1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/page_alloc.c 4 * 5 * Manages the free list, the system allocates free pages here. 6 * Note that kmalloc() lives in slab.c 7 * 8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 9 * Swap reorganised 29.12.95, Stephen Tweedie 10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 11 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 12 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 13 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 14 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 15 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 16 */ 17 18 #include <linux/stddef.h> 19 #include <linux/mm.h> 20 #include <linux/highmem.h> 21 #include <linux/interrupt.h> 22 #include <linux/jiffies.h> 23 #include <linux/compiler.h> 24 #include <linux/kernel.h> 25 #include <linux/kasan.h> 26 #include <linux/kmsan.h> 27 #include <linux/module.h> 28 #include <linux/suspend.h> 29 #include <linux/ratelimit.h> 30 #include <linux/oom.h> 31 #include <linux/topology.h> 32 #include <linux/sysctl.h> 33 #include <linux/cpu.h> 34 #include <linux/cpuset.h> 35 #include <linux/memory_hotplug.h> 36 #include <linux/nodemask.h> 37 #include <linux/vmstat.h> 38 #include <linux/fault-inject.h> 39 #include <linux/compaction.h> 40 #include <trace/events/kmem.h> 41 #include <trace/events/oom.h> 42 #include <linux/prefetch.h> 43 #include <linux/mm_inline.h> 44 #include <linux/mmu_notifier.h> 45 #include <linux/migrate.h> 46 #include <linux/sched/mm.h> 47 #include <linux/page_owner.h> 48 #include <linux/page_table_check.h> 49 #include <linux/memcontrol.h> 50 #include <linux/ftrace.h> 51 #include <linux/lockdep.h> 52 #include <linux/psi.h> 53 #include <linux/khugepaged.h> 54 #include <linux/delayacct.h> 55 #include <asm/div64.h> 56 #include "internal.h" 57 #include "shuffle.h" 58 #include "page_reporting.h" 59 60 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 61 typedef int __bitwise fpi_t; 62 63 /* No special request */ 64 #define FPI_NONE ((__force fpi_t)0) 65 66 /* 67 * Skip free page reporting notification for the (possibly merged) page. 68 * This does not hinder free page reporting from grabbing the page, 69 * reporting it and marking it "reported" - it only skips notifying 70 * the free page reporting infrastructure about a newly freed page. For 71 * example, used when temporarily pulling a page from a freelist and 72 * putting it back unmodified. 73 */ 74 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 75 76 /* 77 * Place the (possibly merged) page to the tail of the freelist. Will ignore 78 * page shuffling (relevant code - e.g., memory onlining - is expected to 79 * shuffle the whole zone). 80 * 81 * Note: No code should rely on this flag for correctness - it's purely 82 * to allow for optimizations when handing back either fresh pages 83 * (memory onlining) or untouched pages (page isolation, free page 84 * reporting). 85 */ 86 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 87 88 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 89 static DEFINE_MUTEX(pcp_batch_high_lock); 90 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) 91 92 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) 93 /* 94 * On SMP, spin_trylock is sufficient protection. 95 * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. 96 */ 97 #define pcp_trylock_prepare(flags) do { } while (0) 98 #define pcp_trylock_finish(flag) do { } while (0) 99 #else 100 101 /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ 102 #define pcp_trylock_prepare(flags) local_irq_save(flags) 103 #define pcp_trylock_finish(flags) local_irq_restore(flags) 104 #endif 105 106 /* 107 * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid 108 * a migration causing the wrong PCP to be locked and remote memory being 109 * potentially allocated, pin the task to the CPU for the lookup+lock. 110 * preempt_disable is used on !RT because it is faster than migrate_disable. 111 * migrate_disable is used on RT because otherwise RT spinlock usage is 112 * interfered with and a high priority task cannot preempt the allocator. 113 */ 114 #ifndef CONFIG_PREEMPT_RT 115 #define pcpu_task_pin() preempt_disable() 116 #define pcpu_task_unpin() preempt_enable() 117 #else 118 #define pcpu_task_pin() migrate_disable() 119 #define pcpu_task_unpin() migrate_enable() 120 #endif 121 122 /* 123 * Generic helper to lookup and a per-cpu variable with an embedded spinlock. 124 * Return value should be used with equivalent unlock helper. 125 */ 126 #define pcpu_spin_lock(type, member, ptr) \ 127 ({ \ 128 type *_ret; \ 129 pcpu_task_pin(); \ 130 _ret = this_cpu_ptr(ptr); \ 131 spin_lock(&_ret->member); \ 132 _ret; \ 133 }) 134 135 #define pcpu_spin_trylock(type, member, ptr) \ 136 ({ \ 137 type *_ret; \ 138 pcpu_task_pin(); \ 139 _ret = this_cpu_ptr(ptr); \ 140 if (!spin_trylock(&_ret->member)) { \ 141 pcpu_task_unpin(); \ 142 _ret = NULL; \ 143 } \ 144 _ret; \ 145 }) 146 147 #define pcpu_spin_unlock(member, ptr) \ 148 ({ \ 149 spin_unlock(&ptr->member); \ 150 pcpu_task_unpin(); \ 151 }) 152 153 /* struct per_cpu_pages specific helpers. */ 154 #define pcp_spin_lock(ptr) \ 155 pcpu_spin_lock(struct per_cpu_pages, lock, ptr) 156 157 #define pcp_spin_trylock(ptr) \ 158 pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) 159 160 #define pcp_spin_unlock(ptr) \ 161 pcpu_spin_unlock(lock, ptr) 162 163 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 164 DEFINE_PER_CPU(int, numa_node); 165 EXPORT_PER_CPU_SYMBOL(numa_node); 166 #endif 167 168 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 169 170 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 171 /* 172 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 173 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 174 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 175 * defined in <linux/topology.h>. 176 */ 177 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 178 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 179 #endif 180 181 static DEFINE_MUTEX(pcpu_drain_mutex); 182 183 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 184 volatile unsigned long latent_entropy __latent_entropy; 185 EXPORT_SYMBOL(latent_entropy); 186 #endif 187 188 /* 189 * Array of node states. 190 */ 191 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 192 [N_POSSIBLE] = NODE_MASK_ALL, 193 [N_ONLINE] = { { [0] = 1UL } }, 194 #ifndef CONFIG_NUMA 195 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 196 #ifdef CONFIG_HIGHMEM 197 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 198 #endif 199 [N_MEMORY] = { { [0] = 1UL } }, 200 [N_CPU] = { { [0] = 1UL } }, 201 #endif /* NUMA */ 202 }; 203 EXPORT_SYMBOL(node_states); 204 205 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 206 207 /* 208 * A cached value of the page's pageblock's migratetype, used when the page is 209 * put on a pcplist. Used to avoid the pageblock migratetype lookup when 210 * freeing from pcplists in most cases, at the cost of possibly becoming stale. 211 * Also the migratetype set in the page does not necessarily match the pcplist 212 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any 213 * other index - this ensures that it will be put on the correct CMA freelist. 214 */ 215 static inline int get_pcppage_migratetype(struct page *page) 216 { 217 return page->index; 218 } 219 220 static inline void set_pcppage_migratetype(struct page *page, int migratetype) 221 { 222 page->index = migratetype; 223 } 224 225 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 226 unsigned int pageblock_order __read_mostly; 227 #endif 228 229 static void __free_pages_ok(struct page *page, unsigned int order, 230 fpi_t fpi_flags); 231 232 /* 233 * results with 256, 32 in the lowmem_reserve sysctl: 234 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 235 * 1G machine -> (16M dma, 784M normal, 224M high) 236 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 237 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 238 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 239 * 240 * TBD: should special case ZONE_DMA32 machines here - in those we normally 241 * don't need any ZONE_NORMAL reservation 242 */ 243 static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 244 #ifdef CONFIG_ZONE_DMA 245 [ZONE_DMA] = 256, 246 #endif 247 #ifdef CONFIG_ZONE_DMA32 248 [ZONE_DMA32] = 256, 249 #endif 250 [ZONE_NORMAL] = 32, 251 #ifdef CONFIG_HIGHMEM 252 [ZONE_HIGHMEM] = 0, 253 #endif 254 [ZONE_MOVABLE] = 0, 255 }; 256 257 char * const zone_names[MAX_NR_ZONES] = { 258 #ifdef CONFIG_ZONE_DMA 259 "DMA", 260 #endif 261 #ifdef CONFIG_ZONE_DMA32 262 "DMA32", 263 #endif 264 "Normal", 265 #ifdef CONFIG_HIGHMEM 266 "HighMem", 267 #endif 268 "Movable", 269 #ifdef CONFIG_ZONE_DEVICE 270 "Device", 271 #endif 272 }; 273 274 const char * const migratetype_names[MIGRATE_TYPES] = { 275 "Unmovable", 276 "Movable", 277 "Reclaimable", 278 "HighAtomic", 279 #ifdef CONFIG_CMA 280 "CMA", 281 #endif 282 #ifdef CONFIG_MEMORY_ISOLATION 283 "Isolate", 284 #endif 285 }; 286 287 static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { 288 [NULL_COMPOUND_DTOR] = NULL, 289 [COMPOUND_PAGE_DTOR] = free_compound_page, 290 #ifdef CONFIG_HUGETLB_PAGE 291 [HUGETLB_PAGE_DTOR] = free_huge_page, 292 #endif 293 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 294 [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, 295 #endif 296 }; 297 298 int min_free_kbytes = 1024; 299 int user_min_free_kbytes = -1; 300 static int watermark_boost_factor __read_mostly = 15000; 301 static int watermark_scale_factor = 10; 302 303 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 304 int movable_zone; 305 EXPORT_SYMBOL(movable_zone); 306 307 #if MAX_NUMNODES > 1 308 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 309 unsigned int nr_online_nodes __read_mostly = 1; 310 EXPORT_SYMBOL(nr_node_ids); 311 EXPORT_SYMBOL(nr_online_nodes); 312 #endif 313 314 static bool page_contains_unaccepted(struct page *page, unsigned int order); 315 static void accept_page(struct page *page, unsigned int order); 316 static bool try_to_accept_memory(struct zone *zone, unsigned int order); 317 static inline bool has_unaccepted_memory(void); 318 static bool __free_unaccepted(struct page *page); 319 320 int page_group_by_mobility_disabled __read_mostly; 321 322 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 323 /* 324 * During boot we initialize deferred pages on-demand, as needed, but once 325 * page_alloc_init_late() has finished, the deferred pages are all initialized, 326 * and we can permanently disable that path. 327 */ 328 DEFINE_STATIC_KEY_TRUE(deferred_pages); 329 330 static inline bool deferred_pages_enabled(void) 331 { 332 return static_branch_unlikely(&deferred_pages); 333 } 334 335 /* 336 * deferred_grow_zone() is __init, but it is called from 337 * get_page_from_freelist() during early boot until deferred_pages permanently 338 * disables this call. This is why we have refdata wrapper to avoid warning, 339 * and to ensure that the function body gets unloaded. 340 */ 341 static bool __ref 342 _deferred_grow_zone(struct zone *zone, unsigned int order) 343 { 344 return deferred_grow_zone(zone, order); 345 } 346 #else 347 static inline bool deferred_pages_enabled(void) 348 { 349 return false; 350 } 351 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 352 353 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 354 static inline unsigned long *get_pageblock_bitmap(const struct page *page, 355 unsigned long pfn) 356 { 357 #ifdef CONFIG_SPARSEMEM 358 return section_to_usemap(__pfn_to_section(pfn)); 359 #else 360 return page_zone(page)->pageblock_flags; 361 #endif /* CONFIG_SPARSEMEM */ 362 } 363 364 static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) 365 { 366 #ifdef CONFIG_SPARSEMEM 367 pfn &= (PAGES_PER_SECTION-1); 368 #else 369 pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); 370 #endif /* CONFIG_SPARSEMEM */ 371 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 372 } 373 374 static __always_inline 375 unsigned long __get_pfnblock_flags_mask(const struct page *page, 376 unsigned long pfn, 377 unsigned long mask) 378 { 379 unsigned long *bitmap; 380 unsigned long bitidx, word_bitidx; 381 unsigned long word; 382 383 bitmap = get_pageblock_bitmap(page, pfn); 384 bitidx = pfn_to_bitidx(page, pfn); 385 word_bitidx = bitidx / BITS_PER_LONG; 386 bitidx &= (BITS_PER_LONG-1); 387 /* 388 * This races, without locks, with set_pfnblock_flags_mask(). Ensure 389 * a consistent read of the memory array, so that results, even though 390 * racy, are not corrupted. 391 */ 392 word = READ_ONCE(bitmap[word_bitidx]); 393 return (word >> bitidx) & mask; 394 } 395 396 /** 397 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages 398 * @page: The page within the block of interest 399 * @pfn: The target page frame number 400 * @mask: mask of bits that the caller is interested in 401 * 402 * Return: pageblock_bits flags 403 */ 404 unsigned long get_pfnblock_flags_mask(const struct page *page, 405 unsigned long pfn, unsigned long mask) 406 { 407 return __get_pfnblock_flags_mask(page, pfn, mask); 408 } 409 410 static __always_inline int get_pfnblock_migratetype(const struct page *page, 411 unsigned long pfn) 412 { 413 return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); 414 } 415 416 /** 417 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 418 * @page: The page within the block of interest 419 * @flags: The flags to set 420 * @pfn: The target page frame number 421 * @mask: mask of bits that the caller is interested in 422 */ 423 void set_pfnblock_flags_mask(struct page *page, unsigned long flags, 424 unsigned long pfn, 425 unsigned long mask) 426 { 427 unsigned long *bitmap; 428 unsigned long bitidx, word_bitidx; 429 unsigned long word; 430 431 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 432 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); 433 434 bitmap = get_pageblock_bitmap(page, pfn); 435 bitidx = pfn_to_bitidx(page, pfn); 436 word_bitidx = bitidx / BITS_PER_LONG; 437 bitidx &= (BITS_PER_LONG-1); 438 439 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 440 441 mask <<= bitidx; 442 flags <<= bitidx; 443 444 word = READ_ONCE(bitmap[word_bitidx]); 445 do { 446 } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); 447 } 448 449 void set_pageblock_migratetype(struct page *page, int migratetype) 450 { 451 if (unlikely(page_group_by_mobility_disabled && 452 migratetype < MIGRATE_PCPTYPES)) 453 migratetype = MIGRATE_UNMOVABLE; 454 455 set_pfnblock_flags_mask(page, (unsigned long)migratetype, 456 page_to_pfn(page), MIGRATETYPE_MASK); 457 } 458 459 #ifdef CONFIG_DEBUG_VM 460 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 461 { 462 int ret = 0; 463 unsigned seq; 464 unsigned long pfn = page_to_pfn(page); 465 unsigned long sp, start_pfn; 466 467 do { 468 seq = zone_span_seqbegin(zone); 469 start_pfn = zone->zone_start_pfn; 470 sp = zone->spanned_pages; 471 if (!zone_spans_pfn(zone, pfn)) 472 ret = 1; 473 } while (zone_span_seqretry(zone, seq)); 474 475 if (ret) 476 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 477 pfn, zone_to_nid(zone), zone->name, 478 start_pfn, start_pfn + sp); 479 480 return ret; 481 } 482 483 /* 484 * Temporary debugging check for pages not lying within a given zone. 485 */ 486 static int __maybe_unused bad_range(struct zone *zone, struct page *page) 487 { 488 if (page_outside_zone_boundaries(zone, page)) 489 return 1; 490 if (zone != page_zone(page)) 491 return 1; 492 493 return 0; 494 } 495 #else 496 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) 497 { 498 return 0; 499 } 500 #endif 501 502 static void bad_page(struct page *page, const char *reason) 503 { 504 static unsigned long resume; 505 static unsigned long nr_shown; 506 static unsigned long nr_unshown; 507 508 /* 509 * Allow a burst of 60 reports, then keep quiet for that minute; 510 * or allow a steady drip of one report per second. 511 */ 512 if (nr_shown == 60) { 513 if (time_before(jiffies, resume)) { 514 nr_unshown++; 515 goto out; 516 } 517 if (nr_unshown) { 518 pr_alert( 519 "BUG: Bad page state: %lu messages suppressed\n", 520 nr_unshown); 521 nr_unshown = 0; 522 } 523 nr_shown = 0; 524 } 525 if (nr_shown++ == 0) 526 resume = jiffies + 60 * HZ; 527 528 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 529 current->comm, page_to_pfn(page)); 530 dump_page(page, reason); 531 532 print_modules(); 533 dump_stack(); 534 out: 535 /* Leave bad fields for debug, except PageBuddy could make trouble */ 536 page_mapcount_reset(page); /* remove PageBuddy */ 537 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 538 } 539 540 static inline unsigned int order_to_pindex(int migratetype, int order) 541 { 542 int base = order; 543 544 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 545 if (order > PAGE_ALLOC_COSTLY_ORDER) { 546 VM_BUG_ON(order != pageblock_order); 547 return NR_LOWORDER_PCP_LISTS; 548 } 549 #else 550 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 551 #endif 552 553 return (MIGRATE_PCPTYPES * base) + migratetype; 554 } 555 556 static inline int pindex_to_order(unsigned int pindex) 557 { 558 int order = pindex / MIGRATE_PCPTYPES; 559 560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 561 if (pindex == NR_LOWORDER_PCP_LISTS) 562 order = pageblock_order; 563 #else 564 VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); 565 #endif 566 567 return order; 568 } 569 570 static inline bool pcp_allowed_order(unsigned int order) 571 { 572 if (order <= PAGE_ALLOC_COSTLY_ORDER) 573 return true; 574 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 575 if (order == pageblock_order) 576 return true; 577 #endif 578 return false; 579 } 580 581 static inline void free_the_page(struct page *page, unsigned int order) 582 { 583 if (pcp_allowed_order(order)) /* Via pcp? */ 584 free_unref_page(page, order); 585 else 586 __free_pages_ok(page, order, FPI_NONE); 587 } 588 589 /* 590 * Higher-order pages are called "compound pages". They are structured thusly: 591 * 592 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 593 * 594 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 595 * in bit 0 of page->compound_head. The rest of bits is pointer to head page. 596 * 597 * The first tail page's ->compound_dtor holds the offset in array of compound 598 * page destructors. See compound_page_dtors. 599 * 600 * The first tail page's ->compound_order holds the order of allocation. 601 * This usage means that zero-order pages may not be compound. 602 */ 603 604 void free_compound_page(struct page *page) 605 { 606 mem_cgroup_uncharge(page_folio(page)); 607 free_the_page(page, compound_order(page)); 608 } 609 610 void prep_compound_page(struct page *page, unsigned int order) 611 { 612 int i; 613 int nr_pages = 1 << order; 614 615 __SetPageHead(page); 616 for (i = 1; i < nr_pages; i++) 617 prep_compound_tail(page, i); 618 619 prep_compound_head(page, order); 620 } 621 622 void destroy_large_folio(struct folio *folio) 623 { 624 enum compound_dtor_id dtor = folio->_folio_dtor; 625 626 VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); 627 compound_page_dtors[dtor](&folio->page); 628 } 629 630 static inline void set_buddy_order(struct page *page, unsigned int order) 631 { 632 set_page_private(page, order); 633 __SetPageBuddy(page); 634 } 635 636 #ifdef CONFIG_COMPACTION 637 static inline struct capture_control *task_capc(struct zone *zone) 638 { 639 struct capture_control *capc = current->capture_control; 640 641 return unlikely(capc) && 642 !(current->flags & PF_KTHREAD) && 643 !capc->page && 644 capc->cc->zone == zone ? capc : NULL; 645 } 646 647 static inline bool 648 compaction_capture(struct capture_control *capc, struct page *page, 649 int order, int migratetype) 650 { 651 if (!capc || order != capc->cc->order) 652 return false; 653 654 /* Do not accidentally pollute CMA or isolated regions*/ 655 if (is_migrate_cma(migratetype) || 656 is_migrate_isolate(migratetype)) 657 return false; 658 659 /* 660 * Do not let lower order allocations pollute a movable pageblock. 661 * This might let an unmovable request use a reclaimable pageblock 662 * and vice-versa but no more than normal fallback logic which can 663 * have trouble finding a high-order free page. 664 */ 665 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 666 return false; 667 668 capc->page = page; 669 return true; 670 } 671 672 #else 673 static inline struct capture_control *task_capc(struct zone *zone) 674 { 675 return NULL; 676 } 677 678 static inline bool 679 compaction_capture(struct capture_control *capc, struct page *page, 680 int order, int migratetype) 681 { 682 return false; 683 } 684 #endif /* CONFIG_COMPACTION */ 685 686 /* Used for pages not on another list */ 687 static inline void add_to_free_list(struct page *page, struct zone *zone, 688 unsigned int order, int migratetype) 689 { 690 struct free_area *area = &zone->free_area[order]; 691 692 list_add(&page->buddy_list, &area->free_list[migratetype]); 693 area->nr_free++; 694 } 695 696 /* Used for pages not on another list */ 697 static inline void add_to_free_list_tail(struct page *page, struct zone *zone, 698 unsigned int order, int migratetype) 699 { 700 struct free_area *area = &zone->free_area[order]; 701 702 list_add_tail(&page->buddy_list, &area->free_list[migratetype]); 703 area->nr_free++; 704 } 705 706 /* 707 * Used for pages which are on another list. Move the pages to the tail 708 * of the list - so the moved pages won't immediately be considered for 709 * allocation again (e.g., optimization for memory onlining). 710 */ 711 static inline void move_to_free_list(struct page *page, struct zone *zone, 712 unsigned int order, int migratetype) 713 { 714 struct free_area *area = &zone->free_area[order]; 715 716 list_move_tail(&page->buddy_list, &area->free_list[migratetype]); 717 } 718 719 static inline void del_page_from_free_list(struct page *page, struct zone *zone, 720 unsigned int order) 721 { 722 /* clear reported state and update reported page count */ 723 if (page_reported(page)) 724 __ClearPageReported(page); 725 726 list_del(&page->buddy_list); 727 __ClearPageBuddy(page); 728 set_page_private(page, 0); 729 zone->free_area[order].nr_free--; 730 } 731 732 static inline struct page *get_page_from_free_area(struct free_area *area, 733 int migratetype) 734 { 735 return list_first_entry_or_null(&area->free_list[migratetype], 736 struct page, buddy_list); 737 } 738 739 /* 740 * If this is not the largest possible page, check if the buddy 741 * of the next-highest order is free. If it is, it's possible 742 * that pages are being freed that will coalesce soon. In case, 743 * that is happening, add the free page to the tail of the list 744 * so it's less likely to be used soon and more likely to be merged 745 * as a higher order page 746 */ 747 static inline bool 748 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 749 struct page *page, unsigned int order) 750 { 751 unsigned long higher_page_pfn; 752 struct page *higher_page; 753 754 if (order >= MAX_ORDER - 1) 755 return false; 756 757 higher_page_pfn = buddy_pfn & pfn; 758 higher_page = page + (higher_page_pfn - pfn); 759 760 return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, 761 NULL) != NULL; 762 } 763 764 /* 765 * Freeing function for a buddy system allocator. 766 * 767 * The concept of a buddy system is to maintain direct-mapped table 768 * (containing bit values) for memory blocks of various "orders". 769 * The bottom level table contains the map for the smallest allocatable 770 * units of memory (here, pages), and each level above it describes 771 * pairs of units from the levels below, hence, "buddies". 772 * At a high level, all that happens here is marking the table entry 773 * at the bottom level available, and propagating the changes upward 774 * as necessary, plus some accounting needed to play nicely with other 775 * parts of the VM system. 776 * At each level, we keep a list of pages, which are heads of continuous 777 * free pages of length of (1 << order) and marked with PageBuddy. 778 * Page's order is recorded in page_private(page) field. 779 * So when we are allocating or freeing one, we can derive the state of the 780 * other. That is, if we allocate a small block, and both were 781 * free, the remainder of the region must be split into blocks. 782 * If a block is freed, and its buddy is also free, then this 783 * triggers coalescing into a block of larger size. 784 * 785 * -- nyc 786 */ 787 788 static inline void __free_one_page(struct page *page, 789 unsigned long pfn, 790 struct zone *zone, unsigned int order, 791 int migratetype, fpi_t fpi_flags) 792 { 793 struct capture_control *capc = task_capc(zone); 794 unsigned long buddy_pfn = 0; 795 unsigned long combined_pfn; 796 struct page *buddy; 797 bool to_tail; 798 799 VM_BUG_ON(!zone_is_initialized(zone)); 800 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 801 802 VM_BUG_ON(migratetype == -1); 803 if (likely(!is_migrate_isolate(migratetype))) 804 __mod_zone_freepage_state(zone, 1 << order, migratetype); 805 806 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 807 VM_BUG_ON_PAGE(bad_range(zone, page), page); 808 809 while (order < MAX_ORDER) { 810 if (compaction_capture(capc, page, order, migratetype)) { 811 __mod_zone_freepage_state(zone, -(1 << order), 812 migratetype); 813 return; 814 } 815 816 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); 817 if (!buddy) 818 goto done_merging; 819 820 if (unlikely(order >= pageblock_order)) { 821 /* 822 * We want to prevent merge between freepages on pageblock 823 * without fallbacks and normal pageblock. Without this, 824 * pageblock isolation could cause incorrect freepage or CMA 825 * accounting or HIGHATOMIC accounting. 826 */ 827 int buddy_mt = get_pageblock_migratetype(buddy); 828 829 if (migratetype != buddy_mt 830 && (!migratetype_is_mergeable(migratetype) || 831 !migratetype_is_mergeable(buddy_mt))) 832 goto done_merging; 833 } 834 835 /* 836 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 837 * merge with it and move up one order. 838 */ 839 if (page_is_guard(buddy)) 840 clear_page_guard(zone, buddy, order, migratetype); 841 else 842 del_page_from_free_list(buddy, zone, order); 843 combined_pfn = buddy_pfn & pfn; 844 page = page + (combined_pfn - pfn); 845 pfn = combined_pfn; 846 order++; 847 } 848 849 done_merging: 850 set_buddy_order(page, order); 851 852 if (fpi_flags & FPI_TO_TAIL) 853 to_tail = true; 854 else if (is_shuffle_order(order)) 855 to_tail = shuffle_pick_tail(); 856 else 857 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 858 859 if (to_tail) 860 add_to_free_list_tail(page, zone, order, migratetype); 861 else 862 add_to_free_list(page, zone, order, migratetype); 863 864 /* Notify page reporting subsystem of freed page */ 865 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 866 page_reporting_notify_free(order); 867 } 868 869 /** 870 * split_free_page() -- split a free page at split_pfn_offset 871 * @free_page: the original free page 872 * @order: the order of the page 873 * @split_pfn_offset: split offset within the page 874 * 875 * Return -ENOENT if the free page is changed, otherwise 0 876 * 877 * It is used when the free page crosses two pageblocks with different migratetypes 878 * at split_pfn_offset within the page. The split free page will be put into 879 * separate migratetype lists afterwards. Otherwise, the function achieves 880 * nothing. 881 */ 882 int split_free_page(struct page *free_page, 883 unsigned int order, unsigned long split_pfn_offset) 884 { 885 struct zone *zone = page_zone(free_page); 886 unsigned long free_page_pfn = page_to_pfn(free_page); 887 unsigned long pfn; 888 unsigned long flags; 889 int free_page_order; 890 int mt; 891 int ret = 0; 892 893 if (split_pfn_offset == 0) 894 return ret; 895 896 spin_lock_irqsave(&zone->lock, flags); 897 898 if (!PageBuddy(free_page) || buddy_order(free_page) != order) { 899 ret = -ENOENT; 900 goto out; 901 } 902 903 mt = get_pageblock_migratetype(free_page); 904 if (likely(!is_migrate_isolate(mt))) 905 __mod_zone_freepage_state(zone, -(1UL << order), mt); 906 907 del_page_from_free_list(free_page, zone, order); 908 for (pfn = free_page_pfn; 909 pfn < free_page_pfn + (1UL << order);) { 910 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); 911 912 free_page_order = min_t(unsigned int, 913 pfn ? __ffs(pfn) : order, 914 __fls(split_pfn_offset)); 915 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, 916 mt, FPI_NONE); 917 pfn += 1UL << free_page_order; 918 split_pfn_offset -= (1UL << free_page_order); 919 /* we have done the first part, now switch to second part */ 920 if (split_pfn_offset == 0) 921 split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); 922 } 923 out: 924 spin_unlock_irqrestore(&zone->lock, flags); 925 return ret; 926 } 927 /* 928 * A bad page could be due to a number of fields. Instead of multiple branches, 929 * try and check multiple fields with one check. The caller must do a detailed 930 * check if necessary. 931 */ 932 static inline bool page_expected_state(struct page *page, 933 unsigned long check_flags) 934 { 935 if (unlikely(atomic_read(&page->_mapcount) != -1)) 936 return false; 937 938 if (unlikely((unsigned long)page->mapping | 939 page_ref_count(page) | 940 #ifdef CONFIG_MEMCG 941 page->memcg_data | 942 #endif 943 (page->flags & check_flags))) 944 return false; 945 946 return true; 947 } 948 949 static const char *page_bad_reason(struct page *page, unsigned long flags) 950 { 951 const char *bad_reason = NULL; 952 953 if (unlikely(atomic_read(&page->_mapcount) != -1)) 954 bad_reason = "nonzero mapcount"; 955 if (unlikely(page->mapping != NULL)) 956 bad_reason = "non-NULL mapping"; 957 if (unlikely(page_ref_count(page) != 0)) 958 bad_reason = "nonzero _refcount"; 959 if (unlikely(page->flags & flags)) { 960 if (flags == PAGE_FLAGS_CHECK_AT_PREP) 961 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 962 else 963 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 964 } 965 #ifdef CONFIG_MEMCG 966 if (unlikely(page->memcg_data)) 967 bad_reason = "page still charged to cgroup"; 968 #endif 969 return bad_reason; 970 } 971 972 static void free_page_is_bad_report(struct page *page) 973 { 974 bad_page(page, 975 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 976 } 977 978 static inline bool free_page_is_bad(struct page *page) 979 { 980 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 981 return false; 982 983 /* Something has gone sideways, find it */ 984 free_page_is_bad_report(page); 985 return true; 986 } 987 988 static inline bool is_check_pages_enabled(void) 989 { 990 return static_branch_unlikely(&check_pages_enabled); 991 } 992 993 static int free_tail_page_prepare(struct page *head_page, struct page *page) 994 { 995 struct folio *folio = (struct folio *)head_page; 996 int ret = 1; 997 998 /* 999 * We rely page->lru.next never has bit 0 set, unless the page 1000 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 1001 */ 1002 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 1003 1004 if (!is_check_pages_enabled()) { 1005 ret = 0; 1006 goto out; 1007 } 1008 switch (page - head_page) { 1009 case 1: 1010 /* the first tail page: these may be in place of ->mapping */ 1011 if (unlikely(folio_entire_mapcount(folio))) { 1012 bad_page(page, "nonzero entire_mapcount"); 1013 goto out; 1014 } 1015 if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { 1016 bad_page(page, "nonzero nr_pages_mapped"); 1017 goto out; 1018 } 1019 if (unlikely(atomic_read(&folio->_pincount))) { 1020 bad_page(page, "nonzero pincount"); 1021 goto out; 1022 } 1023 break; 1024 case 2: 1025 /* 1026 * the second tail page: ->mapping is 1027 * deferred_list.next -- ignore value. 1028 */ 1029 break; 1030 default: 1031 if (page->mapping != TAIL_MAPPING) { 1032 bad_page(page, "corrupted mapping in tail page"); 1033 goto out; 1034 } 1035 break; 1036 } 1037 if (unlikely(!PageTail(page))) { 1038 bad_page(page, "PageTail not set"); 1039 goto out; 1040 } 1041 if (unlikely(compound_head(page) != head_page)) { 1042 bad_page(page, "compound_head not consistent"); 1043 goto out; 1044 } 1045 ret = 0; 1046 out: 1047 page->mapping = NULL; 1048 clear_compound_head(page); 1049 return ret; 1050 } 1051 1052 /* 1053 * Skip KASAN memory poisoning when either: 1054 * 1055 * 1. For generic KASAN: deferred memory initialization has not yet completed. 1056 * Tag-based KASAN modes skip pages freed via deferred memory initialization 1057 * using page tags instead (see below). 1058 * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating 1059 * that error detection is disabled for accesses via the page address. 1060 * 1061 * Pages will have match-all tags in the following circumstances: 1062 * 1063 * 1. Pages are being initialized for the first time, including during deferred 1064 * memory init; see the call to page_kasan_tag_reset in __init_single_page. 1065 * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the 1066 * exception of pages unpoisoned by kasan_unpoison_vmalloc. 1067 * 3. The allocation was excluded from being checked due to sampling, 1068 * see the call to kasan_unpoison_pages. 1069 * 1070 * Poisoning pages during deferred memory init will greatly lengthen the 1071 * process and cause problem in large memory systems as the deferred pages 1072 * initialization is done with interrupt disabled. 1073 * 1074 * Assuming that there will be no reference to those newly initialized 1075 * pages before they are ever allocated, this should have no effect on 1076 * KASAN memory tracking as the poison will be properly inserted at page 1077 * allocation time. The only corner case is when pages are allocated by 1078 * on-demand allocation and then freed again before the deferred pages 1079 * initialization is done, but this is not likely to happen. 1080 */ 1081 static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) 1082 { 1083 if (IS_ENABLED(CONFIG_KASAN_GENERIC)) 1084 return deferred_pages_enabled(); 1085 1086 return page_kasan_tag(page) == 0xff; 1087 } 1088 1089 static void kernel_init_pages(struct page *page, int numpages) 1090 { 1091 int i; 1092 1093 /* s390's use of memset() could override KASAN redzones. */ 1094 kasan_disable_current(); 1095 for (i = 0; i < numpages; i++) 1096 clear_highpage_kasan_tagged(page + i); 1097 kasan_enable_current(); 1098 } 1099 1100 static __always_inline bool free_pages_prepare(struct page *page, 1101 unsigned int order, fpi_t fpi_flags) 1102 { 1103 int bad = 0; 1104 bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); 1105 bool init = want_init_on_free(); 1106 1107 VM_BUG_ON_PAGE(PageTail(page), page); 1108 1109 trace_mm_page_free(page, order); 1110 kmsan_free_page(page, order); 1111 1112 if (unlikely(PageHWPoison(page)) && !order) { 1113 /* 1114 * Do not let hwpoison pages hit pcplists/buddy 1115 * Untie memcg state and reset page's owner 1116 */ 1117 if (memcg_kmem_online() && PageMemcgKmem(page)) 1118 __memcg_kmem_uncharge_page(page, order); 1119 reset_page_owner(page, order); 1120 page_table_check_free(page, order); 1121 return false; 1122 } 1123 1124 /* 1125 * Check tail pages before head page information is cleared to 1126 * avoid checking PageCompound for order-0 pages. 1127 */ 1128 if (unlikely(order)) { 1129 bool compound = PageCompound(page); 1130 int i; 1131 1132 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1133 1134 if (compound) 1135 ClearPageHasHWPoisoned(page); 1136 for (i = 1; i < (1 << order); i++) { 1137 if (compound) 1138 bad += free_tail_page_prepare(page, page + i); 1139 if (is_check_pages_enabled()) { 1140 if (free_page_is_bad(page + i)) { 1141 bad++; 1142 continue; 1143 } 1144 } 1145 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1146 } 1147 } 1148 if (PageMappingFlags(page)) 1149 page->mapping = NULL; 1150 if (memcg_kmem_online() && PageMemcgKmem(page)) 1151 __memcg_kmem_uncharge_page(page, order); 1152 if (is_check_pages_enabled()) { 1153 if (free_page_is_bad(page)) 1154 bad++; 1155 if (bad) 1156 return false; 1157 } 1158 1159 page_cpupid_reset_last(page); 1160 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1161 reset_page_owner(page, order); 1162 page_table_check_free(page, order); 1163 1164 if (!PageHighMem(page)) { 1165 debug_check_no_locks_freed(page_address(page), 1166 PAGE_SIZE << order); 1167 debug_check_no_obj_freed(page_address(page), 1168 PAGE_SIZE << order); 1169 } 1170 1171 kernel_poison_pages(page, 1 << order); 1172 1173 /* 1174 * As memory initialization might be integrated into KASAN, 1175 * KASAN poisoning and memory initialization code must be 1176 * kept together to avoid discrepancies in behavior. 1177 * 1178 * With hardware tag-based KASAN, memory tags must be set before the 1179 * page becomes unavailable via debug_pagealloc or arch_free_page. 1180 */ 1181 if (!skip_kasan_poison) { 1182 kasan_poison_pages(page, order, init); 1183 1184 /* Memory is already initialized if KASAN did it internally. */ 1185 if (kasan_has_integrated_init()) 1186 init = false; 1187 } 1188 if (init) 1189 kernel_init_pages(page, 1 << order); 1190 1191 /* 1192 * arch_free_page() can make the page's contents inaccessible. s390 1193 * does this. So nothing which can access the page's contents should 1194 * happen after this. 1195 */ 1196 arch_free_page(page, order); 1197 1198 debug_pagealloc_unmap_pages(page, 1 << order); 1199 1200 return true; 1201 } 1202 1203 /* 1204 * Frees a number of pages from the PCP lists 1205 * Assumes all pages on list are in same zone. 1206 * count is the number of pages to free. 1207 */ 1208 static void free_pcppages_bulk(struct zone *zone, int count, 1209 struct per_cpu_pages *pcp, 1210 int pindex) 1211 { 1212 unsigned long flags; 1213 int min_pindex = 0; 1214 int max_pindex = NR_PCP_LISTS - 1; 1215 unsigned int order; 1216 bool isolated_pageblocks; 1217 struct page *page; 1218 1219 /* 1220 * Ensure proper count is passed which otherwise would stuck in the 1221 * below while (list_empty(list)) loop. 1222 */ 1223 count = min(pcp->count, count); 1224 1225 /* Ensure requested pindex is drained first. */ 1226 pindex = pindex - 1; 1227 1228 spin_lock_irqsave(&zone->lock, flags); 1229 isolated_pageblocks = has_isolate_pageblock(zone); 1230 1231 while (count > 0) { 1232 struct list_head *list; 1233 int nr_pages; 1234 1235 /* Remove pages from lists in a round-robin fashion. */ 1236 do { 1237 if (++pindex > max_pindex) 1238 pindex = min_pindex; 1239 list = &pcp->lists[pindex]; 1240 if (!list_empty(list)) 1241 break; 1242 1243 if (pindex == max_pindex) 1244 max_pindex--; 1245 if (pindex == min_pindex) 1246 min_pindex++; 1247 } while (1); 1248 1249 order = pindex_to_order(pindex); 1250 nr_pages = 1 << order; 1251 do { 1252 int mt; 1253 1254 page = list_last_entry(list, struct page, pcp_list); 1255 mt = get_pcppage_migratetype(page); 1256 1257 /* must delete to avoid corrupting pcp list */ 1258 list_del(&page->pcp_list); 1259 count -= nr_pages; 1260 pcp->count -= nr_pages; 1261 1262 /* MIGRATE_ISOLATE page should not go to pcplists */ 1263 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); 1264 /* Pageblock could have been isolated meanwhile */ 1265 if (unlikely(isolated_pageblocks)) 1266 mt = get_pageblock_migratetype(page); 1267 1268 __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); 1269 trace_mm_page_pcpu_drain(page, order, mt); 1270 } while (count > 0 && !list_empty(list)); 1271 } 1272 1273 spin_unlock_irqrestore(&zone->lock, flags); 1274 } 1275 1276 static void free_one_page(struct zone *zone, 1277 struct page *page, unsigned long pfn, 1278 unsigned int order, 1279 int migratetype, fpi_t fpi_flags) 1280 { 1281 unsigned long flags; 1282 1283 spin_lock_irqsave(&zone->lock, flags); 1284 if (unlikely(has_isolate_pageblock(zone) || 1285 is_migrate_isolate(migratetype))) { 1286 migratetype = get_pfnblock_migratetype(page, pfn); 1287 } 1288 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1289 spin_unlock_irqrestore(&zone->lock, flags); 1290 } 1291 1292 static void __free_pages_ok(struct page *page, unsigned int order, 1293 fpi_t fpi_flags) 1294 { 1295 unsigned long flags; 1296 int migratetype; 1297 unsigned long pfn = page_to_pfn(page); 1298 struct zone *zone = page_zone(page); 1299 1300 if (!free_pages_prepare(page, order, fpi_flags)) 1301 return; 1302 1303 /* 1304 * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here 1305 * is used to avoid calling get_pfnblock_migratetype() under the lock. 1306 * This will reduce the lock holding time. 1307 */ 1308 migratetype = get_pfnblock_migratetype(page, pfn); 1309 1310 spin_lock_irqsave(&zone->lock, flags); 1311 if (unlikely(has_isolate_pageblock(zone) || 1312 is_migrate_isolate(migratetype))) { 1313 migratetype = get_pfnblock_migratetype(page, pfn); 1314 } 1315 __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); 1316 spin_unlock_irqrestore(&zone->lock, flags); 1317 1318 __count_vm_events(PGFREE, 1 << order); 1319 } 1320 1321 void __free_pages_core(struct page *page, unsigned int order) 1322 { 1323 unsigned int nr_pages = 1 << order; 1324 struct page *p = page; 1325 unsigned int loop; 1326 1327 /* 1328 * When initializing the memmap, __init_single_page() sets the refcount 1329 * of all pages to 1 ("allocated"/"not free"). We have to set the 1330 * refcount of all involved pages to 0. 1331 */ 1332 prefetchw(p); 1333 for (loop = 0; loop < (nr_pages - 1); loop++, p++) { 1334 prefetchw(p + 1); 1335 __ClearPageReserved(p); 1336 set_page_count(p, 0); 1337 } 1338 __ClearPageReserved(p); 1339 set_page_count(p, 0); 1340 1341 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1342 1343 if (page_contains_unaccepted(page, order)) { 1344 if (order == MAX_ORDER && __free_unaccepted(page)) 1345 return; 1346 1347 accept_page(page, order); 1348 } 1349 1350 /* 1351 * Bypass PCP and place fresh pages right to the tail, primarily 1352 * relevant for memory onlining. 1353 */ 1354 __free_pages_ok(page, order, FPI_TO_TAIL); 1355 } 1356 1357 /* 1358 * Check that the whole (or subset of) a pageblock given by the interval of 1359 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1360 * with the migration of free compaction scanner. 1361 * 1362 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1363 * 1364 * It's possible on some configurations to have a setup like node0 node1 node0 1365 * i.e. it's possible that all pages within a zones range of pages do not 1366 * belong to a single zone. We assume that a border between node0 and node1 1367 * can occur within a single pageblock, but not a node0 node1 node0 1368 * interleaving within a single pageblock. It is therefore sufficient to check 1369 * the first and last page of a pageblock and avoid checking each individual 1370 * page in a pageblock. 1371 * 1372 * Note: the function may return non-NULL struct page even for a page block 1373 * which contains a memory hole (i.e. there is no physical memory for a subset 1374 * of the pfn range). For example, if the pageblock order is MAX_ORDER, which 1375 * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole 1376 * even though the start pfn is online and valid. This should be safe most of 1377 * the time because struct pages are still initialized via init_unavailable_range() 1378 * and pfn walkers shouldn't touch any physical memory range for which they do 1379 * not recognize any specific metadata in struct pages. 1380 */ 1381 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1382 unsigned long end_pfn, struct zone *zone) 1383 { 1384 struct page *start_page; 1385 struct page *end_page; 1386 1387 /* end_pfn is one past the range we are checking */ 1388 end_pfn--; 1389 1390 if (!pfn_valid(end_pfn)) 1391 return NULL; 1392 1393 start_page = pfn_to_online_page(start_pfn); 1394 if (!start_page) 1395 return NULL; 1396 1397 if (page_zone(start_page) != zone) 1398 return NULL; 1399 1400 end_page = pfn_to_page(end_pfn); 1401 1402 /* This gives a shorter code than deriving page_zone(end_page) */ 1403 if (page_zone_id(start_page) != page_zone_id(end_page)) 1404 return NULL; 1405 1406 return start_page; 1407 } 1408 1409 /* 1410 * The order of subdivision here is critical for the IO subsystem. 1411 * Please do not alter this order without good reasons and regression 1412 * testing. Specifically, as large blocks of memory are subdivided, 1413 * the order in which smaller blocks are delivered depends on the order 1414 * they're subdivided in this function. This is the primary factor 1415 * influencing the order in which pages are delivered to the IO 1416 * subsystem according to empirical testing, and this is also justified 1417 * by considering the behavior of a buddy system containing a single 1418 * large block of memory acted on by a series of small allocations. 1419 * This behavior is a critical factor in sglist merging's success. 1420 * 1421 * -- nyc 1422 */ 1423 static inline void expand(struct zone *zone, struct page *page, 1424 int low, int high, int migratetype) 1425 { 1426 unsigned long size = 1 << high; 1427 1428 while (high > low) { 1429 high--; 1430 size >>= 1; 1431 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1432 1433 /* 1434 * Mark as guard pages (or page), that will allow to 1435 * merge back to allocator when buddy will be freed. 1436 * Corresponding page table entries will not be touched, 1437 * pages will stay not present in virtual address space 1438 */ 1439 if (set_page_guard(zone, &page[size], high, migratetype)) 1440 continue; 1441 1442 add_to_free_list(&page[size], zone, high, migratetype); 1443 set_buddy_order(&page[size], high); 1444 } 1445 } 1446 1447 static void check_new_page_bad(struct page *page) 1448 { 1449 if (unlikely(page->flags & __PG_HWPOISON)) { 1450 /* Don't complain about hwpoisoned pages */ 1451 page_mapcount_reset(page); /* remove PageBuddy */ 1452 return; 1453 } 1454 1455 bad_page(page, 1456 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 1457 } 1458 1459 /* 1460 * This page is about to be returned from the page allocator 1461 */ 1462 static int check_new_page(struct page *page) 1463 { 1464 if (likely(page_expected_state(page, 1465 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 1466 return 0; 1467 1468 check_new_page_bad(page); 1469 return 1; 1470 } 1471 1472 static inline bool check_new_pages(struct page *page, unsigned int order) 1473 { 1474 if (is_check_pages_enabled()) { 1475 for (int i = 0; i < (1 << order); i++) { 1476 struct page *p = page + i; 1477 1478 if (check_new_page(p)) 1479 return true; 1480 } 1481 } 1482 1483 return false; 1484 } 1485 1486 static inline bool should_skip_kasan_unpoison(gfp_t flags) 1487 { 1488 /* Don't skip if a software KASAN mode is enabled. */ 1489 if (IS_ENABLED(CONFIG_KASAN_GENERIC) || 1490 IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 1491 return false; 1492 1493 /* Skip, if hardware tag-based KASAN is not enabled. */ 1494 if (!kasan_hw_tags_enabled()) 1495 return true; 1496 1497 /* 1498 * With hardware tag-based KASAN enabled, skip if this has been 1499 * requested via __GFP_SKIP_KASAN. 1500 */ 1501 return flags & __GFP_SKIP_KASAN; 1502 } 1503 1504 static inline bool should_skip_init(gfp_t flags) 1505 { 1506 /* Don't skip, if hardware tag-based KASAN is not enabled. */ 1507 if (!kasan_hw_tags_enabled()) 1508 return false; 1509 1510 /* For hardware tag-based KASAN, skip if requested. */ 1511 return (flags & __GFP_SKIP_ZERO); 1512 } 1513 1514 inline void post_alloc_hook(struct page *page, unsigned int order, 1515 gfp_t gfp_flags) 1516 { 1517 bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && 1518 !should_skip_init(gfp_flags); 1519 bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); 1520 int i; 1521 1522 set_page_private(page, 0); 1523 set_page_refcounted(page); 1524 1525 arch_alloc_page(page, order); 1526 debug_pagealloc_map_pages(page, 1 << order); 1527 1528 /* 1529 * Page unpoisoning must happen before memory initialization. 1530 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO 1531 * allocations and the page unpoisoning code will complain. 1532 */ 1533 kernel_unpoison_pages(page, 1 << order); 1534 1535 /* 1536 * As memory initialization might be integrated into KASAN, 1537 * KASAN unpoisoning and memory initializion code must be 1538 * kept together to avoid discrepancies in behavior. 1539 */ 1540 1541 /* 1542 * If memory tags should be zeroed 1543 * (which happens only when memory should be initialized as well). 1544 */ 1545 if (zero_tags) { 1546 /* Initialize both memory and memory tags. */ 1547 for (i = 0; i != 1 << order; ++i) 1548 tag_clear_highpage(page + i); 1549 1550 /* Take note that memory was initialized by the loop above. */ 1551 init = false; 1552 } 1553 if (!should_skip_kasan_unpoison(gfp_flags) && 1554 kasan_unpoison_pages(page, order, init)) { 1555 /* Take note that memory was initialized by KASAN. */ 1556 if (kasan_has_integrated_init()) 1557 init = false; 1558 } else { 1559 /* 1560 * If memory tags have not been set by KASAN, reset the page 1561 * tags to ensure page_address() dereferencing does not fault. 1562 */ 1563 for (i = 0; i != 1 << order; ++i) 1564 page_kasan_tag_reset(page + i); 1565 } 1566 /* If memory is still not initialized, initialize it now. */ 1567 if (init) 1568 kernel_init_pages(page, 1 << order); 1569 1570 set_page_owner(page, order, gfp_flags); 1571 page_table_check_alloc(page, order); 1572 } 1573 1574 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1575 unsigned int alloc_flags) 1576 { 1577 post_alloc_hook(page, order, gfp_flags); 1578 1579 if (order && (gfp_flags & __GFP_COMP)) 1580 prep_compound_page(page, order); 1581 1582 /* 1583 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1584 * allocate the page. The expectation is that the caller is taking 1585 * steps that will free more memory. The caller should avoid the page 1586 * being used for !PFMEMALLOC purposes. 1587 */ 1588 if (alloc_flags & ALLOC_NO_WATERMARKS) 1589 set_page_pfmemalloc(page); 1590 else 1591 clear_page_pfmemalloc(page); 1592 } 1593 1594 /* 1595 * Go through the free lists for the given migratetype and remove 1596 * the smallest available page from the freelists 1597 */ 1598 static __always_inline 1599 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1600 int migratetype) 1601 { 1602 unsigned int current_order; 1603 struct free_area *area; 1604 struct page *page; 1605 1606 /* Find a page of the appropriate size in the preferred list */ 1607 for (current_order = order; current_order <= MAX_ORDER; ++current_order) { 1608 area = &(zone->free_area[current_order]); 1609 page = get_page_from_free_area(area, migratetype); 1610 if (!page) 1611 continue; 1612 del_page_from_free_list(page, zone, current_order); 1613 expand(zone, page, order, current_order, migratetype); 1614 set_pcppage_migratetype(page, migratetype); 1615 trace_mm_page_alloc_zone_locked(page, order, migratetype, 1616 pcp_allowed_order(order) && 1617 migratetype < MIGRATE_PCPTYPES); 1618 return page; 1619 } 1620 1621 return NULL; 1622 } 1623 1624 1625 /* 1626 * This array describes the order lists are fallen back to when 1627 * the free lists for the desirable migrate type are depleted 1628 * 1629 * The other migratetypes do not have fallbacks. 1630 */ 1631 static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { 1632 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, 1633 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, 1634 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, 1635 }; 1636 1637 #ifdef CONFIG_CMA 1638 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1639 unsigned int order) 1640 { 1641 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1642 } 1643 #else 1644 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1645 unsigned int order) { return NULL; } 1646 #endif 1647 1648 /* 1649 * Move the free pages in a range to the freelist tail of the requested type. 1650 * Note that start_page and end_pages are not aligned on a pageblock 1651 * boundary. If alignment is required, use move_freepages_block() 1652 */ 1653 static int move_freepages(struct zone *zone, 1654 unsigned long start_pfn, unsigned long end_pfn, 1655 int migratetype, int *num_movable) 1656 { 1657 struct page *page; 1658 unsigned long pfn; 1659 unsigned int order; 1660 int pages_moved = 0; 1661 1662 for (pfn = start_pfn; pfn <= end_pfn;) { 1663 page = pfn_to_page(pfn); 1664 if (!PageBuddy(page)) { 1665 /* 1666 * We assume that pages that could be isolated for 1667 * migration are movable. But we don't actually try 1668 * isolating, as that would be expensive. 1669 */ 1670 if (num_movable && 1671 (PageLRU(page) || __PageMovable(page))) 1672 (*num_movable)++; 1673 pfn++; 1674 continue; 1675 } 1676 1677 /* Make sure we are not inadvertently changing nodes */ 1678 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1679 VM_BUG_ON_PAGE(page_zone(page) != zone, page); 1680 1681 order = buddy_order(page); 1682 move_to_free_list(page, zone, order, migratetype); 1683 pfn += 1 << order; 1684 pages_moved += 1 << order; 1685 } 1686 1687 return pages_moved; 1688 } 1689 1690 int move_freepages_block(struct zone *zone, struct page *page, 1691 int migratetype, int *num_movable) 1692 { 1693 unsigned long start_pfn, end_pfn, pfn; 1694 1695 if (num_movable) 1696 *num_movable = 0; 1697 1698 pfn = page_to_pfn(page); 1699 start_pfn = pageblock_start_pfn(pfn); 1700 end_pfn = pageblock_end_pfn(pfn) - 1; 1701 1702 /* Do not cross zone boundaries */ 1703 if (!zone_spans_pfn(zone, start_pfn)) 1704 start_pfn = pfn; 1705 if (!zone_spans_pfn(zone, end_pfn)) 1706 return 0; 1707 1708 return move_freepages(zone, start_pfn, end_pfn, migratetype, 1709 num_movable); 1710 } 1711 1712 static void change_pageblock_range(struct page *pageblock_page, 1713 int start_order, int migratetype) 1714 { 1715 int nr_pageblocks = 1 << (start_order - pageblock_order); 1716 1717 while (nr_pageblocks--) { 1718 set_pageblock_migratetype(pageblock_page, migratetype); 1719 pageblock_page += pageblock_nr_pages; 1720 } 1721 } 1722 1723 /* 1724 * When we are falling back to another migratetype during allocation, try to 1725 * steal extra free pages from the same pageblocks to satisfy further 1726 * allocations, instead of polluting multiple pageblocks. 1727 * 1728 * If we are stealing a relatively large buddy page, it is likely there will 1729 * be more free pages in the pageblock, so try to steal them all. For 1730 * reclaimable and unmovable allocations, we steal regardless of page size, 1731 * as fragmentation caused by those allocations polluting movable pageblocks 1732 * is worse than movable allocations stealing from unmovable and reclaimable 1733 * pageblocks. 1734 */ 1735 static bool can_steal_fallback(unsigned int order, int start_mt) 1736 { 1737 /* 1738 * Leaving this order check is intended, although there is 1739 * relaxed order check in next check. The reason is that 1740 * we can actually steal whole pageblock if this condition met, 1741 * but, below check doesn't guarantee it and that is just heuristic 1742 * so could be changed anytime. 1743 */ 1744 if (order >= pageblock_order) 1745 return true; 1746 1747 if (order >= pageblock_order / 2 || 1748 start_mt == MIGRATE_RECLAIMABLE || 1749 start_mt == MIGRATE_UNMOVABLE || 1750 page_group_by_mobility_disabled) 1751 return true; 1752 1753 return false; 1754 } 1755 1756 static inline bool boost_watermark(struct zone *zone) 1757 { 1758 unsigned long max_boost; 1759 1760 if (!watermark_boost_factor) 1761 return false; 1762 /* 1763 * Don't bother in zones that are unlikely to produce results. 1764 * On small machines, including kdump capture kernels running 1765 * in a small area, boosting the watermark can cause an out of 1766 * memory situation immediately. 1767 */ 1768 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 1769 return false; 1770 1771 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 1772 watermark_boost_factor, 10000); 1773 1774 /* 1775 * high watermark may be uninitialised if fragmentation occurs 1776 * very early in boot so do not boost. We do not fall 1777 * through and boost by pageblock_nr_pages as failing 1778 * allocations that early means that reclaim is not going 1779 * to help and it may even be impossible to reclaim the 1780 * boosted watermark resulting in a hang. 1781 */ 1782 if (!max_boost) 1783 return false; 1784 1785 max_boost = max(pageblock_nr_pages, max_boost); 1786 1787 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 1788 max_boost); 1789 1790 return true; 1791 } 1792 1793 /* 1794 * This function implements actual steal behaviour. If order is large enough, 1795 * we can steal whole pageblock. If not, we first move freepages in this 1796 * pageblock to our migratetype and determine how many already-allocated pages 1797 * are there in the pageblock with a compatible migratetype. If at least half 1798 * of pages are free or compatible, we can change migratetype of the pageblock 1799 * itself, so pages freed in the future will be put on the correct free list. 1800 */ 1801 static void steal_suitable_fallback(struct zone *zone, struct page *page, 1802 unsigned int alloc_flags, int start_type, bool whole_block) 1803 { 1804 unsigned int current_order = buddy_order(page); 1805 int free_pages, movable_pages, alike_pages; 1806 int old_block_type; 1807 1808 old_block_type = get_pageblock_migratetype(page); 1809 1810 /* 1811 * This can happen due to races and we want to prevent broken 1812 * highatomic accounting. 1813 */ 1814 if (is_migrate_highatomic(old_block_type)) 1815 goto single_page; 1816 1817 /* Take ownership for orders >= pageblock_order */ 1818 if (current_order >= pageblock_order) { 1819 change_pageblock_range(page, current_order, start_type); 1820 goto single_page; 1821 } 1822 1823 /* 1824 * Boost watermarks to increase reclaim pressure to reduce the 1825 * likelihood of future fallbacks. Wake kswapd now as the node 1826 * may be balanced overall and kswapd will not wake naturally. 1827 */ 1828 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 1829 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 1830 1831 /* We are not allowed to try stealing from the whole block */ 1832 if (!whole_block) 1833 goto single_page; 1834 1835 free_pages = move_freepages_block(zone, page, start_type, 1836 &movable_pages); 1837 /* 1838 * Determine how many pages are compatible with our allocation. 1839 * For movable allocation, it's the number of movable pages which 1840 * we just obtained. For other types it's a bit more tricky. 1841 */ 1842 if (start_type == MIGRATE_MOVABLE) { 1843 alike_pages = movable_pages; 1844 } else { 1845 /* 1846 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 1847 * to MOVABLE pageblock, consider all non-movable pages as 1848 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 1849 * vice versa, be conservative since we can't distinguish the 1850 * exact migratetype of non-movable pages. 1851 */ 1852 if (old_block_type == MIGRATE_MOVABLE) 1853 alike_pages = pageblock_nr_pages 1854 - (free_pages + movable_pages); 1855 else 1856 alike_pages = 0; 1857 } 1858 1859 /* moving whole block can fail due to zone boundary conditions */ 1860 if (!free_pages) 1861 goto single_page; 1862 1863 /* 1864 * If a sufficient number of pages in the block are either free or of 1865 * comparable migratability as our allocation, claim the whole block. 1866 */ 1867 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 1868 page_group_by_mobility_disabled) 1869 set_pageblock_migratetype(page, start_type); 1870 1871 return; 1872 1873 single_page: 1874 move_to_free_list(page, zone, current_order, start_type); 1875 } 1876 1877 /* 1878 * Check whether there is a suitable fallback freepage with requested order. 1879 * If only_stealable is true, this function returns fallback_mt only if 1880 * we can steal other freepages all together. This would help to reduce 1881 * fragmentation due to mixed migratetype pages in one pageblock. 1882 */ 1883 int find_suitable_fallback(struct free_area *area, unsigned int order, 1884 int migratetype, bool only_stealable, bool *can_steal) 1885 { 1886 int i; 1887 int fallback_mt; 1888 1889 if (area->nr_free == 0) 1890 return -1; 1891 1892 *can_steal = false; 1893 for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { 1894 fallback_mt = fallbacks[migratetype][i]; 1895 if (free_area_empty(area, fallback_mt)) 1896 continue; 1897 1898 if (can_steal_fallback(order, migratetype)) 1899 *can_steal = true; 1900 1901 if (!only_stealable) 1902 return fallback_mt; 1903 1904 if (*can_steal) 1905 return fallback_mt; 1906 } 1907 1908 return -1; 1909 } 1910 1911 /* 1912 * Reserve a pageblock for exclusive use of high-order atomic allocations if 1913 * there are no empty page blocks that contain a page with a suitable order 1914 */ 1915 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, 1916 unsigned int alloc_order) 1917 { 1918 int mt; 1919 unsigned long max_managed, flags; 1920 1921 /* 1922 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 1923 * Check is race-prone but harmless. 1924 */ 1925 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; 1926 if (zone->nr_reserved_highatomic >= max_managed) 1927 return; 1928 1929 spin_lock_irqsave(&zone->lock, flags); 1930 1931 /* Recheck the nr_reserved_highatomic limit under the lock */ 1932 if (zone->nr_reserved_highatomic >= max_managed) 1933 goto out_unlock; 1934 1935 /* Yoink! */ 1936 mt = get_pageblock_migratetype(page); 1937 /* Only reserve normal pageblocks (i.e., they can merge with others) */ 1938 if (migratetype_is_mergeable(mt)) { 1939 zone->nr_reserved_highatomic += pageblock_nr_pages; 1940 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); 1941 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); 1942 } 1943 1944 out_unlock: 1945 spin_unlock_irqrestore(&zone->lock, flags); 1946 } 1947 1948 /* 1949 * Used when an allocation is about to fail under memory pressure. This 1950 * potentially hurts the reliability of high-order allocations when under 1951 * intense memory pressure but failed atomic allocations should be easier 1952 * to recover from than an OOM. 1953 * 1954 * If @force is true, try to unreserve a pageblock even though highatomic 1955 * pageblock is exhausted. 1956 */ 1957 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 1958 bool force) 1959 { 1960 struct zonelist *zonelist = ac->zonelist; 1961 unsigned long flags; 1962 struct zoneref *z; 1963 struct zone *zone; 1964 struct page *page; 1965 int order; 1966 bool ret; 1967 1968 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 1969 ac->nodemask) { 1970 /* 1971 * Preserve at least one pageblock unless memory pressure 1972 * is really high. 1973 */ 1974 if (!force && zone->nr_reserved_highatomic <= 1975 pageblock_nr_pages) 1976 continue; 1977 1978 spin_lock_irqsave(&zone->lock, flags); 1979 for (order = 0; order <= MAX_ORDER; order++) { 1980 struct free_area *area = &(zone->free_area[order]); 1981 1982 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 1983 if (!page) 1984 continue; 1985 1986 /* 1987 * In page freeing path, migratetype change is racy so 1988 * we can counter several free pages in a pageblock 1989 * in this loop although we changed the pageblock type 1990 * from highatomic to ac->migratetype. So we should 1991 * adjust the count once. 1992 */ 1993 if (is_migrate_highatomic_page(page)) { 1994 /* 1995 * It should never happen but changes to 1996 * locking could inadvertently allow a per-cpu 1997 * drain to add pages to MIGRATE_HIGHATOMIC 1998 * while unreserving so be safe and watch for 1999 * underflows. 2000 */ 2001 zone->nr_reserved_highatomic -= min( 2002 pageblock_nr_pages, 2003 zone->nr_reserved_highatomic); 2004 } 2005 2006 /* 2007 * Convert to ac->migratetype and avoid the normal 2008 * pageblock stealing heuristics. Minimally, the caller 2009 * is doing the work and needs the pages. More 2010 * importantly, if the block was always converted to 2011 * MIGRATE_UNMOVABLE or another type then the number 2012 * of pageblocks that cannot be completely freed 2013 * may increase. 2014 */ 2015 set_pageblock_migratetype(page, ac->migratetype); 2016 ret = move_freepages_block(zone, page, ac->migratetype, 2017 NULL); 2018 if (ret) { 2019 spin_unlock_irqrestore(&zone->lock, flags); 2020 return ret; 2021 } 2022 } 2023 spin_unlock_irqrestore(&zone->lock, flags); 2024 } 2025 2026 return false; 2027 } 2028 2029 /* 2030 * Try finding a free buddy page on the fallback list and put it on the free 2031 * list of requested migratetype, possibly along with other pages from the same 2032 * block, depending on fragmentation avoidance heuristics. Returns true if 2033 * fallback was found so that __rmqueue_smallest() can grab it. 2034 * 2035 * The use of signed ints for order and current_order is a deliberate 2036 * deviation from the rest of this file, to make the for loop 2037 * condition simpler. 2038 */ 2039 static __always_inline bool 2040 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, 2041 unsigned int alloc_flags) 2042 { 2043 struct free_area *area; 2044 int current_order; 2045 int min_order = order; 2046 struct page *page; 2047 int fallback_mt; 2048 bool can_steal; 2049 2050 /* 2051 * Do not steal pages from freelists belonging to other pageblocks 2052 * i.e. orders < pageblock_order. If there are no local zones free, 2053 * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 2054 */ 2055 if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) 2056 min_order = pageblock_order; 2057 2058 /* 2059 * Find the largest available free page in the other list. This roughly 2060 * approximates finding the pageblock with the most free pages, which 2061 * would be too costly to do exactly. 2062 */ 2063 for (current_order = MAX_ORDER; current_order >= min_order; 2064 --current_order) { 2065 area = &(zone->free_area[current_order]); 2066 fallback_mt = find_suitable_fallback(area, current_order, 2067 start_migratetype, false, &can_steal); 2068 if (fallback_mt == -1) 2069 continue; 2070 2071 /* 2072 * We cannot steal all free pages from the pageblock and the 2073 * requested migratetype is movable. In that case it's better to 2074 * steal and split the smallest available page instead of the 2075 * largest available page, because even if the next movable 2076 * allocation falls back into a different pageblock than this 2077 * one, it won't cause permanent fragmentation. 2078 */ 2079 if (!can_steal && start_migratetype == MIGRATE_MOVABLE 2080 && current_order > order) 2081 goto find_smallest; 2082 2083 goto do_steal; 2084 } 2085 2086 return false; 2087 2088 find_smallest: 2089 for (current_order = order; current_order <= MAX_ORDER; 2090 current_order++) { 2091 area = &(zone->free_area[current_order]); 2092 fallback_mt = find_suitable_fallback(area, current_order, 2093 start_migratetype, false, &can_steal); 2094 if (fallback_mt != -1) 2095 break; 2096 } 2097 2098 /* 2099 * This should not happen - we already found a suitable fallback 2100 * when looking for the largest page. 2101 */ 2102 VM_BUG_ON(current_order > MAX_ORDER); 2103 2104 do_steal: 2105 page = get_page_from_free_area(area, fallback_mt); 2106 2107 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, 2108 can_steal); 2109 2110 trace_mm_page_alloc_extfrag(page, order, current_order, 2111 start_migratetype, fallback_mt); 2112 2113 return true; 2114 2115 } 2116 2117 /* 2118 * Do the hard work of removing an element from the buddy allocator. 2119 * Call me with the zone->lock already held. 2120 */ 2121 static __always_inline struct page * 2122 __rmqueue(struct zone *zone, unsigned int order, int migratetype, 2123 unsigned int alloc_flags) 2124 { 2125 struct page *page; 2126 2127 if (IS_ENABLED(CONFIG_CMA)) { 2128 /* 2129 * Balance movable allocations between regular and CMA areas by 2130 * allocating from CMA when over half of the zone's free memory 2131 * is in the CMA area. 2132 */ 2133 if (alloc_flags & ALLOC_CMA && 2134 zone_page_state(zone, NR_FREE_CMA_PAGES) > 2135 zone_page_state(zone, NR_FREE_PAGES) / 2) { 2136 page = __rmqueue_cma_fallback(zone, order); 2137 if (page) 2138 return page; 2139 } 2140 } 2141 retry: 2142 page = __rmqueue_smallest(zone, order, migratetype); 2143 if (unlikely(!page)) { 2144 if (alloc_flags & ALLOC_CMA) 2145 page = __rmqueue_cma_fallback(zone, order); 2146 2147 if (!page && __rmqueue_fallback(zone, order, migratetype, 2148 alloc_flags)) 2149 goto retry; 2150 } 2151 return page; 2152 } 2153 2154 /* 2155 * Obtain a specified number of elements from the buddy allocator, all under 2156 * a single hold of the lock, for efficiency. Add them to the supplied list. 2157 * Returns the number of new pages which were placed at *list. 2158 */ 2159 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2160 unsigned long count, struct list_head *list, 2161 int migratetype, unsigned int alloc_flags) 2162 { 2163 unsigned long flags; 2164 int i; 2165 2166 spin_lock_irqsave(&zone->lock, flags); 2167 for (i = 0; i < count; ++i) { 2168 struct page *page = __rmqueue(zone, order, migratetype, 2169 alloc_flags); 2170 if (unlikely(page == NULL)) 2171 break; 2172 2173 /* 2174 * Split buddy pages returned by expand() are received here in 2175 * physical page order. The page is added to the tail of 2176 * caller's list. From the callers perspective, the linked list 2177 * is ordered by page number under some conditions. This is 2178 * useful for IO devices that can forward direction from the 2179 * head, thus also in the physical page order. This is useful 2180 * for IO devices that can merge IO requests if the physical 2181 * pages are ordered properly. 2182 */ 2183 list_add_tail(&page->pcp_list, list); 2184 if (is_migrate_cma(get_pcppage_migratetype(page))) 2185 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2186 -(1 << order)); 2187 } 2188 2189 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2190 spin_unlock_irqrestore(&zone->lock, flags); 2191 2192 return i; 2193 } 2194 2195 #ifdef CONFIG_NUMA 2196 /* 2197 * Called from the vmstat counter updater to drain pagesets of this 2198 * currently executing processor on remote nodes after they have 2199 * expired. 2200 */ 2201 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2202 { 2203 int to_drain, batch; 2204 2205 batch = READ_ONCE(pcp->batch); 2206 to_drain = min(pcp->count, batch); 2207 if (to_drain > 0) { 2208 spin_lock(&pcp->lock); 2209 free_pcppages_bulk(zone, to_drain, pcp, 0); 2210 spin_unlock(&pcp->lock); 2211 } 2212 } 2213 #endif 2214 2215 /* 2216 * Drain pcplists of the indicated processor and zone. 2217 */ 2218 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2219 { 2220 struct per_cpu_pages *pcp; 2221 2222 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 2223 if (pcp->count) { 2224 spin_lock(&pcp->lock); 2225 free_pcppages_bulk(zone, pcp->count, pcp, 0); 2226 spin_unlock(&pcp->lock); 2227 } 2228 } 2229 2230 /* 2231 * Drain pcplists of all zones on the indicated processor. 2232 */ 2233 static void drain_pages(unsigned int cpu) 2234 { 2235 struct zone *zone; 2236 2237 for_each_populated_zone(zone) { 2238 drain_pages_zone(cpu, zone); 2239 } 2240 } 2241 2242 /* 2243 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 2244 */ 2245 void drain_local_pages(struct zone *zone) 2246 { 2247 int cpu = smp_processor_id(); 2248 2249 if (zone) 2250 drain_pages_zone(cpu, zone); 2251 else 2252 drain_pages(cpu); 2253 } 2254 2255 /* 2256 * The implementation of drain_all_pages(), exposing an extra parameter to 2257 * drain on all cpus. 2258 * 2259 * drain_all_pages() is optimized to only execute on cpus where pcplists are 2260 * not empty. The check for non-emptiness can however race with a free to 2261 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers 2262 * that need the guarantee that every CPU has drained can disable the 2263 * optimizing racy check. 2264 */ 2265 static void __drain_all_pages(struct zone *zone, bool force_all_cpus) 2266 { 2267 int cpu; 2268 2269 /* 2270 * Allocate in the BSS so we won't require allocation in 2271 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 2272 */ 2273 static cpumask_t cpus_with_pcps; 2274 2275 /* 2276 * Do not drain if one is already in progress unless it's specific to 2277 * a zone. Such callers are primarily CMA and memory hotplug and need 2278 * the drain to be complete when the call returns. 2279 */ 2280 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 2281 if (!zone) 2282 return; 2283 mutex_lock(&pcpu_drain_mutex); 2284 } 2285 2286 /* 2287 * We don't care about racing with CPU hotplug event 2288 * as offline notification will cause the notified 2289 * cpu to drain that CPU pcps and on_each_cpu_mask 2290 * disables preemption as part of its processing 2291 */ 2292 for_each_online_cpu(cpu) { 2293 struct per_cpu_pages *pcp; 2294 struct zone *z; 2295 bool has_pcps = false; 2296 2297 if (force_all_cpus) { 2298 /* 2299 * The pcp.count check is racy, some callers need a 2300 * guarantee that no cpu is missed. 2301 */ 2302 has_pcps = true; 2303 } else if (zone) { 2304 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 2305 if (pcp->count) 2306 has_pcps = true; 2307 } else { 2308 for_each_populated_zone(z) { 2309 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); 2310 if (pcp->count) { 2311 has_pcps = true; 2312 break; 2313 } 2314 } 2315 } 2316 2317 if (has_pcps) 2318 cpumask_set_cpu(cpu, &cpus_with_pcps); 2319 else 2320 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2321 } 2322 2323 for_each_cpu(cpu, &cpus_with_pcps) { 2324 if (zone) 2325 drain_pages_zone(cpu, zone); 2326 else 2327 drain_pages(cpu); 2328 } 2329 2330 mutex_unlock(&pcpu_drain_mutex); 2331 } 2332 2333 /* 2334 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2335 * 2336 * When zone parameter is non-NULL, spill just the single zone's pages. 2337 */ 2338 void drain_all_pages(struct zone *zone) 2339 { 2340 __drain_all_pages(zone, false); 2341 } 2342 2343 static bool free_unref_page_prepare(struct page *page, unsigned long pfn, 2344 unsigned int order) 2345 { 2346 int migratetype; 2347 2348 if (!free_pages_prepare(page, order, FPI_NONE)) 2349 return false; 2350 2351 migratetype = get_pfnblock_migratetype(page, pfn); 2352 set_pcppage_migratetype(page, migratetype); 2353 return true; 2354 } 2355 2356 static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, 2357 bool free_high) 2358 { 2359 int min_nr_free, max_nr_free; 2360 2361 /* Free everything if batch freeing high-order pages. */ 2362 if (unlikely(free_high)) 2363 return pcp->count; 2364 2365 /* Check for PCP disabled or boot pageset */ 2366 if (unlikely(high < batch)) 2367 return 1; 2368 2369 /* Leave at least pcp->batch pages on the list */ 2370 min_nr_free = batch; 2371 max_nr_free = high - batch; 2372 2373 /* 2374 * Double the number of pages freed each time there is subsequent 2375 * freeing of pages without any allocation. 2376 */ 2377 batch <<= pcp->free_factor; 2378 if (batch < max_nr_free) 2379 pcp->free_factor++; 2380 batch = clamp(batch, min_nr_free, max_nr_free); 2381 2382 return batch; 2383 } 2384 2385 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, 2386 bool free_high) 2387 { 2388 int high = READ_ONCE(pcp->high); 2389 2390 if (unlikely(!high || free_high)) 2391 return 0; 2392 2393 if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) 2394 return high; 2395 2396 /* 2397 * If reclaim is active, limit the number of pages that can be 2398 * stored on pcp lists 2399 */ 2400 return min(READ_ONCE(pcp->batch) << 2, high); 2401 } 2402 2403 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, 2404 struct page *page, int migratetype, 2405 unsigned int order) 2406 { 2407 int high; 2408 int pindex; 2409 bool free_high; 2410 2411 __count_vm_events(PGFREE, 1 << order); 2412 pindex = order_to_pindex(migratetype, order); 2413 list_add(&page->pcp_list, &pcp->lists[pindex]); 2414 pcp->count += 1 << order; 2415 2416 /* 2417 * As high-order pages other than THP's stored on PCP can contribute 2418 * to fragmentation, limit the number stored when PCP is heavily 2419 * freeing without allocation. The remainder after bulk freeing 2420 * stops will be drained from vmstat refresh context. 2421 */ 2422 free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); 2423 2424 high = nr_pcp_high(pcp, zone, free_high); 2425 if (pcp->count >= high) { 2426 int batch = READ_ONCE(pcp->batch); 2427 2428 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); 2429 } 2430 } 2431 2432 /* 2433 * Free a pcp page 2434 */ 2435 void free_unref_page(struct page *page, unsigned int order) 2436 { 2437 unsigned long __maybe_unused UP_flags; 2438 struct per_cpu_pages *pcp; 2439 struct zone *zone; 2440 unsigned long pfn = page_to_pfn(page); 2441 int migratetype; 2442 2443 if (!free_unref_page_prepare(page, pfn, order)) 2444 return; 2445 2446 /* 2447 * We only track unmovable, reclaimable and movable on pcp lists. 2448 * Place ISOLATE pages on the isolated list because they are being 2449 * offlined but treat HIGHATOMIC as movable pages so we can get those 2450 * areas back if necessary. Otherwise, we may have to free 2451 * excessively into the page allocator 2452 */ 2453 migratetype = get_pcppage_migratetype(page); 2454 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 2455 if (unlikely(is_migrate_isolate(migratetype))) { 2456 free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); 2457 return; 2458 } 2459 migratetype = MIGRATE_MOVABLE; 2460 } 2461 2462 zone = page_zone(page); 2463 pcp_trylock_prepare(UP_flags); 2464 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2465 if (pcp) { 2466 free_unref_page_commit(zone, pcp, page, migratetype, order); 2467 pcp_spin_unlock(pcp); 2468 } else { 2469 free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); 2470 } 2471 pcp_trylock_finish(UP_flags); 2472 } 2473 2474 /* 2475 * Free a list of 0-order pages 2476 */ 2477 void free_unref_page_list(struct list_head *list) 2478 { 2479 unsigned long __maybe_unused UP_flags; 2480 struct page *page, *next; 2481 struct per_cpu_pages *pcp = NULL; 2482 struct zone *locked_zone = NULL; 2483 int batch_count = 0; 2484 int migratetype; 2485 2486 /* Prepare pages for freeing */ 2487 list_for_each_entry_safe(page, next, list, lru) { 2488 unsigned long pfn = page_to_pfn(page); 2489 if (!free_unref_page_prepare(page, pfn, 0)) { 2490 list_del(&page->lru); 2491 continue; 2492 } 2493 2494 /* 2495 * Free isolated pages directly to the allocator, see 2496 * comment in free_unref_page. 2497 */ 2498 migratetype = get_pcppage_migratetype(page); 2499 if (unlikely(is_migrate_isolate(migratetype))) { 2500 list_del(&page->lru); 2501 free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); 2502 continue; 2503 } 2504 } 2505 2506 list_for_each_entry_safe(page, next, list, lru) { 2507 struct zone *zone = page_zone(page); 2508 2509 list_del(&page->lru); 2510 migratetype = get_pcppage_migratetype(page); 2511 2512 /* 2513 * Either different zone requiring a different pcp lock or 2514 * excessive lock hold times when freeing a large list of 2515 * pages. 2516 */ 2517 if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { 2518 if (pcp) { 2519 pcp_spin_unlock(pcp); 2520 pcp_trylock_finish(UP_flags); 2521 } 2522 2523 batch_count = 0; 2524 2525 /* 2526 * trylock is necessary as pages may be getting freed 2527 * from IRQ or SoftIRQ context after an IO completion. 2528 */ 2529 pcp_trylock_prepare(UP_flags); 2530 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2531 if (unlikely(!pcp)) { 2532 pcp_trylock_finish(UP_flags); 2533 free_one_page(zone, page, page_to_pfn(page), 2534 0, migratetype, FPI_NONE); 2535 locked_zone = NULL; 2536 continue; 2537 } 2538 locked_zone = zone; 2539 } 2540 2541 /* 2542 * Non-isolated types over MIGRATE_PCPTYPES get added 2543 * to the MIGRATE_MOVABLE pcp list. 2544 */ 2545 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) 2546 migratetype = MIGRATE_MOVABLE; 2547 2548 trace_mm_page_free_batched(page); 2549 free_unref_page_commit(zone, pcp, page, migratetype, 0); 2550 batch_count++; 2551 } 2552 2553 if (pcp) { 2554 pcp_spin_unlock(pcp); 2555 pcp_trylock_finish(UP_flags); 2556 } 2557 } 2558 2559 /* 2560 * split_page takes a non-compound higher-order page, and splits it into 2561 * n (1<<order) sub-pages: page[0..n] 2562 * Each sub-page must be freed individually. 2563 * 2564 * Note: this is probably too low level an operation for use in drivers. 2565 * Please consult with lkml before using this in your driver. 2566 */ 2567 void split_page(struct page *page, unsigned int order) 2568 { 2569 int i; 2570 2571 VM_BUG_ON_PAGE(PageCompound(page), page); 2572 VM_BUG_ON_PAGE(!page_count(page), page); 2573 2574 for (i = 1; i < (1 << order); i++) 2575 set_page_refcounted(page + i); 2576 split_page_owner(page, 1 << order); 2577 split_page_memcg(page, 1 << order); 2578 } 2579 EXPORT_SYMBOL_GPL(split_page); 2580 2581 int __isolate_free_page(struct page *page, unsigned int order) 2582 { 2583 struct zone *zone = page_zone(page); 2584 int mt = get_pageblock_migratetype(page); 2585 2586 if (!is_migrate_isolate(mt)) { 2587 unsigned long watermark; 2588 /* 2589 * Obey watermarks as if the page was being allocated. We can 2590 * emulate a high-order watermark check with a raised order-0 2591 * watermark, because we already know our high-order page 2592 * exists. 2593 */ 2594 watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 2595 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 2596 return 0; 2597 2598 __mod_zone_freepage_state(zone, -(1UL << order), mt); 2599 } 2600 2601 del_page_from_free_list(page, zone, order); 2602 2603 /* 2604 * Set the pageblock if the isolated page is at least half of a 2605 * pageblock 2606 */ 2607 if (order >= pageblock_order - 1) { 2608 struct page *endpage = page + (1 << order) - 1; 2609 for (; page < endpage; page += pageblock_nr_pages) { 2610 int mt = get_pageblock_migratetype(page); 2611 /* 2612 * Only change normal pageblocks (i.e., they can merge 2613 * with others) 2614 */ 2615 if (migratetype_is_mergeable(mt)) 2616 set_pageblock_migratetype(page, 2617 MIGRATE_MOVABLE); 2618 } 2619 } 2620 2621 return 1UL << order; 2622 } 2623 2624 /** 2625 * __putback_isolated_page - Return a now-isolated page back where we got it 2626 * @page: Page that was isolated 2627 * @order: Order of the isolated page 2628 * @mt: The page's pageblock's migratetype 2629 * 2630 * This function is meant to return a page pulled from the free lists via 2631 * __isolate_free_page back to the free lists they were pulled from. 2632 */ 2633 void __putback_isolated_page(struct page *page, unsigned int order, int mt) 2634 { 2635 struct zone *zone = page_zone(page); 2636 2637 /* zone lock should be held when this function is called */ 2638 lockdep_assert_held(&zone->lock); 2639 2640 /* Return isolated page to tail of freelist. */ 2641 __free_one_page(page, page_to_pfn(page), zone, order, mt, 2642 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 2643 } 2644 2645 /* 2646 * Update NUMA hit/miss statistics 2647 */ 2648 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, 2649 long nr_account) 2650 { 2651 #ifdef CONFIG_NUMA 2652 enum numa_stat_item local_stat = NUMA_LOCAL; 2653 2654 /* skip numa counters update if numa stats is disabled */ 2655 if (!static_branch_likely(&vm_numa_stat_key)) 2656 return; 2657 2658 if (zone_to_nid(z) != numa_node_id()) 2659 local_stat = NUMA_OTHER; 2660 2661 if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 2662 __count_numa_events(z, NUMA_HIT, nr_account); 2663 else { 2664 __count_numa_events(z, NUMA_MISS, nr_account); 2665 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); 2666 } 2667 __count_numa_events(z, local_stat, nr_account); 2668 #endif 2669 } 2670 2671 static __always_inline 2672 struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, 2673 unsigned int order, unsigned int alloc_flags, 2674 int migratetype) 2675 { 2676 struct page *page; 2677 unsigned long flags; 2678 2679 do { 2680 page = NULL; 2681 spin_lock_irqsave(&zone->lock, flags); 2682 /* 2683 * order-0 request can reach here when the pcplist is skipped 2684 * due to non-CMA allocation context. HIGHATOMIC area is 2685 * reserved for high-order atomic allocation, so order-0 2686 * request should skip it. 2687 */ 2688 if (alloc_flags & ALLOC_HIGHATOMIC) 2689 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2690 if (!page) { 2691 page = __rmqueue(zone, order, migratetype, alloc_flags); 2692 2693 /* 2694 * If the allocation fails, allow OOM handling access 2695 * to HIGHATOMIC reserves as failing now is worse than 2696 * failing a high-order atomic allocation in the 2697 * future. 2698 */ 2699 if (!page && (alloc_flags & ALLOC_OOM)) 2700 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2701 2702 if (!page) { 2703 spin_unlock_irqrestore(&zone->lock, flags); 2704 return NULL; 2705 } 2706 } 2707 __mod_zone_freepage_state(zone, -(1 << order), 2708 get_pcppage_migratetype(page)); 2709 spin_unlock_irqrestore(&zone->lock, flags); 2710 } while (check_new_pages(page, order)); 2711 2712 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2713 zone_statistics(preferred_zone, zone, 1); 2714 2715 return page; 2716 } 2717 2718 /* Remove page from the per-cpu list, caller must protect the list */ 2719 static inline 2720 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, 2721 int migratetype, 2722 unsigned int alloc_flags, 2723 struct per_cpu_pages *pcp, 2724 struct list_head *list) 2725 { 2726 struct page *page; 2727 2728 do { 2729 if (list_empty(list)) { 2730 int batch = READ_ONCE(pcp->batch); 2731 int alloced; 2732 2733 /* 2734 * Scale batch relative to order if batch implies 2735 * free pages can be stored on the PCP. Batch can 2736 * be 1 for small zones or for boot pagesets which 2737 * should never store free pages as the pages may 2738 * belong to arbitrary zones. 2739 */ 2740 if (batch > 1) 2741 batch = max(batch >> order, 2); 2742 alloced = rmqueue_bulk(zone, order, 2743 batch, list, 2744 migratetype, alloc_flags); 2745 2746 pcp->count += alloced << order; 2747 if (unlikely(list_empty(list))) 2748 return NULL; 2749 } 2750 2751 page = list_first_entry(list, struct page, pcp_list); 2752 list_del(&page->pcp_list); 2753 pcp->count -= 1 << order; 2754 } while (check_new_pages(page, order)); 2755 2756 return page; 2757 } 2758 2759 /* Lock and remove page from the per-cpu list */ 2760 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 2761 struct zone *zone, unsigned int order, 2762 int migratetype, unsigned int alloc_flags) 2763 { 2764 struct per_cpu_pages *pcp; 2765 struct list_head *list; 2766 struct page *page; 2767 unsigned long __maybe_unused UP_flags; 2768 2769 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ 2770 pcp_trylock_prepare(UP_flags); 2771 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2772 if (!pcp) { 2773 pcp_trylock_finish(UP_flags); 2774 return NULL; 2775 } 2776 2777 /* 2778 * On allocation, reduce the number of pages that are batch freed. 2779 * See nr_pcp_free() where free_factor is increased for subsequent 2780 * frees. 2781 */ 2782 pcp->free_factor >>= 1; 2783 list = &pcp->lists[order_to_pindex(migratetype, order)]; 2784 page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); 2785 pcp_spin_unlock(pcp); 2786 pcp_trylock_finish(UP_flags); 2787 if (page) { 2788 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2789 zone_statistics(preferred_zone, zone, 1); 2790 } 2791 return page; 2792 } 2793 2794 /* 2795 * Allocate a page from the given zone. 2796 * Use pcplists for THP or "cheap" high-order allocations. 2797 */ 2798 2799 /* 2800 * Do not instrument rmqueue() with KMSAN. This function may call 2801 * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). 2802 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it 2803 * may call rmqueue() again, which will result in a deadlock. 2804 */ 2805 __no_sanitize_memory 2806 static inline 2807 struct page *rmqueue(struct zone *preferred_zone, 2808 struct zone *zone, unsigned int order, 2809 gfp_t gfp_flags, unsigned int alloc_flags, 2810 int migratetype) 2811 { 2812 struct page *page; 2813 2814 /* 2815 * We most definitely don't want callers attempting to 2816 * allocate greater than order-1 page units with __GFP_NOFAIL. 2817 */ 2818 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); 2819 2820 if (likely(pcp_allowed_order(order))) { 2821 /* 2822 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and 2823 * we need to skip it when CMA area isn't allowed. 2824 */ 2825 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || 2826 migratetype != MIGRATE_MOVABLE) { 2827 page = rmqueue_pcplist(preferred_zone, zone, order, 2828 migratetype, alloc_flags); 2829 if (likely(page)) 2830 goto out; 2831 } 2832 } 2833 2834 page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, 2835 migratetype); 2836 2837 out: 2838 /* Separate test+clear to avoid unnecessary atomics */ 2839 if ((alloc_flags & ALLOC_KSWAPD) && 2840 unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { 2841 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 2842 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 2843 } 2844 2845 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 2846 return page; 2847 } 2848 2849 noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 2850 { 2851 return __should_fail_alloc_page(gfp_mask, order); 2852 } 2853 ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); 2854 2855 static inline long __zone_watermark_unusable_free(struct zone *z, 2856 unsigned int order, unsigned int alloc_flags) 2857 { 2858 long unusable_free = (1 << order) - 1; 2859 2860 /* 2861 * If the caller does not have rights to reserves below the min 2862 * watermark then subtract the high-atomic reserves. This will 2863 * over-estimate the size of the atomic reserve but it avoids a search. 2864 */ 2865 if (likely(!(alloc_flags & ALLOC_RESERVES))) 2866 unusable_free += z->nr_reserved_highatomic; 2867 2868 #ifdef CONFIG_CMA 2869 /* If allocation can't use CMA areas don't use free CMA pages */ 2870 if (!(alloc_flags & ALLOC_CMA)) 2871 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 2872 #endif 2873 #ifdef CONFIG_UNACCEPTED_MEMORY 2874 unusable_free += zone_page_state(z, NR_UNACCEPTED); 2875 #endif 2876 2877 return unusable_free; 2878 } 2879 2880 /* 2881 * Return true if free base pages are above 'mark'. For high-order checks it 2882 * will return true of the order-0 watermark is reached and there is at least 2883 * one free page of a suitable size. Checking now avoids taking the zone lock 2884 * to check in the allocation paths if no pages are free. 2885 */ 2886 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2887 int highest_zoneidx, unsigned int alloc_flags, 2888 long free_pages) 2889 { 2890 long min = mark; 2891 int o; 2892 2893 /* free_pages may go negative - that's OK */ 2894 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 2895 2896 if (unlikely(alloc_flags & ALLOC_RESERVES)) { 2897 /* 2898 * __GFP_HIGH allows access to 50% of the min reserve as well 2899 * as OOM. 2900 */ 2901 if (alloc_flags & ALLOC_MIN_RESERVE) { 2902 min -= min / 2; 2903 2904 /* 2905 * Non-blocking allocations (e.g. GFP_ATOMIC) can 2906 * access more reserves than just __GFP_HIGH. Other 2907 * non-blocking allocations requests such as GFP_NOWAIT 2908 * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get 2909 * access to the min reserve. 2910 */ 2911 if (alloc_flags & ALLOC_NON_BLOCK) 2912 min -= min / 4; 2913 } 2914 2915 /* 2916 * OOM victims can try even harder than the normal reserve 2917 * users on the grounds that it's definitely going to be in 2918 * the exit path shortly and free memory. Any allocation it 2919 * makes during the free path will be small and short-lived. 2920 */ 2921 if (alloc_flags & ALLOC_OOM) 2922 min -= min / 2; 2923 } 2924 2925 /* 2926 * Check watermarks for an order-0 allocation request. If these 2927 * are not met, then a high-order request also cannot go ahead 2928 * even if a suitable page happened to be free. 2929 */ 2930 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 2931 return false; 2932 2933 /* If this is an order-0 request then the watermark is fine */ 2934 if (!order) 2935 return true; 2936 2937 /* For a high-order request, check at least one suitable page is free */ 2938 for (o = order; o <= MAX_ORDER; o++) { 2939 struct free_area *area = &z->free_area[o]; 2940 int mt; 2941 2942 if (!area->nr_free) 2943 continue; 2944 2945 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 2946 if (!free_area_empty(area, mt)) 2947 return true; 2948 } 2949 2950 #ifdef CONFIG_CMA 2951 if ((alloc_flags & ALLOC_CMA) && 2952 !free_area_empty(area, MIGRATE_CMA)) { 2953 return true; 2954 } 2955 #endif 2956 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && 2957 !free_area_empty(area, MIGRATE_HIGHATOMIC)) { 2958 return true; 2959 } 2960 } 2961 return false; 2962 } 2963 2964 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 2965 int highest_zoneidx, unsigned int alloc_flags) 2966 { 2967 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 2968 zone_page_state(z, NR_FREE_PAGES)); 2969 } 2970 2971 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 2972 unsigned long mark, int highest_zoneidx, 2973 unsigned int alloc_flags, gfp_t gfp_mask) 2974 { 2975 long free_pages; 2976 2977 free_pages = zone_page_state(z, NR_FREE_PAGES); 2978 2979 /* 2980 * Fast check for order-0 only. If this fails then the reserves 2981 * need to be calculated. 2982 */ 2983 if (!order) { 2984 long usable_free; 2985 long reserved; 2986 2987 usable_free = free_pages; 2988 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); 2989 2990 /* reserved may over estimate high-atomic reserves. */ 2991 usable_free -= min(usable_free, reserved); 2992 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) 2993 return true; 2994 } 2995 2996 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 2997 free_pages)) 2998 return true; 2999 3000 /* 3001 * Ignore watermark boosting for __GFP_HIGH order-0 allocations 3002 * when checking the min watermark. The min watermark is the 3003 * point where boosting is ignored so that kswapd is woken up 3004 * when below the low watermark. 3005 */ 3006 if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost 3007 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 3008 mark = z->_watermark[WMARK_MIN]; 3009 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 3010 alloc_flags, free_pages); 3011 } 3012 3013 return false; 3014 } 3015 3016 bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 3017 unsigned long mark, int highest_zoneidx) 3018 { 3019 long free_pages = zone_page_state(z, NR_FREE_PAGES); 3020 3021 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 3022 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 3023 3024 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, 3025 free_pages); 3026 } 3027 3028 #ifdef CONFIG_NUMA 3029 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; 3030 3031 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3032 { 3033 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3034 node_reclaim_distance; 3035 } 3036 #else /* CONFIG_NUMA */ 3037 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3038 { 3039 return true; 3040 } 3041 #endif /* CONFIG_NUMA */ 3042 3043 /* 3044 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 3045 * fragmentation is subtle. If the preferred zone was HIGHMEM then 3046 * premature use of a lower zone may cause lowmem pressure problems that 3047 * are worse than fragmentation. If the next zone is ZONE_DMA then it is 3048 * probably too small. It only makes sense to spread allocations to avoid 3049 * fragmentation between the Normal and DMA32 zones. 3050 */ 3051 static inline unsigned int 3052 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 3053 { 3054 unsigned int alloc_flags; 3055 3056 /* 3057 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 3058 * to save a branch. 3059 */ 3060 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 3061 3062 #ifdef CONFIG_ZONE_DMA32 3063 if (!zone) 3064 return alloc_flags; 3065 3066 if (zone_idx(zone) != ZONE_NORMAL) 3067 return alloc_flags; 3068 3069 /* 3070 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 3071 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 3072 * on UMA that if Normal is populated then so is DMA32. 3073 */ 3074 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 3075 if (nr_online_nodes > 1 && !populated_zone(--zone)) 3076 return alloc_flags; 3077 3078 alloc_flags |= ALLOC_NOFRAGMENT; 3079 #endif /* CONFIG_ZONE_DMA32 */ 3080 return alloc_flags; 3081 } 3082 3083 /* Must be called after current_gfp_context() which can change gfp_mask */ 3084 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, 3085 unsigned int alloc_flags) 3086 { 3087 #ifdef CONFIG_CMA 3088 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3089 alloc_flags |= ALLOC_CMA; 3090 #endif 3091 return alloc_flags; 3092 } 3093 3094 /* 3095 * get_page_from_freelist goes through the zonelist trying to allocate 3096 * a page. 3097 */ 3098 static struct page * 3099 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3100 const struct alloc_context *ac) 3101 { 3102 struct zoneref *z; 3103 struct zone *zone; 3104 struct pglist_data *last_pgdat = NULL; 3105 bool last_pgdat_dirty_ok = false; 3106 bool no_fallback; 3107 3108 retry: 3109 /* 3110 * Scan zonelist, looking for a zone with enough free. 3111 * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. 3112 */ 3113 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 3114 z = ac->preferred_zoneref; 3115 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 3116 ac->nodemask) { 3117 struct page *page; 3118 unsigned long mark; 3119 3120 if (cpusets_enabled() && 3121 (alloc_flags & ALLOC_CPUSET) && 3122 !__cpuset_zone_allowed(zone, gfp_mask)) 3123 continue; 3124 /* 3125 * When allocating a page cache page for writing, we 3126 * want to get it from a node that is within its dirty 3127 * limit, such that no single node holds more than its 3128 * proportional share of globally allowed dirty pages. 3129 * The dirty limits take into account the node's 3130 * lowmem reserves and high watermark so that kswapd 3131 * should be able to balance it without having to 3132 * write pages from its LRU list. 3133 * 3134 * XXX: For now, allow allocations to potentially 3135 * exceed the per-node dirty limit in the slowpath 3136 * (spread_dirty_pages unset) before going into reclaim, 3137 * which is important when on a NUMA setup the allowed 3138 * nodes are together not big enough to reach the 3139 * global limit. The proper fix for these situations 3140 * will require awareness of nodes in the 3141 * dirty-throttling and the flusher threads. 3142 */ 3143 if (ac->spread_dirty_pages) { 3144 if (last_pgdat != zone->zone_pgdat) { 3145 last_pgdat = zone->zone_pgdat; 3146 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); 3147 } 3148 3149 if (!last_pgdat_dirty_ok) 3150 continue; 3151 } 3152 3153 if (no_fallback && nr_online_nodes > 1 && 3154 zone != ac->preferred_zoneref->zone) { 3155 int local_nid; 3156 3157 /* 3158 * If moving to a remote node, retry but allow 3159 * fragmenting fallbacks. Locality is more important 3160 * than fragmentation avoidance. 3161 */ 3162 local_nid = zone_to_nid(ac->preferred_zoneref->zone); 3163 if (zone_to_nid(zone) != local_nid) { 3164 alloc_flags &= ~ALLOC_NOFRAGMENT; 3165 goto retry; 3166 } 3167 } 3168 3169 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 3170 if (!zone_watermark_fast(zone, order, mark, 3171 ac->highest_zoneidx, alloc_flags, 3172 gfp_mask)) { 3173 int ret; 3174 3175 if (has_unaccepted_memory()) { 3176 if (try_to_accept_memory(zone, order)) 3177 goto try_this_zone; 3178 } 3179 3180 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3181 /* 3182 * Watermark failed for this zone, but see if we can 3183 * grow this zone if it contains deferred pages. 3184 */ 3185 if (deferred_pages_enabled()) { 3186 if (_deferred_grow_zone(zone, order)) 3187 goto try_this_zone; 3188 } 3189 #endif 3190 /* Checked here to keep the fast path fast */ 3191 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3192 if (alloc_flags & ALLOC_NO_WATERMARKS) 3193 goto try_this_zone; 3194 3195 if (!node_reclaim_enabled() || 3196 !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) 3197 continue; 3198 3199 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3200 switch (ret) { 3201 case NODE_RECLAIM_NOSCAN: 3202 /* did not scan */ 3203 continue; 3204 case NODE_RECLAIM_FULL: 3205 /* scanned but unreclaimable */ 3206 continue; 3207 default: 3208 /* did we reclaim enough */ 3209 if (zone_watermark_ok(zone, order, mark, 3210 ac->highest_zoneidx, alloc_flags)) 3211 goto try_this_zone; 3212 3213 continue; 3214 } 3215 } 3216 3217 try_this_zone: 3218 page = rmqueue(ac->preferred_zoneref->zone, zone, order, 3219 gfp_mask, alloc_flags, ac->migratetype); 3220 if (page) { 3221 prep_new_page(page, order, gfp_mask, alloc_flags); 3222 3223 /* 3224 * If this is a high-order atomic allocation then check 3225 * if the pageblock should be reserved for the future 3226 */ 3227 if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) 3228 reserve_highatomic_pageblock(page, zone, order); 3229 3230 return page; 3231 } else { 3232 if (has_unaccepted_memory()) { 3233 if (try_to_accept_memory(zone, order)) 3234 goto try_this_zone; 3235 } 3236 3237 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3238 /* Try again if zone has deferred pages */ 3239 if (deferred_pages_enabled()) { 3240 if (_deferred_grow_zone(zone, order)) 3241 goto try_this_zone; 3242 } 3243 #endif 3244 } 3245 } 3246 3247 /* 3248 * It's possible on a UMA machine to get through all zones that are 3249 * fragmented. If avoiding fragmentation, reset and try again. 3250 */ 3251 if (no_fallback) { 3252 alloc_flags &= ~ALLOC_NOFRAGMENT; 3253 goto retry; 3254 } 3255 3256 return NULL; 3257 } 3258 3259 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3260 { 3261 unsigned int filter = SHOW_MEM_FILTER_NODES; 3262 3263 /* 3264 * This documents exceptions given to allocations in certain 3265 * contexts that are allowed to allocate outside current's set 3266 * of allowed nodes. 3267 */ 3268 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3269 if (tsk_is_oom_victim(current) || 3270 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3271 filter &= ~SHOW_MEM_FILTER_NODES; 3272 if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3273 filter &= ~SHOW_MEM_FILTER_NODES; 3274 3275 __show_mem(filter, nodemask, gfp_zone(gfp_mask)); 3276 } 3277 3278 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 3279 { 3280 struct va_format vaf; 3281 va_list args; 3282 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 3283 3284 if ((gfp_mask & __GFP_NOWARN) || 3285 !__ratelimit(&nopage_rs) || 3286 ((gfp_mask & __GFP_DMA) && !has_managed_dma())) 3287 return; 3288 3289 va_start(args, fmt); 3290 vaf.fmt = fmt; 3291 vaf.va = &args; 3292 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 3293 current->comm, &vaf, gfp_mask, &gfp_mask, 3294 nodemask_pr_args(nodemask)); 3295 va_end(args); 3296 3297 cpuset_print_current_mems_allowed(); 3298 pr_cont("\n"); 3299 dump_stack(); 3300 warn_alloc_show_mem(gfp_mask, nodemask); 3301 } 3302 3303 static inline struct page * 3304 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 3305 unsigned int alloc_flags, 3306 const struct alloc_context *ac) 3307 { 3308 struct page *page; 3309 3310 page = get_page_from_freelist(gfp_mask, order, 3311 alloc_flags|ALLOC_CPUSET, ac); 3312 /* 3313 * fallback to ignore cpuset restriction if our nodes 3314 * are depleted 3315 */ 3316 if (!page) 3317 page = get_page_from_freelist(gfp_mask, order, 3318 alloc_flags, ac); 3319 3320 return page; 3321 } 3322 3323 static inline struct page * 3324 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 3325 const struct alloc_context *ac, unsigned long *did_some_progress) 3326 { 3327 struct oom_control oc = { 3328 .zonelist = ac->zonelist, 3329 .nodemask = ac->nodemask, 3330 .memcg = NULL, 3331 .gfp_mask = gfp_mask, 3332 .order = order, 3333 }; 3334 struct page *page; 3335 3336 *did_some_progress = 0; 3337 3338 /* 3339 * Acquire the oom lock. If that fails, somebody else is 3340 * making progress for us. 3341 */ 3342 if (!mutex_trylock(&oom_lock)) { 3343 *did_some_progress = 1; 3344 schedule_timeout_uninterruptible(1); 3345 return NULL; 3346 } 3347 3348 /* 3349 * Go through the zonelist yet one more time, keep very high watermark 3350 * here, this is only to catch a parallel oom killing, we must fail if 3351 * we're still under heavy pressure. But make sure that this reclaim 3352 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 3353 * allocation which will never fail due to oom_lock already held. 3354 */ 3355 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 3356 ~__GFP_DIRECT_RECLAIM, order, 3357 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 3358 if (page) 3359 goto out; 3360 3361 /* Coredumps can quickly deplete all memory reserves */ 3362 if (current->flags & PF_DUMPCORE) 3363 goto out; 3364 /* The OOM killer will not help higher order allocs */ 3365 if (order > PAGE_ALLOC_COSTLY_ORDER) 3366 goto out; 3367 /* 3368 * We have already exhausted all our reclaim opportunities without any 3369 * success so it is time to admit defeat. We will skip the OOM killer 3370 * because it is very likely that the caller has a more reasonable 3371 * fallback than shooting a random task. 3372 * 3373 * The OOM killer may not free memory on a specific node. 3374 */ 3375 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 3376 goto out; 3377 /* The OOM killer does not needlessly kill tasks for lowmem */ 3378 if (ac->highest_zoneidx < ZONE_NORMAL) 3379 goto out; 3380 if (pm_suspended_storage()) 3381 goto out; 3382 /* 3383 * XXX: GFP_NOFS allocations should rather fail than rely on 3384 * other request to make a forward progress. 3385 * We are in an unfortunate situation where out_of_memory cannot 3386 * do much for this context but let's try it to at least get 3387 * access to memory reserved if the current task is killed (see 3388 * out_of_memory). Once filesystems are ready to handle allocation 3389 * failures more gracefully we should just bail out here. 3390 */ 3391 3392 /* Exhausted what can be done so it's blame time */ 3393 if (out_of_memory(&oc) || 3394 WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { 3395 *did_some_progress = 1; 3396 3397 /* 3398 * Help non-failing allocations by giving them access to memory 3399 * reserves 3400 */ 3401 if (gfp_mask & __GFP_NOFAIL) 3402 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 3403 ALLOC_NO_WATERMARKS, ac); 3404 } 3405 out: 3406 mutex_unlock(&oom_lock); 3407 return page; 3408 } 3409 3410 /* 3411 * Maximum number of compaction retries with a progress before OOM 3412 * killer is consider as the only way to move forward. 3413 */ 3414 #define MAX_COMPACT_RETRIES 16 3415 3416 #ifdef CONFIG_COMPACTION 3417 /* Try memory compaction for high-order allocations before reclaim */ 3418 static struct page * 3419 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3420 unsigned int alloc_flags, const struct alloc_context *ac, 3421 enum compact_priority prio, enum compact_result *compact_result) 3422 { 3423 struct page *page = NULL; 3424 unsigned long pflags; 3425 unsigned int noreclaim_flag; 3426 3427 if (!order) 3428 return NULL; 3429 3430 psi_memstall_enter(&pflags); 3431 delayacct_compact_start(); 3432 noreclaim_flag = memalloc_noreclaim_save(); 3433 3434 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3435 prio, &page); 3436 3437 memalloc_noreclaim_restore(noreclaim_flag); 3438 psi_memstall_leave(&pflags); 3439 delayacct_compact_end(); 3440 3441 if (*compact_result == COMPACT_SKIPPED) 3442 return NULL; 3443 /* 3444 * At least in one zone compaction wasn't deferred or skipped, so let's 3445 * count a compaction stall 3446 */ 3447 count_vm_event(COMPACTSTALL); 3448 3449 /* Prep a captured page if available */ 3450 if (page) 3451 prep_new_page(page, order, gfp_mask, alloc_flags); 3452 3453 /* Try get a page from the freelist if available */ 3454 if (!page) 3455 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3456 3457 if (page) { 3458 struct zone *zone = page_zone(page); 3459 3460 zone->compact_blockskip_flush = false; 3461 compaction_defer_reset(zone, order, true); 3462 count_vm_event(COMPACTSUCCESS); 3463 return page; 3464 } 3465 3466 /* 3467 * It's bad if compaction run occurs and fails. The most likely reason 3468 * is that pages exist, but not enough to satisfy watermarks. 3469 */ 3470 count_vm_event(COMPACTFAIL); 3471 3472 cond_resched(); 3473 3474 return NULL; 3475 } 3476 3477 static inline bool 3478 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, 3479 enum compact_result compact_result, 3480 enum compact_priority *compact_priority, 3481 int *compaction_retries) 3482 { 3483 int max_retries = MAX_COMPACT_RETRIES; 3484 int min_priority; 3485 bool ret = false; 3486 int retries = *compaction_retries; 3487 enum compact_priority priority = *compact_priority; 3488 3489 if (!order) 3490 return false; 3491 3492 if (fatal_signal_pending(current)) 3493 return false; 3494 3495 /* 3496 * Compaction was skipped due to a lack of free order-0 3497 * migration targets. Continue if reclaim can help. 3498 */ 3499 if (compact_result == COMPACT_SKIPPED) { 3500 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3501 goto out; 3502 } 3503 3504 /* 3505 * Compaction managed to coalesce some page blocks, but the 3506 * allocation failed presumably due to a race. Retry some. 3507 */ 3508 if (compact_result == COMPACT_SUCCESS) { 3509 /* 3510 * !costly requests are much more important than 3511 * __GFP_RETRY_MAYFAIL costly ones because they are de 3512 * facto nofail and invoke OOM killer to move on while 3513 * costly can fail and users are ready to cope with 3514 * that. 1/4 retries is rather arbitrary but we would 3515 * need much more detailed feedback from compaction to 3516 * make a better decision. 3517 */ 3518 if (order > PAGE_ALLOC_COSTLY_ORDER) 3519 max_retries /= 4; 3520 3521 if (++(*compaction_retries) <= max_retries) { 3522 ret = true; 3523 goto out; 3524 } 3525 } 3526 3527 /* 3528 * Compaction failed. Retry with increasing priority. 3529 */ 3530 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 3531 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 3532 3533 if (*compact_priority > min_priority) { 3534 (*compact_priority)--; 3535 *compaction_retries = 0; 3536 ret = true; 3537 } 3538 out: 3539 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 3540 return ret; 3541 } 3542 #else 3543 static inline struct page * 3544 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 3545 unsigned int alloc_flags, const struct alloc_context *ac, 3546 enum compact_priority prio, enum compact_result *compact_result) 3547 { 3548 *compact_result = COMPACT_SKIPPED; 3549 return NULL; 3550 } 3551 3552 static inline bool 3553 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, 3554 enum compact_result compact_result, 3555 enum compact_priority *compact_priority, 3556 int *compaction_retries) 3557 { 3558 struct zone *zone; 3559 struct zoneref *z; 3560 3561 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 3562 return false; 3563 3564 /* 3565 * There are setups with compaction disabled which would prefer to loop 3566 * inside the allocator rather than hit the oom killer prematurely. 3567 * Let's give them a good hope and keep retrying while the order-0 3568 * watermarks are OK. 3569 */ 3570 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3571 ac->highest_zoneidx, ac->nodemask) { 3572 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 3573 ac->highest_zoneidx, alloc_flags)) 3574 return true; 3575 } 3576 return false; 3577 } 3578 #endif /* CONFIG_COMPACTION */ 3579 3580 #ifdef CONFIG_LOCKDEP 3581 static struct lockdep_map __fs_reclaim_map = 3582 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 3583 3584 static bool __need_reclaim(gfp_t gfp_mask) 3585 { 3586 /* no reclaim without waiting on it */ 3587 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 3588 return false; 3589 3590 /* this guy won't enter reclaim */ 3591 if (current->flags & PF_MEMALLOC) 3592 return false; 3593 3594 if (gfp_mask & __GFP_NOLOCKDEP) 3595 return false; 3596 3597 return true; 3598 } 3599 3600 void __fs_reclaim_acquire(unsigned long ip) 3601 { 3602 lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); 3603 } 3604 3605 void __fs_reclaim_release(unsigned long ip) 3606 { 3607 lock_release(&__fs_reclaim_map, ip); 3608 } 3609 3610 void fs_reclaim_acquire(gfp_t gfp_mask) 3611 { 3612 gfp_mask = current_gfp_context(gfp_mask); 3613 3614 if (__need_reclaim(gfp_mask)) { 3615 if (gfp_mask & __GFP_FS) 3616 __fs_reclaim_acquire(_RET_IP_); 3617 3618 #ifdef CONFIG_MMU_NOTIFIER 3619 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 3620 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 3621 #endif 3622 3623 } 3624 } 3625 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 3626 3627 void fs_reclaim_release(gfp_t gfp_mask) 3628 { 3629 gfp_mask = current_gfp_context(gfp_mask); 3630 3631 if (__need_reclaim(gfp_mask)) { 3632 if (gfp_mask & __GFP_FS) 3633 __fs_reclaim_release(_RET_IP_); 3634 } 3635 } 3636 EXPORT_SYMBOL_GPL(fs_reclaim_release); 3637 #endif 3638 3639 /* 3640 * Zonelists may change due to hotplug during allocation. Detect when zonelists 3641 * have been rebuilt so allocation retries. Reader side does not lock and 3642 * retries the allocation if zonelist changes. Writer side is protected by the 3643 * embedded spin_lock. 3644 */ 3645 static DEFINE_SEQLOCK(zonelist_update_seq); 3646 3647 static unsigned int zonelist_iter_begin(void) 3648 { 3649 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 3650 return read_seqbegin(&zonelist_update_seq); 3651 3652 return 0; 3653 } 3654 3655 static unsigned int check_retry_zonelist(unsigned int seq) 3656 { 3657 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 3658 return read_seqretry(&zonelist_update_seq, seq); 3659 3660 return seq; 3661 } 3662 3663 /* Perform direct synchronous page reclaim */ 3664 static unsigned long 3665 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 3666 const struct alloc_context *ac) 3667 { 3668 unsigned int noreclaim_flag; 3669 unsigned long progress; 3670 3671 cond_resched(); 3672 3673 /* We now go into synchronous reclaim */ 3674 cpuset_memory_pressure_bump(); 3675 fs_reclaim_acquire(gfp_mask); 3676 noreclaim_flag = memalloc_noreclaim_save(); 3677 3678 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 3679 ac->nodemask); 3680 3681 memalloc_noreclaim_restore(noreclaim_flag); 3682 fs_reclaim_release(gfp_mask); 3683 3684 cond_resched(); 3685 3686 return progress; 3687 } 3688 3689 /* The really slow allocator path where we enter direct reclaim */ 3690 static inline struct page * 3691 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 3692 unsigned int alloc_flags, const struct alloc_context *ac, 3693 unsigned long *did_some_progress) 3694 { 3695 struct page *page = NULL; 3696 unsigned long pflags; 3697 bool drained = false; 3698 3699 psi_memstall_enter(&pflags); 3700 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 3701 if (unlikely(!(*did_some_progress))) 3702 goto out; 3703 3704 retry: 3705 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3706 3707 /* 3708 * If an allocation failed after direct reclaim, it could be because 3709 * pages are pinned on the per-cpu lists or in high alloc reserves. 3710 * Shrink them and try again 3711 */ 3712 if (!page && !drained) { 3713 unreserve_highatomic_pageblock(ac, false); 3714 drain_all_pages(NULL); 3715 drained = true; 3716 goto retry; 3717 } 3718 out: 3719 psi_memstall_leave(&pflags); 3720 3721 return page; 3722 } 3723 3724 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 3725 const struct alloc_context *ac) 3726 { 3727 struct zoneref *z; 3728 struct zone *zone; 3729 pg_data_t *last_pgdat = NULL; 3730 enum zone_type highest_zoneidx = ac->highest_zoneidx; 3731 3732 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 3733 ac->nodemask) { 3734 if (!managed_zone(zone)) 3735 continue; 3736 if (last_pgdat != zone->zone_pgdat) { 3737 wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); 3738 last_pgdat = zone->zone_pgdat; 3739 } 3740 } 3741 } 3742 3743 static inline unsigned int 3744 gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) 3745 { 3746 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 3747 3748 /* 3749 * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE 3750 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 3751 * to save two branches. 3752 */ 3753 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); 3754 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 3755 3756 /* 3757 * The caller may dip into page reserves a bit more if the caller 3758 * cannot run direct reclaim, or if the caller has realtime scheduling 3759 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 3760 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). 3761 */ 3762 alloc_flags |= (__force int) 3763 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 3764 3765 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 3766 /* 3767 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 3768 * if it can't schedule. 3769 */ 3770 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 3771 alloc_flags |= ALLOC_NON_BLOCK; 3772 3773 if (order > 0) 3774 alloc_flags |= ALLOC_HIGHATOMIC; 3775 } 3776 3777 /* 3778 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably 3779 * GFP_ATOMIC) rather than fail, see the comment for 3780 * cpuset_node_allowed(). 3781 */ 3782 if (alloc_flags & ALLOC_MIN_RESERVE) 3783 alloc_flags &= ~ALLOC_CPUSET; 3784 } else if (unlikely(rt_task(current)) && in_task()) 3785 alloc_flags |= ALLOC_MIN_RESERVE; 3786 3787 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); 3788 3789 return alloc_flags; 3790 } 3791 3792 static bool oom_reserves_allowed(struct task_struct *tsk) 3793 { 3794 if (!tsk_is_oom_victim(tsk)) 3795 return false; 3796 3797 /* 3798 * !MMU doesn't have oom reaper so give access to memory reserves 3799 * only to the thread with TIF_MEMDIE set 3800 */ 3801 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 3802 return false; 3803 3804 return true; 3805 } 3806 3807 /* 3808 * Distinguish requests which really need access to full memory 3809 * reserves from oom victims which can live with a portion of it 3810 */ 3811 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 3812 { 3813 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3814 return 0; 3815 if (gfp_mask & __GFP_MEMALLOC) 3816 return ALLOC_NO_WATERMARKS; 3817 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3818 return ALLOC_NO_WATERMARKS; 3819 if (!in_interrupt()) { 3820 if (current->flags & PF_MEMALLOC) 3821 return ALLOC_NO_WATERMARKS; 3822 else if (oom_reserves_allowed(current)) 3823 return ALLOC_OOM; 3824 } 3825 3826 return 0; 3827 } 3828 3829 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3830 { 3831 return !!__gfp_pfmemalloc_flags(gfp_mask); 3832 } 3833 3834 /* 3835 * Checks whether it makes sense to retry the reclaim to make a forward progress 3836 * for the given allocation request. 3837 * 3838 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 3839 * without success, or when we couldn't even meet the watermark if we 3840 * reclaimed all remaining pages on the LRU lists. 3841 * 3842 * Returns true if a retry is viable or false to enter the oom path. 3843 */ 3844 static inline bool 3845 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 3846 struct alloc_context *ac, int alloc_flags, 3847 bool did_some_progress, int *no_progress_loops) 3848 { 3849 struct zone *zone; 3850 struct zoneref *z; 3851 bool ret = false; 3852 3853 /* 3854 * Costly allocations might have made a progress but this doesn't mean 3855 * their order will become available due to high fragmentation so 3856 * always increment the no progress counter for them 3857 */ 3858 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 3859 *no_progress_loops = 0; 3860 else 3861 (*no_progress_loops)++; 3862 3863 /* 3864 * Make sure we converge to OOM if we cannot make any progress 3865 * several times in the row. 3866 */ 3867 if (*no_progress_loops > MAX_RECLAIM_RETRIES) { 3868 /* Before OOM, exhaust highatomic_reserve */ 3869 return unreserve_highatomic_pageblock(ac, true); 3870 } 3871 3872 /* 3873 * Keep reclaiming pages while there is a chance this will lead 3874 * somewhere. If none of the target zones can satisfy our allocation 3875 * request even if all reclaimable pages are considered then we are 3876 * screwed and have to go OOM. 3877 */ 3878 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3879 ac->highest_zoneidx, ac->nodemask) { 3880 unsigned long available; 3881 unsigned long reclaimable; 3882 unsigned long min_wmark = min_wmark_pages(zone); 3883 bool wmark; 3884 3885 available = reclaimable = zone_reclaimable_pages(zone); 3886 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 3887 3888 /* 3889 * Would the allocation succeed if we reclaimed all 3890 * reclaimable pages? 3891 */ 3892 wmark = __zone_watermark_ok(zone, order, min_wmark, 3893 ac->highest_zoneidx, alloc_flags, available); 3894 trace_reclaim_retry_zone(z, order, reclaimable, 3895 available, min_wmark, *no_progress_loops, wmark); 3896 if (wmark) { 3897 ret = true; 3898 break; 3899 } 3900 } 3901 3902 /* 3903 * Memory allocation/reclaim might be called from a WQ context and the 3904 * current implementation of the WQ concurrency control doesn't 3905 * recognize that a particular WQ is congested if the worker thread is 3906 * looping without ever sleeping. Therefore we have to do a short sleep 3907 * here rather than calling cond_resched(). 3908 */ 3909 if (current->flags & PF_WQ_WORKER) 3910 schedule_timeout_uninterruptible(1); 3911 else 3912 cond_resched(); 3913 return ret; 3914 } 3915 3916 static inline bool 3917 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 3918 { 3919 /* 3920 * It's possible that cpuset's mems_allowed and the nodemask from 3921 * mempolicy don't intersect. This should be normally dealt with by 3922 * policy_nodemask(), but it's possible to race with cpuset update in 3923 * such a way the check therein was true, and then it became false 3924 * before we got our cpuset_mems_cookie here. 3925 * This assumes that for all allocations, ac->nodemask can come only 3926 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 3927 * when it does not intersect with the cpuset restrictions) or the 3928 * caller can deal with a violated nodemask. 3929 */ 3930 if (cpusets_enabled() && ac->nodemask && 3931 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 3932 ac->nodemask = NULL; 3933 return true; 3934 } 3935 3936 /* 3937 * When updating a task's mems_allowed or mempolicy nodemask, it is 3938 * possible to race with parallel threads in such a way that our 3939 * allocation can fail while the mask is being updated. If we are about 3940 * to fail, check if the cpuset changed during allocation and if so, 3941 * retry. 3942 */ 3943 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3944 return true; 3945 3946 return false; 3947 } 3948 3949 static inline struct page * 3950 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 3951 struct alloc_context *ac) 3952 { 3953 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 3954 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 3955 struct page *page = NULL; 3956 unsigned int alloc_flags; 3957 unsigned long did_some_progress; 3958 enum compact_priority compact_priority; 3959 enum compact_result compact_result; 3960 int compaction_retries; 3961 int no_progress_loops; 3962 unsigned int cpuset_mems_cookie; 3963 unsigned int zonelist_iter_cookie; 3964 int reserve_flags; 3965 3966 restart: 3967 compaction_retries = 0; 3968 no_progress_loops = 0; 3969 compact_priority = DEF_COMPACT_PRIORITY; 3970 cpuset_mems_cookie = read_mems_allowed_begin(); 3971 zonelist_iter_cookie = zonelist_iter_begin(); 3972 3973 /* 3974 * The fast path uses conservative alloc_flags to succeed only until 3975 * kswapd needs to be woken up, and to avoid the cost of setting up 3976 * alloc_flags precisely. So we do that now. 3977 */ 3978 alloc_flags = gfp_to_alloc_flags(gfp_mask, order); 3979 3980 /* 3981 * We need to recalculate the starting point for the zonelist iterator 3982 * because we might have used different nodemask in the fast path, or 3983 * there was a cpuset modification and we are retrying - otherwise we 3984 * could end up iterating over non-eligible zones endlessly. 3985 */ 3986 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3987 ac->highest_zoneidx, ac->nodemask); 3988 if (!ac->preferred_zoneref->zone) 3989 goto nopage; 3990 3991 /* 3992 * Check for insane configurations where the cpuset doesn't contain 3993 * any suitable zone to satisfy the request - e.g. non-movable 3994 * GFP_HIGHUSER allocations from MOVABLE nodes only. 3995 */ 3996 if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { 3997 struct zoneref *z = first_zones_zonelist(ac->zonelist, 3998 ac->highest_zoneidx, 3999 &cpuset_current_mems_allowed); 4000 if (!z->zone) 4001 goto nopage; 4002 } 4003 4004 if (alloc_flags & ALLOC_KSWAPD) 4005 wake_all_kswapds(order, gfp_mask, ac); 4006 4007 /* 4008 * The adjusted alloc_flags might result in immediate success, so try 4009 * that first 4010 */ 4011 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4012 if (page) 4013 goto got_pg; 4014 4015 /* 4016 * For costly allocations, try direct compaction first, as it's likely 4017 * that we have enough base pages and don't need to reclaim. For non- 4018 * movable high-order allocations, do that as well, as compaction will 4019 * try prevent permanent fragmentation by migrating from blocks of the 4020 * same migratetype. 4021 * Don't try this for allocations that are allowed to ignore 4022 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. 4023 */ 4024 if (can_direct_reclaim && 4025 (costly_order || 4026 (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) 4027 && !gfp_pfmemalloc_allowed(gfp_mask)) { 4028 page = __alloc_pages_direct_compact(gfp_mask, order, 4029 alloc_flags, ac, 4030 INIT_COMPACT_PRIORITY, 4031 &compact_result); 4032 if (page) 4033 goto got_pg; 4034 4035 /* 4036 * Checks for costly allocations with __GFP_NORETRY, which 4037 * includes some THP page fault allocations 4038 */ 4039 if (costly_order && (gfp_mask & __GFP_NORETRY)) { 4040 /* 4041 * If allocating entire pageblock(s) and compaction 4042 * failed because all zones are below low watermarks 4043 * or is prohibited because it recently failed at this 4044 * order, fail immediately unless the allocator has 4045 * requested compaction and reclaim retry. 4046 * 4047 * Reclaim is 4048 * - potentially very expensive because zones are far 4049 * below their low watermarks or this is part of very 4050 * bursty high order allocations, 4051 * - not guaranteed to help because isolate_freepages() 4052 * may not iterate over freed pages as part of its 4053 * linear scan, and 4054 * - unlikely to make entire pageblocks free on its 4055 * own. 4056 */ 4057 if (compact_result == COMPACT_SKIPPED || 4058 compact_result == COMPACT_DEFERRED) 4059 goto nopage; 4060 4061 /* 4062 * Looks like reclaim/compaction is worth trying, but 4063 * sync compaction could be very expensive, so keep 4064 * using async compaction. 4065 */ 4066 compact_priority = INIT_COMPACT_PRIORITY; 4067 } 4068 } 4069 4070 retry: 4071 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4072 if (alloc_flags & ALLOC_KSWAPD) 4073 wake_all_kswapds(order, gfp_mask, ac); 4074 4075 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4076 if (reserve_flags) 4077 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | 4078 (alloc_flags & ALLOC_KSWAPD); 4079 4080 /* 4081 * Reset the nodemask and zonelist iterators if memory policies can be 4082 * ignored. These allocations are high priority and system rather than 4083 * user oriented. 4084 */ 4085 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 4086 ac->nodemask = NULL; 4087 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4088 ac->highest_zoneidx, ac->nodemask); 4089 } 4090 4091 /* Attempt with potentially adjusted zonelist and alloc_flags */ 4092 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4093 if (page) 4094 goto got_pg; 4095 4096 /* Caller is not willing to reclaim, we can't balance anything */ 4097 if (!can_direct_reclaim) 4098 goto nopage; 4099 4100 /* Avoid recursion of direct reclaim */ 4101 if (current->flags & PF_MEMALLOC) 4102 goto nopage; 4103 4104 /* Try direct reclaim and then allocating */ 4105 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 4106 &did_some_progress); 4107 if (page) 4108 goto got_pg; 4109 4110 /* Try direct compaction and then allocating */ 4111 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 4112 compact_priority, &compact_result); 4113 if (page) 4114 goto got_pg; 4115 4116 /* Do not loop if specifically requested */ 4117 if (gfp_mask & __GFP_NORETRY) 4118 goto nopage; 4119 4120 /* 4121 * Do not retry costly high order allocations unless they are 4122 * __GFP_RETRY_MAYFAIL 4123 */ 4124 if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) 4125 goto nopage; 4126 4127 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 4128 did_some_progress > 0, &no_progress_loops)) 4129 goto retry; 4130 4131 /* 4132 * It doesn't make any sense to retry for the compaction if the order-0 4133 * reclaim is not able to make any progress because the current 4134 * implementation of the compaction depends on the sufficient amount 4135 * of free memory (see __compaction_suitable) 4136 */ 4137 if (did_some_progress > 0 && 4138 should_compact_retry(ac, order, alloc_flags, 4139 compact_result, &compact_priority, 4140 &compaction_retries)) 4141 goto retry; 4142 4143 4144 /* 4145 * Deal with possible cpuset update races or zonelist updates to avoid 4146 * a unnecessary OOM kill. 4147 */ 4148 if (check_retry_cpuset(cpuset_mems_cookie, ac) || 4149 check_retry_zonelist(zonelist_iter_cookie)) 4150 goto restart; 4151 4152 /* Reclaim has failed us, start killing things */ 4153 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 4154 if (page) 4155 goto got_pg; 4156 4157 /* Avoid allocations with no watermarks from looping endlessly */ 4158 if (tsk_is_oom_victim(current) && 4159 (alloc_flags & ALLOC_OOM || 4160 (gfp_mask & __GFP_NOMEMALLOC))) 4161 goto nopage; 4162 4163 /* Retry as long as the OOM killer is making progress */ 4164 if (did_some_progress) { 4165 no_progress_loops = 0; 4166 goto retry; 4167 } 4168 4169 nopage: 4170 /* 4171 * Deal with possible cpuset update races or zonelist updates to avoid 4172 * a unnecessary OOM kill. 4173 */ 4174 if (check_retry_cpuset(cpuset_mems_cookie, ac) || 4175 check_retry_zonelist(zonelist_iter_cookie)) 4176 goto restart; 4177 4178 /* 4179 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 4180 * we always retry 4181 */ 4182 if (gfp_mask & __GFP_NOFAIL) { 4183 /* 4184 * All existing users of the __GFP_NOFAIL are blockable, so warn 4185 * of any new users that actually require GFP_NOWAIT 4186 */ 4187 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) 4188 goto fail; 4189 4190 /* 4191 * PF_MEMALLOC request from this context is rather bizarre 4192 * because we cannot reclaim anything and only can loop waiting 4193 * for somebody to do a work for us 4194 */ 4195 WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); 4196 4197 /* 4198 * non failing costly orders are a hard requirement which we 4199 * are not prepared for much so let's warn about these users 4200 * so that we can identify them and convert them to something 4201 * else. 4202 */ 4203 WARN_ON_ONCE_GFP(costly_order, gfp_mask); 4204 4205 /* 4206 * Help non-failing allocations by giving some access to memory 4207 * reserves normally used for high priority non-blocking 4208 * allocations but do not use ALLOC_NO_WATERMARKS because this 4209 * could deplete whole memory reserves which would just make 4210 * the situation worse. 4211 */ 4212 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); 4213 if (page) 4214 goto got_pg; 4215 4216 cond_resched(); 4217 goto retry; 4218 } 4219 fail: 4220 warn_alloc(gfp_mask, ac->nodemask, 4221 "page allocation failure: order:%u", order); 4222 got_pg: 4223 return page; 4224 } 4225 4226 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 4227 int preferred_nid, nodemask_t *nodemask, 4228 struct alloc_context *ac, gfp_t *alloc_gfp, 4229 unsigned int *alloc_flags) 4230 { 4231 ac->highest_zoneidx = gfp_zone(gfp_mask); 4232 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 4233 ac->nodemask = nodemask; 4234 ac->migratetype = gfp_migratetype(gfp_mask); 4235 4236 if (cpusets_enabled()) { 4237 *alloc_gfp |= __GFP_HARDWALL; 4238 /* 4239 * When we are in the interrupt context, it is irrelevant 4240 * to the current task context. It means that any node ok. 4241 */ 4242 if (in_task() && !ac->nodemask) 4243 ac->nodemask = &cpuset_current_mems_allowed; 4244 else 4245 *alloc_flags |= ALLOC_CPUSET; 4246 } 4247 4248 might_alloc(gfp_mask); 4249 4250 if (should_fail_alloc_page(gfp_mask, order)) 4251 return false; 4252 4253 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); 4254 4255 /* Dirty zone balancing only done in the fast path */ 4256 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 4257 4258 /* 4259 * The preferred zone is used for statistics but crucially it is 4260 * also used as the starting point for the zonelist iterator. It 4261 * may get reset for allocations that ignore memory policies. 4262 */ 4263 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4264 ac->highest_zoneidx, ac->nodemask); 4265 4266 return true; 4267 } 4268 4269 /* 4270 * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array 4271 * @gfp: GFP flags for the allocation 4272 * @preferred_nid: The preferred NUMA node ID to allocate from 4273 * @nodemask: Set of nodes to allocate from, may be NULL 4274 * @nr_pages: The number of pages desired on the list or array 4275 * @page_list: Optional list to store the allocated pages 4276 * @page_array: Optional array to store the pages 4277 * 4278 * This is a batched version of the page allocator that attempts to 4279 * allocate nr_pages quickly. Pages are added to page_list if page_list 4280 * is not NULL, otherwise it is assumed that the page_array is valid. 4281 * 4282 * For lists, nr_pages is the number of pages that should be allocated. 4283 * 4284 * For arrays, only NULL elements are populated with pages and nr_pages 4285 * is the maximum number of pages that will be stored in the array. 4286 * 4287 * Returns the number of pages on the list or array. 4288 */ 4289 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, 4290 nodemask_t *nodemask, int nr_pages, 4291 struct list_head *page_list, 4292 struct page **page_array) 4293 { 4294 struct page *page; 4295 unsigned long __maybe_unused UP_flags; 4296 struct zone *zone; 4297 struct zoneref *z; 4298 struct per_cpu_pages *pcp; 4299 struct list_head *pcp_list; 4300 struct alloc_context ac; 4301 gfp_t alloc_gfp; 4302 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4303 int nr_populated = 0, nr_account = 0; 4304 4305 /* 4306 * Skip populated array elements to determine if any pages need 4307 * to be allocated before disabling IRQs. 4308 */ 4309 while (page_array && nr_populated < nr_pages && page_array[nr_populated]) 4310 nr_populated++; 4311 4312 /* No pages requested? */ 4313 if (unlikely(nr_pages <= 0)) 4314 goto out; 4315 4316 /* Already populated array? */ 4317 if (unlikely(page_array && nr_pages - nr_populated == 0)) 4318 goto out; 4319 4320 /* Bulk allocator does not support memcg accounting. */ 4321 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) 4322 goto failed; 4323 4324 /* Use the single page allocator for one page. */ 4325 if (nr_pages - nr_populated == 1) 4326 goto failed; 4327 4328 #ifdef CONFIG_PAGE_OWNER 4329 /* 4330 * PAGE_OWNER may recurse into the allocator to allocate space to 4331 * save the stack with pagesets.lock held. Releasing/reacquiring 4332 * removes much of the performance benefit of bulk allocation so 4333 * force the caller to allocate one page at a time as it'll have 4334 * similar performance to added complexity to the bulk allocator. 4335 */ 4336 if (static_branch_unlikely(&page_owner_inited)) 4337 goto failed; 4338 #endif 4339 4340 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ 4341 gfp &= gfp_allowed_mask; 4342 alloc_gfp = gfp; 4343 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) 4344 goto out; 4345 gfp = alloc_gfp; 4346 4347 /* Find an allowed local zone that meets the low watermark. */ 4348 for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { 4349 unsigned long mark; 4350 4351 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && 4352 !__cpuset_zone_allowed(zone, gfp)) { 4353 continue; 4354 } 4355 4356 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && 4357 zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { 4358 goto failed; 4359 } 4360 4361 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; 4362 if (zone_watermark_fast(zone, 0, mark, 4363 zonelist_zone_idx(ac.preferred_zoneref), 4364 alloc_flags, gfp)) { 4365 break; 4366 } 4367 } 4368 4369 /* 4370 * If there are no allowed local zones that meets the watermarks then 4371 * try to allocate a single page and reclaim if necessary. 4372 */ 4373 if (unlikely(!zone)) 4374 goto failed; 4375 4376 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ 4377 pcp_trylock_prepare(UP_flags); 4378 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 4379 if (!pcp) 4380 goto failed_irq; 4381 4382 /* Attempt the batch allocation */ 4383 pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; 4384 while (nr_populated < nr_pages) { 4385 4386 /* Skip existing pages */ 4387 if (page_array && page_array[nr_populated]) { 4388 nr_populated++; 4389 continue; 4390 } 4391 4392 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, 4393 pcp, pcp_list); 4394 if (unlikely(!page)) { 4395 /* Try and allocate at least one page */ 4396 if (!nr_account) { 4397 pcp_spin_unlock(pcp); 4398 goto failed_irq; 4399 } 4400 break; 4401 } 4402 nr_account++; 4403 4404 prep_new_page(page, 0, gfp, 0); 4405 if (page_list) 4406 list_add(&page->lru, page_list); 4407 else 4408 page_array[nr_populated] = page; 4409 nr_populated++; 4410 } 4411 4412 pcp_spin_unlock(pcp); 4413 pcp_trylock_finish(UP_flags); 4414 4415 __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); 4416 zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); 4417 4418 out: 4419 return nr_populated; 4420 4421 failed_irq: 4422 pcp_trylock_finish(UP_flags); 4423 4424 failed: 4425 page = __alloc_pages(gfp, 0, preferred_nid, nodemask); 4426 if (page) { 4427 if (page_list) 4428 list_add(&page->lru, page_list); 4429 else 4430 page_array[nr_populated] = page; 4431 nr_populated++; 4432 } 4433 4434 goto out; 4435 } 4436 EXPORT_SYMBOL_GPL(__alloc_pages_bulk); 4437 4438 /* 4439 * This is the 'heart' of the zoned buddy allocator. 4440 */ 4441 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, 4442 nodemask_t *nodemask) 4443 { 4444 struct page *page; 4445 unsigned int alloc_flags = ALLOC_WMARK_LOW; 4446 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ 4447 struct alloc_context ac = { }; 4448 4449 /* 4450 * There are several places where we assume that the order value is sane 4451 * so bail out early if the request is out of bound. 4452 */ 4453 if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) 4454 return NULL; 4455 4456 gfp &= gfp_allowed_mask; 4457 /* 4458 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 4459 * resp. GFP_NOIO which has to be inherited for all allocation requests 4460 * from a particular context which has been marked by 4461 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures 4462 * movable zones are not used during allocation. 4463 */ 4464 gfp = current_gfp_context(gfp); 4465 alloc_gfp = gfp; 4466 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, 4467 &alloc_gfp, &alloc_flags)) 4468 return NULL; 4469 4470 /* 4471 * Forbid the first pass from falling back to types that fragment 4472 * memory until all local zones are considered. 4473 */ 4474 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); 4475 4476 /* First allocation attempt */ 4477 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 4478 if (likely(page)) 4479 goto out; 4480 4481 alloc_gfp = gfp; 4482 ac.spread_dirty_pages = false; 4483 4484 /* 4485 * Restore the original nodemask if it was potentially replaced with 4486 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 4487 */ 4488 ac.nodemask = nodemask; 4489 4490 page = __alloc_pages_slowpath(alloc_gfp, order, &ac); 4491 4492 out: 4493 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && 4494 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { 4495 __free_pages(page, order); 4496 page = NULL; 4497 } 4498 4499 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 4500 kmsan_alloc_page(page, order, alloc_gfp); 4501 4502 return page; 4503 } 4504 EXPORT_SYMBOL(__alloc_pages); 4505 4506 struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, 4507 nodemask_t *nodemask) 4508 { 4509 struct page *page = __alloc_pages(gfp | __GFP_COMP, order, 4510 preferred_nid, nodemask); 4511 4512 if (page && order > 1) 4513 prep_transhuge_page(page); 4514 return (struct folio *)page; 4515 } 4516 EXPORT_SYMBOL(__folio_alloc); 4517 4518 /* 4519 * Common helper functions. Never use with __GFP_HIGHMEM because the returned 4520 * address cannot represent highmem pages. Use alloc_pages and then kmap if 4521 * you need to access high mem. 4522 */ 4523 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 4524 { 4525 struct page *page; 4526 4527 page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); 4528 if (!page) 4529 return 0; 4530 return (unsigned long) page_address(page); 4531 } 4532 EXPORT_SYMBOL(__get_free_pages); 4533 4534 unsigned long get_zeroed_page(gfp_t gfp_mask) 4535 { 4536 return __get_free_page(gfp_mask | __GFP_ZERO); 4537 } 4538 EXPORT_SYMBOL(get_zeroed_page); 4539 4540 /** 4541 * __free_pages - Free pages allocated with alloc_pages(). 4542 * @page: The page pointer returned from alloc_pages(). 4543 * @order: The order of the allocation. 4544 * 4545 * This function can free multi-page allocations that are not compound 4546 * pages. It does not check that the @order passed in matches that of 4547 * the allocation, so it is easy to leak memory. Freeing more memory 4548 * than was allocated will probably emit a warning. 4549 * 4550 * If the last reference to this page is speculative, it will be released 4551 * by put_page() which only frees the first page of a non-compound 4552 * allocation. To prevent the remaining pages from being leaked, we free 4553 * the subsequent pages here. If you want to use the page's reference 4554 * count to decide when to free the allocation, you should allocate a 4555 * compound page, and use put_page() instead of __free_pages(). 4556 * 4557 * Context: May be called in interrupt context or while holding a normal 4558 * spinlock, but not in NMI context or while holding a raw spinlock. 4559 */ 4560 void __free_pages(struct page *page, unsigned int order) 4561 { 4562 /* get PageHead before we drop reference */ 4563 int head = PageHead(page); 4564 4565 if (put_page_testzero(page)) 4566 free_the_page(page, order); 4567 else if (!head) 4568 while (order-- > 0) 4569 free_the_page(page + (1 << order), order); 4570 } 4571 EXPORT_SYMBOL(__free_pages); 4572 4573 void free_pages(unsigned long addr, unsigned int order) 4574 { 4575 if (addr != 0) { 4576 VM_BUG_ON(!virt_addr_valid((void *)addr)); 4577 __free_pages(virt_to_page((void *)addr), order); 4578 } 4579 } 4580 4581 EXPORT_SYMBOL(free_pages); 4582 4583 /* 4584 * Page Fragment: 4585 * An arbitrary-length arbitrary-offset area of memory which resides 4586 * within a 0 or higher order page. Multiple fragments within that page 4587 * are individually refcounted, in the page's reference counter. 4588 * 4589 * The page_frag functions below provide a simple allocation framework for 4590 * page fragments. This is used by the network stack and network device 4591 * drivers to provide a backing region of memory for use as either an 4592 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 4593 */ 4594 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, 4595 gfp_t gfp_mask) 4596 { 4597 struct page *page = NULL; 4598 gfp_t gfp = gfp_mask; 4599 4600 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4601 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 4602 __GFP_NOMEMALLOC; 4603 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 4604 PAGE_FRAG_CACHE_MAX_ORDER); 4605 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; 4606 #endif 4607 if (unlikely(!page)) 4608 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 4609 4610 nc->va = page ? page_address(page) : NULL; 4611 4612 return page; 4613 } 4614 4615 void __page_frag_cache_drain(struct page *page, unsigned int count) 4616 { 4617 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4618 4619 if (page_ref_sub_and_test(page, count)) 4620 free_the_page(page, compound_order(page)); 4621 } 4622 EXPORT_SYMBOL(__page_frag_cache_drain); 4623 4624 void *page_frag_alloc_align(struct page_frag_cache *nc, 4625 unsigned int fragsz, gfp_t gfp_mask, 4626 unsigned int align_mask) 4627 { 4628 unsigned int size = PAGE_SIZE; 4629 struct page *page; 4630 int offset; 4631 4632 if (unlikely(!nc->va)) { 4633 refill: 4634 page = __page_frag_cache_refill(nc, gfp_mask); 4635 if (!page) 4636 return NULL; 4637 4638 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4639 /* if size can vary use size else just use PAGE_SIZE */ 4640 size = nc->size; 4641 #endif 4642 /* Even if we own the page, we do not use atomic_set(). 4643 * This would break get_page_unless_zero() users. 4644 */ 4645 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); 4646 4647 /* reset page count bias and offset to start of new frag */ 4648 nc->pfmemalloc = page_is_pfmemalloc(page); 4649 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 4650 nc->offset = size; 4651 } 4652 4653 offset = nc->offset - fragsz; 4654 if (unlikely(offset < 0)) { 4655 page = virt_to_page(nc->va); 4656 4657 if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) 4658 goto refill; 4659 4660 if (unlikely(nc->pfmemalloc)) { 4661 free_the_page(page, compound_order(page)); 4662 goto refill; 4663 } 4664 4665 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4666 /* if size can vary use size else just use PAGE_SIZE */ 4667 size = nc->size; 4668 #endif 4669 /* OK, page count is 0, we can safely set it */ 4670 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); 4671 4672 /* reset page count bias and offset to start of new frag */ 4673 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; 4674 offset = size - fragsz; 4675 if (unlikely(offset < 0)) { 4676 /* 4677 * The caller is trying to allocate a fragment 4678 * with fragsz > PAGE_SIZE but the cache isn't big 4679 * enough to satisfy the request, this may 4680 * happen in low memory conditions. 4681 * We don't release the cache page because 4682 * it could make memory pressure worse 4683 * so we simply return NULL here. 4684 */ 4685 return NULL; 4686 } 4687 } 4688 4689 nc->pagecnt_bias--; 4690 offset &= align_mask; 4691 nc->offset = offset; 4692 4693 return nc->va + offset; 4694 } 4695 EXPORT_SYMBOL(page_frag_alloc_align); 4696 4697 /* 4698 * Frees a page fragment allocated out of either a compound or order 0 page. 4699 */ 4700 void page_frag_free(void *addr) 4701 { 4702 struct page *page = virt_to_head_page(addr); 4703 4704 if (unlikely(put_page_testzero(page))) 4705 free_the_page(page, compound_order(page)); 4706 } 4707 EXPORT_SYMBOL(page_frag_free); 4708 4709 static void *make_alloc_exact(unsigned long addr, unsigned int order, 4710 size_t size) 4711 { 4712 if (addr) { 4713 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); 4714 struct page *page = virt_to_page((void *)addr); 4715 struct page *last = page + nr; 4716 4717 split_page_owner(page, 1 << order); 4718 split_page_memcg(page, 1 << order); 4719 while (page < --last) 4720 set_page_refcounted(last); 4721 4722 last = page + (1UL << order); 4723 for (page += nr; page < last; page++) 4724 __free_pages_ok(page, 0, FPI_TO_TAIL); 4725 } 4726 return (void *)addr; 4727 } 4728 4729 /** 4730 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 4731 * @size: the number of bytes to allocate 4732 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 4733 * 4734 * This function is similar to alloc_pages(), except that it allocates the 4735 * minimum number of pages to satisfy the request. alloc_pages() can only 4736 * allocate memory in power-of-two pages. 4737 * 4738 * This function is also limited by MAX_ORDER. 4739 * 4740 * Memory allocated by this function must be released by free_pages_exact(). 4741 * 4742 * Return: pointer to the allocated area or %NULL in case of error. 4743 */ 4744 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4745 { 4746 unsigned int order = get_order(size); 4747 unsigned long addr; 4748 4749 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 4750 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 4751 4752 addr = __get_free_pages(gfp_mask, order); 4753 return make_alloc_exact(addr, order, size); 4754 } 4755 EXPORT_SYMBOL(alloc_pages_exact); 4756 4757 /** 4758 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 4759 * pages on a node. 4760 * @nid: the preferred node ID where memory should be allocated 4761 * @size: the number of bytes to allocate 4762 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 4763 * 4764 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4765 * back. 4766 * 4767 * Return: pointer to the allocated area or %NULL in case of error. 4768 */ 4769 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4770 { 4771 unsigned int order = get_order(size); 4772 struct page *p; 4773 4774 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 4775 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 4776 4777 p = alloc_pages_node(nid, gfp_mask, order); 4778 if (!p) 4779 return NULL; 4780 return make_alloc_exact((unsigned long)page_address(p), order, size); 4781 } 4782 4783 /** 4784 * free_pages_exact - release memory allocated via alloc_pages_exact() 4785 * @virt: the value returned by alloc_pages_exact. 4786 * @size: size of allocation, same value as passed to alloc_pages_exact(). 4787 * 4788 * Release the memory allocated by a previous call to alloc_pages_exact. 4789 */ 4790 void free_pages_exact(void *virt, size_t size) 4791 { 4792 unsigned long addr = (unsigned long)virt; 4793 unsigned long end = addr + PAGE_ALIGN(size); 4794 4795 while (addr < end) { 4796 free_page(addr); 4797 addr += PAGE_SIZE; 4798 } 4799 } 4800 EXPORT_SYMBOL(free_pages_exact); 4801 4802 /** 4803 * nr_free_zone_pages - count number of pages beyond high watermark 4804 * @offset: The zone index of the highest zone 4805 * 4806 * nr_free_zone_pages() counts the number of pages which are beyond the 4807 * high watermark within all zones at or below a given zone index. For each 4808 * zone, the number of pages is calculated as: 4809 * 4810 * nr_free_zone_pages = managed_pages - high_pages 4811 * 4812 * Return: number of pages beyond high watermark. 4813 */ 4814 static unsigned long nr_free_zone_pages(int offset) 4815 { 4816 struct zoneref *z; 4817 struct zone *zone; 4818 4819 /* Just pick one node, since fallback list is circular */ 4820 unsigned long sum = 0; 4821 4822 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 4823 4824 for_each_zone_zonelist(zone, z, zonelist, offset) { 4825 unsigned long size = zone_managed_pages(zone); 4826 unsigned long high = high_wmark_pages(zone); 4827 if (size > high) 4828 sum += size - high; 4829 } 4830 4831 return sum; 4832 } 4833 4834 /** 4835 * nr_free_buffer_pages - count number of pages beyond high watermark 4836 * 4837 * nr_free_buffer_pages() counts the number of pages which are beyond the high 4838 * watermark within ZONE_DMA and ZONE_NORMAL. 4839 * 4840 * Return: number of pages beyond high watermark within ZONE_DMA and 4841 * ZONE_NORMAL. 4842 */ 4843 unsigned long nr_free_buffer_pages(void) 4844 { 4845 return nr_free_zone_pages(gfp_zone(GFP_USER)); 4846 } 4847 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 4848 4849 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 4850 { 4851 zoneref->zone = zone; 4852 zoneref->zone_idx = zone_idx(zone); 4853 } 4854 4855 /* 4856 * Builds allocation fallback zone lists. 4857 * 4858 * Add all populated zones of a node to the zonelist. 4859 */ 4860 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 4861 { 4862 struct zone *zone; 4863 enum zone_type zone_type = MAX_NR_ZONES; 4864 int nr_zones = 0; 4865 4866 do { 4867 zone_type--; 4868 zone = pgdat->node_zones + zone_type; 4869 if (populated_zone(zone)) { 4870 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 4871 check_highest_zone(zone_type); 4872 } 4873 } while (zone_type); 4874 4875 return nr_zones; 4876 } 4877 4878 #ifdef CONFIG_NUMA 4879 4880 static int __parse_numa_zonelist_order(char *s) 4881 { 4882 /* 4883 * We used to support different zonelists modes but they turned 4884 * out to be just not useful. Let's keep the warning in place 4885 * if somebody still use the cmd line parameter so that we do 4886 * not fail it silently 4887 */ 4888 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 4889 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 4890 return -EINVAL; 4891 } 4892 return 0; 4893 } 4894 4895 static char numa_zonelist_order[] = "Node"; 4896 #define NUMA_ZONELIST_ORDER_LEN 16 4897 /* 4898 * sysctl handler for numa_zonelist_order 4899 */ 4900 static int numa_zonelist_order_handler(struct ctl_table *table, int write, 4901 void *buffer, size_t *length, loff_t *ppos) 4902 { 4903 if (write) 4904 return __parse_numa_zonelist_order(buffer); 4905 return proc_dostring(table, write, buffer, length, ppos); 4906 } 4907 4908 static int node_load[MAX_NUMNODES]; 4909 4910 /** 4911 * find_next_best_node - find the next node that should appear in a given node's fallback list 4912 * @node: node whose fallback list we're appending 4913 * @used_node_mask: nodemask_t of already used nodes 4914 * 4915 * We use a number of factors to determine which is the next node that should 4916 * appear on a given node's fallback list. The node should not have appeared 4917 * already in @node's fallback list, and it should be the next closest node 4918 * according to the distance array (which contains arbitrary distance values 4919 * from each node to each node in the system), and should also prefer nodes 4920 * with no CPUs, since presumably they'll have very little allocation pressure 4921 * on them otherwise. 4922 * 4923 * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 4924 */ 4925 int find_next_best_node(int node, nodemask_t *used_node_mask) 4926 { 4927 int n, val; 4928 int min_val = INT_MAX; 4929 int best_node = NUMA_NO_NODE; 4930 4931 /* Use the local node if we haven't already */ 4932 if (!node_isset(node, *used_node_mask)) { 4933 node_set(node, *used_node_mask); 4934 return node; 4935 } 4936 4937 for_each_node_state(n, N_MEMORY) { 4938 4939 /* Don't want a node to appear more than once */ 4940 if (node_isset(n, *used_node_mask)) 4941 continue; 4942 4943 /* Use the distance array to find the distance */ 4944 val = node_distance(node, n); 4945 4946 /* Penalize nodes under us ("prefer the next node") */ 4947 val += (n < node); 4948 4949 /* Give preference to headless and unused nodes */ 4950 if (!cpumask_empty(cpumask_of_node(n))) 4951 val += PENALTY_FOR_NODE_WITH_CPUS; 4952 4953 /* Slight preference for less loaded node */ 4954 val *= MAX_NUMNODES; 4955 val += node_load[n]; 4956 4957 if (val < min_val) { 4958 min_val = val; 4959 best_node = n; 4960 } 4961 } 4962 4963 if (best_node >= 0) 4964 node_set(best_node, *used_node_mask); 4965 4966 return best_node; 4967 } 4968 4969 4970 /* 4971 * Build zonelists ordered by node and zones within node. 4972 * This results in maximum locality--normal zone overflows into local 4973 * DMA zone, if any--but risks exhausting DMA zone. 4974 */ 4975 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 4976 unsigned nr_nodes) 4977 { 4978 struct zoneref *zonerefs; 4979 int i; 4980 4981 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 4982 4983 for (i = 0; i < nr_nodes; i++) { 4984 int nr_zones; 4985 4986 pg_data_t *node = NODE_DATA(node_order[i]); 4987 4988 nr_zones = build_zonerefs_node(node, zonerefs); 4989 zonerefs += nr_zones; 4990 } 4991 zonerefs->zone = NULL; 4992 zonerefs->zone_idx = 0; 4993 } 4994 4995 /* 4996 * Build gfp_thisnode zonelists 4997 */ 4998 static void build_thisnode_zonelists(pg_data_t *pgdat) 4999 { 5000 struct zoneref *zonerefs; 5001 int nr_zones; 5002 5003 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 5004 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5005 zonerefs += nr_zones; 5006 zonerefs->zone = NULL; 5007 zonerefs->zone_idx = 0; 5008 } 5009 5010 /* 5011 * Build zonelists ordered by zone and nodes within zones. 5012 * This results in conserving DMA zone[s] until all Normal memory is 5013 * exhausted, but results in overflowing to remote node while memory 5014 * may still exist in local DMA zone. 5015 */ 5016 5017 static void build_zonelists(pg_data_t *pgdat) 5018 { 5019 static int node_order[MAX_NUMNODES]; 5020 int node, nr_nodes = 0; 5021 nodemask_t used_mask = NODE_MASK_NONE; 5022 int local_node, prev_node; 5023 5024 /* NUMA-aware ordering of nodes */ 5025 local_node = pgdat->node_id; 5026 prev_node = local_node; 5027 5028 memset(node_order, 0, sizeof(node_order)); 5029 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5030 /* 5031 * We don't want to pressure a particular node. 5032 * So adding penalty to the first node in same 5033 * distance group to make it round-robin. 5034 */ 5035 if (node_distance(local_node, node) != 5036 node_distance(local_node, prev_node)) 5037 node_load[node] += 1; 5038 5039 node_order[nr_nodes++] = node; 5040 prev_node = node; 5041 } 5042 5043 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 5044 build_thisnode_zonelists(pgdat); 5045 pr_info("Fallback order for Node %d: ", local_node); 5046 for (node = 0; node < nr_nodes; node++) 5047 pr_cont("%d ", node_order[node]); 5048 pr_cont("\n"); 5049 } 5050 5051 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5052 /* 5053 * Return node id of node used for "local" allocations. 5054 * I.e., first node id of first zone in arg node's generic zonelist. 5055 * Used for initializing percpu 'numa_mem', which is used primarily 5056 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 5057 */ 5058 int local_memory_node(int node) 5059 { 5060 struct zoneref *z; 5061 5062 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5063 gfp_zone(GFP_KERNEL), 5064 NULL); 5065 return zone_to_nid(z->zone); 5066 } 5067 #endif 5068 5069 static void setup_min_unmapped_ratio(void); 5070 static void setup_min_slab_ratio(void); 5071 #else /* CONFIG_NUMA */ 5072 5073 static void build_zonelists(pg_data_t *pgdat) 5074 { 5075 int node, local_node; 5076 struct zoneref *zonerefs; 5077 int nr_zones; 5078 5079 local_node = pgdat->node_id; 5080 5081 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5082 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5083 zonerefs += nr_zones; 5084 5085 /* 5086 * Now we build the zonelist so that it contains the zones 5087 * of all the other nodes. 5088 * We don't want to pressure a particular node, so when 5089 * building the zones for node N, we make sure that the 5090 * zones coming right after the local ones are those from 5091 * node N+1 (modulo N) 5092 */ 5093 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5094 if (!node_online(node)) 5095 continue; 5096 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5097 zonerefs += nr_zones; 5098 } 5099 for (node = 0; node < local_node; node++) { 5100 if (!node_online(node)) 5101 continue; 5102 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); 5103 zonerefs += nr_zones; 5104 } 5105 5106 zonerefs->zone = NULL; 5107 zonerefs->zone_idx = 0; 5108 } 5109 5110 #endif /* CONFIG_NUMA */ 5111 5112 /* 5113 * Boot pageset table. One per cpu which is going to be used for all 5114 * zones and all nodes. The parameters will be set in such a way 5115 * that an item put on a list will immediately be handed over to 5116 * the buddy list. This is safe since pageset manipulation is done 5117 * with interrupts disabled. 5118 * 5119 * The boot_pagesets must be kept even after bootup is complete for 5120 * unused processors and/or zones. They do play a role for bootstrapping 5121 * hotplugged processors. 5122 * 5123 * zoneinfo_show() and maybe other functions do 5124 * not check if the processor is online before following the pageset pointer. 5125 * Other parts of the kernel may not check if the zone is available. 5126 */ 5127 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); 5128 /* These effectively disable the pcplists in the boot pageset completely */ 5129 #define BOOT_PAGESET_HIGH 0 5130 #define BOOT_PAGESET_BATCH 1 5131 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); 5132 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); 5133 5134 static void __build_all_zonelists(void *data) 5135 { 5136 int nid; 5137 int __maybe_unused cpu; 5138 pg_data_t *self = data; 5139 unsigned long flags; 5140 5141 /* 5142 * Explicitly disable this CPU's interrupts before taking seqlock 5143 * to prevent any IRQ handler from calling into the page allocator 5144 * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. 5145 */ 5146 local_irq_save(flags); 5147 /* 5148 * Explicitly disable this CPU's synchronous printk() before taking 5149 * seqlock to prevent any printk() from trying to hold port->lock, for 5150 * tty_insert_flip_string_and_push_buffer() on other CPU might be 5151 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. 5152 */ 5153 printk_deferred_enter(); 5154 write_seqlock(&zonelist_update_seq); 5155 5156 #ifdef CONFIG_NUMA 5157 memset(node_load, 0, sizeof(node_load)); 5158 #endif 5159 5160 /* 5161 * This node is hotadded and no memory is yet present. So just 5162 * building zonelists is fine - no need to touch other nodes. 5163 */ 5164 if (self && !node_online(self->node_id)) { 5165 build_zonelists(self); 5166 } else { 5167 /* 5168 * All possible nodes have pgdat preallocated 5169 * in free_area_init 5170 */ 5171 for_each_node(nid) { 5172 pg_data_t *pgdat = NODE_DATA(nid); 5173 5174 build_zonelists(pgdat); 5175 } 5176 5177 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5178 /* 5179 * We now know the "local memory node" for each node-- 5180 * i.e., the node of the first zone in the generic zonelist. 5181 * Set up numa_mem percpu variable for on-line cpus. During 5182 * boot, only the boot cpu should be on-line; we'll init the 5183 * secondary cpus' numa_mem as they come on-line. During 5184 * node/memory hotplug, we'll fixup all on-line cpus. 5185 */ 5186 for_each_online_cpu(cpu) 5187 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5188 #endif 5189 } 5190 5191 write_sequnlock(&zonelist_update_seq); 5192 printk_deferred_exit(); 5193 local_irq_restore(flags); 5194 } 5195 5196 static noinline void __init 5197 build_all_zonelists_init(void) 5198 { 5199 int cpu; 5200 5201 __build_all_zonelists(NULL); 5202 5203 /* 5204 * Initialize the boot_pagesets that are going to be used 5205 * for bootstrapping processors. The real pagesets for 5206 * each zone will be allocated later when the per cpu 5207 * allocator is available. 5208 * 5209 * boot_pagesets are used also for bootstrapping offline 5210 * cpus if the system is already booted because the pagesets 5211 * are needed to initialize allocators on a specific cpu too. 5212 * F.e. the percpu allocator needs the page allocator which 5213 * needs the percpu allocator in order to allocate its pagesets 5214 * (a chicken-egg dilemma). 5215 */ 5216 for_each_possible_cpu(cpu) 5217 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); 5218 5219 mminit_verify_zonelist(); 5220 cpuset_init_current_mems_allowed(); 5221 } 5222 5223 /* 5224 * unless system_state == SYSTEM_BOOTING. 5225 * 5226 * __ref due to call of __init annotated helper build_all_zonelists_init 5227 * [protected by SYSTEM_BOOTING]. 5228 */ 5229 void __ref build_all_zonelists(pg_data_t *pgdat) 5230 { 5231 unsigned long vm_total_pages; 5232 5233 if (system_state == SYSTEM_BOOTING) { 5234 build_all_zonelists_init(); 5235 } else { 5236 __build_all_zonelists(pgdat); 5237 /* cpuset refresh routine should be here */ 5238 } 5239 /* Get the number of free pages beyond high watermark in all zones. */ 5240 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 5241 /* 5242 * Disable grouping by mobility if the number of pages in the 5243 * system is too low to allow the mechanism to work. It would be 5244 * more accurate, but expensive to check per-zone. This check is 5245 * made on memory-hotadd so a system can start with mobility 5246 * disabled and enable it later 5247 */ 5248 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 5249 page_group_by_mobility_disabled = 1; 5250 else 5251 page_group_by_mobility_disabled = 0; 5252 5253 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 5254 nr_online_nodes, 5255 page_group_by_mobility_disabled ? "off" : "on", 5256 vm_total_pages); 5257 #ifdef CONFIG_NUMA 5258 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 5259 #endif 5260 } 5261 5262 static int zone_batchsize(struct zone *zone) 5263 { 5264 #ifdef CONFIG_MMU 5265 int batch; 5266 5267 /* 5268 * The number of pages to batch allocate is either ~0.1% 5269 * of the zone or 1MB, whichever is smaller. The batch 5270 * size is striking a balance between allocation latency 5271 * and zone lock contention. 5272 */ 5273 batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); 5274 batch /= 4; /* We effectively *= 4 below */ 5275 if (batch < 1) 5276 batch = 1; 5277 5278 /* 5279 * Clamp the batch to a 2^n - 1 value. Having a power 5280 * of 2 value was found to be more likely to have 5281 * suboptimal cache aliasing properties in some cases. 5282 * 5283 * For example if 2 tasks are alternately allocating 5284 * batches of pages, one task can end up with a lot 5285 * of pages of one half of the possible page colors 5286 * and the other with pages of the other colors. 5287 */ 5288 batch = rounddown_pow_of_two(batch + batch/2) - 1; 5289 5290 return batch; 5291 5292 #else 5293 /* The deferral and batching of frees should be suppressed under NOMMU 5294 * conditions. 5295 * 5296 * The problem is that NOMMU needs to be able to allocate large chunks 5297 * of contiguous memory as there's no hardware page translation to 5298 * assemble apparent contiguous memory from discontiguous pages. 5299 * 5300 * Queueing large contiguous runs of pages for batching, however, 5301 * causes the pages to actually be freed in smaller chunks. As there 5302 * can be a significant delay between the individual batches being 5303 * recycled, this leads to the once large chunks of space being 5304 * fragmented and becoming unavailable for high-order allocations. 5305 */ 5306 return 0; 5307 #endif 5308 } 5309 5310 static int percpu_pagelist_high_fraction; 5311 static int zone_highsize(struct zone *zone, int batch, int cpu_online) 5312 { 5313 #ifdef CONFIG_MMU 5314 int high; 5315 int nr_split_cpus; 5316 unsigned long total_pages; 5317 5318 if (!percpu_pagelist_high_fraction) { 5319 /* 5320 * By default, the high value of the pcp is based on the zone 5321 * low watermark so that if they are full then background 5322 * reclaim will not be started prematurely. 5323 */ 5324 total_pages = low_wmark_pages(zone); 5325 } else { 5326 /* 5327 * If percpu_pagelist_high_fraction is configured, the high 5328 * value is based on a fraction of the managed pages in the 5329 * zone. 5330 */ 5331 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; 5332 } 5333 5334 /* 5335 * Split the high value across all online CPUs local to the zone. Note 5336 * that early in boot that CPUs may not be online yet and that during 5337 * CPU hotplug that the cpumask is not yet updated when a CPU is being 5338 * onlined. For memory nodes that have no CPUs, split pcp->high across 5339 * all online CPUs to mitigate the risk that reclaim is triggered 5340 * prematurely due to pages stored on pcp lists. 5341 */ 5342 nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; 5343 if (!nr_split_cpus) 5344 nr_split_cpus = num_online_cpus(); 5345 high = total_pages / nr_split_cpus; 5346 5347 /* 5348 * Ensure high is at least batch*4. The multiple is based on the 5349 * historical relationship between high and batch. 5350 */ 5351 high = max(high, batch << 2); 5352 5353 return high; 5354 #else 5355 return 0; 5356 #endif 5357 } 5358 5359 /* 5360 * pcp->high and pcp->batch values are related and generally batch is lower 5361 * than high. They are also related to pcp->count such that count is lower 5362 * than high, and as soon as it reaches high, the pcplist is flushed. 5363 * 5364 * However, guaranteeing these relations at all times would require e.g. write 5365 * barriers here but also careful usage of read barriers at the read side, and 5366 * thus be prone to error and bad for performance. Thus the update only prevents 5367 * store tearing. Any new users of pcp->batch and pcp->high should ensure they 5368 * can cope with those fields changing asynchronously, and fully trust only the 5369 * pcp->count field on the local CPU with interrupts disabled. 5370 * 5371 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 5372 * outside of boot time (or some other assurance that no concurrent updaters 5373 * exist). 5374 */ 5375 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, 5376 unsigned long batch) 5377 { 5378 WRITE_ONCE(pcp->batch, batch); 5379 WRITE_ONCE(pcp->high, high); 5380 } 5381 5382 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) 5383 { 5384 int pindex; 5385 5386 memset(pcp, 0, sizeof(*pcp)); 5387 memset(pzstats, 0, sizeof(*pzstats)); 5388 5389 spin_lock_init(&pcp->lock); 5390 for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) 5391 INIT_LIST_HEAD(&pcp->lists[pindex]); 5392 5393 /* 5394 * Set batch and high values safe for a boot pageset. A true percpu 5395 * pageset's initialization will update them subsequently. Here we don't 5396 * need to be as careful as pageset_update() as nobody can access the 5397 * pageset yet. 5398 */ 5399 pcp->high = BOOT_PAGESET_HIGH; 5400 pcp->batch = BOOT_PAGESET_BATCH; 5401 pcp->free_factor = 0; 5402 } 5403 5404 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, 5405 unsigned long batch) 5406 { 5407 struct per_cpu_pages *pcp; 5408 int cpu; 5409 5410 for_each_possible_cpu(cpu) { 5411 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 5412 pageset_update(pcp, high, batch); 5413 } 5414 } 5415 5416 /* 5417 * Calculate and set new high and batch values for all per-cpu pagesets of a 5418 * zone based on the zone's size. 5419 */ 5420 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) 5421 { 5422 int new_high, new_batch; 5423 5424 new_batch = max(1, zone_batchsize(zone)); 5425 new_high = zone_highsize(zone, new_batch, cpu_online); 5426 5427 if (zone->pageset_high == new_high && 5428 zone->pageset_batch == new_batch) 5429 return; 5430 5431 zone->pageset_high = new_high; 5432 zone->pageset_batch = new_batch; 5433 5434 __zone_set_pageset_high_and_batch(zone, new_high, new_batch); 5435 } 5436 5437 void __meminit setup_zone_pageset(struct zone *zone) 5438 { 5439 int cpu; 5440 5441 /* Size may be 0 on !SMP && !NUMA */ 5442 if (sizeof(struct per_cpu_zonestat) > 0) 5443 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); 5444 5445 zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); 5446 for_each_possible_cpu(cpu) { 5447 struct per_cpu_pages *pcp; 5448 struct per_cpu_zonestat *pzstats; 5449 5450 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 5451 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 5452 per_cpu_pages_init(pcp, pzstats); 5453 } 5454 5455 zone_set_pageset_high_and_batch(zone, 0); 5456 } 5457 5458 /* 5459 * The zone indicated has a new number of managed_pages; batch sizes and percpu 5460 * page high values need to be recalculated. 5461 */ 5462 static void zone_pcp_update(struct zone *zone, int cpu_online) 5463 { 5464 mutex_lock(&pcp_batch_high_lock); 5465 zone_set_pageset_high_and_batch(zone, cpu_online); 5466 mutex_unlock(&pcp_batch_high_lock); 5467 } 5468 5469 /* 5470 * Allocate per cpu pagesets and initialize them. 5471 * Before this call only boot pagesets were available. 5472 */ 5473 void __init setup_per_cpu_pageset(void) 5474 { 5475 struct pglist_data *pgdat; 5476 struct zone *zone; 5477 int __maybe_unused cpu; 5478 5479 for_each_populated_zone(zone) 5480 setup_zone_pageset(zone); 5481 5482 #ifdef CONFIG_NUMA 5483 /* 5484 * Unpopulated zones continue using the boot pagesets. 5485 * The numa stats for these pagesets need to be reset. 5486 * Otherwise, they will end up skewing the stats of 5487 * the nodes these zones are associated with. 5488 */ 5489 for_each_possible_cpu(cpu) { 5490 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); 5491 memset(pzstats->vm_numa_event, 0, 5492 sizeof(pzstats->vm_numa_event)); 5493 } 5494 #endif 5495 5496 for_each_online_pgdat(pgdat) 5497 pgdat->per_cpu_nodestats = 5498 alloc_percpu(struct per_cpu_nodestat); 5499 } 5500 5501 __meminit void zone_pcp_init(struct zone *zone) 5502 { 5503 /* 5504 * per cpu subsystem is not up at this point. The following code 5505 * relies on the ability of the linker to provide the 5506 * offset of a (static) per cpu variable into the per cpu area. 5507 */ 5508 zone->per_cpu_pageset = &boot_pageset; 5509 zone->per_cpu_zonestats = &boot_zonestats; 5510 zone->pageset_high = BOOT_PAGESET_HIGH; 5511 zone->pageset_batch = BOOT_PAGESET_BATCH; 5512 5513 if (populated_zone(zone)) 5514 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, 5515 zone->present_pages, zone_batchsize(zone)); 5516 } 5517 5518 void adjust_managed_page_count(struct page *page, long count) 5519 { 5520 atomic_long_add(count, &page_zone(page)->managed_pages); 5521 totalram_pages_add(count); 5522 #ifdef CONFIG_HIGHMEM 5523 if (PageHighMem(page)) 5524 totalhigh_pages_add(count); 5525 #endif 5526 } 5527 EXPORT_SYMBOL(adjust_managed_page_count); 5528 5529 unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) 5530 { 5531 void *pos; 5532 unsigned long pages = 0; 5533 5534 start = (void *)PAGE_ALIGN((unsigned long)start); 5535 end = (void *)((unsigned long)end & PAGE_MASK); 5536 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { 5537 struct page *page = virt_to_page(pos); 5538 void *direct_map_addr; 5539 5540 /* 5541 * 'direct_map_addr' might be different from 'pos' 5542 * because some architectures' virt_to_page() 5543 * work with aliases. Getting the direct map 5544 * address ensures that we get a _writeable_ 5545 * alias for the memset(). 5546 */ 5547 direct_map_addr = page_address(page); 5548 /* 5549 * Perform a kasan-unchecked memset() since this memory 5550 * has not been initialized. 5551 */ 5552 direct_map_addr = kasan_reset_tag(direct_map_addr); 5553 if ((unsigned int)poison <= 0xFF) 5554 memset(direct_map_addr, poison, PAGE_SIZE); 5555 5556 free_reserved_page(page); 5557 } 5558 5559 if (pages && s) 5560 pr_info("Freeing %s memory: %ldK\n", s, K(pages)); 5561 5562 return pages; 5563 } 5564 5565 static int page_alloc_cpu_dead(unsigned int cpu) 5566 { 5567 struct zone *zone; 5568 5569 lru_add_drain_cpu(cpu); 5570 mlock_drain_remote(cpu); 5571 drain_pages(cpu); 5572 5573 /* 5574 * Spill the event counters of the dead processor 5575 * into the current processors event counters. 5576 * This artificially elevates the count of the current 5577 * processor. 5578 */ 5579 vm_events_fold_cpu(cpu); 5580 5581 /* 5582 * Zero the differential counters of the dead processor 5583 * so that the vm statistics are consistent. 5584 * 5585 * This is only okay since the processor is dead and cannot 5586 * race with what we are doing. 5587 */ 5588 cpu_vm_stats_fold(cpu); 5589 5590 for_each_populated_zone(zone) 5591 zone_pcp_update(zone, 0); 5592 5593 return 0; 5594 } 5595 5596 static int page_alloc_cpu_online(unsigned int cpu) 5597 { 5598 struct zone *zone; 5599 5600 for_each_populated_zone(zone) 5601 zone_pcp_update(zone, 1); 5602 return 0; 5603 } 5604 5605 void __init page_alloc_init_cpuhp(void) 5606 { 5607 int ret; 5608 5609 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, 5610 "mm/page_alloc:pcp", 5611 page_alloc_cpu_online, 5612 page_alloc_cpu_dead); 5613 WARN_ON(ret < 0); 5614 } 5615 5616 /* 5617 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 5618 * or min_free_kbytes changes. 5619 */ 5620 static void calculate_totalreserve_pages(void) 5621 { 5622 struct pglist_data *pgdat; 5623 unsigned long reserve_pages = 0; 5624 enum zone_type i, j; 5625 5626 for_each_online_pgdat(pgdat) { 5627 5628 pgdat->totalreserve_pages = 0; 5629 5630 for (i = 0; i < MAX_NR_ZONES; i++) { 5631 struct zone *zone = pgdat->node_zones + i; 5632 long max = 0; 5633 unsigned long managed_pages = zone_managed_pages(zone); 5634 5635 /* Find valid and maximum lowmem_reserve in the zone */ 5636 for (j = i; j < MAX_NR_ZONES; j++) { 5637 if (zone->lowmem_reserve[j] > max) 5638 max = zone->lowmem_reserve[j]; 5639 } 5640 5641 /* we treat the high watermark as reserved pages. */ 5642 max += high_wmark_pages(zone); 5643 5644 if (max > managed_pages) 5645 max = managed_pages; 5646 5647 pgdat->totalreserve_pages += max; 5648 5649 reserve_pages += max; 5650 } 5651 } 5652 totalreserve_pages = reserve_pages; 5653 } 5654 5655 /* 5656 * setup_per_zone_lowmem_reserve - called whenever 5657 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 5658 * has a correct pages reserved value, so an adequate number of 5659 * pages are left in the zone after a successful __alloc_pages(). 5660 */ 5661 static void setup_per_zone_lowmem_reserve(void) 5662 { 5663 struct pglist_data *pgdat; 5664 enum zone_type i, j; 5665 5666 for_each_online_pgdat(pgdat) { 5667 for (i = 0; i < MAX_NR_ZONES - 1; i++) { 5668 struct zone *zone = &pgdat->node_zones[i]; 5669 int ratio = sysctl_lowmem_reserve_ratio[i]; 5670 bool clear = !ratio || !zone_managed_pages(zone); 5671 unsigned long managed_pages = 0; 5672 5673 for (j = i + 1; j < MAX_NR_ZONES; j++) { 5674 struct zone *upper_zone = &pgdat->node_zones[j]; 5675 5676 managed_pages += zone_managed_pages(upper_zone); 5677 5678 if (clear) 5679 zone->lowmem_reserve[j] = 0; 5680 else 5681 zone->lowmem_reserve[j] = managed_pages / ratio; 5682 } 5683 } 5684 } 5685 5686 /* update totalreserve_pages */ 5687 calculate_totalreserve_pages(); 5688 } 5689 5690 static void __setup_per_zone_wmarks(void) 5691 { 5692 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5693 unsigned long lowmem_pages = 0; 5694 struct zone *zone; 5695 unsigned long flags; 5696 5697 /* Calculate total number of !ZONE_HIGHMEM pages */ 5698 for_each_zone(zone) { 5699 if (!is_highmem(zone)) 5700 lowmem_pages += zone_managed_pages(zone); 5701 } 5702 5703 for_each_zone(zone) { 5704 u64 tmp; 5705 5706 spin_lock_irqsave(&zone->lock, flags); 5707 tmp = (u64)pages_min * zone_managed_pages(zone); 5708 do_div(tmp, lowmem_pages); 5709 if (is_highmem(zone)) { 5710 /* 5711 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5712 * need highmem pages, so cap pages_min to a small 5713 * value here. 5714 * 5715 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5716 * deltas control async page reclaim, and so should 5717 * not be capped for highmem. 5718 */ 5719 unsigned long min_pages; 5720 5721 min_pages = zone_managed_pages(zone) / 1024; 5722 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 5723 zone->_watermark[WMARK_MIN] = min_pages; 5724 } else { 5725 /* 5726 * If it's a lowmem zone, reserve a number of pages 5727 * proportionate to the zone's size. 5728 */ 5729 zone->_watermark[WMARK_MIN] = tmp; 5730 } 5731 5732 /* 5733 * Set the kswapd watermarks distance according to the 5734 * scale factor in proportion to available memory, but 5735 * ensure a minimum size on small systems. 5736 */ 5737 tmp = max_t(u64, tmp >> 2, 5738 mult_frac(zone_managed_pages(zone), 5739 watermark_scale_factor, 10000)); 5740 5741 zone->watermark_boost = 0; 5742 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 5743 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; 5744 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; 5745 5746 spin_unlock_irqrestore(&zone->lock, flags); 5747 } 5748 5749 /* update totalreserve_pages */ 5750 calculate_totalreserve_pages(); 5751 } 5752 5753 /** 5754 * setup_per_zone_wmarks - called when min_free_kbytes changes 5755 * or when memory is hot-{added|removed} 5756 * 5757 * Ensures that the watermark[min,low,high] values for each zone are set 5758 * correctly with respect to min_free_kbytes. 5759 */ 5760 void setup_per_zone_wmarks(void) 5761 { 5762 struct zone *zone; 5763 static DEFINE_SPINLOCK(lock); 5764 5765 spin_lock(&lock); 5766 __setup_per_zone_wmarks(); 5767 spin_unlock(&lock); 5768 5769 /* 5770 * The watermark size have changed so update the pcpu batch 5771 * and high limits or the limits may be inappropriate. 5772 */ 5773 for_each_zone(zone) 5774 zone_pcp_update(zone, 0); 5775 } 5776 5777 /* 5778 * Initialise min_free_kbytes. 5779 * 5780 * For small machines we want it small (128k min). For large machines 5781 * we want it large (256MB max). But it is not linear, because network 5782 * bandwidth does not increase linearly with machine size. We use 5783 * 5784 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5785 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5786 * 5787 * which yields 5788 * 5789 * 16MB: 512k 5790 * 32MB: 724k 5791 * 64MB: 1024k 5792 * 128MB: 1448k 5793 * 256MB: 2048k 5794 * 512MB: 2896k 5795 * 1024MB: 4096k 5796 * 2048MB: 5792k 5797 * 4096MB: 8192k 5798 * 8192MB: 11584k 5799 * 16384MB: 16384k 5800 */ 5801 void calculate_min_free_kbytes(void) 5802 { 5803 unsigned long lowmem_kbytes; 5804 int new_min_free_kbytes; 5805 5806 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5807 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5808 5809 if (new_min_free_kbytes > user_min_free_kbytes) 5810 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); 5811 else 5812 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 5813 new_min_free_kbytes, user_min_free_kbytes); 5814 5815 } 5816 5817 int __meminit init_per_zone_wmark_min(void) 5818 { 5819 calculate_min_free_kbytes(); 5820 setup_per_zone_wmarks(); 5821 refresh_zone_stat_thresholds(); 5822 setup_per_zone_lowmem_reserve(); 5823 5824 #ifdef CONFIG_NUMA 5825 setup_min_unmapped_ratio(); 5826 setup_min_slab_ratio(); 5827 #endif 5828 5829 khugepaged_min_free_kbytes_update(); 5830 5831 return 0; 5832 } 5833 postcore_initcall(init_per_zone_wmark_min) 5834 5835 /* 5836 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5837 * that we can call two helper functions whenever min_free_kbytes 5838 * changes. 5839 */ 5840 static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, 5841 void *buffer, size_t *length, loff_t *ppos) 5842 { 5843 int rc; 5844 5845 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5846 if (rc) 5847 return rc; 5848 5849 if (write) { 5850 user_min_free_kbytes = min_free_kbytes; 5851 setup_per_zone_wmarks(); 5852 } 5853 return 0; 5854 } 5855 5856 static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 5857 void *buffer, size_t *length, loff_t *ppos) 5858 { 5859 int rc; 5860 5861 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5862 if (rc) 5863 return rc; 5864 5865 if (write) 5866 setup_per_zone_wmarks(); 5867 5868 return 0; 5869 } 5870 5871 #ifdef CONFIG_NUMA 5872 static void setup_min_unmapped_ratio(void) 5873 { 5874 pg_data_t *pgdat; 5875 struct zone *zone; 5876 5877 for_each_online_pgdat(pgdat) 5878 pgdat->min_unmapped_pages = 0; 5879 5880 for_each_zone(zone) 5881 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 5882 sysctl_min_unmapped_ratio) / 100; 5883 } 5884 5885 5886 static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, 5887 void *buffer, size_t *length, loff_t *ppos) 5888 { 5889 int rc; 5890 5891 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5892 if (rc) 5893 return rc; 5894 5895 setup_min_unmapped_ratio(); 5896 5897 return 0; 5898 } 5899 5900 static void setup_min_slab_ratio(void) 5901 { 5902 pg_data_t *pgdat; 5903 struct zone *zone; 5904 5905 for_each_online_pgdat(pgdat) 5906 pgdat->min_slab_pages = 0; 5907 5908 for_each_zone(zone) 5909 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 5910 sysctl_min_slab_ratio) / 100; 5911 } 5912 5913 static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 5914 void *buffer, size_t *length, loff_t *ppos) 5915 { 5916 int rc; 5917 5918 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5919 if (rc) 5920 return rc; 5921 5922 setup_min_slab_ratio(); 5923 5924 return 0; 5925 } 5926 #endif 5927 5928 /* 5929 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5930 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5931 * whenever sysctl_lowmem_reserve_ratio changes. 5932 * 5933 * The reserve ratio obviously has absolutely no relation with the 5934 * minimum watermarks. The lowmem reserve ratio can only make sense 5935 * if in function of the boot time zone sizes. 5936 */ 5937 static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, 5938 int write, void *buffer, size_t *length, loff_t *ppos) 5939 { 5940 int i; 5941 5942 proc_dointvec_minmax(table, write, buffer, length, ppos); 5943 5944 for (i = 0; i < MAX_NR_ZONES; i++) { 5945 if (sysctl_lowmem_reserve_ratio[i] < 1) 5946 sysctl_lowmem_reserve_ratio[i] = 0; 5947 } 5948 5949 setup_per_zone_lowmem_reserve(); 5950 return 0; 5951 } 5952 5953 /* 5954 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each 5955 * cpu. It is the fraction of total pages in each zone that a hot per cpu 5956 * pagelist can have before it gets flushed back to buddy allocator. 5957 */ 5958 static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, 5959 int write, void *buffer, size_t *length, loff_t *ppos) 5960 { 5961 struct zone *zone; 5962 int old_percpu_pagelist_high_fraction; 5963 int ret; 5964 5965 mutex_lock(&pcp_batch_high_lock); 5966 old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; 5967 5968 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5969 if (!write || ret < 0) 5970 goto out; 5971 5972 /* Sanity checking to avoid pcp imbalance */ 5973 if (percpu_pagelist_high_fraction && 5974 percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { 5975 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; 5976 ret = -EINVAL; 5977 goto out; 5978 } 5979 5980 /* No change? */ 5981 if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) 5982 goto out; 5983 5984 for_each_populated_zone(zone) 5985 zone_set_pageset_high_and_batch(zone, 0); 5986 out: 5987 mutex_unlock(&pcp_batch_high_lock); 5988 return ret; 5989 } 5990 5991 static struct ctl_table page_alloc_sysctl_table[] = { 5992 { 5993 .procname = "min_free_kbytes", 5994 .data = &min_free_kbytes, 5995 .maxlen = sizeof(min_free_kbytes), 5996 .mode = 0644, 5997 .proc_handler = min_free_kbytes_sysctl_handler, 5998 .extra1 = SYSCTL_ZERO, 5999 }, 6000 { 6001 .procname = "watermark_boost_factor", 6002 .data = &watermark_boost_factor, 6003 .maxlen = sizeof(watermark_boost_factor), 6004 .mode = 0644, 6005 .proc_handler = proc_dointvec_minmax, 6006 .extra1 = SYSCTL_ZERO, 6007 }, 6008 { 6009 .procname = "watermark_scale_factor", 6010 .data = &watermark_scale_factor, 6011 .maxlen = sizeof(watermark_scale_factor), 6012 .mode = 0644, 6013 .proc_handler = watermark_scale_factor_sysctl_handler, 6014 .extra1 = SYSCTL_ONE, 6015 .extra2 = SYSCTL_THREE_THOUSAND, 6016 }, 6017 { 6018 .procname = "percpu_pagelist_high_fraction", 6019 .data = &percpu_pagelist_high_fraction, 6020 .maxlen = sizeof(percpu_pagelist_high_fraction), 6021 .mode = 0644, 6022 .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, 6023 .extra1 = SYSCTL_ZERO, 6024 }, 6025 { 6026 .procname = "lowmem_reserve_ratio", 6027 .data = &sysctl_lowmem_reserve_ratio, 6028 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 6029 .mode = 0644, 6030 .proc_handler = lowmem_reserve_ratio_sysctl_handler, 6031 }, 6032 #ifdef CONFIG_NUMA 6033 { 6034 .procname = "numa_zonelist_order", 6035 .data = &numa_zonelist_order, 6036 .maxlen = NUMA_ZONELIST_ORDER_LEN, 6037 .mode = 0644, 6038 .proc_handler = numa_zonelist_order_handler, 6039 }, 6040 { 6041 .procname = "min_unmapped_ratio", 6042 .data = &sysctl_min_unmapped_ratio, 6043 .maxlen = sizeof(sysctl_min_unmapped_ratio), 6044 .mode = 0644, 6045 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, 6046 .extra1 = SYSCTL_ZERO, 6047 .extra2 = SYSCTL_ONE_HUNDRED, 6048 }, 6049 { 6050 .procname = "min_slab_ratio", 6051 .data = &sysctl_min_slab_ratio, 6052 .maxlen = sizeof(sysctl_min_slab_ratio), 6053 .mode = 0644, 6054 .proc_handler = sysctl_min_slab_ratio_sysctl_handler, 6055 .extra1 = SYSCTL_ZERO, 6056 .extra2 = SYSCTL_ONE_HUNDRED, 6057 }, 6058 #endif 6059 {} 6060 }; 6061 6062 void __init page_alloc_sysctl_init(void) 6063 { 6064 register_sysctl_init("vm", page_alloc_sysctl_table); 6065 } 6066 6067 #ifdef CONFIG_CONTIG_ALLOC 6068 /* Usage: See admin-guide/dynamic-debug-howto.rst */ 6069 static void alloc_contig_dump_pages(struct list_head *page_list) 6070 { 6071 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); 6072 6073 if (DYNAMIC_DEBUG_BRANCH(descriptor)) { 6074 struct page *page; 6075 6076 dump_stack(); 6077 list_for_each_entry(page, page_list, lru) 6078 dump_page(page, "migration failure"); 6079 } 6080 } 6081 6082 /* [start, end) must belong to a single zone. */ 6083 int __alloc_contig_migrate_range(struct compact_control *cc, 6084 unsigned long start, unsigned long end) 6085 { 6086 /* This function is based on compact_zone() from compaction.c. */ 6087 unsigned int nr_reclaimed; 6088 unsigned long pfn = start; 6089 unsigned int tries = 0; 6090 int ret = 0; 6091 struct migration_target_control mtc = { 6092 .nid = zone_to_nid(cc->zone), 6093 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 6094 }; 6095 6096 lru_cache_disable(); 6097 6098 while (pfn < end || !list_empty(&cc->migratepages)) { 6099 if (fatal_signal_pending(current)) { 6100 ret = -EINTR; 6101 break; 6102 } 6103 6104 if (list_empty(&cc->migratepages)) { 6105 cc->nr_migratepages = 0; 6106 ret = isolate_migratepages_range(cc, pfn, end); 6107 if (ret && ret != -EAGAIN) 6108 break; 6109 pfn = cc->migrate_pfn; 6110 tries = 0; 6111 } else if (++tries == 5) { 6112 ret = -EBUSY; 6113 break; 6114 } 6115 6116 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6117 &cc->migratepages); 6118 cc->nr_migratepages -= nr_reclaimed; 6119 6120 ret = migrate_pages(&cc->migratepages, alloc_migration_target, 6121 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); 6122 6123 /* 6124 * On -ENOMEM, migrate_pages() bails out right away. It is pointless 6125 * to retry again over this error, so do the same here. 6126 */ 6127 if (ret == -ENOMEM) 6128 break; 6129 } 6130 6131 lru_cache_enable(); 6132 if (ret < 0) { 6133 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) 6134 alloc_contig_dump_pages(&cc->migratepages); 6135 putback_movable_pages(&cc->migratepages); 6136 return ret; 6137 } 6138 return 0; 6139 } 6140 6141 /** 6142 * alloc_contig_range() -- tries to allocate given range of pages 6143 * @start: start PFN to allocate 6144 * @end: one-past-the-last PFN to allocate 6145 * @migratetype: migratetype of the underlying pageblocks (either 6146 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 6147 * in range must have the same migratetype and it must 6148 * be either of the two. 6149 * @gfp_mask: GFP mask to use during compaction 6150 * 6151 * The PFN range does not have to be pageblock aligned. The PFN range must 6152 * belong to a single zone. 6153 * 6154 * The first thing this routine does is attempt to MIGRATE_ISOLATE all 6155 * pageblocks in the range. Once isolated, the pageblocks should not 6156 * be modified by others. 6157 * 6158 * Return: zero on success or negative error code. On success all 6159 * pages which PFN is in [start, end) are allocated for the caller and 6160 * need to be freed with free_contig_range(). 6161 */ 6162 int alloc_contig_range(unsigned long start, unsigned long end, 6163 unsigned migratetype, gfp_t gfp_mask) 6164 { 6165 unsigned long outer_start, outer_end; 6166 int order; 6167 int ret = 0; 6168 6169 struct compact_control cc = { 6170 .nr_migratepages = 0, 6171 .order = -1, 6172 .zone = page_zone(pfn_to_page(start)), 6173 .mode = MIGRATE_SYNC, 6174 .ignore_skip_hint = true, 6175 .no_set_skip_hint = true, 6176 .gfp_mask = current_gfp_context(gfp_mask), 6177 .alloc_contig = true, 6178 }; 6179 INIT_LIST_HEAD(&cc.migratepages); 6180 6181 /* 6182 * What we do here is we mark all pageblocks in range as 6183 * MIGRATE_ISOLATE. Because pageblock and max order pages may 6184 * have different sizes, and due to the way page allocator 6185 * work, start_isolate_page_range() has special handlings for this. 6186 * 6187 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 6188 * migrate the pages from an unaligned range (ie. pages that 6189 * we are interested in). This will put all the pages in 6190 * range back to page allocator as MIGRATE_ISOLATE. 6191 * 6192 * When this is done, we take the pages in range from page 6193 * allocator removing them from the buddy system. This way 6194 * page allocator will never consider using them. 6195 * 6196 * This lets us mark the pageblocks back as 6197 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 6198 * aligned range but not in the unaligned, original range are 6199 * put back to page allocator so that buddy can use them. 6200 */ 6201 6202 ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); 6203 if (ret) 6204 goto done; 6205 6206 drain_all_pages(cc.zone); 6207 6208 /* 6209 * In case of -EBUSY, we'd like to know which page causes problem. 6210 * So, just fall through. test_pages_isolated() has a tracepoint 6211 * which will report the busy page. 6212 * 6213 * It is possible that busy pages could become available before 6214 * the call to test_pages_isolated, and the range will actually be 6215 * allocated. So, if we fall through be sure to clear ret so that 6216 * -EBUSY is not accidentally used or returned to caller. 6217 */ 6218 ret = __alloc_contig_migrate_range(&cc, start, end); 6219 if (ret && ret != -EBUSY) 6220 goto done; 6221 ret = 0; 6222 6223 /* 6224 * Pages from [start, end) are within a pageblock_nr_pages 6225 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 6226 * more, all pages in [start, end) are free in page allocator. 6227 * What we are going to do is to allocate all pages from 6228 * [start, end) (that is remove them from page allocator). 6229 * 6230 * The only problem is that pages at the beginning and at the 6231 * end of interesting range may be not aligned with pages that 6232 * page allocator holds, ie. they can be part of higher order 6233 * pages. Because of this, we reserve the bigger range and 6234 * once this is done free the pages we are not interested in. 6235 * 6236 * We don't have to hold zone->lock here because the pages are 6237 * isolated thus they won't get removed from buddy. 6238 */ 6239 6240 order = 0; 6241 outer_start = start; 6242 while (!PageBuddy(pfn_to_page(outer_start))) { 6243 if (++order > MAX_ORDER) { 6244 outer_start = start; 6245 break; 6246 } 6247 outer_start &= ~0UL << order; 6248 } 6249 6250 if (outer_start != start) { 6251 order = buddy_order(pfn_to_page(outer_start)); 6252 6253 /* 6254 * outer_start page could be small order buddy page and 6255 * it doesn't include start page. Adjust outer_start 6256 * in this case to report failed page properly 6257 * on tracepoint in test_pages_isolated() 6258 */ 6259 if (outer_start + (1UL << order) <= start) 6260 outer_start = start; 6261 } 6262 6263 /* Make sure the range is really isolated. */ 6264 if (test_pages_isolated(outer_start, end, 0)) { 6265 ret = -EBUSY; 6266 goto done; 6267 } 6268 6269 /* Grab isolated pages from freelists. */ 6270 outer_end = isolate_freepages_range(&cc, outer_start, end); 6271 if (!outer_end) { 6272 ret = -EBUSY; 6273 goto done; 6274 } 6275 6276 /* Free head and tail (if any) */ 6277 if (start != outer_start) 6278 free_contig_range(outer_start, start - outer_start); 6279 if (end != outer_end) 6280 free_contig_range(end, outer_end - end); 6281 6282 done: 6283 undo_isolate_page_range(start, end, migratetype); 6284 return ret; 6285 } 6286 EXPORT_SYMBOL(alloc_contig_range); 6287 6288 static int __alloc_contig_pages(unsigned long start_pfn, 6289 unsigned long nr_pages, gfp_t gfp_mask) 6290 { 6291 unsigned long end_pfn = start_pfn + nr_pages; 6292 6293 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 6294 gfp_mask); 6295 } 6296 6297 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 6298 unsigned long nr_pages) 6299 { 6300 unsigned long i, end_pfn = start_pfn + nr_pages; 6301 struct page *page; 6302 6303 for (i = start_pfn; i < end_pfn; i++) { 6304 page = pfn_to_online_page(i); 6305 if (!page) 6306 return false; 6307 6308 if (page_zone(page) != z) 6309 return false; 6310 6311 if (PageReserved(page)) 6312 return false; 6313 6314 if (PageHuge(page)) 6315 return false; 6316 } 6317 return true; 6318 } 6319 6320 static bool zone_spans_last_pfn(const struct zone *zone, 6321 unsigned long start_pfn, unsigned long nr_pages) 6322 { 6323 unsigned long last_pfn = start_pfn + nr_pages - 1; 6324 6325 return zone_spans_pfn(zone, last_pfn); 6326 } 6327 6328 /** 6329 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 6330 * @nr_pages: Number of contiguous pages to allocate 6331 * @gfp_mask: GFP mask to limit search and used during compaction 6332 * @nid: Target node 6333 * @nodemask: Mask for other possible nodes 6334 * 6335 * This routine is a wrapper around alloc_contig_range(). It scans over zones 6336 * on an applicable zonelist to find a contiguous pfn range which can then be 6337 * tried for allocation with alloc_contig_range(). This routine is intended 6338 * for allocation requests which can not be fulfilled with the buddy allocator. 6339 * 6340 * The allocated memory is always aligned to a page boundary. If nr_pages is a 6341 * power of two, then allocated range is also guaranteed to be aligned to same 6342 * nr_pages (e.g. 1GB request would be aligned to 1GB). 6343 * 6344 * Allocated pages can be freed with free_contig_range() or by manually calling 6345 * __free_page() on each allocated page. 6346 * 6347 * Return: pointer to contiguous pages on success, or NULL if not successful. 6348 */ 6349 struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, 6350 int nid, nodemask_t *nodemask) 6351 { 6352 unsigned long ret, pfn, flags; 6353 struct zonelist *zonelist; 6354 struct zone *zone; 6355 struct zoneref *z; 6356 6357 zonelist = node_zonelist(nid, gfp_mask); 6358 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6359 gfp_zone(gfp_mask), nodemask) { 6360 spin_lock_irqsave(&zone->lock, flags); 6361 6362 pfn = ALIGN(zone->zone_start_pfn, nr_pages); 6363 while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 6364 if (pfn_range_valid_contig(zone, pfn, nr_pages)) { 6365 /* 6366 * We release the zone lock here because 6367 * alloc_contig_range() will also lock the zone 6368 * at some point. If there's an allocation 6369 * spinning on this lock, it may win the race 6370 * and cause alloc_contig_range() to fail... 6371 */ 6372 spin_unlock_irqrestore(&zone->lock, flags); 6373 ret = __alloc_contig_pages(pfn, nr_pages, 6374 gfp_mask); 6375 if (!ret) 6376 return pfn_to_page(pfn); 6377 spin_lock_irqsave(&zone->lock, flags); 6378 } 6379 pfn += nr_pages; 6380 } 6381 spin_unlock_irqrestore(&zone->lock, flags); 6382 } 6383 return NULL; 6384 } 6385 #endif /* CONFIG_CONTIG_ALLOC */ 6386 6387 void free_contig_range(unsigned long pfn, unsigned long nr_pages) 6388 { 6389 unsigned long count = 0; 6390 6391 for (; nr_pages--; pfn++) { 6392 struct page *page = pfn_to_page(pfn); 6393 6394 count += page_count(page) != 1; 6395 __free_page(page); 6396 } 6397 WARN(count != 0, "%lu pages are still in use!\n", count); 6398 } 6399 EXPORT_SYMBOL(free_contig_range); 6400 6401 /* 6402 * Effectively disable pcplists for the zone by setting the high limit to 0 6403 * and draining all cpus. A concurrent page freeing on another CPU that's about 6404 * to put the page on pcplist will either finish before the drain and the page 6405 * will be drained, or observe the new high limit and skip the pcplist. 6406 * 6407 * Must be paired with a call to zone_pcp_enable(). 6408 */ 6409 void zone_pcp_disable(struct zone *zone) 6410 { 6411 mutex_lock(&pcp_batch_high_lock); 6412 __zone_set_pageset_high_and_batch(zone, 0, 1); 6413 __drain_all_pages(zone, true); 6414 } 6415 6416 void zone_pcp_enable(struct zone *zone) 6417 { 6418 __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); 6419 mutex_unlock(&pcp_batch_high_lock); 6420 } 6421 6422 void zone_pcp_reset(struct zone *zone) 6423 { 6424 int cpu; 6425 struct per_cpu_zonestat *pzstats; 6426 6427 if (zone->per_cpu_pageset != &boot_pageset) { 6428 for_each_online_cpu(cpu) { 6429 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 6430 drain_zonestat(zone, pzstats); 6431 } 6432 free_percpu(zone->per_cpu_pageset); 6433 zone->per_cpu_pageset = &boot_pageset; 6434 if (zone->per_cpu_zonestats != &boot_zonestats) { 6435 free_percpu(zone->per_cpu_zonestats); 6436 zone->per_cpu_zonestats = &boot_zonestats; 6437 } 6438 } 6439 } 6440 6441 #ifdef CONFIG_MEMORY_HOTREMOVE 6442 /* 6443 * All pages in the range must be in a single zone, must not contain holes, 6444 * must span full sections, and must be isolated before calling this function. 6445 */ 6446 void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 6447 { 6448 unsigned long pfn = start_pfn; 6449 struct page *page; 6450 struct zone *zone; 6451 unsigned int order; 6452 unsigned long flags; 6453 6454 offline_mem_sections(pfn, end_pfn); 6455 zone = page_zone(pfn_to_page(pfn)); 6456 spin_lock_irqsave(&zone->lock, flags); 6457 while (pfn < end_pfn) { 6458 page = pfn_to_page(pfn); 6459 /* 6460 * The HWPoisoned page may be not in buddy system, and 6461 * page_count() is not 0. 6462 */ 6463 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 6464 pfn++; 6465 continue; 6466 } 6467 /* 6468 * At this point all remaining PageOffline() pages have a 6469 * reference count of 0 and can simply be skipped. 6470 */ 6471 if (PageOffline(page)) { 6472 BUG_ON(page_count(page)); 6473 BUG_ON(PageBuddy(page)); 6474 pfn++; 6475 continue; 6476 } 6477 6478 BUG_ON(page_count(page)); 6479 BUG_ON(!PageBuddy(page)); 6480 order = buddy_order(page); 6481 del_page_from_free_list(page, zone, order); 6482 pfn += (1 << order); 6483 } 6484 spin_unlock_irqrestore(&zone->lock, flags); 6485 } 6486 #endif 6487 6488 /* 6489 * This function returns a stable result only if called under zone lock. 6490 */ 6491 bool is_free_buddy_page(struct page *page) 6492 { 6493 unsigned long pfn = page_to_pfn(page); 6494 unsigned int order; 6495 6496 for (order = 0; order <= MAX_ORDER; order++) { 6497 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6498 6499 if (PageBuddy(page_head) && 6500 buddy_order_unsafe(page_head) >= order) 6501 break; 6502 } 6503 6504 return order <= MAX_ORDER; 6505 } 6506 EXPORT_SYMBOL(is_free_buddy_page); 6507 6508 #ifdef CONFIG_MEMORY_FAILURE 6509 /* 6510 * Break down a higher-order page in sub-pages, and keep our target out of 6511 * buddy allocator. 6512 */ 6513 static void break_down_buddy_pages(struct zone *zone, struct page *page, 6514 struct page *target, int low, int high, 6515 int migratetype) 6516 { 6517 unsigned long size = 1 << high; 6518 struct page *current_buddy, *next_page; 6519 6520 while (high > low) { 6521 high--; 6522 size >>= 1; 6523 6524 if (target >= &page[size]) { 6525 next_page = page + size; 6526 current_buddy = page; 6527 } else { 6528 next_page = page; 6529 current_buddy = page + size; 6530 } 6531 6532 if (set_page_guard(zone, current_buddy, high, migratetype)) 6533 continue; 6534 6535 if (current_buddy != target) { 6536 add_to_free_list(current_buddy, zone, high, migratetype); 6537 set_buddy_order(current_buddy, high); 6538 page = next_page; 6539 } 6540 } 6541 } 6542 6543 /* 6544 * Take a page that will be marked as poisoned off the buddy allocator. 6545 */ 6546 bool take_page_off_buddy(struct page *page) 6547 { 6548 struct zone *zone = page_zone(page); 6549 unsigned long pfn = page_to_pfn(page); 6550 unsigned long flags; 6551 unsigned int order; 6552 bool ret = false; 6553 6554 spin_lock_irqsave(&zone->lock, flags); 6555 for (order = 0; order <= MAX_ORDER; order++) { 6556 struct page *page_head = page - (pfn & ((1 << order) - 1)); 6557 int page_order = buddy_order(page_head); 6558 6559 if (PageBuddy(page_head) && page_order >= order) { 6560 unsigned long pfn_head = page_to_pfn(page_head); 6561 int migratetype = get_pfnblock_migratetype(page_head, 6562 pfn_head); 6563 6564 del_page_from_free_list(page_head, zone, page_order); 6565 break_down_buddy_pages(zone, page_head, page, 0, 6566 page_order, migratetype); 6567 SetPageHWPoisonTakenOff(page); 6568 if (!is_migrate_isolate(migratetype)) 6569 __mod_zone_freepage_state(zone, -1, migratetype); 6570 ret = true; 6571 break; 6572 } 6573 if (page_count(page_head) > 0) 6574 break; 6575 } 6576 spin_unlock_irqrestore(&zone->lock, flags); 6577 return ret; 6578 } 6579 6580 /* 6581 * Cancel takeoff done by take_page_off_buddy(). 6582 */ 6583 bool put_page_back_buddy(struct page *page) 6584 { 6585 struct zone *zone = page_zone(page); 6586 unsigned long pfn = page_to_pfn(page); 6587 unsigned long flags; 6588 int migratetype = get_pfnblock_migratetype(page, pfn); 6589 bool ret = false; 6590 6591 spin_lock_irqsave(&zone->lock, flags); 6592 if (put_page_testzero(page)) { 6593 ClearPageHWPoisonTakenOff(page); 6594 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); 6595 if (TestClearPageHWPoison(page)) { 6596 ret = true; 6597 } 6598 } 6599 spin_unlock_irqrestore(&zone->lock, flags); 6600 6601 return ret; 6602 } 6603 #endif 6604 6605 #ifdef CONFIG_ZONE_DMA 6606 bool has_managed_dma(void) 6607 { 6608 struct pglist_data *pgdat; 6609 6610 for_each_online_pgdat(pgdat) { 6611 struct zone *zone = &pgdat->node_zones[ZONE_DMA]; 6612 6613 if (managed_zone(zone)) 6614 return true; 6615 } 6616 return false; 6617 } 6618 #endif /* CONFIG_ZONE_DMA */ 6619 6620 #ifdef CONFIG_UNACCEPTED_MEMORY 6621 6622 /* Counts number of zones with unaccepted pages. */ 6623 static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); 6624 6625 static bool lazy_accept = true; 6626 6627 static int __init accept_memory_parse(char *p) 6628 { 6629 if (!strcmp(p, "lazy")) { 6630 lazy_accept = true; 6631 return 0; 6632 } else if (!strcmp(p, "eager")) { 6633 lazy_accept = false; 6634 return 0; 6635 } else { 6636 return -EINVAL; 6637 } 6638 } 6639 early_param("accept_memory", accept_memory_parse); 6640 6641 static bool page_contains_unaccepted(struct page *page, unsigned int order) 6642 { 6643 phys_addr_t start = page_to_phys(page); 6644 phys_addr_t end = start + (PAGE_SIZE << order); 6645 6646 return range_contains_unaccepted_memory(start, end); 6647 } 6648 6649 static void accept_page(struct page *page, unsigned int order) 6650 { 6651 phys_addr_t start = page_to_phys(page); 6652 6653 accept_memory(start, start + (PAGE_SIZE << order)); 6654 } 6655 6656 static bool try_to_accept_memory_one(struct zone *zone) 6657 { 6658 unsigned long flags; 6659 struct page *page; 6660 bool last; 6661 6662 if (list_empty(&zone->unaccepted_pages)) 6663 return false; 6664 6665 spin_lock_irqsave(&zone->lock, flags); 6666 page = list_first_entry_or_null(&zone->unaccepted_pages, 6667 struct page, lru); 6668 if (!page) { 6669 spin_unlock_irqrestore(&zone->lock, flags); 6670 return false; 6671 } 6672 6673 list_del(&page->lru); 6674 last = list_empty(&zone->unaccepted_pages); 6675 6676 __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 6677 __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); 6678 spin_unlock_irqrestore(&zone->lock, flags); 6679 6680 accept_page(page, MAX_ORDER); 6681 6682 __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); 6683 6684 if (last) 6685 static_branch_dec(&zones_with_unaccepted_pages); 6686 6687 return true; 6688 } 6689 6690 static bool try_to_accept_memory(struct zone *zone, unsigned int order) 6691 { 6692 long to_accept; 6693 int ret = false; 6694 6695 /* How much to accept to get to high watermark? */ 6696 to_accept = high_wmark_pages(zone) - 6697 (zone_page_state(zone, NR_FREE_PAGES) - 6698 __zone_watermark_unusable_free(zone, order, 0)); 6699 6700 /* Accept at least one page */ 6701 do { 6702 if (!try_to_accept_memory_one(zone)) 6703 break; 6704 ret = true; 6705 to_accept -= MAX_ORDER_NR_PAGES; 6706 } while (to_accept > 0); 6707 6708 return ret; 6709 } 6710 6711 static inline bool has_unaccepted_memory(void) 6712 { 6713 return static_branch_unlikely(&zones_with_unaccepted_pages); 6714 } 6715 6716 static bool __free_unaccepted(struct page *page) 6717 { 6718 struct zone *zone = page_zone(page); 6719 unsigned long flags; 6720 bool first = false; 6721 6722 if (!lazy_accept) 6723 return false; 6724 6725 spin_lock_irqsave(&zone->lock, flags); 6726 first = list_empty(&zone->unaccepted_pages); 6727 list_add_tail(&page->lru, &zone->unaccepted_pages); 6728 __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 6729 __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); 6730 spin_unlock_irqrestore(&zone->lock, flags); 6731 6732 if (first) 6733 static_branch_inc(&zones_with_unaccepted_pages); 6734 6735 return true; 6736 } 6737 6738 #else 6739 6740 static bool page_contains_unaccepted(struct page *page, unsigned int order) 6741 { 6742 return false; 6743 } 6744 6745 static void accept_page(struct page *page, unsigned int order) 6746 { 6747 } 6748 6749 static bool try_to_accept_memory(struct zone *zone, unsigned int order) 6750 { 6751 return false; 6752 } 6753 6754 static inline bool has_unaccepted_memory(void) 6755 { 6756 return false; 6757 } 6758 6759 static bool __free_unaccepted(struct page *page) 6760 { 6761 BUILD_BUG(); 6762 return false; 6763 } 6764 6765 #endif /* CONFIG_UNACCEPTED_MEMORY */ 6766