page_alloc.c - OpenGrok cross reference for /linux/mm/page

Lines Matching +full:no +full:- +full:high +full:- +full:z
1 // SPDX-License-Identifier: GPL-2.0-only
39 #include <linux/fault-inject.h>
63 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
66 /* No special request */
72  * reporting it and marking it "reported" -  it only skips notifying
81  * page shuffling (relevant code - e.g., memory onlining - is expected to
84  * Note: No code should rely on this flag for correctness - it's purely
91 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
104 /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
115  * interfered with and a high priority task cannot preempt the allocator.
126  * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
134 	spin_lock(&_ret->member);					\
143 	if (!spin_trylock(&_ret->member)) {				\
152 	spin_unlock(&ptr->member);					\
219  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
220  *	1G machine -> (16M dma, 784M normal, 224M high)
225  * TBD: should special case ZONE_DMA32 machines here - in those we normally
273 int user_min_free_kbytes = -1;
296  * During boot we initialize deferred pages on-demand, as needed, but once
337 	return page_zone(page)->pageblock_flags;
344 	pfn &= (PAGES_PER_SECTION-1);
346 	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
352  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
369 	bitidx &= (BITS_PER_LONG-1);
386  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
406 	bitidx &= (BITS_PER_LONG-1);
438 		start_pfn = zone->zone_start_pfn;
439 		sp = zone->spanned_pages;
444 		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
445 			pfn, zone_to_nid(zone), zone->name,
497 		current->comm, page_to_pfn(page));
554  * Higher-order pages are called "compound pages".  They are structured thusly:
559  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
561  * The first tail page's ->compound_order holds the order of allocation.
562  * This usage means that zero-order pages may not be compound.
586 	struct capture_control *capc = current->capture_control;
589 		!(current->flags & PF_KTHREAD) &&
590 		!capc->page &&
591 		capc->cc->zone == zone ? capc : NULL;
598 	if (!capc || order != capc->cc->order)
610 	 * and vice-versa but no more than normal fallback logic which can
611 	 * have trouble finding a high-order free page.
614 	    capc->cc->migratetype != MIGRATE_MOVABLE)
617 	capc->page = page;
638 	lockdep_assert_held(&zone->lock);
648 		WRITE_ONCE(zone->nr_free_highatomic,
649 			   zone->nr_free_highatomic + nr_pages);
657 	struct free_area *area = &zone->free_area[order];
664 		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
666 		list_add(&page->buddy_list, &area->free_list[migratetype]);
667 	area->nr_free++;
672  * of the list - so the moved pages won't immediately be considered for
678 	struct free_area *area = &zone->free_area[order];
685 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
687 	account_freepages(zone, -(1 << order), old_mt);
702 	list_del(&page->buddy_list);
705 	zone->free_area[order].nr_free--;
712 	account_freepages(zone, -(1 << order), migratetype);
718 	return list_first_entry_or_null(&area->free_list[migratetype],
724  * of the next-higher order is free. If it is, it's possible
728  * as a 2-level higher order page
737 	if (order >= MAX_PAGE_ORDER - 1)
741 	higher_page = page + (higher_page_pfn - pfn);
750  * The concept of a buddy system is to maintain direct-mapped table
755  * At a high level, all that happens here is marking the table entry
768  * -- nyc
783 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
785 	VM_BUG_ON(migratetype == -1);
786 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
795 			account_freepages(zone, -(1 << order), migratetype);
830 			 * expand() down the line puts the sub-blocks
837 		page = page + (combined_pfn - pfn);
867 	if (unlikely(atomic_read(&page->_mapcount) != -1))
870 	if (unlikely((unsigned long)page->mapping |
873 			page->memcg_data |
876 			((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
878 			(page->flags & check_flags)))
888 	if (unlikely(atomic_read(&page->_mapcount) != -1))
890 	if (unlikely(page->mapping != NULL))
891 		bad_reason = "non-NULL mapping";
894 	if (unlikely(page->flags & flags)) {
901 	if (unlikely(page->memcg_data))
905 	if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
938 	 * We rely page->lru.next never has bit 0 set, unless the page
939 	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
947 	switch (page - head_page) {
949 		/* the first tail page: these may be in place of ->mapping */
958 		if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
962 		if (unlikely(atomic_read(&folio->_pincount))) {
968 		/* the second tail page: deferred_list overlaps ->mapping */
969 		if (unlikely(!list_empty(&folio->_deferred_list))) {
975 		if (page->mapping != TAIL_MAPPING) {
991 	page->mapping = NULL;
1000  *    Tag-based KASAN modes skip pages freed via deferred memory initialization
1002  * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1005  * Pages will have match-all tags in the following circumstances:
1018  * Assuming that there will be no reference to those newly initialized
1019  * pages before they are ever allocated, this should have no effect on
1022  * on-demand allocation and then freed again before the deferred pages
1071 		zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
1094 	 * avoid checking PageCompound for order-0 pages.
1110 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1115 			mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
1116 		page->mapping = NULL;
1126 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1145 	 * With hardware tag-based KASAN, memory tags must be set before the
1187 	count = min(pcp->count, count);
1190 	pindex = pindex - 1;
1192 	spin_lock_irqsave(&zone->lock, flags);
1198 		/* Remove pages from lists in a round-robin fashion. */
1200 			if (++pindex > NR_PCP_LISTS - 1)
1202 			list = &pcp->lists[pindex];
1216 			list_del(&page->pcp_list);
1217 			count -= nr_pages;
1218 			pcp->count -= nr_pages;
1225 	spin_unlock_irqrestore(&zone->lock, flags);
1228 /* Split a multi-block free page into its individual pageblocks. */
1258 	spin_lock_irqsave(&zone->lock, flags);
1260 	spin_unlock_irqrestore(&zone->lock, flags);
1306 		atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1338  * Note: the function may return non-NULL struct page even for a page block
1339  * which contains a memory hole (i.e. there is no physical memory for a subset
1341  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1354 	end_pfn--;
1387  * -- nyc
1390 				  int high, int migratetype)
1392 	unsigned int size = 1 << high;
1395 	while (high > low) {
1396 		high--;
1406 		if (set_page_guard(zone, &page[size], high))
1409 		__add_to_free_list(&page[size], zone, high, migratetype, false);
1410 		set_buddy_order(&page[size], high);
1419 						int high, int migratetype)
1421 	int nr_pages = 1 << high;
1423 	__del_page_from_free_list(page, zone, high, migratetype);
1424 	nr_pages -= expand(zone, page, low, high, migratetype);
1425 	account_freepages(zone, -nr_pages, migratetype);
1430 	if (unlikely(page->flags & __PG_HWPOISON)) {
1475 	/* Skip, if hardware tag-based KASAN is not enabled. */
1480 	 * With hardware tag-based KASAN enabled, skip if this has been
1488 	/* Don't skip, if hardware tag-based KASAN is not enabled. */
1492 	/* For hardware tag-based KASAN, skip if requested. */
1590 		area = &(zone->free_area[current_order]);
1613 static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
1642 	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
1688 	if (!zone_spans_pfn(zone, end - 1))
1725 		return -1;
1756  * move_freepages_block_isolate - move free pages in block for page isolation
1782 	/* No splits needed if buddies can't span multiple blocks */
1820 	int nr_pageblocks = 1 << (start_order - pageblock_order);
1822 	while (nr_pageblocks--) {
1884 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
1888 	 * high watermark may be uninitialised if fragmentation occurs
1900 	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
1949 		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
1970 		 * to MOVABLE pageblock, consider all non-movable pages as
1973 		 * exact migratetype of non-movable pages.
1977 						- (free_pages + movable_pages);
1985 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2008 	if (area->nr_free == 0)
2009 		return -1;
2012 	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
2027 	return -1;
2032  * exclusive use of high-order atomic allocations if there are no
2045 	 * Check is race-prone but harmless.
2050 	if (zone->nr_reserved_highatomic >= max_managed)
2053 	spin_lock_irqsave(&zone->lock, flags);
2056 	if (zone->nr_reserved_highatomic >= max_managed)
2066 		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
2068 		zone->nr_reserved_highatomic += pageblock_nr_pages;
2071 		zone->nr_reserved_highatomic += 1 << order;
2075 	spin_unlock_irqrestore(&zone->lock, flags);
2080  * potentially hurts the reliability of high-order allocations when under
2090 	struct zonelist *zonelist = ac->zonelist;
2092 	struct zoneref *z;
2098 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2099 								ac->nodemask) {
2102 		 * is really high.
2104 		if (!force && zone->nr_reserved_highatomic <=
2108 		spin_lock_irqsave(&zone->lock, flags);
2110 			struct free_area *area = &(zone->free_area[order]);
2122 			 * from highatomic to ac->migratetype. So we should
2129 				 * locking could inadvertently allow a per-cpu
2135 				size = min(size, zone->nr_reserved_highatomic);
2136 				zone->nr_reserved_highatomic -= size;
2140 			 * Convert to ac->migratetype and avoid the normal
2150 							   ac->migratetype);
2153 						  ac->migratetype);
2155 						       ac->migratetype);
2162 			WARN_ON_ONCE(ret == -1);
2164 				spin_unlock_irqrestore(&zone->lock, flags);
2168 		spin_unlock_irqrestore(&zone->lock, flags);
2197 	 * i.e. orders < pageblock_order. If there are no local zones free,
2209 				--current_order) {
2210 		area = &(zone->free_area[current_order]);
2213 		if (fallback_mt == -1)
2235 		area = &(zone->free_area[current_order]);
2238 		if (fallback_mt != -1)
2243 	 * This should not happen - we already found a suitable fallback
2263  * Call me with the zone->lock already held.
2310 	spin_lock_irqsave(&zone->lock, flags);
2327 		list_add_tail(&page->pcp_list, list);
2329 	spin_unlock_irqrestore(&zone->lock, flags);
2335  * Called from the vmstat counter updater to decay the PCP high.
2343 	high_min = READ_ONCE(pcp->high_min);
2344 	batch = READ_ONCE(pcp->batch);
2346 	 * Decrease pcp->high periodically to try to free possible
2348 	 * control latency.  This caps pcp->high decrement too.
2350 	if (pcp->high > high_min) {
2351 		pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2352 				 pcp->high - (pcp->high >> 3), high_min);
2353 		if (pcp->high > high_min)
2357 	to_drain = pcp->count - pcp->high;
2359 		spin_lock(&pcp->lock);
2361 		spin_unlock(&pcp->lock);
2378 	batch = READ_ONCE(pcp->batch);
2379 	to_drain = min(pcp->count, batch);
2381 		spin_lock(&pcp->lock);
2383 		spin_unlock(&pcp->lock);
2393 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2397 		spin_lock(&pcp->lock);
2398 		count = pcp->count;
2401 				pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
2404 			count -= to_drain;
2406 		spin_unlock(&pcp->lock);
2423  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2440  * not empty. The check for non-emptiness can however race with a free to
2441  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2474 		struct zone *z;
2480 			 * guarantee that no cpu is missed.
2484 			pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2485 			if (pcp->count)
2488 			for_each_populated_zone(z) {
2489 				pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2490 				if (pcp->count) {
2514  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2516  * When zone parameter is non-NULL, spill just the single zone's pages.
2523 static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
2527 	/* Free as much as possible if batch freeing high-order pages. */
2529 		return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
2532 	if (unlikely(high < batch))
2535 	/* Leave at least pcp->batch pages on the list */
2537 	max_nr_free = high - batch;
2543 	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
2551 	int high, high_min, high_max;
2553 	high_min = READ_ONCE(pcp->high_min);
2554 	high_max = READ_ONCE(pcp->high_max);
2555 	high = pcp->high = clamp(pcp->high, high_min, high_max);
2557 	if (unlikely(!high))
2561 		pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2570 	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
2571 		int free_count = max_t(int, pcp->free_count, batch);
2573 		pcp->high = max(high - free_count, high_min);
2574 		return min(batch << 2, pcp->high);
2578 		return high;
2580 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
2581 		int free_count = max_t(int, pcp->free_count, batch);
2583 		pcp->high = max(high - free_count, high_min);
2584 		high = max(pcp->count, high_min);
2585 	} else if (pcp->count >= high) {
2586 		int need_high = pcp->free_count + batch;
2588 		/* pcp->high should be large enough to hold batch freed pages */
2589 		if (pcp->high < need_high)
2590 			pcp->high = clamp(need_high, high_min, high_max);
2593 	return high;
2600 	int high, batch;
2609 	pcp->alloc_factor >>= 1;
2612 	list_add(&page->pcp_list, &pcp->lists[pindex]);
2613 	pcp->count += 1 << order;
2615 	batch = READ_ONCE(pcp->batch);
2617 	 * As high-order pages other than THP's stored on PCP can contribute
2623 		free_high = (pcp->free_count >= batch &&
2624 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
2625 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
2626 			      pcp->count >= READ_ONCE(batch)));
2627 		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
2628 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
2629 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
2631 	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
2632 		pcp->free_count += (1 << order);
2633 	high = nr_pcp_high(pcp, zone, batch, free_high);
2634 	if (pcp->count >= high) {
2635 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
2637 		if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2640 			clear_bit(ZONE_BELOW_HIGH, &zone->flags);
2681 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2702 	for (i = 0, j = 0; i < folios->nr; i++) {
2703 		struct folio *folio = folios->folios[i];
2707 		if (!free_pages_prepare(&folio->page, order))
2714 			free_one_page(folio_zone(folio), &folio->page,
2718 		folio->private = (void *)(unsigned long)order;
2720 			folios->folios[j] = folio;
2723 	folios->nr = j;
2725 	for (i = 0; i < folios->nr; i++) {
2726 		struct folio *folio = folios->folios[i];
2729 		unsigned int order = (unsigned long)folio->private;
2732 		folio->private = NULL;
2733 		migratetype = get_pfnblock_migratetype(&folio->page, pfn);
2750 				free_one_page(zone, &folio->page, pfn,
2760 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2763 				free_one_page(zone, &folio->page, pfn,
2771 		 * Non-isolated types over MIGRATE_PCPTYPES get added
2777 		trace_mm_page_free_batched(&folio->page);
2778 		free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
2790  * split_page takes a non-compound higher-order page, and splits it into
2791  * n (1<<order) sub-pages: page[0..n]
2792  * Each sub-page must be freed individually.
2821 		 * emulate a high-order watermark check with a raised order-0
2822 		 * watermark, because we already know our high-order page
2825 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
2836 	if (order >= pageblock_order - 1) {
2837 		struct page *endpage = page + (1 << order) - 1;
2854  * __putback_isolated_page - Return a now-isolated page back where we got it
2867 	lockdep_assert_held(&zone->lock);
2877 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
2887 	if (zone_to_nid(z) != numa_node_id())
2890 	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
2891 		__count_numa_events(z, NUMA_HIT, nr_account);
2893 		__count_numa_events(z, NUMA_MISS, nr_account);
2896 	__count_numa_events(z, local_stat, nr_account);
2910 		spin_lock_irqsave(&zone->lock, flags);
2918 			 * order-0 (atomic) allocs access to HIGHATOMIC
2920 			 * high-order atomic allocation in the future.
2926 				spin_unlock_irqrestore(&zone->lock, flags);
2930 		spin_unlock_irqrestore(&zone->lock, flags);
2941 	int high, base_batch, batch, max_nr_alloc;
2944 	base_batch = READ_ONCE(pcp->batch);
2945 	high_min = READ_ONCE(pcp->high_min);
2946 	high_max = READ_ONCE(pcp->high_max);
2947 	high = pcp->high = clamp(pcp->high, high_min, high_max);
2950 	if (unlikely(high < base_batch))
2956 		batch = (base_batch << pcp->alloc_factor);
2959 	 * If we had larger pcp->high, we could avoid to allocate from
2962 	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
2963 		high = pcp->high = min(high + batch, high_max);
2966 		max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
2969 		 * subsequent allocation of order-0 pages without any freeing.
2972 		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
2973 			pcp->alloc_factor++;
2989 /* Remove page from the per-cpu list, caller must protect the list */
3008 			pcp->count += alloced << order;
3014 		list_del(&page->pcp_list);
3015 		pcp->count -= 1 << order;
3021 /* Lock and remove page from the per-cpu list */
3033 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3044 	pcp->free_count >>= 1;
3045 	list = &pcp->lists[order_to_pindex(migratetype, order)];
3058  * Use pcplists for THP or "cheap" high-order allocations.
3089 	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
3090 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3098 static inline long __zone_watermark_unusable_free(struct zone *z,
3101 	long unusable_free = (1 << order) - 1;
3108 		unusable_free += READ_ONCE(z->nr_free_highatomic);
3113 		unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3120  * Return true if free base pages are above 'mark'. For high-order checks it
3121  * will return true of the order-0 watermark is reached and there is at least
3123  * to check in the allocation paths if no pages are free.
3125 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3132 	/* free_pages may go negative - that's OK */
3133 	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3141 			min -= min / 2;
3144 			 * Non-blocking allocations (e.g. GFP_ATOMIC) can
3146 			 * non-blocking allocations requests such as GFP_NOWAIT
3151 				min -= min / 4;
3158 		 * makes during the free path will be small and short-lived.
3161 			min -= min / 2;
3165 	 * Check watermarks for an order-0 allocation request. If these
3166 	 * are not met, then a high-order request also cannot go ahead
3169 	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3172 	/* If this is an order-0 request then the watermark is fine */
3176 	/* For a high-order request, check at least one suitable page is free */
3178 		struct free_area *area = &z->free_area[o];
3181 		if (!area->nr_free)
3203 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3206 	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3207 					zone_page_state(z, NR_FREE_PAGES));
3210 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3216 	free_pages = zone_page_state(z, NR_FREE_PAGES);
3219 	 * Fast check for order-0 only. If this fails then the reserves
3227 		reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
3229 		/* reserved may over estimate high-atomic reserves. */
3230 		usable_free -= min(usable_free, reserved);
3231 		if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3235 	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3240 	 * Ignore watermark boosting for __GFP_HIGH order-0 allocations
3245 	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
3247 		mark = z->_watermark[WMARK_MIN];
3248 		return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3255 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3258 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3260 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3261 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3263 	return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3310 	 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3313 	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3314 	if (nr_online_nodes > 1 && !populated_zone(--zone))
3341 	struct zoneref *z;
3353 	z = ac->preferred_zoneref;
3354 	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3355 					ac->nodemask) {
3366 		 * limit, such that no single node holds more than its
3369 		 * lowmem reserves and high watermark so that kswapd
3374 		 * exceed the per-node dirty limit in the slowpath
3380 		 * dirty-throttling and the flusher threads.
3382 		if (ac->spread_dirty_pages) {
3383 			if (last_pgdat != zone->zone_pgdat) {
3384 				last_pgdat = zone->zone_pgdat;
3385 				last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
3393 		    zone != zonelist_zone(ac->preferred_zoneref)) {
3401 			local_nid = zonelist_node_idx(ac->preferred_zoneref);
3411 		 * Detect whether the number of free pages is below high
3412 		 * watermark.  If so, we will decrease pcp->high and free
3417 		if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
3422 					ac->highest_zoneidx, alloc_flags,
3426 			set_bit(ZONE_BELOW_HIGH, &zone->flags);
3431 				       ac->highest_zoneidx, alloc_flags,
3452 			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
3455 			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3466 					ac->highest_zoneidx, alloc_flags))
3474 		page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order,
3475 				gfp_mask, alloc_flags, ac->migratetype);
3480 			 * If this is a high-order atomic allocation then check
3522 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
3545 			current->comm, &vaf, gfp_mask, &gfp_mask,
3579 		.zonelist = ac->zonelist,
3580 		.nodemask = ac->nodemask,
3600 	 * Go through the zonelist yet one more time, keep very high watermark
3613 	if (current->flags & PF_DUMPCORE)
3629 	if (ac->highest_zoneidx < ZONE_NORMAL)
3649 		 * Help non-failing allocations by giving them access to memory
3668 /* Try memory compaction for high-order allocations before reclaim */
3711 		zone->compact_blockskip_flush = false;
3747 	 * Compaction was skipped due to a lack of free order-0
3785 		(*compact_priority)--;
3810 	struct zoneref *z;
3818 	 * Let's give them a good hope and keep retrying while the order-0
3821 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3822 				ac->highest_zoneidx, ac->nodemask) {
3824 					ac->highest_zoneidx, alloc_flags))
3837 	/* no reclaim without waiting on it */
3842 	if (current->flags & PF_MEMALLOC)
3929 	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3930 								ac->nodemask);
3960 	 * pages are pinned on the per-cpu lists or in high alloc reserves.
3978 	struct zoneref *z;
3981 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
3983 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
3984 					ac->nodemask) {
3987 		if (last_pgdat != zone->zone_pgdat) {
3989 			last_pgdat = zone->zone_pgdat;
4029 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4068 	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4071 		if (current->flags & PF_MEMALLOC)
4101 	struct zoneref *z;
4106 	 * their order will become available due to high fragmentation so
4107 	 * always increment the no progress counter for them
4124 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4125 				ac->highest_zoneidx, ac->nodemask) {
4144 				ac->highest_zoneidx, alloc_flags, available);
4145 		trace_reclaim_retry_zone(z, order, reclaimable,
4160 	if (current->flags & PF_WQ_WORKER)
4181 	 * This assumes that for all allocations, ac->nodemask can come only
4186 	if (cpusets_enabled() && ac->nodemask &&
4187 			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
4188 		ac->nodemask = NULL;
4227 		 * allocate greater than order-1 page units with __GFP_NOFAIL.
4240 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4260 	 * there was a cpuset modification and we are retrying - otherwise we
4261 	 * could end up iterating over non-eligible zones endlessly.
4263 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4264 					ac->highest_zoneidx, ac->nodemask);
4265 	if (!zonelist_zone(ac->preferred_zoneref))
4270 	 * any suitable zone to satisfy the request - e.g. non-movable
4274 		struct zoneref *z = first_zones_zonelist(ac->zonelist,
4275 					ac->highest_zoneidx,
4277 		if (!zonelist_zone(z))
4294 	 * that we have enough base pages and don't need to reclaim. For non-
4295 	 * movable high-order allocations, do that as well, as compaction will
4303 			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4325 			 *  - potentially very expensive because zones are far
4327 			 *    bursty high order allocations,
4328 			 *  - not guaranteed to help because isolate_freepages()
4331 			 *  - unlikely to make entire pageblocks free on its
4359 	 * ignored. These allocations are high priority and system rather than
4363 		ac->nodemask = NULL;
4364 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4365 					ac->highest_zoneidx, ac->nodemask);
4378 	if (current->flags & PF_MEMALLOC)
4398 	 * Do not retry costly high order allocations unless they are
4410 	 * It doesn't make any sense to retry for the compaction if the order-0
4435 	/* Avoid allocations with no watermarks from looping endlessly */
4470 		 * Help non-failing allocations by giving some access to memory
4471 		 * reserves normally used for high priority non-blocking
4484 	warn_alloc(gfp_mask, ac->nodemask,
4495 	ac->highest_zoneidx = gfp_zone(gfp_mask);
4496 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4497 	ac->nodemask = nodemask;
4498 	ac->migratetype = gfp_migratetype(gfp_mask);
4506 		if (in_task() && !ac->nodemask)
4507 			ac->nodemask = &cpuset_current_mems_allowed;
4520 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4527 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4528 					ac->highest_zoneidx, ac->nodemask);
4534  * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
4556 	struct zoneref *z;
4571 	/* No pages requested? */
4576 	if (unlikely(nr_pages - nr_populated == 0))
4584 	if (nr_pages - nr_populated == 1)
4607 	z = ac.preferred_zoneref;
4608 	for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
4641 	 * If there are no allowed local zones that meets the watermarks then
4649 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
4654 	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
4748 	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4792  * you need to access high mem.
4812  * __free_pages - Free pages allocated with alloc_pages().
4816  * This function can free multi-page allocations that are not compound
4822  * by put_page() which only frees the first page of a non-compound
4840 		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
4841 		while (order-- > 0)
4868 		while (page < --last)
4879  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4885  * allocate memory in power-of-two pages.
4907  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4933  * free_pages_exact - release memory allocated via alloc_pages_exact()
4952  * nr_free_zone_pages - count number of pages beyond high watermark
4956  * high watermark within all zones at or below a given zone index.  For each
4959  *     nr_free_zone_pages = managed_pages - high_pages
4961  * Return: number of pages beyond high watermark.
4965 	struct zoneref *z;
4973 	for_each_zone_zonelist(zone, z, zonelist, offset) {
4975 		unsigned long high = high_wmark_pages(zone);
4976 		if (size > high)
4977 			sum += size - high;
4984  * nr_free_buffer_pages - count number of pages beyond high watermark
4986  * nr_free_buffer_pages() counts the number of pages which are beyond the high
4989  * Return: number of pages beyond high watermark within ZONE_DMA and
5000 	zoneref->zone = zone;
5001 	zoneref->zone_idx = zone_idx(zone);
5016 		zone_type--;
5017 		zone = pgdat->node_zones + zone_type;
5039 		return -EINVAL;
5060  * find_next_best_node - find the next node that should appear in a given node's fallback list
5069  * with no CPUs, since presumably they'll have very little allocation pressure
5072  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5124  * This results in maximum locality--normal zone overflows into local
5125  * DMA zone, if any--but risks exhausting DMA zone.
5133 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5143 	zonerefs->zone = NULL;
5144 	zonerefs->zone_idx = 0;
5155 	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5158 	zonerefs->zone = NULL;
5159 	zonerefs->zone_idx = 0;
5169 	/* NUMA-aware ordering of nodes */
5170 	local_node = pgdat->node_id;
5178 		 * distance group to make it round-robin.
5205 	struct zoneref *z;
5207 	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5210 	return zonelist_node_idx(z);
5223 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5227 	zonerefs->zone = NULL;
5228 	zonerefs->zone_idx = 0;
5269 	 * trying to hold port->lock, for
5271 	 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
5280 	 * This node is hotadded and no memory is yet present.   So just
5281 	 * building zonelists is fine - no need to touch other nodes.
5283 	if (self && !node_online(self->node_id)) {
5298 		 * We now know the "local memory node" for each node--
5300 		 * Set up numa_mem percpu variable for on-line cpus.  During
5301 		 * boot, only the boot cpu should be on-line;  we'll init the
5302 		 * secondary cpus' numa_mem as they come on-line.  During
5303 		 * node/memory hotplug, we'll fixup all on-line cpus.
5332 	 * (a chicken-egg dilemma).
5357 	/* Get the number of free pages beyond high watermark in all zones. */
5362 	 * more accurate, but expensive to check per-zone. This check is
5363 	 * made on memory-hotadd so a system can start with mobility
5397 	 * Clamp the batch to a 2^n - 1 value. Having a power
5406 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
5415 	 * of contiguous memory as there's no hardware page translation to
5422 	 * fragmented and becoming unavailable for high-order allocations.
5433 	int high;
5439 		 * By default, the high value of the pcp is based on the zone
5446 		 * If percpu_pagelist_high_fraction is configured, the high
5454 	 * Split the high value across all online CPUs local to the zone. Note
5457 	 * onlined. For memory nodes that have no CPUs, split the high value
5464 	high = total_pages / nr_split_cpus;
5467 	 * Ensure high is at least batch*4. The multiple is based on the
5468 	 * historical relationship between high and batch.
5470 	high = max(high, batch << 2);
5472 	return high;
5479  * pcp->high and pcp->batch values are related and generally batch is lower
5480  * than high. They are also related to pcp->count such that count is lower
5481  * than high, and as soon as it reaches high, the pcplist is flushed.
5486  * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
5488  * fully trust only the pcp->count field on the local CPU with interrupts
5492  * outside of boot time (or some other assurance that no concurrent updaters
5498 	WRITE_ONCE(pcp->batch, batch);
5499 	WRITE_ONCE(pcp->high_min, high_min);
5500 	WRITE_ONCE(pcp->high_max, high_max);
5510 	spin_lock_init(&pcp->lock);
5512 		INIT_LIST_HEAD(&pcp->lists[pindex]);
5515 	 * Set batch and high values safe for a boot pageset. A true percpu
5520 	pcp->high_min = BOOT_PAGESET_HIGH;
5521 	pcp->high_max = BOOT_PAGESET_HIGH;
5522 	pcp->batch = BOOT_PAGESET_BATCH;
5523 	pcp->free_count = 0;
5533 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5539  * Calculate and set new high and batch values for all per-cpu pagesets of a
5551 		 * PCP high is tuned manually, disable auto-tuning via
5561 	if (zone->pageset_high_min == new_high_min &&
5562 	    zone->pageset_high_max == new_high_max &&
5563 	    zone->pageset_batch == new_batch)
5566 	zone->pageset_high_min = new_high_min;
5567 	zone->pageset_high_max = new_high_max;
5568 	zone->pageset_batch = new_batch;
5580 		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
5582 	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
5587 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5588 		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
5597  * page high values need to be recalculated.
5611 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5614 	 * If data cache slice of CPU is large enough, "pcp->batch"
5616 	 * consecutive high-order pages freeing without allocation.
5618 	 * cache-hot pages sharing.
5620 	spin_lock(&pcp->lock);
5621 	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
5622 		pcp->flags |= PCPF_FREE_HIGH_BATCH;
5624 		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
5625 	spin_unlock(&pcp->lock);
5658 		memset(pzstats->vm_numa_event, 0,
5659 		       sizeof(pzstats->vm_numa_event));
5664 		pgdat->per_cpu_nodestats =
5675 	zone->per_cpu_pageset = &boot_pageset;
5676 	zone->per_cpu_zonestats = &boot_zonestats;
5677 	zone->pageset_high_min = BOOT_PAGESET_HIGH;
5678 	zone->pageset_high_max = BOOT_PAGESET_HIGH;
5679 	zone->pageset_batch = BOOT_PAGESET_BATCH;
5682 		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
5683 			 zone->present_pages, zone_batchsize(zone));
5690 	atomic_long_add(count, &page_zone(page)->managed_pages);
5716 		 * Perform a kasan-unchecked memset() since this memory
5794  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
5805 		pgdat->totalreserve_pages = 0;
5808 			struct zone *zone = pgdat->node_zones + i;
5814 				if (zone->lowmem_reserve[j] > max)
5815 					max = zone->lowmem_reserve[j];
5818 			/* we treat the high watermark as reserved pages. */
5824 			pgdat->totalreserve_pages += max;
5833  * setup_per_zone_lowmem_reserve - called whenever
5844 		for (i = 0; i < MAX_NR_ZONES - 1; i++) {
5845 			struct zone *zone = &pgdat->node_zones[i];
5851 				struct zone *upper_zone = &pgdat->node_zones[j];
5857 					zone->lowmem_reserve[j] = 0;
5859 					zone->lowmem_reserve[j] = managed_pages / ratio;
5870 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5884 		spin_lock_irqsave(&zone->lock, flags);
5893 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5901 			zone->_watermark[WMARK_MIN] = min_pages;
5907 			zone->_watermark[WMARK_MIN] = tmp;
5919 		zone->watermark_boost = 0;
5920 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
5921 		zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
5922 		zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
5924 		spin_unlock_irqrestore(&zone->lock, flags);
5932  * setup_per_zone_wmarks - called when min_free_kbytes changes
5933  * or when memory is hot-{added|removed}
5935  * Ensures that the watermark[min,low,high] values for each zone are set
5949 	 * and high limits or the limits may be inappropriate.
6014  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
6056 		pgdat->min_unmapped_pages = 0;
6059 		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
6084 		pgdat->min_slab_pages = 0;
6087 		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
6107  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6111  * The reserve ratio obviously has absolutely no relation with the
6132  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6154 		ret = -EINVAL;
6158 	/* No change? */
6245 /* Usage: See admin-guide/dynamic-debug-howto.rst */
6273 		.nid = zone_to_nid(cc->zone),
6274 		.gfp_mask = cc->gfp_mask,
6284 	while (pfn < end || !list_empty(&cc->migratepages)) {
6286 			ret = -EINTR;
6290 		if (list_empty(&cc->migratepages)) {
6291 			cc->nr_migratepages = 0;
6293 			if (ret && ret != -EAGAIN)
6295 			pfn = cc->migrate_pfn;
6298 			ret = -EBUSY;
6302 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6303 							&cc->migratepages);
6304 		cc->nr_migratepages -= nr_reclaimed;
6308 			list_for_each_entry(page, &cc->migratepages, lru) {
6316 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,
6317 			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
6320 			total_migrated += cc->nr_migratepages;
6323 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
6326 		if (ret == -ENOMEM)
6332 		if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
6333 			alloc_contig_dump_pages(&cc->migratepages);
6334 		putback_movable_pages(&cc->migratepages);
6362 			/* Add all subpages to the order-0 head, in sequence. */
6363 			list_del(&page->lru);
6389 		return -EINVAL;
6405  * alloc_contig_range() -- tries to allocate given range of pages
6407  * @end:	one-past-the-last PFN to allocate
6435 		.order = -1,
6446 		return -EINVAL;
6476 	 * In case of -EBUSY, we'd like to know which page causes problem.
6483 	 * -EBUSY is not accidentally used or returned to caller.
6486 	if (ret && ret != -EBUSY)
6490 	 * When in-use hugetlb pages are migrated, they may simply be released
6492 	 * buddy system.  After the migration of in-use huge pages is completed,
6513 	 * We don't have to hold zone->lock here because the pages are
6520 		ret = -EBUSY;
6527 		ret = -EBUSY;
6536 			free_contig_range(outer_start, start - outer_start);
6538 			free_contig_range(end, outer_end - end);
6539 	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
6541 		int order = ilog2(end - start);
6547 		ret = -EINVAL;
6566 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
6577 		if (page_zone(page) != z)
6592 	unsigned long last_pfn = start_pfn + nr_pages - 1;
6598  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
6626 	struct zoneref *z;
6629 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6631 		spin_lock_irqsave(&zone->lock, flags);
6633 		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
6643 				spin_unlock_irqrestore(&zone->lock, flags);
6648 				spin_lock_irqsave(&zone->lock, flags);
6652 		spin_unlock_irqrestore(&zone->lock, flags);
6674 	for (; nr_pages--; pfn++) {
6685  * Effectively disable pcplists for the zone by setting the high limit to 0
6688  * will be drained, or observe the new high limit and skip the pcplist.
6701 	__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
6702 		zone->pageset_high_max, zone->pageset_batch);
6711 	if (zone->per_cpu_pageset != &boot_pageset) {
6713 			pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6716 		free_percpu(zone->per_cpu_pageset);
6717 		zone->per_cpu_pageset = &boot_pageset;
6718 		if (zone->per_cpu_zonestats != &boot_zonestats) {
6719 			free_percpu(zone->per_cpu_zonestats);
6720 			zone->per_cpu_zonestats = &boot_zonestats;
6730  * Returns the number of managed (non-PageOffline()) pages in the range: the
6745 	spin_lock_irqsave(&zone->lock, flags);
6775 	spin_unlock_irqrestore(&zone->lock, flags);
6777 	return end_pfn - start_pfn - already_offline;
6790 		const struct page *head = page - (pfn & ((1 << order) - 1));
6811  * Break down a higher-order page in sub-pages, and keep our target out of
6815 				   struct page *target, int low, int high,
6818 	unsigned long size = 1 << high;
6821 	while (high > low) {
6822 		high--;
6832 		if (set_page_guard(zone, current_buddy, high))
6835 		add_to_free_list(current_buddy, zone, high, migratetype, false);
6836 		set_buddy_order(current_buddy, high);
6851 	spin_lock_irqsave(&zone->lock, flags);
6853 		struct page *page_head = page - (pfn & ((1 << order) - 1));
6872 	spin_unlock_irqrestore(&zone->lock, flags);
6885 	spin_lock_irqsave(&zone->lock, flags);
6896 	spin_unlock_irqrestore(&zone->lock, flags);
6908 		struct zone *zone = &pgdat->node_zones[ZONE_DMA];
6933 		return -EINVAL;
6950 	list_del(&page->lru);
6951 	last = list_empty(&zone->unaccepted_pages);
6953 	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
6954 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
6956 	spin_unlock_irqrestore(&zone->lock, *flags);
6971 	spin_lock_irqsave(&zone->lock, flags);
6973 		spin_unlock_irqrestore(&zone->lock, flags);
6977 	/* Unlocks zone->lock */
6986 	spin_lock_irqsave(&zone->lock, flags);
6987 	page = list_first_entry_or_null(&zone->unaccepted_pages,
6990 		spin_unlock_irqrestore(&zone->lock, flags);
6994 	/* Unlocks zone->lock */
7013 	if (list_empty(&zone->unaccepted_pages))
7017 	to_accept = promo_wmark_pages(zone) -
7018 		    (zone_page_state(zone, NR_FREE_PAGES) -
7019 		    __zone_watermark_unusable_free(zone, order, 0) -
7026 		to_accept -= MAX_ORDER_NR_PAGES;
7041 	spin_lock_irqsave(&zone->lock, flags);
7042 	first = list_empty(&zone->unaccepted_pages);
7043 	list_add_tail(&page->lru, &zone->unaccepted_pages);
7047 	spin_unlock_irqrestore(&zone->lock, flags);