page_alloc.c - OpenGrok cross reference for /linux/mm/page

Lines Matching +full:high +full:- +full:end
1 // SPDX-License-Identifier: GPL-2.0-only
39 #include <linux/fault-inject.h>
63 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
72  * reporting it and marking it "reported" -  it only skips notifying
81  * page shuffling (relevant code - e.g., memory onlining - is expected to
84  * Note: No code should rely on this flag for correctness - it's purely
94 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
107 /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
118  * interfered with and a high priority task cannot preempt the allocator.
129  * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
137 	spin_lock(&_ret->member);					\
146 	if (!spin_trylock(&_ret->member)) {				\
155 	spin_unlock(&ptr->member);					\
222  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
223  *	1G machine -> (16M dma, 784M normal, 224M high)
228  * TBD: should special case ZONE_DMA32 machines here - in those we normally
276 int user_min_free_kbytes = -1;
301  * During boot we initialize deferred pages on-demand, as needed, but once
342 	return page_zone(page)->pageblock_flags;  in get_pageblock_bitmap()
349 	pfn &= (PAGES_PER_SECTION-1);  in pfn_to_bitidx()
351 	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);  in pfn_to_bitidx()
379 	*bitidx &= (BITS_PER_LONG - 1);  in get_pfnblock_bitmap_bitidx()
385  * __get_pfnblock_flags_mask - Return the requested group of flags for
412  * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
434  * get_pfnblock_migratetype - Return the migratetype of a pageblock
459  * __set_pfnblock_flags_mask - Set the requested group of flags for
484  * set_pfnblock_bit - Set a standalone bit of a pageblock
504  * clear_pfnblock_bit - Clear a standalone bit of a pageblock
524  * set_pageblock_migratetype - Set the migratetype of a pageblock
586 		start_pfn = zone->zone_start_pfn;  in page_outside_zone_boundaries()
587 		sp = zone->spanned_pages;  in page_outside_zone_boundaries()
592 		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",  in page_outside_zone_boundaries()
593 			pfn, zone_to_nid(zone), zone->name,  in page_outside_zone_boundaries()
645 		current->comm, page_to_pfn(page));  in bad_page()
702  * Higher-order pages are called "compound pages".  They are structured thusly:
707  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
709  * The first tail page's ->compound_order holds the order of allocation.
710  * This usage means that zero-order pages may not be compound.
734 	struct capture_control *capc = current->capture_control;  in task_capc()
737 		!(current->flags & PF_KTHREAD) &&  in task_capc()
738 		!capc->page &&  in task_capc()
739 		capc->cc->zone == zone ? capc : NULL;  in task_capc()
746 	if (!capc || order != capc->cc->order)  in compaction_capture()
758 	 * and vice-versa but no more than normal fallback logic which can  in compaction_capture()
759 	 * have trouble finding a high-order free page.  in compaction_capture()
762 	    capc->cc->migratetype != MIGRATE_MOVABLE)  in compaction_capture()
765 	if (migratetype != capc->cc->migratetype)  in compaction_capture()
766 		trace_mm_page_alloc_extfrag(page, capc->cc->order, order,  in compaction_capture()
767 					    capc->cc->migratetype, migratetype);  in compaction_capture()
769 	capc->page = page;  in compaction_capture()
790 	lockdep_assert_held(&zone->lock);  in account_freepages()
800 		WRITE_ONCE(zone->nr_free_highatomic,  in account_freepages()
801 			   zone->nr_free_highatomic + nr_pages);  in account_freepages()
809 	struct free_area *area = &zone->free_area[order];  in __add_to_free_list()
817 		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);  in __add_to_free_list()
819 		list_add(&page->buddy_list, &area->free_list[migratetype]);  in __add_to_free_list()
820 	area->nr_free++;  in __add_to_free_list()
828  * of the list - so the moved pages won't immediately be considered for
834 	struct free_area *area = &zone->free_area[order];  in move_to_free_list()
842 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);  in move_to_free_list()
844 	account_freepages(zone, -nr_pages, old_mt);  in move_to_free_list()
850 			nr_pages = -nr_pages;  in move_to_free_list()
868 	list_del(&page->buddy_list);  in __del_page_from_free_list()
871 	zone->free_area[order].nr_free--;  in __del_page_from_free_list()
874 		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);  in __del_page_from_free_list()
881 	account_freepages(zone, -(1 << order), migratetype);  in del_page_from_free_list()
887 	return list_first_entry_or_null(&area->free_list[migratetype],  in get_page_from_free_area()
893  * of the next-higher order is free. If it is, it's possible
897  * as a 2-level higher order page
906 	if (order >= MAX_PAGE_ORDER - 1)  in buddy_merge_likely()
910 	higher_page = page + (higher_page_pfn - pfn);  in buddy_merge_likely()
919  * The concept of a buddy system is to maintain direct-mapped table
924  * At a high level, all that happens here is marking the table entry
937  * -- nyc
952 	VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);  in __free_one_page()
954 	VM_BUG_ON(migratetype == -1);  in __free_one_page()
955 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);  in __free_one_page()
964 			account_freepages(zone, -(1 << order), migratetype);  in __free_one_page()
999 			 * expand() down the line puts the sub-blocks  in __free_one_page()
1006 		page = page + (combined_pfn - pfn);  in __free_one_page()
1036 	if (unlikely(atomic_read(&page->_mapcount) != -1))  in page_expected_state()
1039 	if (unlikely((unsigned long)page->mapping |  in page_expected_state()
1042 			page->memcg_data |  in page_expected_state()
1045 			(page->flags.f & check_flags)))  in page_expected_state()
1055 	if (unlikely(atomic_read(&page->_mapcount) != -1))  in page_bad_reason()
1057 	if (unlikely(page->mapping != NULL))  in page_bad_reason()
1058 		bad_reason = "non-NULL mapping";  in page_bad_reason()
1061 	if (unlikely(page->flags.f & flags)) {  in page_bad_reason()
1068 	if (unlikely(page->memcg_data))  in page_bad_reason()
1097 	 * We rely page->lru.next never has bit 0 set, unless the page  in free_tail_page_prepare()
1098 	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.  in free_tail_page_prepare()
1106 	switch (page - head_page) {  in free_tail_page_prepare()
1108 		/* the first tail page: these may be in place of ->mapping */  in free_tail_page_prepare()
1114 		    unlikely(atomic_read(&folio->_nr_pages_mapped))) {  in free_tail_page_prepare()
1119 			if (unlikely(folio->_mm_id_mapcount[0] != -1)) {  in free_tail_page_prepare()
1123 			if (unlikely(folio->_mm_id_mapcount[1] != -1)) {  in free_tail_page_prepare()
1129 			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {  in free_tail_page_prepare()
1133 			if (unlikely(atomic_read(&folio->_pincount))) {  in free_tail_page_prepare()
1140 		/* the second tail page: deferred_list overlaps ->mapping */  in free_tail_page_prepare()
1141 		if (unlikely(!list_empty(&folio->_deferred_list))) {  in free_tail_page_prepare()
1146 			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {  in free_tail_page_prepare()
1150 			if (unlikely(atomic_read(&folio->_pincount))) {  in free_tail_page_prepare()
1157 		/* the third tail page: hugetlb specifics overlap ->mappings */  in free_tail_page_prepare()
1162 		if (page->mapping != TAIL_MAPPING) {  in free_tail_page_prepare()
1178 	page->mapping = NULL;  in free_tail_page_prepare()
1187  *    Tag-based KASAN modes skip pages freed via deferred memory initialization
1189  * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1192  * Pages will have match-all tags in the following circumstances:
1209  * on-demand allocation and then freed again before the deferred pages
1255 		alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);  in __pgalloc_tag_add()
1292 		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);  in pgalloc_tag_sub_pages()
1331 		zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);  in free_pages_prepare()
1354 	 * avoid checking PageCompound for order-0 pages.  in free_pages_prepare()
1362 			folio->_nr_pages = 0;  in free_pages_prepare()
1374 			(page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;  in free_pages_prepare()
1378 		mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);  in free_pages_prepare()
1379 		folio->mapping = NULL;  in free_pages_prepare()
1383 		page->page_type = UINT_MAX;  in free_pages_prepare()
1393 	page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;  in free_pages_prepare()
1412 	 * With hardware tag-based KASAN, memory tags must be set before the  in free_pages_prepare()
1454 	count = min(pcp->count, count);  in free_pcppages_bulk()
1457 	pindex = pindex - 1;  in free_pcppages_bulk()
1459 	spin_lock_irqsave(&zone->lock, flags);  in free_pcppages_bulk()
1465 		/* Remove pages from lists in a round-robin fashion. */  in free_pcppages_bulk()
1467 			if (++pindex > NR_PCP_LISTS - 1)  in free_pcppages_bulk()
1469 			list = &pcp->lists[pindex];  in free_pcppages_bulk()
1483 			list_del(&page->pcp_list);  in free_pcppages_bulk()
1484 			count -= nr_pages;  in free_pcppages_bulk()
1485 			pcp->count -= nr_pages;  in free_pcppages_bulk()
1492 	spin_unlock_irqrestore(&zone->lock, flags);  in free_pcppages_bulk()
1495 /* Split a multi-block free page into its individual pageblocks. */
1499 	unsigned long end = pfn + (1 << order);  in split_large_buddy()  local
1513 		if (pfn == end)  in split_large_buddy()
1523 	page->private = order;  in add_page_to_zone_llist()
1525 	llist_add(&page->pcp_llist, &zone->trylock_free_pages);  in add_page_to_zone_llist()
1536 		if (!spin_trylock_irqsave(&zone->lock, flags)) {  in free_one_page()
1541 		spin_lock_irqsave(&zone->lock, flags);  in free_one_page()
1545 	llhead = &zone->trylock_free_pages;  in free_one_page()
1552 			unsigned int p_order = p->private;  in free_one_page()
1559 	spin_unlock_irqrestore(&zone->lock, flags);  in free_one_page()
1605 		atomic_long_add(nr_pages, &page_zone(page)->managed_pages);  in __free_pages_core()
1637  * Note: the function may return non-NULL struct page even for a page block
1640  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1653 	end_pfn--;  in __pageblock_pfn_to_page()
1686  * -- nyc
1689 				  int high, int migratetype)  in expand()  argument
1691 	unsigned int size = 1 << high;  in expand()
1694 	while (high > low) {  in expand()
1695 		high--;  in expand()
1705 		if (set_page_guard(zone, &page[size], high))  in expand()
1708 		__add_to_free_list(&page[size], zone, high, migratetype, false);  in expand()
1709 		set_buddy_order(&page[size], high);  in expand()
1718 						int high, int migratetype)  in page_del_and_expand()  argument
1720 	int nr_pages = 1 << high;  in page_del_and_expand()
1722 	__del_page_from_free_list(page, zone, high, migratetype);  in page_del_and_expand()
1723 	nr_pages -= expand(zone, page, low, high, migratetype);  in page_del_and_expand()
1724 	account_freepages(zone, -nr_pages, migratetype);  in page_del_and_expand()
1774 	/* Skip, if hardware tag-based KASAN is not enabled. */  in should_skip_kasan_unpoison()
1779 	 * With hardware tag-based KASAN enabled, skip if this has been  in should_skip_kasan_unpoison()
1787 	/* Don't skip, if hardware tag-based KASAN is not enabled. */  in should_skip_init()
1791 	/* For hardware tag-based KASAN, skip if requested. */  in should_skip_init()
1884 		area = &(zone->free_area[current_order]);  in __rmqueue_smallest()
1907 static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
1936 	VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));  in __move_freepages_block()
1965 	unsigned long pfn, start, end;  in prep_move_freepages_block()  local
1969 	end = pageblock_end_pfn(pfn);  in prep_move_freepages_block()
1980 	if (!zone_spans_pfn(zone, end - 1))  in prep_move_freepages_block()
1988 		for (pfn = start; pfn < end;) {  in prep_move_freepages_block()
2018 		return -1;  in move_freepages_block()
2032 	 * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing  in find_large_buddy()
2034 	 * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,  in find_large_buddy()
2067  * __move_freepages_block_isolate - move free pages in block for page isolation
2120 	/* Use MIGRATETYPE_MASK to get non-isolate migratetype */  in __move_freepages_block_isolate()
2152 	int nr_pageblocks = 1 << (start_order - pageblock_order);  in change_pageblock_range()
2154 	while (nr_pageblocks--) {  in change_pageblock_range()
2175 	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],  in boost_watermark()
2179 	 * high watermark may be uninitialised if fragmentation occurs  in boost_watermark()
2191 	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,  in boost_watermark()
2247  * we would do this whole-block claiming. This would help to reduce
2256 		return -2;  in find_suitable_fallback()
2258 	if (area->nr_free == 0)  in find_suitable_fallback()
2259 		return -1;  in find_suitable_fallback()
2261 	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {  in find_suitable_fallback()
2268 	return -1;  in find_suitable_fallback()
2303 		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);  in try_to_claim_block()
2320 		 * to MOVABLE pageblock, consider all non-movable pages as  in try_to_claim_block()
2323 		 * exact migratetype of non-movable pages.  in try_to_claim_block()
2327 						- (free_pages + movable_pages);  in try_to_claim_block()
2335 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||  in try_to_claim_block()
2377 				--current_order) {  in __rmqueue_claim()
2378 		area = &(zone->free_area[current_order]);  in __rmqueue_claim()
2383 		if (fallback_mt == -1)  in __rmqueue_claim()
2387 		if (fallback_mt == -2)  in __rmqueue_claim()
2417 		area = &(zone->free_area[current_order]);  in __rmqueue_steal()
2420 		if (fallback_mt == -1)  in __rmqueue_steal()
2442  * Call me with the zone->lock already held.
2470 	 * a loop with the zone->lock held, meaning the freelists are  in __rmqueue()
2523 		if (!spin_trylock_irqsave(&zone->lock, flags))  in rmqueue_bulk()
2526 		spin_lock_irqsave(&zone->lock, flags);  in rmqueue_bulk()
2544 		list_add_tail(&page->pcp_list, list);  in rmqueue_bulk()
2546 	spin_unlock_irqrestore(&zone->lock, flags);  in rmqueue_bulk()
2552  * Called from the vmstat counter updater to decay the PCP high.
2560 	high_min = READ_ONCE(pcp->high_min);  in decay_pcp_high()
2561 	batch = READ_ONCE(pcp->batch);  in decay_pcp_high()
2563 	 * Decrease pcp->high periodically to try to free possible  in decay_pcp_high()
2565 	 * control latency.  This caps pcp->high decrement too.  in decay_pcp_high()
2567 	if (pcp->high > high_min) {  in decay_pcp_high()
2568 		pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),  in decay_pcp_high()
2569 				 pcp->high - (pcp->high >> 3), high_min);  in decay_pcp_high()
2570 		if (pcp->high > high_min)  in decay_pcp_high()
2574 	to_drain = pcp->count - pcp->high;  in decay_pcp_high()
2576 		spin_lock(&pcp->lock);  in decay_pcp_high()
2578 		spin_unlock(&pcp->lock);  in decay_pcp_high()
2595 	batch = READ_ONCE(pcp->batch);  in drain_zone_pages()
2596 	to_drain = min(pcp->count, batch);  in drain_zone_pages()
2598 		spin_lock(&pcp->lock);  in drain_zone_pages()
2600 		spin_unlock(&pcp->lock);  in drain_zone_pages()
2610 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);  in drain_pages_zone()
2614 		spin_lock(&pcp->lock);  in drain_pages_zone()
2615 		count = pcp->count;  in drain_pages_zone()
2618 				pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);  in drain_pages_zone()
2621 			count -= to_drain;  in drain_pages_zone()
2623 		spin_unlock(&pcp->lock);  in drain_pages_zone()
2640  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2657  * not empty. The check for non-emptiness can however race with a free to
2658  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2701 			pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);  in __drain_all_pages()
2702 			if (pcp->count)  in __drain_all_pages()
2706 				pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);  in __drain_all_pages()
2707 				if (pcp->count) {  in __drain_all_pages()
2731  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2733  * When zone parameter is non-NULL, spill just the single zone's pages.
2740 static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)  in nr_pcp_free()  argument
2744 	/* Free as much as possible if batch freeing high-order pages. */  in nr_pcp_free()
2746 		return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);  in nr_pcp_free()
2749 	if (unlikely(high < batch))  in nr_pcp_free()
2752 	/* Leave at least pcp->batch pages on the list */  in nr_pcp_free()
2754 	max_nr_free = high - batch;  in nr_pcp_free()
2760 	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);  in nr_pcp_free()
2768 	int high, high_min, high_max;  in nr_pcp_high()  local
2770 	high_min = READ_ONCE(pcp->high_min);  in nr_pcp_high()
2771 	high_max = READ_ONCE(pcp->high_max);  in nr_pcp_high()
2772 	high = pcp->high = clamp(pcp->high, high_min, high_max);  in nr_pcp_high()
2774 	if (unlikely(!high))  in nr_pcp_high()
2778 		pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),  in nr_pcp_high()
2787 	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {  in nr_pcp_high()
2788 		int free_count = max_t(int, pcp->free_count, batch);  in nr_pcp_high()
2790 		pcp->high = max(high - free_count, high_min);  in nr_pcp_high()
2791 		return min(batch << 2, pcp->high);  in nr_pcp_high()
2795 		return high;  in nr_pcp_high()
2797 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {  in nr_pcp_high()
2798 		int free_count = max_t(int, pcp->free_count, batch);  in nr_pcp_high()
2800 		pcp->high = max(high - free_count, high_min);  in nr_pcp_high()
2801 		high = max(pcp->count, high_min);  in nr_pcp_high()
2802 	} else if (pcp->count >= high) {  in nr_pcp_high()
2803 		int need_high = pcp->free_count + batch;  in nr_pcp_high()
2805 		/* pcp->high should be large enough to hold batch freed pages */  in nr_pcp_high()
2806 		if (pcp->high < need_high)  in nr_pcp_high()
2807 			pcp->high = clamp(need_high, high_min, high_max);  in nr_pcp_high()
2810 	return high;  in nr_pcp_high()
2817 	int high, batch;  in free_frozen_page_commit()  local
2826 	pcp->alloc_factor >>= 1;  in free_frozen_page_commit()
2829 	list_add(&page->pcp_list, &pcp->lists[pindex]);  in free_frozen_page_commit()
2830 	pcp->count += 1 << order;  in free_frozen_page_commit()
2832 	batch = READ_ONCE(pcp->batch);  in free_frozen_page_commit()
2834 	 * As high-order pages other than THP's stored on PCP can contribute  in free_frozen_page_commit()
2840 		free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&  in free_frozen_page_commit()
2841 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&  in free_frozen_page_commit()
2842 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||  in free_frozen_page_commit()
2843 			      pcp->count >= batch));  in free_frozen_page_commit()
2844 		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;  in free_frozen_page_commit()
2845 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {  in free_frozen_page_commit()
2846 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;  in free_frozen_page_commit()
2848 	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))  in free_frozen_page_commit()
2849 		pcp->free_count += (1 << order);  in free_frozen_page_commit()
2853 		 * Do not attempt to take a zone lock. Let pcp->count get  in free_frozen_page_commit()
2854 		 * over high mark temporarily.  in free_frozen_page_commit()
2859 	high = nr_pcp_high(pcp, zone, batch, free_high);  in free_frozen_page_commit()
2860 	if (pcp->count < high)  in free_frozen_page_commit()
2863 	free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),  in free_frozen_page_commit()
2865 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&  in free_frozen_page_commit()
2868 		struct pglist_data *pgdat = zone->zone_pgdat;  in free_frozen_page_commit()
2869 		clear_bit(ZONE_BELOW_HIGH, &zone->flags);  in free_frozen_page_commit()
2878 		if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&  in free_frozen_page_commit()
2879 		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)  in free_frozen_page_commit()
2880 			atomic_set(&pgdat->kswapd_failures, 0);  in free_frozen_page_commit()
2927 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);  in __free_frozen_pages()
2953 	for (i = 0, j = 0; i < folios->nr; i++) {  in free_unref_folios()
2954 		struct folio *folio = folios->folios[i];  in free_unref_folios()
2958 		if (!free_pages_prepare(&folio->page, order))  in free_unref_folios()
2965 			free_one_page(folio_zone(folio), &folio->page,  in free_unref_folios()
2969 		folio->private = (void *)(unsigned long)order;  in free_unref_folios()
2971 			folios->folios[j] = folio;  in free_unref_folios()
2974 	folios->nr = j;  in free_unref_folios()
2976 	for (i = 0; i < folios->nr; i++) {  in free_unref_folios()
2977 		struct folio *folio = folios->folios[i];  in free_unref_folios()
2980 		unsigned int order = (unsigned long)folio->private;  in free_unref_folios()
2983 		folio->private = NULL;  in free_unref_folios()
2984 		migratetype = get_pfnblock_migratetype(&folio->page, pfn);  in free_unref_folios()
3001 				free_one_page(zone, &folio->page, pfn,  in free_unref_folios()
3011 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);  in free_unref_folios()
3014 				free_one_page(zone, &folio->page, pfn,  in free_unref_folios()
3022 		 * Non-isolated types over MIGRATE_PCPTYPES get added  in free_unref_folios()
3028 		trace_mm_page_free_batched(&folio->page);  in free_unref_folios()
3029 		free_frozen_page_commit(zone, pcp, &folio->page, migratetype,  in free_unref_folios()
3041  * split_page takes a non-compound higher-order page, and splits it into
3042  * n (1<<order) sub-pages: page[0..n]
3043  * Each sub-page must be freed individually.
3072 		 * emulate a high-order watermark check with a raised order-0  in __isolate_free_page()
3073 		 * watermark, because we already know our high-order page  in __isolate_free_page()
3076 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);  in __isolate_free_page()
3087 	if (order >= pageblock_order - 1) {  in __isolate_free_page()
3088 		struct page *endpage = page + (1 << order) - 1;  in __isolate_free_page()
3105  * __putback_isolated_page - Return a now-isolated page back where we got it
3118 	lockdep_assert_held(&zone->lock);  in __putback_isolated_page()
3162 			if (!spin_trylock_irqsave(&zone->lock, flags))  in rmqueue_buddy()
3165 			spin_lock_irqsave(&zone->lock, flags);  in rmqueue_buddy()
3176 			 * order-0 (atomic) allocs access to HIGHATOMIC  in rmqueue_buddy()
3178 			 * high-order atomic allocation in the future.  in rmqueue_buddy()
3184 				spin_unlock_irqrestore(&zone->lock, flags);  in rmqueue_buddy()
3188 		spin_unlock_irqrestore(&zone->lock, flags);  in rmqueue_buddy()
3199 	int high, base_batch, batch, max_nr_alloc;  in nr_pcp_alloc()  local
3202 	base_batch = READ_ONCE(pcp->batch);  in nr_pcp_alloc()
3203 	high_min = READ_ONCE(pcp->high_min);  in nr_pcp_alloc()
3204 	high_max = READ_ONCE(pcp->high_max);  in nr_pcp_alloc()
3205 	high = pcp->high = clamp(pcp->high, high_min, high_max);  in nr_pcp_alloc()
3208 	if (unlikely(high < base_batch))  in nr_pcp_alloc()
3214 		batch = (base_batch << pcp->alloc_factor);  in nr_pcp_alloc()
3217 	 * If we had larger pcp->high, we could avoid to allocate from  in nr_pcp_alloc()
3220 	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))  in nr_pcp_alloc()
3221 		high = pcp->high = min(high + batch, high_max);  in nr_pcp_alloc()
3224 		max_nr_alloc = max(high - pcp->count - base_batch, base_batch);  in nr_pcp_alloc()
3227 		 * subsequent allocation of order-0 pages without any freeing.  in nr_pcp_alloc()
3230 		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)  in nr_pcp_alloc()
3231 			pcp->alloc_factor++;  in nr_pcp_alloc()
3247 /* Remove page from the per-cpu list, caller must protect the list */
3266 			pcp->count += alloced << order;  in __rmqueue_pcplist()
3272 		list_del(&page->pcp_list);  in __rmqueue_pcplist()
3273 		pcp->count -= 1 << order;  in __rmqueue_pcplist()
3279 /* Lock and remove page from the per-cpu list */
3291 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);  in rmqueue_pcplist()
3302 	pcp->free_count >>= 1;  in rmqueue_pcplist()
3303 	list = &pcp->lists[order_to_pindex(migratetype, order)];  in rmqueue_pcplist()
3316  * Use pcplists for THP or "cheap" high-order allocations.
3347 	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {  in rmqueue()
3348 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);  in rmqueue()
3358  * exclusive use of high-order atomic allocations if there are no
3371 	 * Check is race-prone but harmless.  in reserve_highatomic_pageblock()
3376 	if (zone->nr_reserved_highatomic >= max_managed)  in reserve_highatomic_pageblock()
3379 	spin_lock_irqsave(&zone->lock, flags);  in reserve_highatomic_pageblock()
3382 	if (zone->nr_reserved_highatomic >= max_managed)  in reserve_highatomic_pageblock()
3392 		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)  in reserve_highatomic_pageblock()
3394 		zone->nr_reserved_highatomic += pageblock_nr_pages;  in reserve_highatomic_pageblock()
3397 		zone->nr_reserved_highatomic += 1 << order;  in reserve_highatomic_pageblock()
3401 	spin_unlock_irqrestore(&zone->lock, flags);  in reserve_highatomic_pageblock()
3406  * potentially hurts the reliability of high-order allocations when under
3416 	struct zonelist *zonelist = ac->zonelist;  in unreserve_highatomic_pageblock()
3424 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,  in unreserve_highatomic_pageblock()
3425 								ac->nodemask) {  in unreserve_highatomic_pageblock()
3428 		 * is really high.  in unreserve_highatomic_pageblock()
3430 		if (!force && zone->nr_reserved_highatomic <=  in unreserve_highatomic_pageblock()
3434 		spin_lock_irqsave(&zone->lock, flags);  in unreserve_highatomic_pageblock()
3436 			struct free_area *area = &(zone->free_area[order]);  in unreserve_highatomic_pageblock()
3446 			 * locking could inadvertently allow a per-cpu  in unreserve_highatomic_pageblock()
3451 			if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))  in unreserve_highatomic_pageblock()
3452 				size = zone->nr_reserved_highatomic;  in unreserve_highatomic_pageblock()
3453 			zone->nr_reserved_highatomic -= size;  in unreserve_highatomic_pageblock()
3456 			 * Convert to ac->migratetype and avoid the normal  in unreserve_highatomic_pageblock()
3467 							   ac->migratetype);  in unreserve_highatomic_pageblock()
3471 						  ac->migratetype);  in unreserve_highatomic_pageblock()
3473 						       ac->migratetype);  in unreserve_highatomic_pageblock()
3480 			WARN_ON_ONCE(ret == -1);  in unreserve_highatomic_pageblock()
3482 				spin_unlock_irqrestore(&zone->lock, flags);  in unreserve_highatomic_pageblock()
3486 		spin_unlock_irqrestore(&zone->lock, flags);  in unreserve_highatomic_pageblock()
3495 	long unusable_free = (1 << order) - 1;  in __zone_watermark_unusable_free()
3502 		unusable_free += READ_ONCE(z->nr_free_highatomic);  in __zone_watermark_unusable_free()
3514  * Return true if free base pages are above 'mark'. For high-order checks it
3515  * will return true of the order-0 watermark is reached and there is at least
3526 	/* free_pages may go negative - that's OK */  in __zone_watermark_ok()
3527 	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);  in __zone_watermark_ok()
3535 			min -= min / 2;  in __zone_watermark_ok()
3538 			 * Non-blocking allocations (e.g. GFP_ATOMIC) can  in __zone_watermark_ok()
3540 			 * non-blocking allocations requests such as GFP_NOWAIT  in __zone_watermark_ok()
3545 				min -= min / 4;  in __zone_watermark_ok()
3552 		 * makes during the free path will be small and short-lived.  in __zone_watermark_ok()
3555 			min -= min / 2;  in __zone_watermark_ok()
3559 	 * Check watermarks for an order-0 allocation request. If these  in __zone_watermark_ok()
3560 	 * are not met, then a high-order request also cannot go ahead  in __zone_watermark_ok()
3563 	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])  in __zone_watermark_ok()
3566 	/* If this is an order-0 request then the watermark is fine */  in __zone_watermark_ok()
3570 	/* For a high-order request, check at least one suitable page is free */  in __zone_watermark_ok()
3572 		struct free_area *area = &z->free_area[o];  in __zone_watermark_ok()
3575 		if (!area->nr_free)  in __zone_watermark_ok()
3613 	 * Fast check for order-0 only. If this fails then the reserves  in zone_watermark_fast()
3623 		/* reserved may over estimate high-atomic reserves. */  in zone_watermark_fast()
3624 		usable_free -= min(usable_free, reserved);  in zone_watermark_fast()
3625 		if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])  in zone_watermark_fast()
3634 	 * Ignore watermark boosting for __GFP_HIGH order-0 allocations  in zone_watermark_fast()
3639 	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost  in zone_watermark_fast()
3641 		mark = z->_watermark[WMARK_MIN];  in zone_watermark_fast()
3697 	 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume  in alloc_flags_nofragment()
3700 	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);  in alloc_flags_nofragment()
3701 	if (nr_online_nodes > 1 && !populated_zone(--zone))  in alloc_flags_nofragment()
3742 	z = ac->preferred_zoneref;  in get_page_from_freelist()
3743 	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,  in get_page_from_freelist()
3744 					ac->nodemask) {  in get_page_from_freelist()
3758 		 * lowmem reserves and high watermark so that kswapd  in get_page_from_freelist()
3763 		 * exceed the per-node dirty limit in the slowpath  in get_page_from_freelist()
3769 		 * dirty-throttling and the flusher threads.  in get_page_from_freelist()
3771 		if (ac->spread_dirty_pages) {  in get_page_from_freelist()
3772 			if (last_pgdat != zone->zone_pgdat) {  in get_page_from_freelist()
3773 				last_pgdat = zone->zone_pgdat;  in get_page_from_freelist()
3774 				last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);  in get_page_from_freelist()
3782 		    zone != zonelist_zone(ac->preferred_zoneref)) {  in get_page_from_freelist()
3790 			local_nid = zonelist_node_idx(ac->preferred_zoneref);  in get_page_from_freelist()
3802 		 * "node_reclaim_mode"-like behavior in this case.  in get_page_from_freelist()
3805 		    !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {  in get_page_from_freelist()
3813 		 * Detect whether the number of free pages is below high  in get_page_from_freelist()
3814 		 * watermark.  If so, we will decrease pcp->high and free  in get_page_from_freelist()
3819 		if (test_bit(ZONE_BELOW_HIGH, &zone->flags))  in get_page_from_freelist()
3824 					ac->highest_zoneidx, alloc_flags,  in get_page_from_freelist()
3828 			set_bit(ZONE_BELOW_HIGH, &zone->flags);  in get_page_from_freelist()
3833 				       ac->highest_zoneidx, alloc_flags,  in get_page_from_freelist()
3854 			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))  in get_page_from_freelist()
3857 			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);  in get_page_from_freelist()
3868 					ac->highest_zoneidx, alloc_flags))  in get_page_from_freelist()
3876 		page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order,  in get_page_from_freelist()
3877 				gfp_mask, alloc_flags, ac->migratetype);  in get_page_from_freelist()
3882 			 * If this is a high-order atomic allocation then check  in get_page_from_freelist()
3933 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))  in warn_alloc_show_mem()
3956 			current->comm, &vaf, gfp_mask, &gfp_mask,  in warn_alloc()
3990 		.zonelist = ac->zonelist,  in __alloc_pages_may_oom()
3991 		.nodemask = ac->nodemask,  in __alloc_pages_may_oom()
4011 	 * Go through the zonelist yet one more time, keep very high watermark  in __alloc_pages_may_oom()
4024 	if (current->flags & PF_DUMPCORE)  in __alloc_pages_may_oom()
4040 	if (ac->highest_zoneidx < ZONE_NORMAL)  in __alloc_pages_may_oom()
4060 		 * Help non-failing allocations by giving them access to memory  in __alloc_pages_may_oom()
4079 /* Try memory compaction for high-order allocations before reclaim */
4122 		zone->compact_blockskip_flush = false;  in __alloc_pages_direct_compact()
4158 	 * Compaction was skipped due to a lack of free order-0  in should_compact_retry()
4196 		(*compact_priority)--;  in should_compact_retry()
4229 	 * Let's give them a good hope and keep retrying while the order-0  in should_compact_retry()
4232 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,  in should_compact_retry()
4233 				ac->highest_zoneidx, ac->nodemask) {  in should_compact_retry()
4235 					ac->highest_zoneidx, alloc_flags))  in should_compact_retry()
4253 	if (current->flags & PF_MEMALLOC)  in __need_reclaim()
4340 	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,  in __perform_reclaim()
4341 								ac->nodemask);  in __perform_reclaim()
4371 	 * pages are pinned on the per-cpu lists or in high alloc reserves.  in __alloc_pages_direct_reclaim()
4392 	enum zone_type highest_zoneidx = ac->highest_zoneidx;  in wake_all_kswapds()
4400 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,  in wake_all_kswapds()
4401 					ac->nodemask) {  in wake_all_kswapds()
4404 		if (last_pgdat == zone->zone_pgdat)  in wake_all_kswapds()
4407 		last_pgdat = zone->zone_pgdat;  in wake_all_kswapds()
4446 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably  in gfp_to_alloc_flags()
4488 	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))  in __gfp_pfmemalloc_flags()
4491 		if (current->flags & PF_MEMALLOC)  in __gfp_pfmemalloc_flags()
4526 	 * their order will become available due to high fragmentation so  in should_reclaim_retry()
4544 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,  in should_reclaim_retry()
4545 				ac->highest_zoneidx, ac->nodemask) {  in should_reclaim_retry()
4564 				ac->highest_zoneidx, alloc_flags, available);  in should_reclaim_retry()
4580 	if (current->flags & PF_WQ_WORKER)  in should_reclaim_retry()
4601 	 * This assumes that for all allocations, ac->nodemask can come only  in check_retry_cpuset()
4606 	if (cpusets_enabled() && ac->nodemask &&  in check_retry_cpuset()
4607 			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {  in check_retry_cpuset()
4608 		ac->nodemask = NULL;  in check_retry_cpuset()
4647 		 * allocate greater than order-1 page units with __GFP_NOFAIL.  in __alloc_pages_slowpath()
4660 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);  in __alloc_pages_slowpath()
4681 	 * there was a cpuset modification and we are retrying - otherwise we  in __alloc_pages_slowpath()
4682 	 * could end up iterating over non-eligible zones endlessly.  in __alloc_pages_slowpath()
4684 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in __alloc_pages_slowpath()
4685 					ac->highest_zoneidx, ac->nodemask);  in __alloc_pages_slowpath()
4686 	if (!zonelist_zone(ac->preferred_zoneref))  in __alloc_pages_slowpath()
4691 	 * any suitable zone to satisfy the request - e.g. non-movable  in __alloc_pages_slowpath()
4695 		struct zoneref *z = first_zones_zonelist(ac->zonelist,  in __alloc_pages_slowpath()
4696 					ac->highest_zoneidx,  in __alloc_pages_slowpath()
4715 	 * that we have enough base pages and don't need to reclaim. For non-  in __alloc_pages_slowpath()
4716 	 * movable high-order allocations, do that as well, as compaction will  in __alloc_pages_slowpath()
4724 			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))  in __alloc_pages_slowpath()
4746 			 *  - potentially very expensive because zones are far  in __alloc_pages_slowpath()
4748 			 *    bursty high order allocations,  in __alloc_pages_slowpath()
4749 			 *  - not guaranteed to help because isolate_freepages()  in __alloc_pages_slowpath()
4752 			 *  - unlikely to make entire pageblocks free on its  in __alloc_pages_slowpath()
4788 	 * ignored. These allocations are high priority and system rather than  in __alloc_pages_slowpath()
4792 		ac->nodemask = NULL;  in __alloc_pages_slowpath()
4793 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in __alloc_pages_slowpath()
4794 					ac->highest_zoneidx, ac->nodemask);  in __alloc_pages_slowpath()
4807 	if (current->flags & PF_MEMALLOC)  in __alloc_pages_slowpath()
4827 	 * Do not retry costly high order allocations unless they are  in __alloc_pages_slowpath()
4839 	 * It doesn't make any sense to retry for the compaction if the order-0  in __alloc_pages_slowpath()
4904 		 * Help non-failing allocations by giving some access to memory  in __alloc_pages_slowpath()
4905 		 * reserves normally used for high priority non-blocking  in __alloc_pages_slowpath()
4918 	warn_alloc(gfp_mask, ac->nodemask,  in __alloc_pages_slowpath()
4929 	ac->highest_zoneidx = gfp_zone(gfp_mask);  in prepare_alloc_pages()
4930 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);  in prepare_alloc_pages()
4931 	ac->nodemask = nodemask;  in prepare_alloc_pages()
4932 	ac->migratetype = gfp_migratetype(gfp_mask);  in prepare_alloc_pages()
4940 		if (in_task() && !ac->nodemask)  in prepare_alloc_pages()
4941 			ac->nodemask = &cpuset_current_mems_allowed;  in prepare_alloc_pages()
4959 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);  in prepare_alloc_pages()
4966 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,  in prepare_alloc_pages()
4967 					ac->highest_zoneidx, ac->nodemask);  in prepare_alloc_pages()
4973  * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
5015 	if (unlikely(nr_pages - nr_populated == 0))  in alloc_pages_bulk_noprof()
5023 	if (nr_pages - nr_populated == 1)  in alloc_pages_bulk_noprof()
5088 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);  in alloc_pages_bulk_noprof()
5093 	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];  in alloc_pages_bulk_noprof()
5187 	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.  in __alloc_frozen_pages_noprof()
5231  * you need to access high mem.
5261 		pgalloc_tag_sub_pages(tag, (1 << order) - 1);  in ___free_pages()
5262 		while (order-- > 0) {  in ___free_pages()
5264 			 * The "tail" pages of this non-compound high-order  in ___free_pages()
5276  * __free_pages - Free pages allocated with alloc_pages().
5280  * This function can free multi-page allocations that are not compound
5286  * by put_page() which only frees the first page of a non-compound
5311  * free_pages - Free pages allocated with __get_free_pages().
5340 		while (page < --last)  in make_alloc_exact()
5351  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5357  * allocate memory in power-of-two pages.
5379  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5405  * free_pages_exact - release memory allocated via alloc_pages_exact()
5414 	unsigned long end = addr + PAGE_ALIGN(size);  in free_pages_exact()  local
5416 	while (addr < end) {  in free_pages_exact()
5424  * nr_free_zone_pages - count number of pages beyond high watermark
5428  * high watermark within all zones at or below a given zone index.  For each
5431  *     nr_free_zone_pages = managed_pages - high_pages
5433  * Return: number of pages beyond high watermark.
5447 		unsigned long high = high_wmark_pages(zone);  in nr_free_zone_pages()  local
5448 		if (size > high)  in nr_free_zone_pages()
5449 			sum += size - high;  in nr_free_zone_pages()
5456  * nr_free_buffer_pages - count number of pages beyond high watermark
5458  * nr_free_buffer_pages() counts the number of pages which are beyond the high
5461  * Return: number of pages beyond high watermark within ZONE_DMA and
5472 	zoneref->zone = zone;  in zoneref_set_zone()
5473 	zoneref->zone_idx = zone_idx(zone);  in zoneref_set_zone()
5488 		zone_type--;  in build_zonerefs_node()
5489 		zone = pgdat->node_zones + zone_type;  in build_zonerefs_node()
5511 		return -EINVAL;  in __parse_numa_zonelist_order()
5532  * find_next_best_node - find the next node that should appear in a given node's fallback list
5596  * This results in maximum locality--normal zone overflows into local
5597  * DMA zone, if any--but risks exhausting DMA zone.
5605 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;  in build_zonelists_in_node_order()
5615 	zonerefs->zone = NULL;  in build_zonelists_in_node_order()
5616 	zonerefs->zone_idx = 0;  in build_zonelists_in_node_order()
5627 	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;  in build_thisnode_zonelists()
5630 	zonerefs->zone = NULL;  in build_thisnode_zonelists()
5631 	zonerefs->zone_idx = 0;  in build_thisnode_zonelists()
5641 	/* NUMA-aware ordering of nodes */  in build_zonelists()
5642 	local_node = pgdat->node_id;  in build_zonelists()
5650 		 * distance group to make it round-robin.  in build_zonelists()
5695 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;  in build_zonelists()
5699 	zonerefs->zone = NULL;  in build_zonelists()
5700 	zonerefs->zone_idx = 0;  in build_zonelists()
5741 	 * trying to hold port->lock, for  in __build_all_zonelists()
5743 	 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.  in __build_all_zonelists()
5753 	 * building zonelists is fine - no need to touch other nodes.  in __build_all_zonelists()
5755 	if (self && !node_online(self->node_id)) {  in __build_all_zonelists()
5770 		 * We now know the "local memory node" for each node--  in __build_all_zonelists()
5772 		 * Set up numa_mem percpu variable for on-line cpus.  During  in __build_all_zonelists()
5773 		 * boot, only the boot cpu should be on-line;  we'll init the  in __build_all_zonelists()
5774 		 * secondary cpus' numa_mem as they come on-line.  During  in __build_all_zonelists()
5775 		 * node/memory hotplug, we'll fixup all on-line cpus.  in __build_all_zonelists()
5804 	 * (a chicken-egg dilemma).  in build_all_zonelists_init()
5829 	/* Get the number of free pages beyond high watermark in all zones. */  in build_all_zonelists()
5834 	 * more accurate, but expensive to check per-zone. This check is  in build_all_zonelists()
5835 	 * made on memory-hotadd so a system can start with mobility  in build_all_zonelists()
5869 	 * Clamp the batch to a 2^n - 1 value. Having a power  in zone_batchsize()
5874 	 * batches of pages, one task can end up with a lot  in zone_batchsize()
5878 	batch = rounddown_pow_of_two(batch + batch/2) - 1;  in zone_batchsize()
5894 	 * fragmented and becoming unavailable for high-order allocations.  in zone_batchsize()
5905 	int high;  in zone_highsize()  local
5911 		 * By default, the high value of the pcp is based on the zone  in zone_highsize()
5918 		 * If percpu_pagelist_high_fraction is configured, the high  in zone_highsize()
5926 	 * Split the high value across all online CPUs local to the zone. Note  in zone_highsize()
5929 	 * onlined. For memory nodes that have no CPUs, split the high value  in zone_highsize()
5936 	high = total_pages / nr_split_cpus;  in zone_highsize()
5939 	 * Ensure high is at least batch*4. The multiple is based on the  in zone_highsize()
5940 	 * historical relationship between high and batch.  in zone_highsize()
5942 	high = max(high, batch << 2);  in zone_highsize()
5944 	return high;  in zone_highsize()
5951  * pcp->high and pcp->batch values are related and generally batch is lower
5952  * than high. They are also related to pcp->count such that count is lower
5953  * than high, and as soon as it reaches high, the pcplist is flushed.
5958  * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
5960  * fully trust only the pcp->count field on the local CPU with interrupts
5970 	WRITE_ONCE(pcp->batch, batch);  in pageset_update()
5971 	WRITE_ONCE(pcp->high_min, high_min);  in pageset_update()
5972 	WRITE_ONCE(pcp->high_max, high_max);  in pageset_update()
5982 	spin_lock_init(&pcp->lock);  in per_cpu_pages_init()
5984 		INIT_LIST_HEAD(&pcp->lists[pindex]);  in per_cpu_pages_init()
5987 	 * Set batch and high values safe for a boot pageset. A true percpu  in per_cpu_pages_init()
5992 	pcp->high_min = BOOT_PAGESET_HIGH;  in per_cpu_pages_init()
5993 	pcp->high_max = BOOT_PAGESET_HIGH;  in per_cpu_pages_init()
5994 	pcp->batch = BOOT_PAGESET_BATCH;  in per_cpu_pages_init()
6004 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);  in __zone_set_pageset_high_and_batch()
6010  * Calculate and set new high and batch values for all per-cpu pagesets of a
6022 		 * PCP high is tuned manually, disable auto-tuning via  in zone_set_pageset_high_and_batch()
6032 	if (zone->pageset_high_min == new_high_min &&  in zone_set_pageset_high_and_batch()
6033 	    zone->pageset_high_max == new_high_max &&  in zone_set_pageset_high_and_batch()
6034 	    zone->pageset_batch == new_batch)  in zone_set_pageset_high_and_batch()
6037 	zone->pageset_high_min = new_high_min;  in zone_set_pageset_high_and_batch()
6038 	zone->pageset_high_max = new_high_max;  in zone_set_pageset_high_and_batch()
6039 	zone->pageset_batch = new_batch;  in zone_set_pageset_high_and_batch()
6051 		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);  in setup_zone_pageset()
6053 	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);  in setup_zone_pageset()
6058 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);  in setup_zone_pageset()
6059 		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);  in setup_zone_pageset()
6068  * page high values need to be recalculated.
6082 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);  in zone_pcp_update_cacheinfo()
6085 	 * If data cache slice of CPU is large enough, "pcp->batch"  in zone_pcp_update_cacheinfo()
6087 	 * consecutive high-order pages freeing without allocation.  in zone_pcp_update_cacheinfo()
6089 	 * cache-hot pages sharing.  in zone_pcp_update_cacheinfo()
6091 	spin_lock(&pcp->lock);  in zone_pcp_update_cacheinfo()
6092 	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)  in zone_pcp_update_cacheinfo()
6093 		pcp->flags |= PCPF_FREE_HIGH_BATCH;  in zone_pcp_update_cacheinfo()
6095 		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;  in zone_pcp_update_cacheinfo()
6096 	spin_unlock(&pcp->lock);  in zone_pcp_update_cacheinfo()
6124 	 * Otherwise, they will end up skewing the stats of  in setup_per_cpu_pageset()
6129 		memset(pzstats->vm_numa_event, 0,  in setup_per_cpu_pageset()
6130 		       sizeof(pzstats->vm_numa_event));  in setup_per_cpu_pageset()
6135 		pgdat->per_cpu_nodestats =  in setup_per_cpu_pageset()
6146 	zone->per_cpu_pageset = &boot_pageset;  in zone_pcp_init()
6147 	zone->per_cpu_zonestats = &boot_zonestats;  in zone_pcp_init()
6148 	zone->pageset_high_min = BOOT_PAGESET_HIGH;  in zone_pcp_init()
6149 	zone->pageset_high_max = BOOT_PAGESET_HIGH;  in zone_pcp_init()
6150 	zone->pageset_batch = BOOT_PAGESET_BATCH;  in zone_pcp_init()
6153 		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,  in zone_pcp_init()
6154 			 zone->present_pages, zone_batchsize(zone));  in zone_pcp_init()
6161 	atomic_long_add(count, &page_zone(page)->managed_pages);  in adjust_managed_page_count()
6167 unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)  in free_reserved_area()  argument
6173 	end = (void *)((unsigned long)end & PAGE_MASK);  in free_reserved_area()
6174 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {  in free_reserved_area()
6187 		 * Perform a kasan-unchecked memset() since this memory  in free_reserved_area()
6265  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6276 		pgdat->totalreserve_pages = 0;  in calculate_totalreserve_pages()
6279 			struct zone *zone = pgdat->node_zones + i;  in calculate_totalreserve_pages()
6285 				max = max(max, zone->lowmem_reserve[j]);  in calculate_totalreserve_pages()
6287 			/* we treat the high watermark as reserved pages. */  in calculate_totalreserve_pages()
6292 			pgdat->totalreserve_pages += max;  in calculate_totalreserve_pages()
6302  * setup_per_zone_lowmem_reserve - called whenever
6313 		for (i = 0; i < MAX_NR_ZONES - 1; i++) {  in setup_per_zone_lowmem_reserve()
6314 			struct zone *zone = &pgdat->node_zones[i];  in setup_per_zone_lowmem_reserve()
6320 				struct zone *upper_zone = &pgdat->node_zones[j];  in setup_per_zone_lowmem_reserve()
6325 					zone->lowmem_reserve[j] = 0;  in setup_per_zone_lowmem_reserve()
6327 					zone->lowmem_reserve[j] = managed_pages / ratio;  in setup_per_zone_lowmem_reserve()
6329 								       zone->lowmem_reserve[j]);  in setup_per_zone_lowmem_reserve()
6340 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);  in __setup_per_zone_wmarks()
6354 		spin_lock_irqsave(&zone->lock, flags);  in __setup_per_zone_wmarks()
6363 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)  in __setup_per_zone_wmarks()
6371 			zone->_watermark[WMARK_MIN] = min_pages;  in __setup_per_zone_wmarks()
6377 			zone->_watermark[WMARK_MIN] = tmp;  in __setup_per_zone_wmarks()
6389 		zone->watermark_boost = 0;  in __setup_per_zone_wmarks()
6390 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;  in __setup_per_zone_wmarks()
6391 		zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;  in __setup_per_zone_wmarks()
6392 		zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;  in __setup_per_zone_wmarks()
6395 		spin_unlock_irqrestore(&zone->lock, flags);  in __setup_per_zone_wmarks()
6403  * setup_per_zone_wmarks - called when min_free_kbytes changes
6404  * or when memory is hot-{added|removed}
6406  * Ensures that the watermark[min,low,high] values for each zone are set
6420 	 * and high limits or the limits may be inappropriate.  in setup_per_zone_wmarks()
6485  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so  in postcore_initcall()
6527 		pgdat->min_unmapped_pages = 0;  in setup_min_unmapped_ratio()
6530 		zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *  in setup_min_unmapped_ratio()
6555 		pgdat->min_slab_pages = 0;  in setup_min_slab_ratio()
6558 		zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *  in setup_min_slab_ratio()
6578  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6603  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6625 		ret = -EINVAL;  in percpu_pagelist_high_fraction_sysctl_handler()
6725 /* Usage: See admin-guide/dynamic-debug-howto.rst */
6739 /* [start, end) must belong to a single zone. */
6741 					unsigned long start, unsigned long end)  in __alloc_contig_migrate_range()  argument
6749 		.nid = zone_to_nid(cc->zone),  in __alloc_contig_migrate_range()
6750 		.gfp_mask = cc->gfp_mask,  in __alloc_contig_migrate_range()
6756 	while (pfn < end || !list_empty(&cc->migratepages)) {  in __alloc_contig_migrate_range()
6758 			ret = -EINTR;  in __alloc_contig_migrate_range()
6762 		if (list_empty(&cc->migratepages)) {  in __alloc_contig_migrate_range()
6763 			cc->nr_migratepages = 0;  in __alloc_contig_migrate_range()
6764 			ret = isolate_migratepages_range(cc, pfn, end);  in __alloc_contig_migrate_range()
6765 			if (ret && ret != -EAGAIN)  in __alloc_contig_migrate_range()
6767 			pfn = cc->migrate_pfn;  in __alloc_contig_migrate_range()
6770 			ret = -EBUSY;  in __alloc_contig_migrate_range()
6774 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,  in __alloc_contig_migrate_range()
6775 							&cc->migratepages);  in __alloc_contig_migrate_range()
6776 		cc->nr_migratepages -= nr_reclaimed;  in __alloc_contig_migrate_range()
6778 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,  in __alloc_contig_migrate_range()
6779 			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);  in __alloc_contig_migrate_range()
6782 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless  in __alloc_contig_migrate_range()
6785 		if (ret == -ENOMEM)  in __alloc_contig_migrate_range()
6791 		if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)  in __alloc_contig_migrate_range()
6792 			alloc_contig_dump_pages(&cc->migratepages);  in __alloc_contig_migrate_range()
6793 		putback_movable_pages(&cc->migratepages);  in __alloc_contig_migrate_range()
6817 			/* Add all subpages to the order-0 head, in sequence. */  in split_free_pages()
6818 			list_del(&page->lru);  in split_free_pages()
6844 		return -EINVAL;  in __alloc_contig_verify_gfp_mask()
6860  * alloc_contig_range() -- tries to allocate given range of pages
6862  * @end:	one-past-the-last PFN to allocate
6876  * pages which PFN is in [start, end) are allocated for the caller and
6879 int alloc_contig_range_noprof(unsigned long start, unsigned long end,  in alloc_contig_range_noprof()  argument
6882 	const unsigned int order = ilog2(end - start);  in alloc_contig_range_noprof()
6888 		.order = -1,  in alloc_contig_range_noprof()
6906 		return -EINVAL;  in alloc_contig_range_noprof()
6910 		return -EINVAL;  in alloc_contig_range_noprof()
6933 	ret = start_isolate_page_range(start, end, mode);  in alloc_contig_range_noprof()
6940 	 * In case of -EBUSY, we'd like to know which page causes problem.  in alloc_contig_range_noprof()
6947 	 * -EBUSY is not accidentally used or returned to caller.  in alloc_contig_range_noprof()
6949 	ret = __alloc_contig_migrate_range(&cc, start, end);  in alloc_contig_range_noprof()
6950 	if (ret && ret != -EBUSY)  in alloc_contig_range_noprof()
6954 	 * When in-use hugetlb pages are migrated, they may simply be released  in alloc_contig_range_noprof()
6956 	 * buddy system.  After the migration of in-use huge pages is completed,  in alloc_contig_range_noprof()
6960 	ret = replace_free_hugepage_folios(start, end);  in alloc_contig_range_noprof()
6965 	 * Pages from [start, end) are within a pageblock_nr_pages  in alloc_contig_range_noprof()
6967 	 * more, all pages in [start, end) are free in page allocator.  in alloc_contig_range_noprof()
6969 	 * [start, end) (that is remove them from page allocator).  in alloc_contig_range_noprof()
6972 	 * end of interesting range may be not aligned with pages that  in alloc_contig_range_noprof()
6977 	 * We don't have to hold zone->lock here because the pages are  in alloc_contig_range_noprof()
6983 	if (test_pages_isolated(outer_start, end, mode)) {  in alloc_contig_range_noprof()
6984 		ret = -EBUSY;  in alloc_contig_range_noprof()
6989 	outer_end = isolate_freepages_range(&cc, outer_start, end);  in alloc_contig_range_noprof()
6991 		ret = -EBUSY;  in alloc_contig_range_noprof()
7000 			free_contig_range(outer_start, start - outer_start);  in alloc_contig_range_noprof()
7001 		if (end != outer_end)  in alloc_contig_range_noprof()
7002 			free_contig_range(end, outer_end - end);  in alloc_contig_range_noprof()
7003 	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {  in alloc_contig_range_noprof()
7010 		ret = -EINVAL;  in alloc_contig_range_noprof()
7012 		     start, end, outer_start, outer_end);  in alloc_contig_range_noprof()
7015 	undo_isolate_page_range(start, end);  in alloc_contig_range_noprof()
7055 	unsigned long last_pfn = start_pfn + nr_pages - 1;  in zone_spans_last_pfn()
7061  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
7094 		spin_lock_irqsave(&zone->lock, flags);  in alloc_contig_pages_noprof()
7096 		pfn = ALIGN(zone->zone_start_pfn, nr_pages);  in alloc_contig_pages_noprof()
7106 				spin_unlock_irqrestore(&zone->lock, flags);  in alloc_contig_pages_noprof()
7111 				spin_lock_irqsave(&zone->lock, flags);  in alloc_contig_pages_noprof()
7115 		spin_unlock_irqrestore(&zone->lock, flags);  in alloc_contig_pages_noprof()
7137 	for (; nr_pages--; pfn++) {  in free_contig_range()
7148  * Effectively disable pcplists for the zone by setting the high limit to 0
7151  * will be drained, or observe the new high limit and skip the pcplist.
7164 	__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,  in zone_pcp_enable()
7165 		zone->pageset_high_max, zone->pageset_batch);  in zone_pcp_enable()
7174 	if (zone->per_cpu_pageset != &boot_pageset) {  in zone_pcp_reset()
7176 			pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);  in zone_pcp_reset()
7179 		free_percpu(zone->per_cpu_pageset);  in zone_pcp_reset()
7180 		zone->per_cpu_pageset = &boot_pageset;  in zone_pcp_reset()
7181 		if (zone->per_cpu_zonestats != &boot_zonestats) {  in zone_pcp_reset()
7182 			free_percpu(zone->per_cpu_zonestats);  in zone_pcp_reset()
7183 			zone->per_cpu_zonestats = &boot_zonestats;  in zone_pcp_reset()
7193  * Returns the number of managed (non-PageOffline()) pages in the range: the
7208 	spin_lock_irqsave(&zone->lock, flags);  in __offline_isolated_pages()
7238 	spin_unlock_irqrestore(&zone->lock, flags);  in __offline_isolated_pages()
7240 	return end_pfn - start_pfn - already_offline;  in __offline_isolated_pages()
7253 		const struct page *head = page - (pfn & ((1 << order) - 1));  in is_free_buddy_page()
7274  * Break down a higher-order page in sub-pages, and keep our target out of
7278 				   struct page *target, int low, int high,  in break_down_buddy_pages()  argument
7281 	unsigned long size = 1 << high;  in break_down_buddy_pages()
7284 	while (high > low) {  in break_down_buddy_pages()
7285 		high--;  in break_down_buddy_pages()
7295 		if (set_page_guard(zone, current_buddy, high))  in break_down_buddy_pages()
7298 		add_to_free_list(current_buddy, zone, high, migratetype, false);  in break_down_buddy_pages()
7299 		set_buddy_order(current_buddy, high);  in break_down_buddy_pages()
7314 	spin_lock_irqsave(&zone->lock, flags);  in take_page_off_buddy()
7316 		struct page *page_head = page - (pfn & ((1 << order) - 1));  in take_page_off_buddy()
7335 	spin_unlock_irqrestore(&zone->lock, flags);  in take_page_off_buddy()
7348 	spin_lock_irqsave(&zone->lock, flags);  in put_page_back_buddy()
7359 	spin_unlock_irqrestore(&zone->lock, flags);  in put_page_back_buddy()
7371 		struct zone *zone = &pgdat->node_zones[ZONE_DMA];  in has_managed_dma()
7393 		return -EINVAL;  in accept_memory_parse()
7408 	list_del(&page->lru);  in __accept_page()
7409 	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);  in __accept_page()
7410 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);  in __accept_page()
7412 	spin_unlock_irqrestore(&zone->lock, *flags);  in __accept_page()
7424 	spin_lock_irqsave(&zone->lock, flags);  in accept_page()
7426 		spin_unlock_irqrestore(&zone->lock, flags);  in accept_page()
7430 	/* Unlocks zone->lock */  in accept_page()
7439 	spin_lock_irqsave(&zone->lock, flags);  in try_to_accept_memory_one()
7440 	page = list_first_entry_or_null(&zone->unaccepted_pages,  in try_to_accept_memory_one()
7443 		spin_unlock_irqrestore(&zone->lock, flags);  in try_to_accept_memory_one()
7447 	/* Unlocks zone->lock */  in try_to_accept_memory_one()
7459 	if (list_empty(&zone->unaccepted_pages))  in cond_accept_memory()
7477 	to_accept = wmark -  in cond_accept_memory()
7478 		    (zone_page_state(zone, NR_FREE_PAGES) -  in cond_accept_memory()
7479 		    __zone_watermark_unusable_free(zone, order, 0) -  in cond_accept_memory()
7486 		to_accept -= MAX_ORDER_NR_PAGES;  in cond_accept_memory()
7500 	spin_lock_irqsave(&zone->lock, flags);  in __free_unaccepted()
7501 	list_add_tail(&page->lru, &zone->unaccepted_pages);  in __free_unaccepted()
7505 	spin_unlock_irqrestore(&zone->lock, flags);  in __free_unaccepted()
7588 	 * If it's empty attempt to spin_trylock zone->lock.  in alloc_frozen_pages_nolock_noprof()
7604  * alloc_pages_nolock - opportunistic reentrant allocation from any context
7611  * allocator -> tracepoint -> alloc_pages_nolock_noprof).