xref: /linux/mm/ksm.c (revision ab1c247094e323177a578b38f0325bf79f0317ac)
17a338472SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2f8af4da3SHugh Dickins /*
331dbd01fSIzik Eidus  * Memory merging support.
431dbd01fSIzik Eidus  *
531dbd01fSIzik Eidus  * This code enables dynamic sharing of identical pages found in different
631dbd01fSIzik Eidus  * memory areas, even if they are not shared by fork()
731dbd01fSIzik Eidus  *
836b2528dSIzik Eidus  * Copyright (C) 2008-2009 Red Hat, Inc.
931dbd01fSIzik Eidus  * Authors:
1031dbd01fSIzik Eidus  *	Izik Eidus
1131dbd01fSIzik Eidus  *	Andrea Arcangeli
1231dbd01fSIzik Eidus  *	Chris Wright
1336b2528dSIzik Eidus  *	Hugh Dickins
14f8af4da3SHugh Dickins  */
15f8af4da3SHugh Dickins 
16f8af4da3SHugh Dickins #include <linux/errno.h>
1731dbd01fSIzik Eidus #include <linux/mm.h>
1836090defSArnd Bergmann #include <linux/mm_inline.h>
1931dbd01fSIzik Eidus #include <linux/fs.h>
20f8af4da3SHugh Dickins #include <linux/mman.h>
2131dbd01fSIzik Eidus #include <linux/sched.h>
226e84f315SIngo Molnar #include <linux/sched/mm.h>
23f7ccbae4SIngo Molnar #include <linux/sched/coredump.h>
2431dbd01fSIzik Eidus #include <linux/sched/cputime.h>
2531dbd01fSIzik Eidus #include <linux/rwsem.h>
2631dbd01fSIzik Eidus #include <linux/pagemap.h>
2731dbd01fSIzik Eidus #include <linux/rmap.h>
2859e1a2f4STimofey Titovets #include <linux/spinlock.h>
2931dbd01fSIzik Eidus #include <linux/xxhash.h>
3031dbd01fSIzik Eidus #include <linux/delay.h>
3131dbd01fSIzik Eidus #include <linux/kthread.h>
3231dbd01fSIzik Eidus #include <linux/wait.h>
3331dbd01fSIzik Eidus #include <linux/slab.h>
3462b61f61SHugh Dickins #include <linux/rbtree.h>
3531dbd01fSIzik Eidus #include <linux/memory.h>
362c6854fdSIzik Eidus #include <linux/mmu_notifier.h>
37f8af4da3SHugh Dickins #include <linux/swap.h>
384ca3a69bSSasha Levin #include <linux/ksm.h>
39878aee7dSAndrea Arcangeli #include <linux/hashtable.h>
4072788c38SDavid Rientjes #include <linux/freezer.h>
4190bd6fd3SPetr Holasek #include <linux/oom.h>
42d7c0e68dSDavid Hildenbrand #include <linux/numa.h>
43f8af4da3SHugh Dickins #include <linux/pagewalk.h>
4431dbd01fSIzik Eidus 
4573848b46SHugh Dickins #include <asm/tlbflush.h>
4658730ab6SQi Zheng #include "internal.h"
4731dbd01fSIzik Eidus #include "mm_slot.h"
48739100c8SStefan Roesch 
49739100c8SStefan Roesch #define CREATE_TRACE_POINTS
50739100c8SStefan Roesch #include <trace/events/ksm.h>
51e850dcf5SHugh Dickins 
52e850dcf5SHugh Dickins #ifdef CONFIG_NUMA
53e850dcf5SHugh Dickins #define NUMA(x)		(x)
54e850dcf5SHugh Dickins #define DO_NUMA(x)	do { (x); } while (0)
55e850dcf5SHugh Dickins #else
56e850dcf5SHugh Dickins #define NUMA(x)		(0)
57e850dcf5SHugh Dickins #define DO_NUMA(x)	do { } while (0)
58e850dcf5SHugh Dickins #endif
595e924ff5SStefan Roesch 
605e924ff5SStefan Roesch typedef u8 rmap_age_t;
615a2ca3efSMike Rapoport 
625a2ca3efSMike Rapoport /**
635a2ca3efSMike Rapoport  * DOC: Overview
6431dbd01fSIzik Eidus  *
6531dbd01fSIzik Eidus  * A few notes about the KSM scanning process,
6631dbd01fSIzik Eidus  * to make it easier to understand the data structures below:
6731dbd01fSIzik Eidus  *
6831dbd01fSIzik Eidus  * In order to reduce excessive scanning, KSM sorts the memory pages by their
6931dbd01fSIzik Eidus  * contents into a data structure that holds pointers to the pages' locations.
7031dbd01fSIzik Eidus  *
7131dbd01fSIzik Eidus  * Since the contents of the pages may change at any moment, KSM cannot just
7231dbd01fSIzik Eidus  * insert the pages into a normal sorted tree and expect it to find anything.
7331dbd01fSIzik Eidus  * Therefore KSM uses two data structures - the stable and the unstable tree.
7431dbd01fSIzik Eidus  *
7531dbd01fSIzik Eidus  * The stable tree holds pointers to all the merged pages (ksm pages), sorted
7631dbd01fSIzik Eidus  * by their contents.  Because each such page is write-protected, searching on
7731dbd01fSIzik Eidus  * this tree is fully assured to be working (except when pages are unmapped),
7831dbd01fSIzik Eidus  * and therefore this tree is called the stable tree.
795a2ca3efSMike Rapoport  *
805a2ca3efSMike Rapoport  * The stable tree node includes information required for reverse
815a2ca3efSMike Rapoport  * mapping from a KSM page to virtual addresses that map this page.
825a2ca3efSMike Rapoport  *
835a2ca3efSMike Rapoport  * In order to avoid large latencies of the rmap walks on KSM pages,
845a2ca3efSMike Rapoport  * KSM maintains two types of nodes in the stable tree:
855a2ca3efSMike Rapoport  *
865a2ca3efSMike Rapoport  * * the regular nodes that keep the reverse mapping structures in a
875a2ca3efSMike Rapoport  *   linked list
885a2ca3efSMike Rapoport  * * the "chains" that link nodes ("dups") that represent the same
895a2ca3efSMike Rapoport  *   write protected memory content, but each "dup" corresponds to a
905a2ca3efSMike Rapoport  *   different KSM page copy of that content
915a2ca3efSMike Rapoport  *
9221fbd591SQi Zheng  * Internally, the regular nodes, "dups" and "chains" are represented
935a2ca3efSMike Rapoport  * using the same struct ksm_stable_node structure.
9431dbd01fSIzik Eidus  *
9531dbd01fSIzik Eidus  * In addition to the stable tree, KSM uses a second data structure called the
9631dbd01fSIzik Eidus  * unstable tree: this tree holds pointers to pages which have been found to
9731dbd01fSIzik Eidus  * be "unchanged for a period of time".  The unstable tree sorts these pages
9831dbd01fSIzik Eidus  * by their contents, but since they are not write-protected, KSM cannot rely
9931dbd01fSIzik Eidus  * upon the unstable tree to work correctly - the unstable tree is liable to
10031dbd01fSIzik Eidus  * be corrupted as its contents are modified, and so it is called unstable.
10131dbd01fSIzik Eidus  *
10231dbd01fSIzik Eidus  * KSM solves this problem by several techniques:
10331dbd01fSIzik Eidus  *
10431dbd01fSIzik Eidus  * 1) The unstable tree is flushed every time KSM completes scanning all
10531dbd01fSIzik Eidus  *    memory areas, and then the tree is rebuilt again from the beginning.
10631dbd01fSIzik Eidus  * 2) KSM will only insert into the unstable tree, pages whose hash value
10731dbd01fSIzik Eidus  *    has not changed since the previous scan of all memory areas.
10831dbd01fSIzik Eidus  * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
10931dbd01fSIzik Eidus  *    colors of the nodes and not on their contents, assuring that even when
11031dbd01fSIzik Eidus  *    the tree gets "corrupted" it won't get out of balance, so scanning time
11131dbd01fSIzik Eidus  *    remains the same (also, searching and inserting nodes in an rbtree uses
11231dbd01fSIzik Eidus  *    the same algorithm, so we have no overhead when we flush and rebuild).
11331dbd01fSIzik Eidus  * 4) KSM never flushes the stable tree, which means that even if it were to
11431dbd01fSIzik Eidus  *    take 10 attempts to find a page in the unstable tree, once it is found,
11531dbd01fSIzik Eidus  *    it is secured in the stable tree.  (When we scan a new page, we first
1168fdb3dbfSHugh Dickins  *    compare it against the stable tree, and then against the unstable tree.)
1178fdb3dbfSHugh Dickins  *
1188fdb3dbfSHugh Dickins  * If the merge_across_nodes tunable is unset, then KSM maintains multiple
11931dbd01fSIzik Eidus  * stable trees and multiple unstable trees: one of each for each NUMA node.
12031dbd01fSIzik Eidus  */
12131dbd01fSIzik Eidus 
12221fbd591SQi Zheng /**
12358730ab6SQi Zheng  * struct ksm_mm_slot - ksm information per mm that is being scanned
1246514d511SHugh Dickins  * @slot: hash lookup from mm to mm_slot
12531dbd01fSIzik Eidus  * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
12621fbd591SQi Zheng  */
12758730ab6SQi Zheng struct ksm_mm_slot {
12821fbd591SQi Zheng 	struct mm_slot slot;
12931dbd01fSIzik Eidus 	struct ksm_rmap_item *rmap_list;
13031dbd01fSIzik Eidus };
13131dbd01fSIzik Eidus 
13231dbd01fSIzik Eidus /**
13331dbd01fSIzik Eidus  * struct ksm_scan - cursor for scanning
13431dbd01fSIzik Eidus  * @mm_slot: the current mm_slot we are scanning
1356514d511SHugh Dickins  * @address: the next address inside that to be scanned
13631dbd01fSIzik Eidus  * @rmap_list: link to the next rmap to be scanned in the rmap_list
13731dbd01fSIzik Eidus  * @seqnr: count of completed full scans (needed when removing unstable node)
13831dbd01fSIzik Eidus  *
13931dbd01fSIzik Eidus  * There is only the one ksm_scan instance of this cursor structure.
14031dbd01fSIzik Eidus  */
14121fbd591SQi Zheng struct ksm_scan {
14231dbd01fSIzik Eidus 	struct ksm_mm_slot *mm_slot;
14321fbd591SQi Zheng 	unsigned long address;
14431dbd01fSIzik Eidus 	struct ksm_rmap_item **rmap_list;
14531dbd01fSIzik Eidus 	unsigned long seqnr;
14631dbd01fSIzik Eidus };
14731dbd01fSIzik Eidus 
14821fbd591SQi Zheng /**
1497b6ba2c7SHugh Dickins  * struct ksm_stable_node - node of the stable rbtree
1504146d2d6SHugh Dickins  * @node: rb node of this ksm page in the stable tree
1512c653d0eSAndrea Arcangeli  * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
1524146d2d6SHugh Dickins  * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
1537b6ba2c7SHugh Dickins  * @list: linked into migrate_nodes, pending placement in the proper node tree
1544146d2d6SHugh Dickins  * @hlist: hlist head of rmap_items using this ksm page
1552c653d0eSAndrea Arcangeli  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
1562c653d0eSAndrea Arcangeli  * @chain_prune_time: time of the last full garbage collection
1574146d2d6SHugh Dickins  * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
1587b6ba2c7SHugh Dickins  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
15921fbd591SQi Zheng  */
1604146d2d6SHugh Dickins struct ksm_stable_node {
1614146d2d6SHugh Dickins 	union {
1624146d2d6SHugh Dickins 		struct rb_node node;	/* when node of stable tree */
1634146d2d6SHugh Dickins 		struct {		/* when listed for migration */
1642c653d0eSAndrea Arcangeli 			struct list_head *head;
1652c653d0eSAndrea Arcangeli 			struct {
1664146d2d6SHugh Dickins 				struct hlist_node hlist_dup;
1674146d2d6SHugh Dickins 				struct list_head list;
1684146d2d6SHugh Dickins 			};
1692c653d0eSAndrea Arcangeli 		};
1707b6ba2c7SHugh Dickins 	};
1712c653d0eSAndrea Arcangeli 	struct hlist_head hlist;
17262b61f61SHugh Dickins 	union {
1732c653d0eSAndrea Arcangeli 		unsigned long kpfn;
1742c653d0eSAndrea Arcangeli 		unsigned long chain_prune_time;
1752c653d0eSAndrea Arcangeli 	};
1762c653d0eSAndrea Arcangeli 	/*
1772c653d0eSAndrea Arcangeli 	 * STABLE_NODE_CHAIN can be any negative number in
1782c653d0eSAndrea Arcangeli 	 * rmap_hlist_len negative range, but better not -1 to be able
1792c653d0eSAndrea Arcangeli 	 * to reliably detect underflows.
1802c653d0eSAndrea Arcangeli 	 */
1812c653d0eSAndrea Arcangeli #define STABLE_NODE_CHAIN -1024
1824146d2d6SHugh Dickins 	int rmap_hlist_len;
1834146d2d6SHugh Dickins #ifdef CONFIG_NUMA
1844146d2d6SHugh Dickins 	int nid;
1857b6ba2c7SHugh Dickins #endif
1867b6ba2c7SHugh Dickins };
1877b6ba2c7SHugh Dickins 
18821fbd591SQi Zheng /**
1896514d511SHugh Dickins  * struct ksm_rmap_item - reverse mapping item for virtual addresses
190db114b83SHugh Dickins  * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
191bc56620bSHugh Dickins  * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
19231dbd01fSIzik Eidus  * @nid: NUMA node id of unstable tree in which linked (may not match page)
19331dbd01fSIzik Eidus  * @mm: the memory structure this rmap_item is pointing into
19431dbd01fSIzik Eidus  * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1957b6ba2c7SHugh Dickins  * @oldchecksum: previous checksum of the page at that virtual address
1967b6ba2c7SHugh Dickins  * @node: rb node of this rmap_item in the unstable tree
1977b6ba2c7SHugh Dickins  * @head: pointer to stable_node heading this list in the stable tree
1985e924ff5SStefan Roesch  * @hlist: link into hlist of rmap_items hanging off that stable_node
1995e924ff5SStefan Roesch  * @age: number of scan iterations since creation
20031dbd01fSIzik Eidus  * @remaining_skips: how many scans to skip
20121fbd591SQi Zheng  */
20221fbd591SQi Zheng struct ksm_rmap_item {
203bc56620bSHugh Dickins 	struct ksm_rmap_item *rmap_list;
204db114b83SHugh Dickins 	union {
205bc56620bSHugh Dickins 		struct anon_vma *anon_vma;	/* when stable */
206bc56620bSHugh Dickins #ifdef CONFIG_NUMA
207bc56620bSHugh Dickins 		int nid;		/* when node of unstable tree */
208bc56620bSHugh Dickins #endif
20931dbd01fSIzik Eidus 	};
21031dbd01fSIzik Eidus 	struct mm_struct *mm;
21131dbd01fSIzik Eidus 	unsigned long address;		/* + low bits used for flags below */
2125e924ff5SStefan Roesch 	unsigned int oldchecksum;	/* when unstable */
2135e924ff5SStefan Roesch 	rmap_age_t age;
21431dbd01fSIzik Eidus 	rmap_age_t remaining_skips;
2157b6ba2c7SHugh Dickins 	union {
2167b6ba2c7SHugh Dickins 		struct rb_node node;	/* when node of unstable tree */
21721fbd591SQi Zheng 		struct {		/* when listed from stable tree */
2187b6ba2c7SHugh Dickins 			struct ksm_stable_node *head;
2197b6ba2c7SHugh Dickins 			struct hlist_node hlist;
22031dbd01fSIzik Eidus 		};
22131dbd01fSIzik Eidus 	};
22231dbd01fSIzik Eidus };
22331dbd01fSIzik Eidus 
2247b6ba2c7SHugh Dickins #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
2257b6ba2c7SHugh Dickins #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
22631dbd01fSIzik Eidus #define STABLE_FLAG	0x200	/* is listed from the stable tree */
22731dbd01fSIzik Eidus 
228ef53d16cSHugh Dickins /* The stable and unstable tree heads */
229ef53d16cSHugh Dickins static struct rb_root one_stable_tree[1] = { RB_ROOT };
230ef53d16cSHugh Dickins static struct rb_root one_unstable_tree[1] = { RB_ROOT };
231ef53d16cSHugh Dickins static struct rb_root *root_stable_tree = one_stable_tree;
23231dbd01fSIzik Eidus static struct rb_root *root_unstable_tree = one_unstable_tree;
2334146d2d6SHugh Dickins 
2344146d2d6SHugh Dickins /* Recently migrated nodes of stable tree, pending proper placement */
2352c653d0eSAndrea Arcangeli static LIST_HEAD(migrate_nodes);
2364146d2d6SHugh Dickins #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
2374ca3a69bSSasha Levin 
2384ca3a69bSSasha Levin #define MM_SLOTS_HASH_BITS 10
23931dbd01fSIzik Eidus static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
24021fbd591SQi Zheng 
24158730ab6SQi Zheng static struct ksm_mm_slot ksm_mm_head = {
24231dbd01fSIzik Eidus 	.slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
24331dbd01fSIzik Eidus };
24431dbd01fSIzik Eidus static struct ksm_scan ksm_scan = {
24531dbd01fSIzik Eidus 	.mm_slot = &ksm_mm_head,
24631dbd01fSIzik Eidus };
24731dbd01fSIzik Eidus 
2487b6ba2c7SHugh Dickins static struct kmem_cache *rmap_item_cache;
24931dbd01fSIzik Eidus static struct kmem_cache *stable_node_cache;
25031dbd01fSIzik Eidus static struct kmem_cache *mm_slot_cache;
251b348b5feSStefan Roesch 
252b348b5feSStefan Roesch /* Default number of pages to scan per batch */
253b348b5feSStefan Roesch #define DEFAULT_PAGES_TO_SCAN 100
25431dbd01fSIzik Eidus 
255b4028260SHugh Dickins /* The number of pages scanned */
25631dbd01fSIzik Eidus static unsigned long ksm_pages_scanned;
257e178dfdeSHugh Dickins 
258b4028260SHugh Dickins /* The number of nodes in the stable tree */
25931dbd01fSIzik Eidus static unsigned long ksm_pages_shared;
260473b0ce4SHugh Dickins 
261473b0ce4SHugh Dickins /* The number of page slots additionally sharing those nodes */
262473b0ce4SHugh Dickins static unsigned long ksm_pages_sharing;
263473b0ce4SHugh Dickins 
264473b0ce4SHugh Dickins /* The number of nodes in the unstable tree */
265473b0ce4SHugh Dickins static unsigned long ksm_pages_unshared;
2662c653d0eSAndrea Arcangeli 
2672c653d0eSAndrea Arcangeli /* The number of rmap_items in use: to calculate pages_volatile */
2682c653d0eSAndrea Arcangeli static unsigned long ksm_rmap_items;
2692c653d0eSAndrea Arcangeli 
2702c653d0eSAndrea Arcangeli /* The number of stable_node chains */
2712c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_chains;
2722c653d0eSAndrea Arcangeli 
273584ff0dfSZhansaya Bagdauletkyzy /* The number of stable_node dups linked to the stable_node chains */
2742c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_dups;
2752c653d0eSAndrea Arcangeli 
2762c653d0eSAndrea Arcangeli /* Delay in pruning stale stable_node_dups in the stable_node_chains */
2772c653d0eSAndrea Arcangeli static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
27831dbd01fSIzik Eidus 
2792c6854fdSIzik Eidus /* Maximum number of page slots sharing a stable node */
28031dbd01fSIzik Eidus static int ksm_max_page_sharing = 256;
28131dbd01fSIzik Eidus 
2822ffd8679SHugh Dickins /* Number of pages ksmd should scan in one batch */
28331dbd01fSIzik Eidus static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
284e86c59b1SClaudio Imbrenda 
285e86c59b1SClaudio Imbrenda /* Milliseconds ksmd should sleep between batches */
286e86c59b1SClaudio Imbrenda static unsigned int ksm_thread_sleep_millisecs = 20;
287e86c59b1SClaudio Imbrenda 
288e86c59b1SClaudio Imbrenda /* Checksum of an empty (zeroed) page */
289e86c59b1SClaudio Imbrenda static unsigned int zero_checksum __read_mostly;
2905e924ff5SStefan Roesch 
2915e924ff5SStefan Roesch /* Whether to merge empty (zeroed) pages with actual zero pages */
2925e924ff5SStefan Roesch static bool ksm_use_zero_pages __read_mostly;
2935e924ff5SStefan Roesch 
294e2942062Sxu xin /* Skip pages that couldn't be de-duplicated previously */
295e2942062Sxu xin /* Default to true at least temporarily, for testing */
296e2942062Sxu xin static bool ksm_smart_scan = true;
297e5a68991SStefan Roesch 
298e5a68991SStefan Roesch /* The number of zero pages which is placed by KSM */
299e5a68991SStefan Roesch unsigned long ksm_zero_pages;
300e850dcf5SHugh Dickins 
30190bd6fd3SPetr Holasek /* The number of pages that have been skipped due to "smart scanning" */
30290bd6fd3SPetr Holasek static unsigned long ksm_pages_skipped;
303ef53d16cSHugh Dickins 
304e850dcf5SHugh Dickins /* Don't scan more than max pages per batch. */
305e850dcf5SHugh Dickins static unsigned long ksm_advisor_max_pages_to_scan = 30000;
306ef53d16cSHugh Dickins 
307e850dcf5SHugh Dickins /* Min CPU for scanning pages per scan */
30890bd6fd3SPetr Holasek #define KSM_ADVISOR_MIN_CPU 10
30931dbd01fSIzik Eidus 
31031dbd01fSIzik Eidus /* Max CPU for scanning pages per scan */
31131dbd01fSIzik Eidus static unsigned int ksm_advisor_max_cpu =  70;
312ef4d43a8SHugh Dickins 
313ef4d43a8SHugh Dickins /* Target scan time in seconds to analyze all KSM candidate pages. */
314ef4d43a8SHugh Dickins static unsigned long ksm_advisor_target_scan_time = 200;
31531dbd01fSIzik Eidus 
31631dbd01fSIzik Eidus /* Exponentially weighted moving average. */
317fcf9a0efSKirill Tkhai #define EWMA_WEIGHT 30
31831dbd01fSIzik Eidus 
31931dbd01fSIzik Eidus /**
32031dbd01fSIzik Eidus  * struct advisor_ctx - metadata for KSM advisor
32121fbd591SQi Zheng  * @start_scan: start time of the current scan
32231dbd01fSIzik Eidus  * @scan_time: scan time of previous scan
32331dbd01fSIzik Eidus  * @change: change in percent to pages_to_scan parameter
32431dbd01fSIzik Eidus  * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
32531dbd01fSIzik Eidus  */
32631dbd01fSIzik Eidus struct advisor_ctx {
32721fbd591SQi Zheng 	ktime_t start_scan;
32831dbd01fSIzik Eidus 	unsigned long scan_time;
32931dbd01fSIzik Eidus 	unsigned long change;
33031dbd01fSIzik Eidus 	unsigned long long cpu_time;
33121fbd591SQi Zheng };
3327b6ba2c7SHugh Dickins static struct advisor_ctx advisor_ctx;
3337b6ba2c7SHugh Dickins 
3347b6ba2c7SHugh Dickins /* Define different advisor's */
33521fbd591SQi Zheng enum ksm_advisor_type {
33631dbd01fSIzik Eidus 	KSM_ADVISOR_NONE,
3377b6ba2c7SHugh Dickins 	KSM_ADVISOR_SCAN_TIME,
33831dbd01fSIzik Eidus };
33931dbd01fSIzik Eidus static enum ksm_advisor_type ksm_advisor;
34031dbd01fSIzik Eidus 
3417b6ba2c7SHugh Dickins #ifdef CONFIG_SYSFS
3427b6ba2c7SHugh Dickins /*
3437b6ba2c7SHugh Dickins  * Only called through the sysfs control interface:
34431dbd01fSIzik Eidus  */
34531dbd01fSIzik Eidus 
34631dbd01fSIzik Eidus /* At least scan this many pages per batch. */
34731dbd01fSIzik Eidus static unsigned long ksm_advisor_min_pages_to_scan = 500;
34831dbd01fSIzik Eidus 
set_advisor_defaults(void)34931dbd01fSIzik Eidus static void set_advisor_defaults(void)
35031dbd01fSIzik Eidus {
35131dbd01fSIzik Eidus 	if (ksm_advisor == KSM_ADVISOR_NONE) {
3527b6ba2c7SHugh Dickins 		ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
35331dbd01fSIzik Eidus 	} else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
35431dbd01fSIzik Eidus 		advisor_ctx = (const struct advisor_ctx){ 0 };
35531dbd01fSIzik Eidus 		ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
35631dbd01fSIzik Eidus 	}
35721fbd591SQi Zheng }
3582c653d0eSAndrea Arcangeli #endif /* CONFIG_SYSFS */
3592c653d0eSAndrea Arcangeli 
advisor_start_scan(void)3602c653d0eSAndrea Arcangeli static inline void advisor_start_scan(void)
3612c653d0eSAndrea Arcangeli {
36221fbd591SQi Zheng 	if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
3632c653d0eSAndrea Arcangeli 		advisor_ctx.start_scan = ktime_get();
3642c653d0eSAndrea Arcangeli }
3652c653d0eSAndrea Arcangeli 
3662c653d0eSAndrea Arcangeli /*
36721fbd591SQi Zheng  * Use previous scan time if available, otherwise use current scan time as an
36821fbd591SQi Zheng  * approximation for the previous scan time.
3692c653d0eSAndrea Arcangeli  */
prev_scan_time(struct advisor_ctx * ctx,unsigned long scan_time)3702c653d0eSAndrea Arcangeli static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
3712c653d0eSAndrea Arcangeli 					   unsigned long scan_time)
3722c653d0eSAndrea Arcangeli {
3732c653d0eSAndrea Arcangeli 	return ctx->scan_time ? ctx->scan_time : scan_time;
3742c653d0eSAndrea Arcangeli }
3752c653d0eSAndrea Arcangeli 
3762c653d0eSAndrea Arcangeli /* Calculate exponential weighted moving average */
ewma(unsigned long prev,unsigned long curr)37721fbd591SQi Zheng static unsigned long ewma(unsigned long prev, unsigned long curr)
3782c653d0eSAndrea Arcangeli {
379b4fecc67SAndrea Arcangeli 	return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
3802c653d0eSAndrea Arcangeli }
3812c653d0eSAndrea Arcangeli 
3822c653d0eSAndrea Arcangeli /*
3832c653d0eSAndrea Arcangeli  * The scan time advisor is based on the current scan rate and the target
38421fbd591SQi Zheng  * scan rate.
3852c653d0eSAndrea Arcangeli  *
3862c653d0eSAndrea Arcangeli  *      new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
3872c653d0eSAndrea Arcangeli  *
3882c653d0eSAndrea Arcangeli  * To avoid perturbations it calculates a change factor of previous changes.
3892c653d0eSAndrea Arcangeli  * A new change factor is calculated for each iteration and it uses an
3902c653d0eSAndrea Arcangeli  * exponentially weighted moving average. The new pages_to_scan value is
3912c653d0eSAndrea Arcangeli  * multiplied with that change factor:
3922c653d0eSAndrea Arcangeli  *
3932c653d0eSAndrea Arcangeli  *      new_pages_to_scan *= change facor
3942c653d0eSAndrea Arcangeli  *
3952c653d0eSAndrea Arcangeli  * The new_pages_to_scan value is limited by the cpu min and max values. It
39621fbd591SQi Zheng  * calculates the cpu percent for the last scan and calculates the new
39731dbd01fSIzik Eidus  * estimated cpu percent cost for the next scan. That value is capped by the
39821fbd591SQi Zheng  * cpu min and max setting.
399473b0ce4SHugh Dickins  *
4005b398e41Szhong jiang  * In addition the new pages_to_scan value is capped by the max and min
4015b398e41Szhong jiang  * limits.
402473b0ce4SHugh Dickins  */
scan_time_advisor(void)403473b0ce4SHugh Dickins static void scan_time_advisor(void)
404473b0ce4SHugh Dickins {
40531dbd01fSIzik Eidus 	unsigned int cpu_percent;
40631dbd01fSIzik Eidus 	unsigned long cpu_time;
40721fbd591SQi Zheng 	unsigned long cpu_time_diff;
40831dbd01fSIzik Eidus 	unsigned long cpu_time_diff_ms;
409473b0ce4SHugh Dickins 	unsigned long pages;
410cb4df4caSxu xin 	unsigned long per_page_cost;
41131dbd01fSIzik Eidus 	unsigned long factor;
41231dbd01fSIzik Eidus 	unsigned long change;
41331dbd01fSIzik Eidus 	unsigned long last_scan_time;
41431dbd01fSIzik Eidus 	unsigned long scan_time;
41521fbd591SQi Zheng 
4167b6ba2c7SHugh Dickins 	/* Convert scan time to seconds */
4176213055fSzhong jiang 	scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
4186213055fSzhong jiang 			    MSEC_PER_SEC);
4196213055fSzhong jiang 	scan_time = scan_time ? scan_time : 1;
4206213055fSzhong jiang 
4216213055fSzhong jiang 	/* Calculate CPU consumption of ksmd background thread */
4226213055fSzhong jiang 	cpu_time = task_sched_runtime(current);
4237b6ba2c7SHugh Dickins 	cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
4247b6ba2c7SHugh Dickins 	cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;
42521fbd591SQi Zheng 
4267b6ba2c7SHugh Dickins 	cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
4272c653d0eSAndrea Arcangeli 	cpu_percent = cpu_percent ? cpu_percent : 1;
4282c653d0eSAndrea Arcangeli 	last_scan_time = prev_scan_time(&advisor_ctx, scan_time);
4297b6ba2c7SHugh Dickins 
4307b6ba2c7SHugh Dickins 	/* Calculate scan time as percentage of target scan time */
4317b6ba2c7SHugh Dickins 	factor = ksm_advisor_target_scan_time * 100 / scan_time;
43231dbd01fSIzik Eidus 	factor = factor ? factor : 1;
433a913e182SHugh Dickins 
434a913e182SHugh Dickins 	/*
435c1e8d7c6SMichel Lespinasse 	 * Calculate scan time as percentage of last scan time and use
436a913e182SHugh Dickins 	 * exponentially weighted average to smooth it
437a913e182SHugh Dickins 	 */
438a913e182SHugh Dickins 	change = scan_time * 100 / last_scan_time;
439a913e182SHugh Dickins 	change = change ? change : 1;
440a913e182SHugh Dickins 	change = ewma(advisor_ctx.change, change);
441a913e182SHugh Dickins 
442a913e182SHugh Dickins 	/* Calculate new scan rate based on target scan rate. */
443a913e182SHugh Dickins 	pages = ksm_thread_pages_to_scan * 100 / factor;
444a913e182SHugh Dickins 	/* Update pages_to_scan by weighted change percentage. */
445d7c0e68dSDavid Hildenbrand 	pages = pages * change / 100;
446d7c0e68dSDavid Hildenbrand 
447d7c0e68dSDavid Hildenbrand 	/* Cap new pages_to_scan value */
448d7c0e68dSDavid Hildenbrand 	per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
449d7c0e68dSDavid Hildenbrand 	per_page_cost = per_page_cost ? per_page_cost : 1;
450d7c0e68dSDavid Hildenbrand 
451c33c7948SRyan Roberts 	pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
452d7c0e68dSDavid Hildenbrand 	pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
453d7c0e68dSDavid Hildenbrand 	pages = min(pages, ksm_advisor_max_pages_to_scan);
454d7c0e68dSDavid Hildenbrand 
45504dee9e8SHugh Dickins 	/* Update advisor context */
45604dee9e8SHugh Dickins 	advisor_ctx.change = change;
457c33c7948SRyan Roberts 	advisor_ctx.scan_time = scan_time;
458c33c7948SRyan Roberts 	advisor_ctx.cpu_time = cpu_time;
459c33c7948SRyan Roberts 
460c33c7948SRyan Roberts 	ksm_thread_pages_to_scan = pages;
461c33c7948SRyan Roberts 	trace_ksm_advisor(scan_time, pages, cpu_percent);
462d7c0e68dSDavid Hildenbrand }
463d7c0e68dSDavid Hildenbrand 
advisor_stop_scan(void)464d7c0e68dSDavid Hildenbrand static void advisor_stop_scan(void)
465d7c0e68dSDavid Hildenbrand {
466d7c0e68dSDavid Hildenbrand 	if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
467d7c0e68dSDavid Hildenbrand 		scan_time_advisor();
468d7c0e68dSDavid Hildenbrand }
469d7c0e68dSDavid Hildenbrand 
47079271476Sxu xin #ifdef CONFIG_NUMA
471*afccb080SRyan Roberts /* Zeroed when merging across nodes is not allowed */
472d7c0e68dSDavid Hildenbrand static unsigned int ksm_merge_across_nodes = 1;
473d7c0e68dSDavid Hildenbrand static int ksm_nr_node_ids = 1;
474d7c0e68dSDavid Hildenbrand #else
475d7c0e68dSDavid Hildenbrand #define ksm_merge_across_nodes	1U
476d7c0e68dSDavid Hildenbrand #define ksm_nr_node_ids		1
477d7c0e68dSDavid Hildenbrand #endif
47849b06385SSuren Baghdasaryan 
47949b06385SSuren Baghdasaryan #define KSM_RUN_STOP	0
48049b06385SSuren Baghdasaryan #define KSM_RUN_MERGE	1
48149b06385SSuren Baghdasaryan #define KSM_RUN_UNMERGE	2
48249b06385SSuren Baghdasaryan #define KSM_RUN_OFFLINE	4
48349b06385SSuren Baghdasaryan static unsigned long ksm_run = KSM_RUN_STOP;
484d7c0e68dSDavid Hildenbrand static void wait_while_offlining(void);
485d7c0e68dSDavid Hildenbrand 
486a913e182SHugh Dickins static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
4876cce3314SDavid Hildenbrand static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
4886cce3314SDavid Hildenbrand static DEFINE_MUTEX(ksm_thread_mutex);
48931dbd01fSIzik Eidus static DEFINE_SPINLOCK(ksm_mmlist_lock);
4906cce3314SDavid Hildenbrand 
49131dbd01fSIzik Eidus #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
49231dbd01fSIzik Eidus 		sizeof(struct __struct), __alignof__(struct __struct),\
493bbcd53c9SDavid Hildenbrand 		(__flags), NULL)
4941b2ee126SDave Hansen 
ksm_slab_init(void)4956cce3314SDavid Hildenbrand static int __init ksm_slab_init(void)
4961b2ee126SDave Hansen {
4971b2ee126SDave Hansen 	rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
49831dbd01fSIzik Eidus 	if (!rmap_item_cache)
49949b06385SSuren Baghdasaryan 		goto out;
50031dbd01fSIzik Eidus 
50150a7ca3cSSouptick Joarder 	stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
50249b06385SSuren Baghdasaryan 	if (!stable_node_cache)
50349b06385SSuren Baghdasaryan 		goto out_free1;
50431dbd01fSIzik Eidus 
50531dbd01fSIzik Eidus 	mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
506d7c0e68dSDavid Hildenbrand 	if (!mm_slot_cache)
50758f595c6SDavid Hildenbrand 		goto out_free2;
50831dbd01fSIzik Eidus 
50949b06385SSuren Baghdasaryan 	return 0;
510d7c0e68dSDavid Hildenbrand 
511d7c0e68dSDavid Hildenbrand out_free2:
51258f595c6SDavid Hildenbrand 	kmem_cache_destroy(stable_node_cache);
51358f595c6SDavid Hildenbrand out_free1:
514dcddffd4SKirill A. Shutemov 	kmem_cache_destroy(rmap_item_cache);
5156cce3314SDavid Hildenbrand out:
516bce617edSPeter Xu 	return -ENOMEM;
51758f595c6SDavid Hildenbrand }
518d952b791SHugh Dickins 
ksm_slab_free(void)51958f595c6SDavid Hildenbrand static void __init ksm_slab_free(void)
52058f595c6SDavid Hildenbrand {
52158f595c6SDavid Hildenbrand 	kmem_cache_destroy(mm_slot_cache);
522d952b791SHugh Dickins 	kmem_cache_destroy(stable_node_cache);
523d952b791SHugh Dickins 	kmem_cache_destroy(rmap_item_cache);
524d952b791SHugh Dickins 	mm_slot_cache = NULL;
525d952b791SHugh Dickins }
526d952b791SHugh Dickins 
is_stable_node_chain(struct ksm_stable_node * chain)527d952b791SHugh Dickins static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
528d952b791SHugh Dickins {
529d952b791SHugh Dickins 	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
530d952b791SHugh Dickins }
531d952b791SHugh Dickins 
is_stable_node_dup(struct ksm_stable_node * dup)532d952b791SHugh Dickins static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
533d952b791SHugh Dickins {
534d952b791SHugh Dickins 	return dup->head == STABLE_NODE_DUP_HEAD;
535d952b791SHugh Dickins }
536d952b791SHugh Dickins 
stable_node_chain_add_dup(struct ksm_stable_node * dup,struct ksm_stable_node * chain)537d952b791SHugh Dickins static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
538d952b791SHugh Dickins 					     struct ksm_stable_node *chain)
539d952b791SHugh Dickins {
540d952b791SHugh Dickins 	VM_BUG_ON(is_stable_node_dup(dup));
541d952b791SHugh Dickins 	dup->head = STABLE_NODE_DUP_HEAD;
542d952b791SHugh Dickins 	VM_BUG_ON(!is_stable_node_chain(chain));
543d952b791SHugh Dickins 	hlist_add_head(&dup->hlist_dup, &chain->hlist);
54431dbd01fSIzik Eidus 	ksm_stable_node_dups++;
54531dbd01fSIzik Eidus }
546d7597f59SStefan Roesch 
__stable_node_dup_del(struct ksm_stable_node * dup)547d7597f59SStefan Roesch static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
548d7597f59SStefan Roesch {
549d7597f59SStefan Roesch 	VM_BUG_ON(!is_stable_node_dup(dup));
550d7597f59SStefan Roesch 	hlist_del(&dup->hlist_dup);
551d7597f59SStefan Roesch 	ksm_stable_node_dups--;
552d7597f59SStefan Roesch }
553d7597f59SStefan Roesch 
stable_node_dup_del(struct ksm_stable_node * dup)554d7597f59SStefan Roesch static inline void stable_node_dup_del(struct ksm_stable_node *dup)
555d7597f59SStefan Roesch {
556d7597f59SStefan Roesch 	VM_BUG_ON(is_stable_node_chain(dup));
557d7597f59SStefan Roesch 	if (is_stable_node_dup(dup))
558d7597f59SStefan Roesch 		__stable_node_dup_del(dup);
559d7597f59SStefan Roesch 	else
560d7597f59SStefan Roesch 		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
561d7597f59SStefan Roesch #ifdef CONFIG_DEBUG_VM
562d7597f59SStefan Roesch 	dup->head = NULL;
563d7597f59SStefan Roesch #endif
564d7597f59SStefan Roesch }
565d7597f59SStefan Roesch 
alloc_rmap_item(void)566d7597f59SStefan Roesch static inline struct ksm_rmap_item *alloc_rmap_item(void)
567d7597f59SStefan Roesch {
568ef694222SBob Liu 	struct ksm_rmap_item *rmap_item;
569ef694222SBob Liu 
570ef694222SBob Liu 	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
571ef694222SBob Liu 						__GFP_NORETRY | __GFP_NOWARN);
572ef694222SBob Liu 	if (rmap_item)
573ef694222SBob Liu 		ksm_rmap_items++;
574ff69fb81SLiam Howlett 	return rmap_item;
575ff69fb81SLiam Howlett }
576ef694222SBob Liu 
free_rmap_item(struct ksm_rmap_item * rmap_item)577ef694222SBob Liu static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
578ef694222SBob Liu {
579ef694222SBob Liu 	ksm_rmap_items--;
58021fbd591SQi Zheng 	rmap_item->mm->ksm_rmap_items--;
58131dbd01fSIzik Eidus 	rmap_item->mm = NULL;	/* debug safety */
5828dd3557aSHugh Dickins 	kmem_cache_free(rmap_item_cache, rmap_item);
5838dd3557aSHugh Dickins }
58431dbd01fSIzik Eidus 
alloc_stable_node(void)58531dbd01fSIzik Eidus static inline struct ksm_stable_node *alloc_stable_node(void)
5864035c07aSHugh Dickins {
5874035c07aSHugh Dickins 	/*
5884035c07aSHugh Dickins 	 * The allocation can take too long with GFP_KERNEL when memory is under
5894035c07aSHugh Dickins 	 * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
5909e60109fSPeter Zijlstra 	 * grants access to memory reserves, helping to avoid this problem.
5914035c07aSHugh Dickins 	 */
592d8ed45c5SMichel Lespinasse 	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
593ef694222SBob Liu }
594ef694222SBob Liu 
free_stable_node(struct ksm_stable_node * stable_node)59549b06385SSuren Baghdasaryan static inline void free_stable_node(struct ksm_stable_node *stable_node)
596d8ed45c5SMichel Lespinasse {
59731dbd01fSIzik Eidus 	VM_BUG_ON(stable_node->rmap_hlist_len &&
59831dbd01fSIzik Eidus 		  !is_stable_node_chain(stable_node));
59921fbd591SQi Zheng 	kmem_cache_free(stable_node_cache, stable_node);
60031dbd01fSIzik Eidus }
60131dbd01fSIzik Eidus 
60231dbd01fSIzik Eidus /*
60331dbd01fSIzik Eidus  * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
60431dbd01fSIzik Eidus  * page tables after it has passed through ksm_exit() - which, if necessary,
60531dbd01fSIzik Eidus  * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
606d8ed45c5SMichel Lespinasse  * a special flag: they can just back out as soon as mm_users goes to zero.
607ef694222SBob Liu  * ksm_test_exit() is used throughout to make this test for exit: in some
608ef694222SBob Liu  * places for correctness, in some places just to avoid unnecessary work.
60931dbd01fSIzik Eidus  */
ksm_test_exit(struct mm_struct * mm)61031dbd01fSIzik Eidus static inline bool ksm_test_exit(struct mm_struct *mm)
61131dbd01fSIzik Eidus {
612f7091ed6SHaiyue Wang 	return atomic_read(&mm->mm_users) == 0;
61331dbd01fSIzik Eidus }
614f7091ed6SHaiyue Wang 
break_ksm_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)615f7091ed6SHaiyue Wang static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
616f765f540SKirill A. Shutemov 			struct mm_walk *walk)
61731dbd01fSIzik Eidus {
61831dbd01fSIzik Eidus 	struct page *page = NULL;
61931dbd01fSIzik Eidus 	spinlock_t *ptl;
620f7091ed6SHaiyue Wang 	pte_t *pte;
62131dbd01fSIzik Eidus 	pte_t ptent;
622c8f95ed1SAndrea Arcangeli 	int ret;
623c8f95ed1SAndrea Arcangeli 
62431dbd01fSIzik Eidus 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
625d8ed45c5SMichel Lespinasse 	if (!pte)
62631dbd01fSIzik Eidus 		return 0;
62731dbd01fSIzik Eidus 	ptent = ptep_get(pte);
62831dbd01fSIzik Eidus 	if (pte_present(ptent)) {
62990bd6fd3SPetr Holasek 		page = vm_normal_page(walk->vma, addr, ptent);
63090bd6fd3SPetr Holasek 	} else if (!pte_none(ptent)) {
63190bd6fd3SPetr Holasek 		swp_entry_t entry = pte_to_swp_entry(ptent);
63290bd6fd3SPetr Holasek 
63390bd6fd3SPetr Holasek 		/*
63490bd6fd3SPetr Holasek 		 * As KSM pages remain KSM pages until freed, no need to wait
63590bd6fd3SPetr Holasek 		 * here for migration to end.
63690bd6fd3SPetr Holasek 		 */
637d8fc16a8SHugh Dickins 		if (is_migration_entry(entry))
63890bd6fd3SPetr Holasek 			page = pfn_swap_entry_to_page(entry);
63990bd6fd3SPetr Holasek 	}
64021fbd591SQi Zheng 	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
6412c653d0eSAndrea Arcangeli 	ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
6422c653d0eSAndrea Arcangeli 	pte_unmap_unlock(pte, ptl);
64321fbd591SQi Zheng 	return ret;
6442c653d0eSAndrea Arcangeli }
6452c653d0eSAndrea Arcangeli 
6462c653d0eSAndrea Arcangeli static const struct mm_walk_ops break_ksm_ops = {
6472c653d0eSAndrea Arcangeli 	.pmd_entry = break_ksm_pmd_entry,
6482c653d0eSAndrea Arcangeli 	.walk_lock = PGWALK_RDLOCK,
6492c653d0eSAndrea Arcangeli };
65098fa15f3SAnshuman Khandual 
6512c653d0eSAndrea Arcangeli static const struct mm_walk_ops break_ksm_lock_vma_ops = {
6522c653d0eSAndrea Arcangeli 	.pmd_entry = break_ksm_pmd_entry,
6532c653d0eSAndrea Arcangeli 	.walk_lock = PGWALK_WRLOCK,
6542c653d0eSAndrea Arcangeli };
6552c653d0eSAndrea Arcangeli 
6562c653d0eSAndrea Arcangeli /*
6572c653d0eSAndrea Arcangeli  * We use break_ksm to break COW on a ksm page by triggering unsharing,
6582c653d0eSAndrea Arcangeli  * such that the ksm page will get replaced by an exclusive anonymous page.
6592c653d0eSAndrea Arcangeli  *
6602c653d0eSAndrea Arcangeli  * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
6612c653d0eSAndrea Arcangeli  * in case the application has unmapped and remapped mm,addr meanwhile.
6622c653d0eSAndrea Arcangeli  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
6632c653d0eSAndrea Arcangeli  * mmap of /dev/mem, where we would not want to touch it.
6642c653d0eSAndrea Arcangeli  *
665457aef94SEthon Paul  * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
6662c653d0eSAndrea Arcangeli  * of the process that owns 'vma'.  We also do not want to enforce
6672c653d0eSAndrea Arcangeli  * protection keys here anyway.
6682c653d0eSAndrea Arcangeli  */
break_ksm(struct vm_area_struct * vma,unsigned long addr,bool lock_vma)6692c653d0eSAndrea Arcangeli static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
6702c653d0eSAndrea Arcangeli {
6712c653d0eSAndrea Arcangeli 	vm_fault_t ret = 0;
6722c653d0eSAndrea Arcangeli 	const struct mm_walk_ops *ops = lock_vma ?
67321fbd591SQi Zheng 				&break_ksm_lock_vma_ops : &break_ksm_ops;
6742c653d0eSAndrea Arcangeli 
6752c653d0eSAndrea Arcangeli 	do {
6762c653d0eSAndrea Arcangeli 		int ksm_page;
6772c653d0eSAndrea Arcangeli 
6782c653d0eSAndrea Arcangeli 		cond_resched();
6792c653d0eSAndrea Arcangeli 		ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
6802c653d0eSAndrea Arcangeli 		if (WARN_ON_ONCE(ksm_page < 0))
68121fbd591SQi Zheng 			return ksm_page;
6824035c07aSHugh Dickins 		if (!ksm_page)
68321fbd591SQi Zheng 			return 0;
6844035c07aSHugh Dickins 		ret = handle_mm_fault(vma, addr,
6852c653d0eSAndrea Arcangeli 				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
6862c653d0eSAndrea Arcangeli 				      NULL);
6872c653d0eSAndrea Arcangeli 	} while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
688b67bfe0dSSasha Levin 	/*
689739100c8SStefan Roesch 	 * We must loop until we no longer find a KSM page because
6904035c07aSHugh Dickins 	 * handle_mm_fault() may back out if there's any difficulty e.g. if
691739100c8SStefan Roesch 	 * pte accessed bit gets updated concurrently.
692739100c8SStefan Roesch 	 *
6934035c07aSHugh Dickins 	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
694739100c8SStefan Roesch 	 * backing file, which also invalidates anonymous pages: that's
69576093853Sxu xin 	 * okay, that truncation will have unmapped the PageKsm for us.
69676093853Sxu xin 	 *
69776093853Sxu xin 	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
6982c653d0eSAndrea Arcangeli 	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
6992c653d0eSAndrea Arcangeli 	 * current task has TIF_MEMDIE set, and will be OOM killed on return
7009e60109fSPeter Zijlstra 	 * to user; and ksmd, having no mm, would never be chosen for that.
7014035c07aSHugh Dickins 	 *
7024035c07aSHugh Dickins 	 * But if the mm is in a limited mem_cgroup, then the fault may fail
7034035c07aSHugh Dickins 	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
7044035c07aSHugh Dickins 	 * even ksmd can fail in this way - though it's usually breaking ksm
7052c653d0eSAndrea Arcangeli 	 * just to undo a merge it made a moment before, so unlikely to oom.
7062c653d0eSAndrea Arcangeli 	 *
7072c653d0eSAndrea Arcangeli 	 * That's a pity: we might therefore have more kernel pages allocated
7082c653d0eSAndrea Arcangeli 	 * than we're counting as nodes in the stable tree; but ksm_do_scan
7092c653d0eSAndrea Arcangeli 	 * will retry to break_cow on each pass, so should recover the page
710815f0ddbSNick Desaulniers 	 * in due course.  The important thing is to not let VM_MERGEABLE
7112c653d0eSAndrea Arcangeli 	 * be cleared while any such pages might remain in the area.
7122c653d0eSAndrea Arcangeli 	 */
7132c653d0eSAndrea Arcangeli 	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
7142c653d0eSAndrea Arcangeli }
715739100c8SStefan Roesch 
vma_ksm_compatible(struct vm_area_struct * vma)7164146d2d6SHugh Dickins static bool vma_ksm_compatible(struct vm_area_struct *vma)
7174146d2d6SHugh Dickins {
7184146d2d6SHugh Dickins 	if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
7192c653d0eSAndrea Arcangeli 			     VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
7204035c07aSHugh Dickins 			     VM_MIXEDMAP))
7214035c07aSHugh Dickins 		return false;		/* just ignore the advice */
7224035c07aSHugh Dickins 
7232cee57d1SYang Shi 	if (vma_is_dax(vma))
7242cee57d1SYang Shi 		return false;
7252cee57d1SYang Shi 
7262cee57d1SYang Shi #ifdef VM_SAO
7272cee57d1SYang Shi 	if (vma->vm_flags & VM_SAO)
7282cee57d1SYang Shi 		return false;
7294035c07aSHugh Dickins #endif
7304035c07aSHugh Dickins #ifdef VM_SPARC_ADI
7314035c07aSHugh Dickins 	if (vma->vm_flags & VM_SPARC_ADI)
7324035c07aSHugh Dickins 		return false;
7334035c07aSHugh Dickins #endif
7344035c07aSHugh Dickins 
735c8d6553bSHugh Dickins 	return true;
7364035c07aSHugh Dickins }
7374035c07aSHugh Dickins 
find_mergeable_vma(struct mm_struct * mm,unsigned long addr)7384035c07aSHugh Dickins static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
7394035c07aSHugh Dickins 		unsigned long addr)
7404035c07aSHugh Dickins {
7414035c07aSHugh Dickins 	struct vm_area_struct *vma;
7424035c07aSHugh Dickins 	if (ksm_test_exit(mm))
7434035c07aSHugh Dickins 		return NULL;
7444035c07aSHugh Dickins 	vma = vma_lookup(mm, addr);
7454035c07aSHugh Dickins 	if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
7464035c07aSHugh Dickins 		return NULL;
7474035c07aSHugh Dickins 	return vma;
74821fbd591SQi Zheng }
7492cee57d1SYang Shi 
break_cow(struct ksm_rmap_item * rmap_item)7504035c07aSHugh Dickins static void break_cow(struct ksm_rmap_item *rmap_item)
7514035c07aSHugh Dickins {
7524035c07aSHugh Dickins 	struct mm_struct *mm = rmap_item->mm;
753c8d6553bSHugh Dickins 	unsigned long addr = rmap_item->address;
7544035c07aSHugh Dickins 	struct vm_area_struct *vma;
755bda807d4SMinchan Kim 
756bda807d4SMinchan Kim 	/*
757c8d6553bSHugh Dickins 	 * It is not an accident that whenever we want to break COW
75808df4774SPaul E. McKenney 	 * to undo, we also need to drop a reference to the anon_vma.
759c8d6553bSHugh Dickins 	 */
7604db0c3c2SJason Low 	put_anon_vma(rmap_item->anon_vma);
7614035c07aSHugh Dickins 
762c8d6553bSHugh Dickins 	mmap_read_lock(mm);
763c8d6553bSHugh Dickins 	vma = find_mergeable_vma(mm, addr);
764c8d6553bSHugh Dickins 	if (vma)
765c8d6553bSHugh Dickins 		break_ksm(vma, addr, false);
766c8d6553bSHugh Dickins 	mmap_read_unlock(mm);
7671c4c3b99SJiang Biao }
768c8d6553bSHugh Dickins 
get_mergeable_page(struct ksm_rmap_item * rmap_item)76952d1e606SKirill Tkhai static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
7709800562fSMatthew Wilcox (Oracle) {
77152d1e606SKirill Tkhai 	struct mm_struct *mm = rmap_item->mm;
772c8d6553bSHugh Dickins 	unsigned long addr = rmap_item->address;
773c8d6553bSHugh Dickins 	struct vm_area_struct *vma;
774c8d6553bSHugh Dickins 	struct page *page;
775c8d6553bSHugh Dickins 
776c8d6553bSHugh Dickins 	mmap_read_lock(mm);
777c8d6553bSHugh Dickins 	vma = find_mergeable_vma(mm, addr);
778c8d6553bSHugh Dickins 	if (!vma)
7791c4c3b99SJiang Biao 		goto out;
780c8d6553bSHugh Dickins 
781c8d6553bSHugh Dickins 	page = follow_page(vma, addr, FOLL_GET);
782c8d6553bSHugh Dickins 	if (IS_ERR_OR_NULL(page))
7834035c07aSHugh Dickins 		goto out;
784c8d6553bSHugh Dickins 	if (is_zone_device_page(page))
785c8d6553bSHugh Dickins 		goto out_putpage;
786c8d6553bSHugh Dickins 	if (PageAnon(page)) {
7874db0c3c2SJason Low 		flush_anon_page(vma, page, addr);
7884035c07aSHugh Dickins 		flush_dcache_page(page);
7894035c07aSHugh Dickins 	} else {
7904035c07aSHugh Dickins out_putpage:
791c8d6553bSHugh Dickins 		put_page(page);
7922cee57d1SYang Shi out:
7932cee57d1SYang Shi 		page = NULL;
7942cee57d1SYang Shi 	}
7952cee57d1SYang Shi 	mmap_read_unlock(mm);
7962cee57d1SYang Shi 	return page;
7972cee57d1SYang Shi }
7988aafa6a4SHugh Dickins 
7992cee57d1SYang Shi /*
8002cee57d1SYang Shi  * This helper is used for getting right index into array of tree roots.
8014db0c3c2SJason Low  * When merge_across_nodes knob is set to 1, there are only two rb-trees for
8028aafa6a4SHugh Dickins  * stable and unstable pages from all nodes with roots in index 0. Otherwise,
8038aafa6a4SHugh Dickins  * every node has its own stable and unstable tree.
8048aafa6a4SHugh Dickins  */
get_kpfn_nid(unsigned long kpfn)8058aafa6a4SHugh Dickins static inline int get_kpfn_nid(unsigned long kpfn)
8068aafa6a4SHugh Dickins {
8074035c07aSHugh Dickins 	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
808c8d6553bSHugh Dickins }
8094035c07aSHugh Dickins 
alloc_stable_node_chain(struct ksm_stable_node * dup,struct rb_root * root)810c8d6553bSHugh Dickins static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
811c8d6553bSHugh Dickins 						   struct rb_root *root)
812c8d6553bSHugh Dickins {
81319138349SMatthew Wilcox (Oracle) 	struct ksm_stable_node *chain = alloc_stable_node();
814c8d6553bSHugh Dickins 	VM_BUG_ON(is_stable_node_chain(dup));
815c8d6553bSHugh Dickins 	if (likely(chain)) {
816c8d6553bSHugh Dickins 		INIT_HLIST_HEAD(&chain->hlist);
8174db0c3c2SJason Low 		chain->chain_prune_time = jiffies;
818c8d6553bSHugh Dickins 		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
8194035c07aSHugh Dickins #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
8204035c07aSHugh Dickins 		chain->nid = NUMA_NO_NODE; /* debug */
8214035c07aSHugh Dickins #endif
8224035c07aSHugh Dickins 		ksm_stable_node_chains++;
82331dbd01fSIzik Eidus 
82431dbd01fSIzik Eidus 		/*
82531dbd01fSIzik Eidus 		 * Put the stable node chain in the first dimension of
82631dbd01fSIzik Eidus 		 * the stable tree and at the same time remove the old
82721fbd591SQi Zheng 		 * stable node.
82831dbd01fSIzik Eidus 		 */
8297b6ba2c7SHugh Dickins 		rb_replace_node(&dup->node, &chain->node, root);
83021fbd591SQi Zheng 
8315ad64688SHugh Dickins 		/*
83231dbd01fSIzik Eidus 		 * Move the old stable node to the second dimension
8337b6ba2c7SHugh Dickins 		 * queued in the hlist_dup. The invariant is that all
83462862290SHugh Dickins 		 * dup stable_nodes in the chain->hlist point to pages
8354035c07aSHugh Dickins 		 * that are write protected and have the exact same
8364035c07aSHugh Dickins 		 * content.
8375ad64688SHugh Dickins 		 */
8387b6ba2c7SHugh Dickins 		stable_node_chain_add_dup(dup, chain);
83962862290SHugh Dickins 	}
8405ad64688SHugh Dickins 	return chain;
84108beca44SHugh Dickins }
84298666f8aSAndrea Arcangeli 
free_stable_node_chain(struct ksm_stable_node * chain,struct rb_root * root)8434035c07aSHugh Dickins static inline void free_stable_node_chain(struct ksm_stable_node *chain,
8444035c07aSHugh Dickins 					  struct rb_root *root)
845b4028260SHugh Dickins {
84676093853Sxu xin 	rb_erase(&chain->node, root);
84776093853Sxu xin 	free_stable_node(chain);
84876093853Sxu xin 	ksm_stable_node_chains--;
8492c653d0eSAndrea Arcangeli }
8502c653d0eSAndrea Arcangeli 
remove_node_from_stable_tree(struct ksm_stable_node * stable_node)85131dbd01fSIzik Eidus static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
8529e60109fSPeter Zijlstra {
853c89a384eSMiaohe Lin 	struct ksm_rmap_item *rmap_item;
85493d17715SHugh Dickins 
85531dbd01fSIzik Eidus 	/* check it's not STABLE_NODE_CHAIN or negative */
8567b6ba2c7SHugh Dickins 	BUG_ON(stable_node->rmap_hlist_len < 0);
85731dbd01fSIzik Eidus 
85831dbd01fSIzik Eidus 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
8599ba69294SHugh Dickins 		if (rmap_item->hlist.next) {
86031dbd01fSIzik Eidus 			ksm_pages_sharing--;
8619ba69294SHugh Dickins 			trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
8629ba69294SHugh Dickins 		} else {
8639ba69294SHugh Dickins 			ksm_pages_shared--;
86431dbd01fSIzik Eidus 		}
86531dbd01fSIzik Eidus 
866cd551f97SHugh Dickins 		rmap_item->mm->ksm_merging_pages--;
86731dbd01fSIzik Eidus 
86890bd6fd3SPetr Holasek 		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
869ef53d16cSHugh Dickins 		stable_node->rmap_hlist_len--;
87093d17715SHugh Dickins 		put_anon_vma(rmap_item->anon_vma);
87131dbd01fSIzik Eidus 		rmap_item->address &= PAGE_MASK;
87293d17715SHugh Dickins 		cond_resched();
8734035c07aSHugh Dickins 	}
87431dbd01fSIzik Eidus 
87531dbd01fSIzik Eidus 	/*
87631dbd01fSIzik Eidus 	 * We need the second aligned pointer of the migrate_nodes
87721fbd591SQi Zheng 	 * list_head to stay clear from the rb_parent_color union
87831dbd01fSIzik Eidus 	 * (aligned and different than any node) and also different
8796514d511SHugh Dickins 	 * from &migrate_nodes. This will verify that future list.h changes
88021fbd591SQi Zheng 	 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
8816514d511SHugh Dickins 	 */
88231dbd01fSIzik Eidus 	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
88331dbd01fSIzik Eidus 	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
88431dbd01fSIzik Eidus 
88531dbd01fSIzik Eidus 	trace_ksm_remove_ksm_page(stable_node->kpfn);
88631dbd01fSIzik Eidus 	if (stable_node->head == &migrate_nodes)
88731dbd01fSIzik Eidus 		list_del(&stable_node->list);
888e850dcf5SHugh Dickins 	else
88931dbd01fSIzik Eidus 		stable_node_dup_del(stable_node);
89031dbd01fSIzik Eidus 	free_stable_node(stable_node);
891c1e8d7c6SMichel Lespinasse }
89231dbd01fSIzik Eidus 
89331dbd01fSIzik Eidus enum get_ksm_page_flags {
89481464e30SHugh Dickins 	GET_KSM_PAGE_NOLOCK,
89581464e30SHugh Dickins 	GET_KSM_PAGE_LOCK,
89681464e30SHugh Dickins 	GET_KSM_PAGE_TRYLOCK
89781464e30SHugh Dickins };
89881464e30SHugh Dickins 
89931dbd01fSIzik Eidus /*
900d952b791SHugh Dickins  * get_ksm_page: checks if the page indicated by the stable node
90149b06385SSuren Baghdasaryan  * is still its ksm page, despite having held no reference to it.
90231dbd01fSIzik Eidus  * In which case we can trust the content of the page, and it
90331dbd01fSIzik Eidus  * returns the gotten page; but if the page has now been zapped,
904d952b791SHugh Dickins  * remove the stale node from the stable tree and return NULL.
90531dbd01fSIzik Eidus  * But beware, the stable node's page might be being migrated.
906d952b791SHugh Dickins  *
9079ba69294SHugh Dickins  * You would expect the stable_node to hold a reference to the ksm page.
9089ba69294SHugh Dickins  * But if it increments the page's count, swapping out has to wait for
909d952b791SHugh Dickins  * ksmd to come around again before it can free the page, which may take
910d952b791SHugh Dickins  * seconds or even minutes: much too unresponsive.  So instead we use a
911d952b791SHugh Dickins  * "keyhole reference": access to the ksm page from the stable node peeps
91249b06385SSuren Baghdasaryan  * out through its keyhole to see if that page still holds the right key,
913d952b791SHugh Dickins  * pointing back to this stable node.  This relies on freeing a PageAnon
914d952b791SHugh Dickins  * page to reset its page->mapping to NULL, and relies on no other use of
91531dbd01fSIzik Eidus  * a page to put something that might look like our key in page->mapping.
91631dbd01fSIzik Eidus  * is on its way to being freed; but it is an anomaly to bear in mind.
91721fbd591SQi Zheng  */
get_ksm_page(struct ksm_stable_node * stable_node,enum get_ksm_page_flags flags)91819138349SMatthew Wilcox (Oracle) static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
91919138349SMatthew Wilcox (Oracle) 				 enum get_ksm_page_flags flags)
92019138349SMatthew Wilcox (Oracle) {
92119138349SMatthew Wilcox (Oracle) 	struct page *page;
92221fbd591SQi Zheng 	void *expected_mapping;
92388484826SMike Rapoport 	unsigned long kpfn;
92419138349SMatthew Wilcox (Oracle) 
92588484826SMike Rapoport 	expected_mapping = (void *)((unsigned long)stable_node |
92688484826SMike Rapoport 					PAGE_MAPPING_KSM);
92788484826SMike Rapoport again:
92821fbd591SQi Zheng 	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
92988484826SMike Rapoport 	page = pfn_to_page(kpfn);
9306c287605SDavid Hildenbrand 	if (READ_ONCE(page->mapping) != expected_mapping)
93188484826SMike Rapoport 		goto stale;
93288484826SMike Rapoport 
93388484826SMike Rapoport 	/*
9342ffd8679SHugh Dickins 	 * We cannot do anything with the page while its refcount is 0.
9352ffd8679SHugh Dickins 	 * Usually 0 means free, or tail of a higher-order page: in which
9362ffd8679SHugh Dickins 	 * case this node is no longer referenced, and should be freed;
9372ffd8679SHugh Dickins 	 * however, it might mean that the page is under page_ref_freeze().
93821fbd591SQi Zheng 	 * The __remove_mapping() case is easy, again the node is now stale;
939cbf86cfeSHugh Dickins 	 * the same is in reuse_ksm_page() case; but if page is swapcache
940cbf86cfeSHugh Dickins 	 * in folio_migrate_mapping(), it might still be our page,
941cbf86cfeSHugh Dickins 	 * in which case it's essential to keep the node.
942cbf86cfeSHugh Dickins 	 */
9432cee57d1SYang Shi 	while (!get_page_unless_zero(page)) {
944cbf86cfeSHugh Dickins 		/*
945cbf86cfeSHugh Dickins 		 * Another check for page->mapping != expected_mapping would
946cbf86cfeSHugh Dickins 		 * work here too.  We have chosen the !PageSwapCache test to
947cbf86cfeSHugh Dickins 		 * optimize the common case, when the page is or is about to
948cbf86cfeSHugh Dickins 		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
949cbf86cfeSHugh Dickins 		 * in the ref_freeze section of __remove_mapping(); but Anon
950cbf86cfeSHugh Dickins 		 * page->mapping reset to NULL later, in free_pages_prepare().
951cbf86cfeSHugh Dickins 		 */
9529a63236fSAndrey Ryabinin 		if (!PageSwapCache(page))
9539a63236fSAndrey Ryabinin 			goto stale;
9549a63236fSAndrey Ryabinin 		cpu_relax();
9558fdb3dbfSHugh Dickins 	}
9568fdb3dbfSHugh Dickins 
9579a63236fSAndrey Ryabinin 	if (READ_ONCE(page->mapping) != expected_mapping) {
9588fdb3dbfSHugh Dickins 		put_page(page);
9598fdb3dbfSHugh Dickins 		goto stale;
9608fdb3dbfSHugh Dickins 	}
9618fdb3dbfSHugh Dickins 
9621fec6890SMatthew Wilcox (Oracle) 	if (flags == GET_KSM_PAGE_TRYLOCK) {
963cbf86cfeSHugh Dickins 		if (!trylock_page(page)) {
964cbf86cfeSHugh Dickins 			put_page(page);
965cbf86cfeSHugh Dickins 			return ERR_PTR(-EBUSY);
966cbf86cfeSHugh Dickins 		}
967cbf86cfeSHugh Dickins 	} else if (flags == GET_KSM_PAGE_LOCK)
968cbf86cfeSHugh Dickins 		lock_page(page);
969cbf86cfeSHugh Dickins 
970cbf86cfeSHugh Dickins 	if (flags != GET_KSM_PAGE_NOLOCK) {
971cbf86cfeSHugh Dickins 		if (READ_ONCE(page->mapping) != expected_mapping) {
972cbf86cfeSHugh Dickins 			unlock_page(page);
973cbf86cfeSHugh Dickins 			put_page(page);
974cbf86cfeSHugh Dickins 			goto stale;
975cbf86cfeSHugh Dickins 		}
97621fbd591SQi Zheng 	}
9772c653d0eSAndrea Arcangeli 	return page;
9782c653d0eSAndrea Arcangeli 
97921fbd591SQi Zheng stale:
9802c653d0eSAndrea Arcangeli 	/*
9812c653d0eSAndrea Arcangeli 	 * We come here from above when page->mapping or !PageSwapCache
9822c653d0eSAndrea Arcangeli 	 * suggests that the node is stale; but it might be under migration.
9832c653d0eSAndrea Arcangeli 	 * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
9842c653d0eSAndrea Arcangeli 	 * before checking whether node->kpfn has been changed.
9852c653d0eSAndrea Arcangeli 	 */
9862c653d0eSAndrea Arcangeli 	smp_rmb();
9872c653d0eSAndrea Arcangeli 	if (READ_ONCE(stable_node->kpfn) != kpfn)
9882c653d0eSAndrea Arcangeli 		goto again;
9892c653d0eSAndrea Arcangeli 	remove_node_from_stable_tree(stable_node);
9902c653d0eSAndrea Arcangeli 	return NULL;
9912c653d0eSAndrea Arcangeli }
9922c653d0eSAndrea Arcangeli 
9932c653d0eSAndrea Arcangeli /*
9942c653d0eSAndrea Arcangeli  * Removing rmap_item from stable or unstable tree.
9952c653d0eSAndrea Arcangeli  * This function will clean the information from the stable/unstable tree.
9962c653d0eSAndrea Arcangeli  */
remove_rmap_item_from_tree(struct ksm_rmap_item * rmap_item)9972c653d0eSAndrea Arcangeli static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
9982c653d0eSAndrea Arcangeli {
9992c653d0eSAndrea Arcangeli 	if (rmap_item->address & STABLE_FLAG) {
10002c653d0eSAndrea Arcangeli 		struct ksm_stable_node *stable_node;
1001cbf86cfeSHugh Dickins 		struct page *page;
1002cbf86cfeSHugh Dickins 
100321fbd591SQi Zheng 		stable_node = rmap_item->head;
1004cbf86cfeSHugh Dickins 		page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1005cbf86cfeSHugh Dickins 		if (!page)
1006cbf86cfeSHugh Dickins 			goto out;
1007ef53d16cSHugh Dickins 
1008cbf86cfeSHugh Dickins 		hlist_del(&rmap_item->hlist);
1009cbf86cfeSHugh Dickins 		unlock_page(page);
101021fbd591SQi Zheng 		put_page(page);
10112c653d0eSAndrea Arcangeli 
10122c653d0eSAndrea Arcangeli 		if (!hlist_empty(&stable_node->hlist))
1013cbf86cfeSHugh Dickins 			ksm_pages_sharing--;
1014cbf86cfeSHugh Dickins 		else
1015cbf86cfeSHugh Dickins 			ksm_pages_shared--;
1016cbf86cfeSHugh Dickins 
1017cbf86cfeSHugh Dickins 		rmap_item->mm->ksm_merging_pages--;
1018cbf86cfeSHugh Dickins 
101903640418SGeliang Tang 		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
10204146d2d6SHugh Dickins 		stable_node->rmap_hlist_len--;
10214146d2d6SHugh Dickins 
10224146d2d6SHugh Dickins 		put_anon_vma(rmap_item->anon_vma);
10234146d2d6SHugh Dickins 		rmap_item->head = NULL;
1024cbf86cfeSHugh Dickins 		rmap_item->address &= PAGE_MASK;
1025cbf86cfeSHugh Dickins 
1026cbf86cfeSHugh Dickins 	} else if (rmap_item->address & UNSTABLE_FLAG) {
1027d952b791SHugh Dickins 		unsigned char age;
102831dbd01fSIzik Eidus 		/*
102921fbd591SQi Zheng 		 * Usually ksmd can and must skip the rb_erase, because
103058730ab6SQi Zheng 		 * root_unstable_tree was already reset to RB_ROOT.
103131dbd01fSIzik Eidus 		 * But be careful when an mm is exiting: do the rb_erase
103231dbd01fSIzik Eidus 		 * if this rmap_item was inserted by this scan, rather
1033d952b791SHugh Dickins 		 * than left over from before.
103431dbd01fSIzik Eidus 		 */
1035d952b791SHugh Dickins 		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
103658730ab6SQi Zheng 		BUG_ON(age > 1);
103758730ab6SQi Zheng 		if (!age)
103858730ab6SQi Zheng 			rb_erase(&rmap_item->node,
1039d952b791SHugh Dickins 				 root_unstable_tree + NUMA(rmap_item->nid));
1040d952b791SHugh Dickins 		ksm_pages_unshared--;
1041a5f18ba0SMatthew Wilcox (Oracle) 		rmap_item->address &= PAGE_MASK;
1042a5f18ba0SMatthew Wilcox (Oracle) 	}
104358730ab6SQi Zheng out:
1044a5f18ba0SMatthew Wilcox (Oracle) 	cond_resched();		/* we're called from many long loops */
104558730ab6SQi Zheng }
1046d8ed45c5SMichel Lespinasse 
remove_trailing_rmap_items(struct ksm_rmap_item ** rmap_list)10476db504ceSLiam R. Howlett static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
10486db504ceSLiam R. Howlett {
10496db504ceSLiam R. Howlett 	while (*rmap_list) {
10506db504ceSLiam R. Howlett 		struct ksm_rmap_item *rmap_item = *rmap_list;
10516db504ceSLiam R. Howlett 		*rmap_list = rmap_item->rmap_list;
10529ba69294SHugh Dickins 		remove_rmap_item_from_tree(rmap_item);
10536db504ceSLiam R. Howlett 		free_rmap_item(rmap_item);
10546db504ceSLiam R. Howlett 	}
10556db504ceSLiam R. Howlett }
105631dbd01fSIzik Eidus 
105731dbd01fSIzik Eidus /*
1058d952b791SHugh Dickins  * Though it's very tempting to unmerge rmap_items from stable tree rather
105949b06385SSuren Baghdasaryan  * than check every pte of a given vma, the locking doesn't quite work for
10609ba69294SHugh Dickins  * that - an rmap_item is assigned to the stable tree after inserting ksm
10619ba69294SHugh Dickins  * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
1062d952b791SHugh Dickins  * rmap_items from parent to child at fork time (so as not to waste time
10639ba69294SHugh Dickins  * if exit comes before the next scan reaches it).
10646db504ceSLiam R. Howlett  *
1065420be4edSChengyang Fan  * Similarly, although we'd like to remove rmap_items (so updating counts
1066d8ed45c5SMichel Lespinasse  * and freeing memory) when unmerging an area, it's easier to leave that
106731dbd01fSIzik Eidus  * to the next pass of ksmd - consider, for example, how ksmd might be
106831dbd01fSIzik Eidus  * in cmp_and_merge_page on one of the rmap_items we would be removing.
106958730ab6SQi Zheng  */
unmerge_ksm_pages(struct vm_area_struct * vma,unsigned long start,unsigned long end,bool lock_vma)107058730ab6SQi Zheng static int unmerge_ksm_pages(struct vm_area_struct *vma,
107158730ab6SQi Zheng 			     unsigned long start, unsigned long end, bool lock_vma)
10729ba69294SHugh Dickins {
107358730ab6SQi Zheng 	unsigned long addr;
107458730ab6SQi Zheng 	int err = 0;
107531dbd01fSIzik Eidus 
10769ba69294SHugh Dickins 	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
107758730ab6SQi Zheng 		if (ksm_test_exit(vma->vm_mm))
10789ba69294SHugh Dickins 			break;
1079d7597f59SStefan Roesch 		if (signal_pending(current))
10809ba69294SHugh Dickins 			err = -ERESTARTSYS;
10817496fea9SZhou Chengming 		else
10829ba69294SHugh Dickins 			err = break_ksm(vma, addr, lock_vma);
108331dbd01fSIzik Eidus 	}
108431dbd01fSIzik Eidus 	return err;
1085cbf86cfeSHugh Dickins }
1086cbf86cfeSHugh Dickins 
folio_stable_node(struct folio * folio)1087d952b791SHugh Dickins static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
10889ba69294SHugh Dickins {
10899ba69294SHugh Dickins 	return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
10909ba69294SHugh Dickins }
1091d8ed45c5SMichel Lespinasse 
page_stable_node(struct page * page)1092d952b791SHugh Dickins static inline struct ksm_stable_node *page_stable_node(struct page *page)
1093d952b791SHugh Dickins {
1094d952b791SHugh Dickins 	return folio_stable_node(page_folio(page));
1095d952b791SHugh Dickins }
1096d952b791SHugh Dickins 
set_page_stable_node(struct page * page,struct ksm_stable_node * stable_node)10972ffd8679SHugh Dickins static inline void set_page_stable_node(struct page *page,
1098d952b791SHugh Dickins 					struct ksm_stable_node *stable_node)
109931dbd01fSIzik Eidus {
110031dbd01fSIzik Eidus 	VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
110131dbd01fSIzik Eidus 	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
11029b04c5feSCong Wang }
110359e1a2f4STimofey Titovets 
11049b04c5feSCong Wang #ifdef CONFIG_SYSFS
110531dbd01fSIzik Eidus /*
110631dbd01fSIzik Eidus  * Only called through the sysfs control interface:
110731dbd01fSIzik Eidus  */
remove_stable_node(struct ksm_stable_node * stable_node)110831dbd01fSIzik Eidus static int remove_stable_node(struct ksm_stable_node *stable_node)
110931dbd01fSIzik Eidus {
111031dbd01fSIzik Eidus 	struct page *page;
111131dbd01fSIzik Eidus 	int err;
1112eed05e54SMatthew Wilcox (Oracle) 
111331dbd01fSIzik Eidus 	page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
111431dbd01fSIzik Eidus 	if (!page) {
1115ac46d4f3SJérôme Glisse 		/*
11166c287605SDavid Hildenbrand 		 * get_ksm_page did remove_node_from_stable_tree itself.
1117c33c7948SRyan Roberts 		 */
111831dbd01fSIzik Eidus 		return 0;
111936eaff33SKirill A. Shutemov 	}
112036eaff33SKirill A. Shutemov 
112131dbd01fSIzik Eidus 	/*
112231dbd01fSIzik Eidus 	 * Page could be still mapped if this races with __mmput() running in
112329ad768cSAndrea Arcangeli 	 * between ksm_exit() and exit_mmap(). Just refuse to let
11246bdb913fSHaggai Eran 	 * merge_across_nodes/max_page_sharing be switched.
11257d4a8be0SAlistair Popple 	 */
1126ac46d4f3SJérôme Glisse 	err = -EBUSY;
1127ac46d4f3SJérôme Glisse 	if (!page_mapped(page)) {
11286bdb913fSHaggai Eran 		/*
112936eaff33SKirill A. Shutemov 		 * The stable node did not yet appear stale to get_ksm_page(),
11306bdb913fSHaggai Eran 		 * since that allows for an unmapped ksm page to be recognized
113136eaff33SKirill A. Shutemov 		 * right up until it is freed; but the node is safe to remove.
113236eaff33SKirill A. Shutemov 		 * This page might be in an LRU cache waiting to be freed,
113331dbd01fSIzik Eidus 		 * or it might be PageSwapCache (perhaps under writeback),
11346c287605SDavid Hildenbrand 		 * or it might have been removed from swapcache a moment ago.
1135c33c7948SRyan Roberts 		 */
1136c33c7948SRyan Roberts 		set_page_stable_node(page, NULL);
11376c287605SDavid Hildenbrand 		remove_node_from_stable_tree(stable_node);
113831dbd01fSIzik Eidus 		err = 0;
113936eaff33SKirill A. Shutemov 	}
114031dbd01fSIzik Eidus 
114125985edcSLucas De Marchi 	unlock_page(page);
114231dbd01fSIzik Eidus 	put_page(page);
1143f0953a1bSIngo Molnar 	return err;
114431dbd01fSIzik Eidus }
114531dbd01fSIzik Eidus 
remove_stable_node_chain(struct ksm_stable_node * stable_node,struct rb_root * root)114631dbd01fSIzik Eidus static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
114731dbd01fSIzik Eidus 				    struct rb_root *root)
11480f10851eSJérôme Glisse {
11490f10851eSJérôme Glisse 	struct ksm_stable_node *dup;
11500f10851eSJérôme Glisse 	struct hlist_node *hlist_safe;
11510f10851eSJérôme Glisse 
1152ee65728eSMike Rapoport 	if (!is_stable_node_chain(stable_node)) {
115331dbd01fSIzik Eidus 		VM_BUG_ON(is_stable_node_dup(stable_node));
11540f10851eSJérôme Glisse 		if (remove_stable_node(stable_node))
115531dbd01fSIzik Eidus 			return true;
115631dbd01fSIzik Eidus 		else
115731dbd01fSIzik Eidus 			return false;
115831dbd01fSIzik Eidus 	}
115931e855eaSHugh Dickins 
116036eaff33SKirill A. Shutemov 	hlist_for_each_entry_safe(dup, hlist_safe,
116131dbd01fSIzik Eidus 				  &stable_node->hlist, hlist_dup) {
116231dbd01fSIzik Eidus 		VM_BUG_ON(!is_stable_node_dup(dup));
11636c287605SDavid Hildenbrand 		if (remove_stable_node(dup))
1164088b8aa5SDavid Hildenbrand 			return true;
11656c287605SDavid Hildenbrand 	}
11666c287605SDavid Hildenbrand 	BUG_ON(!hlist_empty(&stable_node->hlist));
11676c287605SDavid Hildenbrand 	free_stable_node_chain(stable_node, root);
11686c287605SDavid Hildenbrand 	return false;
11696c287605SDavid Hildenbrand }
11704e31635cSHugh Dickins 
remove_all_stable_nodes(void)11714e31635cSHugh Dickins static int remove_all_stable_nodes(void)
11726a56ccbcSDavid Hildenbrand {
1173595cd8f2SAneesh Kumar K.V 	struct ksm_stable_node *stable_node, *next;
11746a56ccbcSDavid Hildenbrand 	int nid;
11756a56ccbcSDavid Hildenbrand 	int err = 0;
11766a56ccbcSDavid Hildenbrand 
117736eaff33SKirill A. Shutemov 	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
117831dbd01fSIzik Eidus 		while (root_stable_tree[nid].rb_node) {
1179c33c7948SRyan Roberts 			stable_node = rb_entry(root_stable_tree[nid].rb_node,
118031dbd01fSIzik Eidus 						struct ksm_stable_node, node);
118131dbd01fSIzik Eidus 			if (remove_stable_node_chain(stable_node,
118231dbd01fSIzik Eidus 						     root_stable_tree + nid)) {
118336eaff33SKirill A. Shutemov 				err = -EBUSY;
11846bdb913fSHaggai Eran 				break;	/* proceed to next nid */
1185ac46d4f3SJérôme Glisse 			}
118631dbd01fSIzik Eidus 			cond_resched();
118731dbd01fSIzik Eidus 		}
118831dbd01fSIzik Eidus 	}
118931dbd01fSIzik Eidus 	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
119031dbd01fSIzik Eidus 		if (remove_stable_node(stable_node))
119131dbd01fSIzik Eidus 			err = -EBUSY;
11928dd3557aSHugh Dickins 		cond_resched();
11938dd3557aSHugh Dickins 	}
11948dd3557aSHugh Dickins 	return err;
119531dbd01fSIzik Eidus }
119631dbd01fSIzik Eidus 
unmerge_and_remove_all_rmap_items(void)119731dbd01fSIzik Eidus static int unmerge_and_remove_all_rmap_items(void)
119831dbd01fSIzik Eidus {
11998dd3557aSHugh Dickins 	struct ksm_mm_slot *mm_slot;
12008dd3557aSHugh Dickins 	struct mm_slot *slot;
120131dbd01fSIzik Eidus 	struct mm_struct *mm;
120231dbd01fSIzik Eidus 	struct vm_area_struct *vma;
1203b4e6f66eSMatthew Wilcox (Oracle) 	int err = 0;
120431dbd01fSIzik Eidus 
120550722804SZach O'Keefe 	spin_lock(&ksm_mmlist_lock);
120631dbd01fSIzik Eidus 	slot = list_entry(ksm_mm_head.slot.mm_node.next,
1207e86c59b1SClaudio Imbrenda 			  struct mm_slot, mm_node);
120831dbd01fSIzik Eidus 	ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
120931dbd01fSIzik Eidus 	spin_unlock(&ksm_mmlist_lock);
121031dbd01fSIzik Eidus 
1211ac46d4f3SJérôme Glisse 	for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
121231dbd01fSIzik Eidus 	     mm_slot = ksm_scan.mm_slot) {
12138dd3557aSHugh Dickins 		VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
121431dbd01fSIzik Eidus 
121531dbd01fSIzik Eidus 		mm = mm_slot->slot.mm;
121631dbd01fSIzik Eidus 		mmap_read_lock(mm);
12176219049aSBob Liu 
12186219049aSBob Liu 		/*
121931dbd01fSIzik Eidus 		 * Exit right away if mm is exiting to avoid lockdep issue in
122050722804SZach O'Keefe 		 * the maple tree
122150722804SZach O'Keefe 		 */
122250722804SZach O'Keefe 		if (ksm_test_exit(mm))
122350722804SZach O'Keefe 			goto mm_exiting;
122450722804SZach O'Keefe 
122526e1a0c3SHugh Dickins 		for_each_vma(vmi, vma) {
122650722804SZach O'Keefe 			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
122750722804SZach O'Keefe 				continue;
122831dbd01fSIzik Eidus 			err = unmerge_ksm_pages(vma,
12297d4a8be0SAlistair Popple 						vma->vm_start, vma->vm_end, false);
12306f4f13e8SJérôme Glisse 			if (err)
1231ac46d4f3SJérôme Glisse 				goto error;
12326bdb913fSHaggai Eran 		}
123331dbd01fSIzik Eidus 
123404dee9e8SHugh Dickins mm_exiting:
123504dee9e8SHugh Dickins 		remove_trailing_rmap_items(&mm_slot->rmap_list);
1236c33c7948SRyan Roberts 		mmap_read_unlock(mm);
123731dbd01fSIzik Eidus 
12386bdb913fSHaggai Eran 		spin_lock(&ksm_mmlist_lock);
123931dbd01fSIzik Eidus 		slot = list_entry(mm_slot->slot.mm_node.next,
12406c287605SDavid Hildenbrand 				  struct mm_slot, mm_node);
12416c287605SDavid Hildenbrand 		ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
124231dbd01fSIzik Eidus 		if (ksm_test_exit(mm)) {
1243e86c59b1SClaudio Imbrenda 			hash_del(&mm_slot->slot.hash);
1244e86c59b1SClaudio Imbrenda 			list_del(&mm_slot->slot.mm_node);
1245457aef94SEthon Paul 			spin_unlock(&ksm_mmlist_lock);
1246e86c59b1SClaudio Imbrenda 
1247e86c59b1SClaudio Imbrenda 			mm_slot_free(mm_slot_cache, mm_slot);
12488dd3557aSHugh Dickins 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1249f1e2db12SDavid Hildenbrand 			clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
1250e86c59b1SClaudio Imbrenda 			mmdrop(mm);
1251e86c59b1SClaudio Imbrenda 		} else
125279271476Sxu xin 			spin_unlock(&ksm_mmlist_lock);
125379271476Sxu xin 	}
125479271476Sxu xin 
125579271476Sxu xin 	/* Clean up stable nodes, but don't worry if some are still busy */
125679271476Sxu xin 	remove_all_stable_nodes();
125779271476Sxu xin 	ksm_scan.seqnr = 0;
1258e2942062Sxu xin 	return 0;
12596080d19fSxu xin 
1260a38c015fSClaudio Imbrenda error:
1261a38c015fSClaudio Imbrenda 	mmap_read_unlock(mm);
1262a38c015fSClaudio Imbrenda 	spin_lock(&ksm_mmlist_lock);
1263a38c015fSClaudio Imbrenda 	ksm_scan.mm_slot = &ksm_mm_head;
1264a38c015fSClaudio Imbrenda 	spin_unlock(&ksm_mmlist_lock);
1265a38c015fSClaudio Imbrenda 	return err;
1266a38c015fSClaudio Imbrenda }
1267e86c59b1SClaudio Imbrenda #endif /* CONFIG_SYSFS */
126831dbd01fSIzik Eidus 
calc_checksum(struct page * page)1269c33c7948SRyan Roberts static u32 calc_checksum(struct page *page)
12700f10851eSJérôme Glisse {
12710f10851eSJérôme Glisse 	u32 checksum;
12720f10851eSJérôme Glisse 	void *addr = kmap_local_page(page);
12730f10851eSJérôme Glisse 	checksum = xxhash(addr, PAGE_SIZE, 0);
1274ee65728eSMike Rapoport 	kunmap_local(addr);
12750f10851eSJérôme Glisse 	return checksum;
12760f10851eSJérôme Glisse }
1277e86c59b1SClaudio Imbrenda 
write_protect_page(struct vm_area_struct * vma,struct page * page,pte_t * orig_pte)127831dbd01fSIzik Eidus static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1279b4e6f66eSMatthew Wilcox (Oracle) 			      pte_t *orig_pte)
1280cea86fe2SHugh Dickins {
1281b4e6f66eSMatthew Wilcox (Oracle) 	struct mm_struct *mm = vma->vm_mm;
1282b4e6f66eSMatthew Wilcox (Oracle) 	DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
1283b4e6f66eSMatthew Wilcox (Oracle) 	int swapped;
128431dbd01fSIzik Eidus 	int err = -EFAULT;
128531dbd01fSIzik Eidus 	struct mmu_notifier_range range;
128631dbd01fSIzik Eidus 	bool anon_exclusive;
12876bdb913fSHaggai Eran 	pte_t entry;
1288ac46d4f3SJérôme Glisse 
128931dbd01fSIzik Eidus 	pvmw.address = page_address_in_vma(page, vma);
129031dbd01fSIzik Eidus 	if (pvmw.address == -EFAULT)
129131dbd01fSIzik Eidus 		goto out;
129231dbd01fSIzik Eidus 
129331dbd01fSIzik Eidus 	BUG_ON(PageTransCompound(page));
129431dbd01fSIzik Eidus 
12958dd3557aSHugh Dickins 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
12968dd3557aSHugh Dickins 				pvmw.address + PAGE_SIZE);
129780e14822SHugh Dickins 	mmu_notifier_invalidate_range_start(&range);
129880e14822SHugh Dickins 
129931dbd01fSIzik Eidus 	if (!page_vma_mapped_walk(&pvmw))
130031dbd01fSIzik Eidus 		goto out_mn;
130131dbd01fSIzik Eidus 	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
130231dbd01fSIzik Eidus 		goto out_unlock;
13038dd3557aSHugh Dickins 
130431dbd01fSIzik Eidus 	anon_exclusive = PageAnonExclusive(page);
130531dbd01fSIzik Eidus 	entry = ptep_get(pvmw.pte);
130631dbd01fSIzik Eidus 	if (pte_write(entry) || pte_dirty(entry) ||
130731dbd01fSIzik Eidus 	    anon_exclusive || mm_tlb_flush_pending(mm)) {
1308db114b83SHugh Dickins 		swapped = PageSwapCache(page);
1309db114b83SHugh Dickins 		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1310db114b83SHugh Dickins 		/*
13118dd3557aSHugh Dickins 		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
131231dbd01fSIzik Eidus 		 * take any lock, therefore the check that we are going to make
131331dbd01fSIzik Eidus 		 * with the pagecount against the mapcount is racy and
131431dbd01fSIzik Eidus 		 * O_DIRECT can happen right after the check.
131531dbd01fSIzik Eidus 		 * So we clear the pte and flush the tlb before the check
131631dbd01fSIzik Eidus 		 * this assure us that no O_DIRECT can happen after the check
131731dbd01fSIzik Eidus 		 * or in the middle of the check.
131831dbd01fSIzik Eidus 		 *
131931dbd01fSIzik Eidus 		 * No need to notify as we are downgrading page table to read
132031dbd01fSIzik Eidus 		 * only not changing it to point to a new page.
13218dd3557aSHugh Dickins 		 *
132231e855eaSHugh Dickins 		 * See Documentation/mm/mmu_notifier.rst
1323f765f540SKirill A. Shutemov 		 */
1324f765f540SKirill A. Shutemov 		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1325a7306c34SAndrea Arcangeli 		/*
1326f765f540SKirill A. Shutemov 		 * Check that no O_DIRECT or similar I/O is in progress on the
1327f765f540SKirill A. Shutemov 		 * page
1328f765f540SKirill A. Shutemov 		 */
132931dbd01fSIzik Eidus 		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
133031dbd01fSIzik Eidus 			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
133131dbd01fSIzik Eidus 			goto out_unlock;
133231dbd01fSIzik Eidus 		}
133331dbd01fSIzik Eidus 
133431dbd01fSIzik Eidus 		/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
133580e14822SHugh Dickins 		if (anon_exclusive &&
133680e14822SHugh Dickins 		    folio_try_share_anon_rmap_pte(page_folio(page), page)) {
133780e14822SHugh Dickins 			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
133880e14822SHugh Dickins 			goto out_unlock;
133980e14822SHugh Dickins 		}
134080e14822SHugh Dickins 
134180e14822SHugh Dickins 		if (pte_dirty(entry))
134280e14822SHugh Dickins 			set_page_dirty(page);
134380e14822SHugh Dickins 		entry = pte_mkclean(entry);
1344337ed7ebSMinchan Kim 
1345337ed7ebSMinchan Kim 		if (pte_write(entry))
1346337ed7ebSMinchan Kim 			entry = pte_wrprotect(entry);
1347337ed7ebSMinchan Kim 
1348337ed7ebSMinchan Kim 		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1349337ed7ebSMinchan Kim 	}
135080e14822SHugh Dickins 	*orig_pte = entry;
135180e14822SHugh Dickins 	err = 0;
13528dd3557aSHugh Dickins 
135380e14822SHugh Dickins out_unlock:
135431dbd01fSIzik Eidus 	page_vma_mapped_walk_done(&pvmw);
1355f765f540SKirill A. Shutemov out_mn:
13568dd3557aSHugh Dickins 	mmu_notifier_invalidate_range_end(&range);
135731dbd01fSIzik Eidus out:
135831dbd01fSIzik Eidus 	return err;
135931dbd01fSIzik Eidus }
136031dbd01fSIzik Eidus 
136131dbd01fSIzik Eidus /**
136281464e30SHugh Dickins  * replace_page - replace page in vma by new ksm page
136381464e30SHugh Dickins  * @vma:      vma that holds the pte pointing to page
13648dd3557aSHugh Dickins  * @page:     the page we are replacing by kpage
13658dd3557aSHugh Dickins  * @kpage:    the ksm page we replace page by
136681464e30SHugh Dickins  * @orig_pte: the original value of the pte
136721fbd591SQi Zheng  *
13688dd3557aSHugh Dickins  * Returns 0 on success, -EFAULT on failure.
136981464e30SHugh Dickins  */
replace_page(struct vm_area_struct * vma,struct page * page,struct page * kpage,pte_t orig_pte)13708dd3557aSHugh Dickins static int replace_page(struct vm_area_struct *vma, struct page *page,
137181464e30SHugh Dickins 			struct page *kpage, pte_t orig_pte)
137281464e30SHugh Dickins {
137381464e30SHugh Dickins 	struct folio *kfolio = page_folio(kpage);
1374d8ed45c5SMichel Lespinasse 	struct mm_struct *mm = vma->vm_mm;
137585c6e8ddSAndrea Arcangeli 	struct folio *folio;
137685c6e8ddSAndrea Arcangeli 	pmd_t *pmd;
13779ba69294SHugh Dickins 	pmd_t pmde;
13789ba69294SHugh Dickins 	pte_t *ptep;
13798dd3557aSHugh Dickins 	pte_t newpte;
1380db114b83SHugh Dickins 	spinlock_t *ptl;
1381db114b83SHugh Dickins 	unsigned long addr;
1382db114b83SHugh Dickins 	int err = -EFAULT;
1383bc56620bSHugh Dickins 	struct mmu_notifier_range range;
1384bc56620bSHugh Dickins 
1385bc56620bSHugh Dickins 	addr = page_address_in_vma(page, vma);
1386c1e8d7c6SMichel Lespinasse 	if (addr == -EFAULT)
13879e60109fSPeter Zijlstra 		goto out;
13889e60109fSPeter Zijlstra 
138981464e30SHugh Dickins 	pmd = mm_find_pmd(mm, addr);
1390d8ed45c5SMichel Lespinasse 	if (!pmd)
1391739100c8SStefan Roesch 		goto out;
1392739100c8SStefan Roesch 	/*
139381464e30SHugh Dickins 	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
139481464e30SHugh Dickins 	 * without holding anon_vma lock for write.  So when looking for a
139581464e30SHugh Dickins 	 * genuine pmde (in which to find pte), test present and !THP together.
139681464e30SHugh Dickins 	 */
139731dbd01fSIzik Eidus 	pmde = pmdp_get_lockless(pmd);
139831dbd01fSIzik Eidus 	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
139931dbd01fSIzik Eidus 		goto out;
14008dd3557aSHugh Dickins 
14018dd3557aSHugh Dickins 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
140231dbd01fSIzik Eidus 				addr + PAGE_SIZE);
140380e14822SHugh Dickins 	mmu_notifier_invalidate_range_start(&range);
140431dbd01fSIzik Eidus 
140531dbd01fSIzik Eidus 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
140621fbd591SQi Zheng 	if (!ptep)
14078dd3557aSHugh Dickins 		goto out_mn;
140821fbd591SQi Zheng 	if (!pte_same(ptep_get(ptep), orig_pte)) {
14098dd3557aSHugh Dickins 		pte_unmap_unlock(ptep, ptl);
141031dbd01fSIzik Eidus 		goto out_mn;
141180e14822SHugh Dickins 	}
141231dbd01fSIzik Eidus 	VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
141380e14822SHugh Dickins 	VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
141431dbd01fSIzik Eidus 			kfolio);
14158dd3557aSHugh Dickins 
141680e14822SHugh Dickins 	/*
141731dbd01fSIzik Eidus 	 * No need to check ksm_use_zero_pages here: we can only have a
141881464e30SHugh Dickins 	 * zero_page here if ksm_use_zero_pages was enabled already.
141981464e30SHugh Dickins 	 */
142031dbd01fSIzik Eidus 	if (!is_zero_pfn(page_to_pfn(kpage))) {
14214035c07aSHugh Dickins 		folio_get(kfolio);
14228dd3557aSHugh Dickins 		folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
142331dbd01fSIzik Eidus 		newpte = mk_pte(kpage, vma->vm_page_prot);
142480e14822SHugh Dickins 	} else {
142531dbd01fSIzik Eidus 		/*
142631dbd01fSIzik Eidus 		 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
14272c653d0eSAndrea Arcangeli 		 * we can easily track all KSM-placed zero pages by checking if
142821fbd591SQi Zheng 		 * the dirty bit in zero page's PTE is set.
14292c653d0eSAndrea Arcangeli 		 */
14302c653d0eSAndrea Arcangeli 		newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
14312c653d0eSAndrea Arcangeli 		ksm_zero_pages++;
14322c653d0eSAndrea Arcangeli 		mm->ksm_zero_pages++;
14332c653d0eSAndrea Arcangeli 		/*
14342c653d0eSAndrea Arcangeli 		 * We're replacing an anonymous page with a zero page, which is
14352c653d0eSAndrea Arcangeli 		 * not anonymous. We need to do proper accounting otherwise we
14362c653d0eSAndrea Arcangeli 		 * will get wrong values in /proc, and a BUG message in dmesg
14372c653d0eSAndrea Arcangeli 		 * when tearing down the mm.
14382c653d0eSAndrea Arcangeli 		 */
14392c653d0eSAndrea Arcangeli 		dec_mm_counter(mm, MM_ANONPAGES);
14402c653d0eSAndrea Arcangeli 	}
14412c653d0eSAndrea Arcangeli 
144221fbd591SQi Zheng 	flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
14432c653d0eSAndrea Arcangeli 	/*
14442c653d0eSAndrea Arcangeli 	 * No need to notify as we are replacing a read only page with another
14452c653d0eSAndrea Arcangeli 	 * read only page with the same content.
14462c653d0eSAndrea Arcangeli 	 *
144721fbd591SQi Zheng 	 * See Documentation/mm/mmu_notifier.rst
144821fbd591SQi Zheng 	 */
14492c653d0eSAndrea Arcangeli 	ptep_clear_flush(vma, addr, ptep);
14502c653d0eSAndrea Arcangeli 	set_pte_at_notify(mm, addr, ptep, newpte);
14512c653d0eSAndrea Arcangeli 
145221fbd591SQi Zheng 	folio = page_folio(page);
14532c653d0eSAndrea Arcangeli 	folio_remove_rmap_pte(folio, page, vma);
14548dc5ffcdSAndrea Arcangeli 	if (!folio_mapped(folio))
14552c653d0eSAndrea Arcangeli 		folio_free_swap(folio);
14562c653d0eSAndrea Arcangeli 	folio_put(folio);
14572c653d0eSAndrea Arcangeli 
14582c653d0eSAndrea Arcangeli 	pte_unmap_unlock(ptep, ptl);
14592c653d0eSAndrea Arcangeli 	err = 0;
14602c653d0eSAndrea Arcangeli out_mn:
14612c653d0eSAndrea Arcangeli 	mmu_notifier_invalidate_range_end(&range);
14622c653d0eSAndrea Arcangeli out:
14632c653d0eSAndrea Arcangeli 	return err;
14642c653d0eSAndrea Arcangeli }
14652c653d0eSAndrea Arcangeli 
14662c653d0eSAndrea Arcangeli /*
14672c653d0eSAndrea Arcangeli  * try_to_merge_one_page - take two pages and merge them into one
14682c653d0eSAndrea Arcangeli  * @vma: the vma that holds the pte pointing to page
14692c653d0eSAndrea Arcangeli  * @page: the PageAnon page that we want to replace with kpage
14702c653d0eSAndrea Arcangeli  * @kpage: the PageKsm page that we want to map instead of page,
14712c653d0eSAndrea Arcangeli  *         or NULL the first time when we want to use page as kpage.
14722c653d0eSAndrea Arcangeli  *
14732c653d0eSAndrea Arcangeli  * This function returns 0 if the pages were merged, -EFAULT otherwise.
14742c653d0eSAndrea Arcangeli  */
try_to_merge_one_page(struct vm_area_struct * vma,struct page * page,struct page * kpage)14752c653d0eSAndrea Arcangeli static int try_to_merge_one_page(struct vm_area_struct *vma,
14762c653d0eSAndrea Arcangeli 				 struct page *page, struct page *kpage)
14772c653d0eSAndrea Arcangeli {
14782c653d0eSAndrea Arcangeli 	pte_t orig_pte = __pte(0);
14792cee57d1SYang Shi 	int err = -EFAULT;
14802c653d0eSAndrea Arcangeli 
14812c653d0eSAndrea Arcangeli 	if (page == kpage)			/* ksm page forked */
14822c653d0eSAndrea Arcangeli 		return 0;
14832c653d0eSAndrea Arcangeli 
14842c653d0eSAndrea Arcangeli 	if (!PageAnon(page))
14852c653d0eSAndrea Arcangeli 		goto out;
14862c653d0eSAndrea Arcangeli 
14878dc5ffcdSAndrea Arcangeli 	/*
14882c653d0eSAndrea Arcangeli 	 * We need the page lock to read a stable PageSwapCache in
14892c653d0eSAndrea Arcangeli 	 * write_protect_page().  We use trylock_page() instead of
14908dc5ffcdSAndrea Arcangeli 	 * lock_page() because we don't want to wait here - we
14912c653d0eSAndrea Arcangeli 	 * prefer to continue scanning and merging different pages,
14928dc5ffcdSAndrea Arcangeli 	 * then come back to this page when it is unlocked.
14932c653d0eSAndrea Arcangeli 	 */
14942c653d0eSAndrea Arcangeli 	if (!trylock_page(page))
14952c653d0eSAndrea Arcangeli 		goto out;
14962c653d0eSAndrea Arcangeli 
14972c653d0eSAndrea Arcangeli 	if (PageTransCompound(page)) {
14982c653d0eSAndrea Arcangeli 		if (split_huge_page(page))
14992c653d0eSAndrea Arcangeli 			goto out_unlock;
15002c653d0eSAndrea Arcangeli 	}
150180b18dfaSAndrea Arcangeli 
15022c653d0eSAndrea Arcangeli 	/*
150380b18dfaSAndrea Arcangeli 	 * If this anonymous page is mapped only here, its pte may need
150480b18dfaSAndrea Arcangeli 	 * to be write-protected.  If it's mapped elsewhere, all of its
150580b18dfaSAndrea Arcangeli 	 * ptes are necessarily already write-protected.  But in either
150680b18dfaSAndrea Arcangeli 	 * case, we need to lock and check page_count is not raised.
15072c653d0eSAndrea Arcangeli 	 */
150880b18dfaSAndrea Arcangeli 	if (write_protect_page(vma, page, &orig_pte) == 0) {
15092c653d0eSAndrea Arcangeli 		if (!kpage) {
15102c653d0eSAndrea Arcangeli 			/*
15112c653d0eSAndrea Arcangeli 			 * While we hold page lock, upgrade page from
15122c653d0eSAndrea Arcangeli 			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
15132c653d0eSAndrea Arcangeli 			 * stable_tree_insert() will update stable_node.
15142c653d0eSAndrea Arcangeli 			 */
15152c653d0eSAndrea Arcangeli 			set_page_stable_node(page, NULL);
15162c653d0eSAndrea Arcangeli 			mark_page_accessed(page);
15172c653d0eSAndrea Arcangeli 			/*
15182c653d0eSAndrea Arcangeli 			 * Page reclaim just frees a clean page with no dirty
15192c653d0eSAndrea Arcangeli 			 * ptes: make sure that the ksm page would be swapped.
15202c653d0eSAndrea Arcangeli 			 */
15212c653d0eSAndrea Arcangeli 			if (!PageDirty(page))
15222c653d0eSAndrea Arcangeli 				SetPageDirty(page);
15232c653d0eSAndrea Arcangeli 			err = 0;
15242c653d0eSAndrea Arcangeli 		} else if (pages_identical(page, kpage))
15252c653d0eSAndrea Arcangeli 			err = replace_page(vma, page, kpage, orig_pte);
1526b4fecc67SAndrea Arcangeli 	}
15270ba1d0f7SAndrea Arcangeli 
15280ba1d0f7SAndrea Arcangeli out_unlock:
15290ba1d0f7SAndrea Arcangeli 	unlock_page(page);
1530b4fecc67SAndrea Arcangeli out:
15310ba1d0f7SAndrea Arcangeli 	return err;
15320ba1d0f7SAndrea Arcangeli }
1533f0953a1bSIngo Molnar 
15340ba1d0f7SAndrea Arcangeli /*
15350ba1d0f7SAndrea Arcangeli  * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
15360ba1d0f7SAndrea Arcangeli  * but no new kernel page is allocated: kpage must already be a ksm page.
15370ba1d0f7SAndrea Arcangeli  *
15380ba1d0f7SAndrea Arcangeli  * This function returns 0 if the pages were merged, -EFAULT otherwise.
153980b18dfaSAndrea Arcangeli  */
try_to_merge_with_ksm_page(struct ksm_rmap_item * rmap_item,struct page * page,struct page * kpage)154080b18dfaSAndrea Arcangeli static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
15412c653d0eSAndrea Arcangeli 				      struct page *page, struct page *kpage)
154280b18dfaSAndrea Arcangeli {
154380b18dfaSAndrea Arcangeli 	struct mm_struct *mm = rmap_item->mm;
154480b18dfaSAndrea Arcangeli 	struct vm_area_struct *vma;
154580b18dfaSAndrea Arcangeli 	int err = -EFAULT;
154680b18dfaSAndrea Arcangeli 
154780b18dfaSAndrea Arcangeli 	mmap_read_lock(mm);
154880b18dfaSAndrea Arcangeli 	vma = find_mergeable_vma(mm, rmap_item->address);
154980b18dfaSAndrea Arcangeli 	if (!vma)
155080b18dfaSAndrea Arcangeli 		goto out;
155180b18dfaSAndrea Arcangeli 
155280b18dfaSAndrea Arcangeli 	err = try_to_merge_one_page(vma, page, kpage);
155380b18dfaSAndrea Arcangeli 	if (err)
155480b18dfaSAndrea Arcangeli 		goto out;
15552c653d0eSAndrea Arcangeli 
15562c653d0eSAndrea Arcangeli 	/* Unstable nid is in union with stable anon_vma: remove first */
15572c653d0eSAndrea Arcangeli 	remove_rmap_item_from_tree(rmap_item);
15582c653d0eSAndrea Arcangeli 
15592c653d0eSAndrea Arcangeli 	/* Must get reference to anon_vma while still holding mmap_lock */
15602c653d0eSAndrea Arcangeli 	rmap_item->anon_vma = vma->anon_vma;
15612c653d0eSAndrea Arcangeli 	get_anon_vma(vma->anon_vma);
15628dc5ffcdSAndrea Arcangeli out:
15638dc5ffcdSAndrea Arcangeli 	mmap_read_unlock(mm);
15642c653d0eSAndrea Arcangeli 	trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
15652c653d0eSAndrea Arcangeli 				rmap_item, mm, err);
156621fbd591SQi Zheng 	return err;
15672c653d0eSAndrea Arcangeli }
15682c653d0eSAndrea Arcangeli 
15692c653d0eSAndrea Arcangeli /*
15702c653d0eSAndrea Arcangeli  * try_to_merge_two_pages - take two identical pages and prepare them
15712c653d0eSAndrea Arcangeli  * to be merged into one page.
15722c653d0eSAndrea Arcangeli  *
15732c653d0eSAndrea Arcangeli  * This function returns the kpage if we successfully merged two identical
15742c653d0eSAndrea Arcangeli  * pages into one ksm page, NULL otherwise.
15752c653d0eSAndrea Arcangeli  *
15762c653d0eSAndrea Arcangeli  * Note that this function upgrades page to ksm page: if one of the pages
15772c653d0eSAndrea Arcangeli  * is already a ksm page, try_to_merge_with_ksm_page should be used.
15782c653d0eSAndrea Arcangeli  */
try_to_merge_two_pages(struct ksm_rmap_item * rmap_item,struct page * page,struct ksm_rmap_item * tree_rmap_item,struct page * tree_page)15798dc5ffcdSAndrea Arcangeli static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
15808dc5ffcdSAndrea Arcangeli 					   struct page *page,
15818dc5ffcdSAndrea Arcangeli 					   struct ksm_rmap_item *tree_rmap_item,
15828dc5ffcdSAndrea Arcangeli 					   struct page *tree_page)
15838dc5ffcdSAndrea Arcangeli {
15848dc5ffcdSAndrea Arcangeli 	int err;
15858dc5ffcdSAndrea Arcangeli 
15868dc5ffcdSAndrea Arcangeli 	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
15878dc5ffcdSAndrea Arcangeli 	if (!err) {
15888dc5ffcdSAndrea Arcangeli 		err = try_to_merge_with_ksm_page(tree_rmap_item,
15898dc5ffcdSAndrea Arcangeli 							tree_page, page);
15908dc5ffcdSAndrea Arcangeli 		/*
15918dc5ffcdSAndrea Arcangeli 		 * If that fails, we have a ksm page with only one pte
15928dc5ffcdSAndrea Arcangeli 		 * pointing to it: so break it.
159321fbd591SQi Zheng 		 */
159421fbd591SQi Zheng 		if (err)
15952c653d0eSAndrea Arcangeli 			break_cow(rmap_item);
15962c653d0eSAndrea Arcangeli 	}
15972c653d0eSAndrea Arcangeli 	return err ? NULL : page;
159821fbd591SQi Zheng }
15992c653d0eSAndrea Arcangeli 
16002c653d0eSAndrea Arcangeli static __always_inline
__is_page_sharing_candidate(struct ksm_stable_node * stable_node,int offset)16018dc5ffcdSAndrea Arcangeli bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
16022cee57d1SYang Shi {
16032c653d0eSAndrea Arcangeli 	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
16048dc5ffcdSAndrea Arcangeli 	/*
16058dc5ffcdSAndrea Arcangeli 	 * Check that at least one mapping still exists, otherwise
16068dc5ffcdSAndrea Arcangeli 	 * there's no much point to merge and share with this
16078dc5ffcdSAndrea Arcangeli 	 * stable_node, as the underlying tree_page of the other
16088dc5ffcdSAndrea Arcangeli 	 * sharer is going to be freed soon.
16092c653d0eSAndrea Arcangeli 	 */
16102c653d0eSAndrea Arcangeli 	return stable_node->rmap_hlist_len &&
16118dc5ffcdSAndrea Arcangeli 		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
16122c653d0eSAndrea Arcangeli }
16132c653d0eSAndrea Arcangeli 
16142c653d0eSAndrea Arcangeli static __always_inline
is_page_sharing_candidate(struct ksm_stable_node * stable_node)161521fbd591SQi Zheng bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
161621fbd591SQi Zheng {
16172c653d0eSAndrea Arcangeli 	return __is_page_sharing_candidate(stable_node, 0);
16182c653d0eSAndrea Arcangeli }
16198dc5ffcdSAndrea Arcangeli 
stable_node_dup(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)16202c653d0eSAndrea Arcangeli static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
16212c653d0eSAndrea Arcangeli 				    struct ksm_stable_node **_stable_node,
162221fbd591SQi Zheng 				    struct rb_root *root,
162321fbd591SQi Zheng 				    bool prune_stale_stable_nodes)
16242c653d0eSAndrea Arcangeli {
16252c653d0eSAndrea Arcangeli 	struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
162621fbd591SQi Zheng 	struct hlist_node *hlist_safe;
16278dc5ffcdSAndrea Arcangeli 	struct page *_tree_page, *tree_page = NULL;
16288dc5ffcdSAndrea Arcangeli 	int nr = 0;
16298dc5ffcdSAndrea Arcangeli 	int found_rmap_hlist_len;
16308dc5ffcdSAndrea Arcangeli 
16318dc5ffcdSAndrea Arcangeli 	if (!prune_stale_stable_nodes ||
16328dc5ffcdSAndrea Arcangeli 	    time_before(jiffies, stable_node->chain_prune_time +
16332c653d0eSAndrea Arcangeli 			msecs_to_jiffies(
16342c653d0eSAndrea Arcangeli 				ksm_stable_node_chains_prune_millisecs)))
163531dbd01fSIzik Eidus 		prune_stale_stable_nodes = false;
16368dd3557aSHugh Dickins 	else
163731dbd01fSIzik Eidus 		stable_node->chain_prune_time = jiffies;
163831dbd01fSIzik Eidus 
163931dbd01fSIzik Eidus 	hlist_for_each_entry_safe(dup, hlist_safe,
164031dbd01fSIzik Eidus 				  &stable_node->hlist, hlist_dup) {
16417b6ba2c7SHugh Dickins 		cond_resched();
164231dbd01fSIzik Eidus 		/*
164331dbd01fSIzik Eidus 		 * We must walk all stable_node_dup to prune the stale
164462b61f61SHugh Dickins 		 * stable nodes during lookup.
164531dbd01fSIzik Eidus 		 *
164690bd6fd3SPetr Holasek 		 * get_ksm_page can drop the nodes from the
1647ef53d16cSHugh Dickins 		 * stable_node->hlist if they point to freed pages
16484146d2d6SHugh Dickins 		 * (that's why we do a _safe walk). The "dup"
16494146d2d6SHugh Dickins 		 * stable_node parameter itself will be freed from
165021fbd591SQi Zheng 		 * under us if it returns NULL.
165121fbd591SQi Zheng 		 */
165231dbd01fSIzik Eidus 		_tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
16534146d2d6SHugh Dickins 		if (!_tree_page)
16544146d2d6SHugh Dickins 			continue;
16554146d2d6SHugh Dickins 		nr += 1;
165608beca44SHugh Dickins 		if (is_page_sharing_candidate(dup)) {
165762b61f61SHugh Dickins 			if (!found ||
165808beca44SHugh Dickins 			    dup->rmap_hlist_len > found_rmap_hlist_len) {
165908beca44SHugh Dickins 				if (found)
166090bd6fd3SPetr Holasek 					put_page(tree_page);
1661ef53d16cSHugh Dickins 				found = dup;
16624146d2d6SHugh Dickins 				found_rmap_hlist_len = found->rmap_hlist_len;
1663ef53d16cSHugh Dickins 				tree_page = _tree_page;
16644146d2d6SHugh Dickins 
166590bd6fd3SPetr Holasek 				/* skip put_page for found dup */
16664146d2d6SHugh Dickins 				if (!prune_stale_stable_nodes)
16674035c07aSHugh Dickins 					break;
166831dbd01fSIzik Eidus 				continue;
166931dbd01fSIzik Eidus 			}
167031dbd01fSIzik Eidus 		}
167121fbd591SQi Zheng 		put_page(_tree_page);
16722c653d0eSAndrea Arcangeli 	}
16738dc5ffcdSAndrea Arcangeli 
1674b4fecc67SAndrea Arcangeli 	if (found) {
1675b4fecc67SAndrea Arcangeli 		/*
1676b4fecc67SAndrea Arcangeli 		 * nr is counting all dups in the chain only if
1677b4fecc67SAndrea Arcangeli 		 * prune_stale_stable_nodes is true, otherwise we may
1678b4fecc67SAndrea Arcangeli 		 * break the loop at nr == 1 even if there are
1679b4fecc67SAndrea Arcangeli 		 * multiple entries.
16800ba1d0f7SAndrea Arcangeli 		 */
16813413b2c8SJulia Lawall 		if (prune_stale_stable_nodes && nr == 1) {
16820ba1d0f7SAndrea Arcangeli 			/*
16830ba1d0f7SAndrea Arcangeli 			 * If there's not just one entry it would
16840ba1d0f7SAndrea Arcangeli 			 * corrupt memory, better BUG_ON. In KSM
1685b4fecc67SAndrea Arcangeli 			 * context with no lock held it's not even
16862c653d0eSAndrea Arcangeli 			 * fatal.
16872c653d0eSAndrea Arcangeli 			 */
16882c653d0eSAndrea Arcangeli 			BUG_ON(stable_node->hlist.first->next);
16892c653d0eSAndrea Arcangeli 
16902c653d0eSAndrea Arcangeli 			/*
16912c653d0eSAndrea Arcangeli 			 * There's just one entry and it is below the
16922c653d0eSAndrea Arcangeli 			 * deduplication limit so drop the chain.
16932c653d0eSAndrea Arcangeli 			 */
16942c653d0eSAndrea Arcangeli 			rb_replace_node(&stable_node->node, &found->node,
16952c653d0eSAndrea Arcangeli 					root);
16962c653d0eSAndrea Arcangeli 			free_stable_node(stable_node);
16972c653d0eSAndrea Arcangeli 			ksm_stable_node_chains--;
16982c653d0eSAndrea Arcangeli 			ksm_stable_node_dups--;
16992c653d0eSAndrea Arcangeli 			/*
17002c653d0eSAndrea Arcangeli 			 * NOTE: the caller depends on the stable_node
17012c653d0eSAndrea Arcangeli 			 * to be equal to stable_node_dup if the chain
17022c653d0eSAndrea Arcangeli 			 * was collapsed.
17032c653d0eSAndrea Arcangeli 			 */
1704457aef94SEthon Paul 			*_stable_node = found;
17052c653d0eSAndrea Arcangeli 			/*
17062c653d0eSAndrea Arcangeli 			 * Just for robustness, as stable_node is
17072cee57d1SYang Shi 			 * otherwise left as a stable pointer, the
17082cee57d1SYang Shi 			 * compiler shall optimize it away at build
17092c653d0eSAndrea Arcangeli 			 * time.
17102c653d0eSAndrea Arcangeli 			 */
1711f2e5ff85SAndrea Arcangeli 			stable_node = NULL;
1712f2e5ff85SAndrea Arcangeli 		} else if (stable_node->hlist.first != &found->hlist_dup &&
1713f2e5ff85SAndrea Arcangeli 			   __is_page_sharing_candidate(found, 1)) {
1714f2e5ff85SAndrea Arcangeli 			/*
1715f2e5ff85SAndrea Arcangeli 			 * If the found stable_node dup can accept one
1716f2e5ff85SAndrea Arcangeli 			 * more future merge (in addition to the one
1717f2e5ff85SAndrea Arcangeli 			 * that is underway) and is not at the head of
1718f2e5ff85SAndrea Arcangeli 			 * the chain, put it there so next search will
1719f2e5ff85SAndrea Arcangeli 			 * be quicker in the !prune_stale_stable_nodes
1720f2e5ff85SAndrea Arcangeli 			 * case.
1721f2e5ff85SAndrea Arcangeli 			 *
1722f2e5ff85SAndrea Arcangeli 			 * NOTE: it would be inaccurate to use nr > 1
172331dbd01fSIzik Eidus 			 * instead of checking the hlist.first pointer
17244035c07aSHugh Dickins 			 * directly, because in the
1725c8d6553bSHugh Dickins 			 * prune_stale_stable_nodes case "nr" isn't
172631dbd01fSIzik Eidus 			 * the position of the found dup in the chain,
17274146d2d6SHugh Dickins 			 * but the total number of dups in the chain.
1728c8d6553bSHugh Dickins 			 */
17294146d2d6SHugh Dickins 			hlist_del(&found->hlist_dup);
1730c8d6553bSHugh Dickins 			hlist_add_head(&found->hlist_dup,
17314146d2d6SHugh Dickins 				       &stable_node->hlist);
1732c8d6553bSHugh Dickins 		}
17332c653d0eSAndrea Arcangeli 	}
17342c653d0eSAndrea Arcangeli 
17352c653d0eSAndrea Arcangeli 	*_stable_node_dup = found;
17362c653d0eSAndrea Arcangeli 	return tree_page;
17372c653d0eSAndrea Arcangeli }
17382c653d0eSAndrea Arcangeli 
stable_node_dup_any(struct ksm_stable_node * stable_node,struct rb_root * root)17392c653d0eSAndrea Arcangeli static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
17402c653d0eSAndrea Arcangeli 					       struct rb_root *root)
17412c653d0eSAndrea Arcangeli {
17422c653d0eSAndrea Arcangeli 	if (!is_stable_node_chain(stable_node))
17432c653d0eSAndrea Arcangeli 		return stable_node;
17442c653d0eSAndrea Arcangeli 	if (hlist_empty(&stable_node->hlist)) {
17452c653d0eSAndrea Arcangeli 		free_stable_node_chain(stable_node, root);
17462c653d0eSAndrea Arcangeli 		return NULL;
17472c653d0eSAndrea Arcangeli 	}
17482c653d0eSAndrea Arcangeli 	return hlist_entry(stable_node->hlist.first,
17492c653d0eSAndrea Arcangeli 			   typeof(*stable_node), hlist_dup);
17502c653d0eSAndrea Arcangeli }
17512c653d0eSAndrea Arcangeli 
17522c653d0eSAndrea Arcangeli /*
17532c653d0eSAndrea Arcangeli  * Like for get_ksm_page, this function can free the *_stable_node and
17542c653d0eSAndrea Arcangeli  * *_stable_node_dup if the returned tree_page is NULL.
17552c653d0eSAndrea Arcangeli  *
17562c653d0eSAndrea Arcangeli  * It can also free and overwrite *_stable_node with the found
17572c653d0eSAndrea Arcangeli  * stable_node_dup if the chain is collapsed (in which case
17582c653d0eSAndrea Arcangeli  * *_stable_node will be equal to *_stable_node_dup like if the chain
17592c653d0eSAndrea Arcangeli  * never existed). It's up to the caller to verify tree_page is not
17602c653d0eSAndrea Arcangeli  * NULL before dereferencing *_stable_node or *_stable_node_dup.
1761c8d6553bSHugh Dickins  *
1762c8d6553bSHugh Dickins  * *_stable_node_dup is really a second output parameter of this
1763c8d6553bSHugh Dickins  * function and will be overwritten in all cases, the caller doesn't
1764c8d6553bSHugh Dickins  * need to initialize it.
1765c8d6553bSHugh Dickins  */
__stable_node_chain(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)1766c8d6553bSHugh Dickins static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
1767c8d6553bSHugh Dickins 					struct ksm_stable_node **_stable_node,
17682cee57d1SYang Shi 					struct rb_root *root,
17692cee57d1SYang Shi 					bool prune_stale_stable_nodes)
17702cee57d1SYang Shi {
17712cee57d1SYang Shi 	struct ksm_stable_node *stable_node = *_stable_node;
17722cee57d1SYang Shi 	if (!is_stable_node_chain(stable_node)) {
17732cee57d1SYang Shi 		if (is_page_sharing_candidate(stable_node)) {
17742c653d0eSAndrea Arcangeli 			*_stable_node_dup = stable_node;
17752c653d0eSAndrea Arcangeli 			return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
17762c653d0eSAndrea Arcangeli 		}
17772c653d0eSAndrea Arcangeli 		/*
17782c653d0eSAndrea Arcangeli 		 * _stable_node_dup set to NULL means the stable_node
17792c653d0eSAndrea Arcangeli 		 * reached the ksm_max_page_sharing limit.
1780c8d6553bSHugh Dickins 		 */
17812c653d0eSAndrea Arcangeli 		*_stable_node_dup = NULL;
17822c653d0eSAndrea Arcangeli 		return NULL;
17832c653d0eSAndrea Arcangeli 	}
17844146d2d6SHugh Dickins 	return stable_node_dup(_stable_node_dup, _stable_node, root,
17854146d2d6SHugh Dickins 			       prune_stale_stable_nodes);
17864146d2d6SHugh Dickins }
178762b61f61SHugh Dickins 
chain_prune(struct ksm_stable_node ** s_n_d,struct ksm_stable_node ** s_n,struct rb_root * root)178831dbd01fSIzik Eidus static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
1789c8d6553bSHugh Dickins 						struct ksm_stable_node **s_n,
179031dbd01fSIzik Eidus 						struct rb_root *root)
17914146d2d6SHugh Dickins {
179231dbd01fSIzik Eidus 	return __stable_node_chain(s_n_d, s_n, root, true);
17934146d2d6SHugh Dickins }
17944146d2d6SHugh Dickins 
chain(struct ksm_stable_node ** s_n_d,struct ksm_stable_node * s_n,struct rb_root * root)17954146d2d6SHugh Dickins static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
17964146d2d6SHugh Dickins 					  struct ksm_stable_node *s_n,
1797ef53d16cSHugh Dickins 					  struct rb_root *root)
17982c653d0eSAndrea Arcangeli {
17992c653d0eSAndrea Arcangeli 	struct ksm_stable_node *old_stable_node = s_n;
18004146d2d6SHugh Dickins 	struct page *tree_page;
18014146d2d6SHugh Dickins 
18022c653d0eSAndrea Arcangeli 	tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
18032c653d0eSAndrea Arcangeli 	/* not pruning dups so s_n cannot have changed */
18044146d2d6SHugh Dickins 	VM_BUG_ON(s_n != old_stable_node);
18054146d2d6SHugh Dickins 	return tree_page;
1806b4fecc67SAndrea Arcangeli }
1807b4fecc67SAndrea Arcangeli 
18080ba1d0f7SAndrea Arcangeli /*
18090ba1d0f7SAndrea Arcangeli  * stable_tree_search - search for page inside the stable tree
18100ba1d0f7SAndrea Arcangeli  *
18110ba1d0f7SAndrea Arcangeli  * This function checks if there is a page inside the stable tree
18120ba1d0f7SAndrea Arcangeli  * with identical content to the page that we are scanning right now.
1813b4fecc67SAndrea Arcangeli  *
18140ba1d0f7SAndrea Arcangeli  * This function returns the stable tree node of identical content if found,
1815b4fecc67SAndrea Arcangeli  * NULL otherwise.
1816b4fecc67SAndrea Arcangeli  */
stable_tree_search(struct page * page)18172c653d0eSAndrea Arcangeli static struct page *stable_tree_search(struct page *page)
18184146d2d6SHugh Dickins {
18192c653d0eSAndrea Arcangeli 	int nid;
18204146d2d6SHugh Dickins 	struct rb_root *root;
18214146d2d6SHugh Dickins 	struct rb_node **new;
1822b4fecc67SAndrea Arcangeli 	struct rb_node *parent;
1823b4fecc67SAndrea Arcangeli 	struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
18242c653d0eSAndrea Arcangeli 	struct ksm_stable_node *page_node;
18252c653d0eSAndrea Arcangeli 
18264146d2d6SHugh Dickins 	page_node = page_stable_node(page);
18272c653d0eSAndrea Arcangeli 	if (page_node && page_node->head != &migrate_nodes) {
18282c653d0eSAndrea Arcangeli 		/* ksm page forked */
18294146d2d6SHugh Dickins 		get_page(page);
1830b4fecc67SAndrea Arcangeli 		return page;
18314146d2d6SHugh Dickins 	}
18324146d2d6SHugh Dickins 
18332c653d0eSAndrea Arcangeli 	nid = get_kpfn_nid(page_to_pfn(page));
18342c653d0eSAndrea Arcangeli 	root = root_stable_tree + nid;
18352c653d0eSAndrea Arcangeli again:
18362c653d0eSAndrea Arcangeli 	new = &root->rb_node;
18372c653d0eSAndrea Arcangeli 	parent = NULL;
18382c653d0eSAndrea Arcangeli 
18392c653d0eSAndrea Arcangeli 	while (*new) {
18402c653d0eSAndrea Arcangeli 		struct page *tree_page;
18412c653d0eSAndrea Arcangeli 		int ret;
18422c653d0eSAndrea Arcangeli 
18432c653d0eSAndrea Arcangeli 		cond_resched();
18442c653d0eSAndrea Arcangeli 		stable_node = rb_entry(*new, struct ksm_stable_node, node);
18452c653d0eSAndrea Arcangeli 		stable_node_any = NULL;
18462c653d0eSAndrea Arcangeli 		tree_page = chain_prune(&stable_node_dup, &stable_node,	root);
18472c653d0eSAndrea Arcangeli 		/*
18482c653d0eSAndrea Arcangeli 		 * NOTE: stable_node may have been freed by
18492c653d0eSAndrea Arcangeli 		 * chain_prune() if the returned stable_node_dup is
18502c653d0eSAndrea Arcangeli 		 * not NULL. stable_node_dup may have been inserted in
18514146d2d6SHugh Dickins 		 * the rbtree instead as a regular stable_node (in
18522c653d0eSAndrea Arcangeli 		 * order to collapse the stable_node chain if a single
18532c653d0eSAndrea Arcangeli 		 * stable_node dup was found in it). In such case the
18542c653d0eSAndrea Arcangeli 		 * stable_node is overwritten by the callee to point
18552c653d0eSAndrea Arcangeli 		 * to the stable_node_dup that was collapsed in the
18562c653d0eSAndrea Arcangeli 		 * stable rbtree and stable_node will be equal to
1857b4fecc67SAndrea Arcangeli 		 * stable_node_dup like if the chain never existed.
1858b4fecc67SAndrea Arcangeli 		 */
18590ba1d0f7SAndrea Arcangeli 		if (!stable_node_dup) {
18600ba1d0f7SAndrea Arcangeli 			/*
18610ba1d0f7SAndrea Arcangeli 			 * Either all stable_node dups were full in
18620ba1d0f7SAndrea Arcangeli 			 * this stable_node chain, or this chain was
18630ba1d0f7SAndrea Arcangeli 			 * empty and should be rb_erased.
1864b4fecc67SAndrea Arcangeli 			 */
18650ba1d0f7SAndrea Arcangeli 			stable_node_any = stable_node_dup_any(stable_node,
1866b4fecc67SAndrea Arcangeli 							      root);
18672c653d0eSAndrea Arcangeli 			if (!stable_node_any) {
18682c653d0eSAndrea Arcangeli 				/* rb_erase just run */
18692c653d0eSAndrea Arcangeli 				goto again;
18702c653d0eSAndrea Arcangeli 			}
18712c653d0eSAndrea Arcangeli 			/*
18722c653d0eSAndrea Arcangeli 			 * Take any of the stable_node dups page of
18732c653d0eSAndrea Arcangeli 			 * this stable_node chain to let the tree walk
18742c653d0eSAndrea Arcangeli 			 * continue. All KSM pages belonging to the
18752c653d0eSAndrea Arcangeli 			 * stable_node dups in a stable_node chain
18762c653d0eSAndrea Arcangeli 			 * have the same content and they're
18772c653d0eSAndrea Arcangeli 			 * write protected at all times. Any will work
18782c653d0eSAndrea Arcangeli 			 * fine to continue the walk.
1879b4fecc67SAndrea Arcangeli 			 */
18802c653d0eSAndrea Arcangeli 			tree_page = get_ksm_page(stable_node_any,
18812c653d0eSAndrea Arcangeli 						 GET_KSM_PAGE_NOLOCK);
18822c653d0eSAndrea Arcangeli 		}
18832c653d0eSAndrea Arcangeli 		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
18842c653d0eSAndrea Arcangeli 		if (!tree_page) {
188531dbd01fSIzik Eidus 			/*
188631dbd01fSIzik Eidus 			 * If we walked over a stale stable_node,
188731dbd01fSIzik Eidus 			 * get_ksm_page() will call rb_erase() and it
1888e850dcf5SHugh Dickins 			 * may rebalance the tree from under us. So
188931dbd01fSIzik Eidus 			 * restart the search from scratch. Returning
189031dbd01fSIzik Eidus 			 * NULL would be safe too, but we'd generate
18917b6ba2c7SHugh Dickins 			 * false negative insertions just because some
18927b6ba2c7SHugh Dickins 			 * stable_node was stale.
189331dbd01fSIzik Eidus 			 */
189421fbd591SQi Zheng 			goto again;
189531dbd01fSIzik Eidus 		}
189690bd6fd3SPetr Holasek 
189790bd6fd3SPetr Holasek 		ret = memcmp_pages(page, tree_page);
1898ef53d16cSHugh Dickins 		put_page(tree_page);
189990bd6fd3SPetr Holasek 
1900f2e5ff85SAndrea Arcangeli 		parent = *new;
190121fbd591SQi Zheng 		if (ret < 0)
19022c653d0eSAndrea Arcangeli 			new = &parent->rb_left;
190331dbd01fSIzik Eidus 		else if (ret > 0)
190490bd6fd3SPetr Holasek 			new = &parent->rb_right;
190590bd6fd3SPetr Holasek 		else {
1906ef53d16cSHugh Dickins 			if (page_node) {
1907f2e5ff85SAndrea Arcangeli 				VM_BUG_ON(page_node->head != &migrate_nodes);
1908f2e5ff85SAndrea Arcangeli 				/*
1909ef53d16cSHugh Dickins 				 * Test if the migrated page should be merged
191090bd6fd3SPetr Holasek 				 * into a stable node dup. If the mapcount is
191131dbd01fSIzik Eidus 				 * 1 we can migrate it with another KSM page
19124035c07aSHugh Dickins 				 * without adding it to the chain.
191331dbd01fSIzik Eidus 				 */
191431dbd01fSIzik Eidus 				if (page_mapcount(page) > 1)
191531dbd01fSIzik Eidus 					goto chain_append;
191621fbd591SQi Zheng 			}
19172c653d0eSAndrea Arcangeli 
19188dc5ffcdSAndrea Arcangeli 			if (!stable_node_dup) {
19192c653d0eSAndrea Arcangeli 				/*
19202c653d0eSAndrea Arcangeli 				 * If the stable_node is a chain and
19212c653d0eSAndrea Arcangeli 				 * we got a payload match in memcmp
19222c653d0eSAndrea Arcangeli 				 * but we cannot merge the scanned
19232c653d0eSAndrea Arcangeli 				 * page in any of the existing
19242c653d0eSAndrea Arcangeli 				 * stable_node dups because they're
19252c653d0eSAndrea Arcangeli 				 * all full, we need to wait the
19262c653d0eSAndrea Arcangeli 				 * scanned page to find itself a match
19272c653d0eSAndrea Arcangeli 				 * in the unstable tree to create a
19282c653d0eSAndrea Arcangeli 				 * brand new KSM page to add later to
19292c653d0eSAndrea Arcangeli 				 * the dups of this stable_node.
19302c653d0eSAndrea Arcangeli 				 */
19312c653d0eSAndrea Arcangeli 				return NULL;
19322c653d0eSAndrea Arcangeli 			}
19332c653d0eSAndrea Arcangeli 
19342c653d0eSAndrea Arcangeli 			/*
19352c653d0eSAndrea Arcangeli 			 * Lock and unlock the stable_node's page (which
19362c653d0eSAndrea Arcangeli 			 * might already have been migrated) so that page
1937457aef94SEthon Paul 			 * migration is sure to notice its raised count.
19382c653d0eSAndrea Arcangeli 			 * It would be more elegant to return stable_node
19392c653d0eSAndrea Arcangeli 			 * than kpage, but that involves more changes.
19402cee57d1SYang Shi 			 */
19412cee57d1SYang Shi 			tree_page = get_ksm_page(stable_node_dup,
19422c653d0eSAndrea Arcangeli 						 GET_KSM_PAGE_TRYLOCK);
19432c653d0eSAndrea Arcangeli 
1944f2e5ff85SAndrea Arcangeli 			if (PTR_ERR(tree_page) == -EBUSY)
1945f2e5ff85SAndrea Arcangeli 				return ERR_PTR(-EBUSY);
1946f2e5ff85SAndrea Arcangeli 
1947f2e5ff85SAndrea Arcangeli 			if (unlikely(!tree_page))
1948f2e5ff85SAndrea Arcangeli 				/*
1949f2e5ff85SAndrea Arcangeli 				 * The tree may have been rebalanced,
1950f2e5ff85SAndrea Arcangeli 				 * so re-evaluate parent and new.
1951f2e5ff85SAndrea Arcangeli 				 */
1952f2e5ff85SAndrea Arcangeli 				goto again;
1953f2e5ff85SAndrea Arcangeli 			unlock_page(tree_page);
1954f2e5ff85SAndrea Arcangeli 
1955f2e5ff85SAndrea Arcangeli 			if (get_kpfn_nid(stable_node_dup->kpfn) !=
195631dbd01fSIzik Eidus 			    NUMA(stable_node_dup->nid)) {
19574035c07aSHugh Dickins 				put_page(tree_page);
19584035c07aSHugh Dickins 				goto replace;
195931dbd01fSIzik Eidus 			}
196031dbd01fSIzik Eidus 			return tree_page;
196131dbd01fSIzik Eidus 		}
196231dbd01fSIzik Eidus 	}
196331dbd01fSIzik Eidus 
196431dbd01fSIzik Eidus 	if (!page_node)
196531dbd01fSIzik Eidus 		return NULL;
19662c653d0eSAndrea Arcangeli 
19672c653d0eSAndrea Arcangeli 	list_del(&page_node->list);
196831dbd01fSIzik Eidus 	DO_NUMA(page_node->nid = nid);
196931dbd01fSIzik Eidus 	rb_link_node(&page_node->node, parent, new);
197031dbd01fSIzik Eidus 	rb_insert_color(&page_node->node, root);
19712c653d0eSAndrea Arcangeli out:
19722c653d0eSAndrea Arcangeli 	if (is_page_sharing_candidate(page_node)) {
19737b6ba2c7SHugh Dickins 		get_page(page);
197431dbd01fSIzik Eidus 		return page;
19752c653d0eSAndrea Arcangeli 	} else
19762c653d0eSAndrea Arcangeli 		return NULL;
19772c653d0eSAndrea Arcangeli 
19782c653d0eSAndrea Arcangeli replace:
19792c653d0eSAndrea Arcangeli 	/*
19802c653d0eSAndrea Arcangeli 	 * If stable_node was a chain and chain_prune collapsed it,
19812c653d0eSAndrea Arcangeli 	 * stable_node has been updated to be the new regular
19822c653d0eSAndrea Arcangeli 	 * stable_node. A collapse of the chain is indistinguishable
19832c653d0eSAndrea Arcangeli 	 * from the case there was no chain in the stable
19842c653d0eSAndrea Arcangeli 	 * rbtree. Otherwise stable_node is the chain and
198521fbd591SQi Zheng 	 * stable_node_dup is the dup to replace.
19862c653d0eSAndrea Arcangeli 	 */
19872c653d0eSAndrea Arcangeli 	if (stable_node_dup == stable_node) {
19882c653d0eSAndrea Arcangeli 		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
19892c653d0eSAndrea Arcangeli 		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
19902c653d0eSAndrea Arcangeli 		/* there is no chain */
19912c653d0eSAndrea Arcangeli 		if (page_node) {
19922c653d0eSAndrea Arcangeli 			VM_BUG_ON(page_node->head != &migrate_nodes);
19932c653d0eSAndrea Arcangeli 			list_del(&page_node->list);
19942c653d0eSAndrea Arcangeli 			DO_NUMA(page_node->nid = nid);
199508beca44SHugh Dickins 			rb_replace_node(&stable_node_dup->node,
19962c653d0eSAndrea Arcangeli 					&page_node->node,
199731dbd01fSIzik Eidus 					root);
199831dbd01fSIzik Eidus 			if (is_page_sharing_candidate(page_node))
199931dbd01fSIzik Eidus 				get_page(page);
20008dd3557aSHugh Dickins 			else
20018dd3557aSHugh Dickins 				page = NULL;
200231dbd01fSIzik Eidus 		} else {
200331dbd01fSIzik Eidus 			rb_erase(&stable_node_dup->node, root);
200431dbd01fSIzik Eidus 			page = NULL;
200531dbd01fSIzik Eidus 		}
200631dbd01fSIzik Eidus 	} else {
200731dbd01fSIzik Eidus 		VM_BUG_ON(!is_stable_node_chain(stable_node));
200831dbd01fSIzik Eidus 		__stable_node_dup_del(stable_node_dup);
200931dbd01fSIzik Eidus 		if (page_node) {
201031dbd01fSIzik Eidus 			VM_BUG_ON(page_node->head != &migrate_nodes);
201131dbd01fSIzik Eidus 			list_del(&page_node->list);
201231dbd01fSIzik Eidus 			DO_NUMA(page_node->nid = nid);
20138dd3557aSHugh Dickins 			stable_node_chain_add_dup(page_node, stable_node);
201421fbd591SQi Zheng 			if (is_page_sharing_candidate(page_node))
20158dd3557aSHugh Dickins 				get_page(page);
20168dd3557aSHugh Dickins 			else
201731dbd01fSIzik Eidus 				page = NULL;
201890bd6fd3SPetr Holasek 		} else {
201990bd6fd3SPetr Holasek 			page = NULL;
202031dbd01fSIzik Eidus 		}
202190bd6fd3SPetr Holasek 	}
202290bd6fd3SPetr Holasek 	stable_node_dup->head = &migrate_nodes;
202390bd6fd3SPetr Holasek 	list_add(&stable_node_dup->list, stable_node_dup->head);
2024ef53d16cSHugh Dickins 	return page;
202590bd6fd3SPetr Holasek 
202631dbd01fSIzik Eidus chain_append:
202731dbd01fSIzik Eidus 	/* stable_node_dup could be null if it reached the limit */
202821fbd591SQi Zheng 	if (!stable_node_dup)
20298dd3557aSHugh Dickins 		stable_node_dup = stable_node_any;
203031dbd01fSIzik Eidus 	/*
203131dbd01fSIzik Eidus 	 * If stable_node was a chain and chain_prune collapsed it,
2032d178f27fSHugh Dickins 	 * stable_node has been updated to be the new regular
203321fbd591SQi Zheng 	 * stable_node. A collapse of the chain is indistinguishable
20348dd3557aSHugh Dickins 	 * from the case there was no chain in the stable
2035c8f95ed1SAndrea Arcangeli 	 * rbtree. Otherwise stable_node is the chain and
203631dbd01fSIzik Eidus 	 * stable_node_dup is the dup to replace.
203731dbd01fSIzik Eidus 	 */
203831dbd01fSIzik Eidus 	if (stable_node_dup == stable_node) {
20398dd3557aSHugh Dickins 		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
204031dbd01fSIzik Eidus 		/* chain is missing so create it */
20418dd3557aSHugh Dickins 		stable_node = alloc_stable_node_chain(stable_node_dup,
20428dd3557aSHugh Dickins 						      root);
204331dbd01fSIzik Eidus 		if (!stable_node)
204431dbd01fSIzik Eidus 			return NULL;
204531dbd01fSIzik Eidus 	}
20468dd3557aSHugh Dickins 	/*
204731dbd01fSIzik Eidus 	 * Add this stable_node dup that was
204831dbd01fSIzik Eidus 	 * migrated to the stable_node chain
204931dbd01fSIzik Eidus 	 * of the current nid for this page
20508dd3557aSHugh Dickins 	 * content.
205131dbd01fSIzik Eidus 	 */
205231dbd01fSIzik Eidus 	VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
20538dd3557aSHugh Dickins 	VM_BUG_ON(page_node->head != &migrate_nodes);
205431dbd01fSIzik Eidus 	list_del(&page_node->list);
2055b599cbdfSHugh Dickins 	DO_NUMA(page_node->nid = nid);
2056b599cbdfSHugh Dickins 	stable_node_chain_add_dup(page_node, stable_node);
2057b599cbdfSHugh Dickins 	goto out;
2058b599cbdfSHugh Dickins }
2059b599cbdfSHugh Dickins 
2060b599cbdfSHugh Dickins /*
2061b599cbdfSHugh Dickins  * stable_tree_insert - insert stable tree node pointing to new ksm page
2062b599cbdfSHugh Dickins  * into the stable tree.
2063b599cbdfSHugh Dickins  *
206431dbd01fSIzik Eidus  * This function returns the stable tree node just allocated on success,
20658dd3557aSHugh Dickins  * NULL otherwise.
206631dbd01fSIzik Eidus  */
stable_tree_insert(struct page * kpage)206731dbd01fSIzik Eidus static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
206831dbd01fSIzik Eidus {
206931dbd01fSIzik Eidus 	int nid;
20707b6ba2c7SHugh Dickins 	unsigned long kpfn;
207131dbd01fSIzik Eidus 	struct rb_root *root;
2072e850dcf5SHugh Dickins 	struct rb_node **new;
207331dbd01fSIzik Eidus 	struct rb_node *parent;
207490bd6fd3SPetr Holasek 	struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
207531dbd01fSIzik Eidus 	bool need_chain = false;
2076473b0ce4SHugh Dickins 
207731dbd01fSIzik Eidus 	kpfn = page_to_pfn(kpage);
207831dbd01fSIzik Eidus 	nid = get_kpfn_nid(kpfn);
207931dbd01fSIzik Eidus 	root = root_stable_tree + nid;
208031dbd01fSIzik Eidus again:
208131dbd01fSIzik Eidus 	parent = NULL;
208231dbd01fSIzik Eidus 	new = &root->rb_node;
208331dbd01fSIzik Eidus 
208431dbd01fSIzik Eidus 	while (*new) {
208521fbd591SQi Zheng 		struct page *tree_page;
208621fbd591SQi Zheng 		int ret;
20872c653d0eSAndrea Arcangeli 
208831dbd01fSIzik Eidus 		cond_resched();
20892c653d0eSAndrea Arcangeli 		stable_node = rb_entry(*new, struct ksm_stable_node, node);
20902c653d0eSAndrea Arcangeli 		stable_node_any = NULL;
20912c653d0eSAndrea Arcangeli 		tree_page = chain(&stable_node_dup, stable_node, root);
20922c653d0eSAndrea Arcangeli 		if (!stable_node_dup) {
20932c653d0eSAndrea Arcangeli 			/*
20942c653d0eSAndrea Arcangeli 			 * Either all stable_node dups were full in
2095457aef94SEthon Paul 			 * this stable_node chain, or this chain was
20962c653d0eSAndrea Arcangeli 			 * empty and should be rb_erased.
20972c653d0eSAndrea Arcangeli 			 */
20982c653d0eSAndrea Arcangeli 			stable_node_any = stable_node_dup_any(stable_node,
20992c653d0eSAndrea Arcangeli 							      root);
21002c653d0eSAndrea Arcangeli 			if (!stable_node_any) {
21012c653d0eSAndrea Arcangeli 				/* rb_erase just run */
21022c653d0eSAndrea Arcangeli 				goto again;
21032c653d0eSAndrea Arcangeli 			}
21042c653d0eSAndrea Arcangeli 			/*
21052c653d0eSAndrea Arcangeli 			 * Take any of the stable_node dups page of
21062c653d0eSAndrea Arcangeli 			 * this stable_node chain to let the tree walk
21077b6ba2c7SHugh Dickins 			 * continue. All KSM pages belonging to the
210831dbd01fSIzik Eidus 			 * stable_node dups in a stable_node chain
21097b6ba2c7SHugh Dickins 			 * have the same content and they're
2110e178dfdeSHugh Dickins 			 * write protected at all times. Any will work
21117b6ba2c7SHugh Dickins 			 * fine to continue the walk.
2112e178dfdeSHugh Dickins 			 */
21137b6ba2c7SHugh Dickins 			tree_page = get_ksm_page(stable_node_any,
21147b6ba2c7SHugh Dickins 						 GET_KSM_PAGE_NOLOCK);
211576093853Sxu xin 		}
211676093853Sxu xin 		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
211731dbd01fSIzik Eidus 		if (!tree_page) {
211831dbd01fSIzik Eidus 			/*
211931dbd01fSIzik Eidus 			 * If we walked over a stale stable_node,
212081464e30SHugh Dickins 			 * get_ksm_page() will call rb_erase() and it
212181464e30SHugh Dickins 			 * may rebalance the tree from under us. So
212281464e30SHugh Dickins 			 * restart the search from scratch. Returning
212381464e30SHugh Dickins 			 * NULL would be safe too, but we'd generate
212431dbd01fSIzik Eidus 			 * false negative insertions just because some
212531dbd01fSIzik Eidus 			 * stable_node was stale.
212631dbd01fSIzik Eidus 			 */
212731dbd01fSIzik Eidus 			goto again;
212821fbd591SQi Zheng 		}
212931dbd01fSIzik Eidus 
21304b22927fSKirill Tkhai 		ret = memcmp_pages(kpage, tree_page);
213121fbd591SQi Zheng 		put_page(tree_page);
21328dd3557aSHugh Dickins 
213321fbd591SQi Zheng 		parent = *new;
21348dd3557aSHugh Dickins 		if (ret < 0)
213531dbd01fSIzik Eidus 			new = &parent->rb_left;
213631dbd01fSIzik Eidus 		else if (ret > 0)
21372c653d0eSAndrea Arcangeli 			new = &parent->rb_right;
213831dbd01fSIzik Eidus 		else {
21394146d2d6SHugh Dickins 			need_chain = true;
21404146d2d6SHugh Dickins 			break;
21414146d2d6SHugh Dickins 		}
21422c653d0eSAndrea Arcangeli 	}
21432c653d0eSAndrea Arcangeli 
21442c653d0eSAndrea Arcangeli 	stable_node_dup = alloc_stable_node();
21454146d2d6SHugh Dickins 	if (!stable_node_dup)
21464146d2d6SHugh Dickins 		return NULL;
21474146d2d6SHugh Dickins 
21484146d2d6SHugh Dickins 	INIT_HLIST_HEAD(&stable_node_dup->hlist);
21494146d2d6SHugh Dickins 	stable_node_dup->kpfn = kpfn;
21504146d2d6SHugh Dickins 	set_page_stable_node(kpage, stable_node_dup);
21512c653d0eSAndrea Arcangeli 	stable_node_dup->rmap_hlist_len = 0;
21522c653d0eSAndrea Arcangeli 	DO_NUMA(stable_node_dup->nid = nid);
21532c653d0eSAndrea Arcangeli 	if (!need_chain) {
21542c653d0eSAndrea Arcangeli 		rb_link_node(&stable_node_dup->node, parent, new);
21552c653d0eSAndrea Arcangeli 		rb_insert_color(&stable_node_dup->node, root);
21562c653d0eSAndrea Arcangeli 	} else {
21574146d2d6SHugh Dickins 		if (!is_stable_node_chain(stable_node)) {
215831dbd01fSIzik Eidus 			struct ksm_stable_node *orig = stable_node;
215931dbd01fSIzik Eidus 			/* chain is missing so create it */
216062b61f61SHugh Dickins 			stable_node = alloc_stable_node_chain(orig, root);
21614146d2d6SHugh Dickins 			if (!stable_node) {
21624146d2d6SHugh Dickins 				free_stable_node(stable_node_dup);
21634146d2d6SHugh Dickins 				return NULL;
21644146d2d6SHugh Dickins 			}
21654146d2d6SHugh Dickins 		}
21664146d2d6SHugh Dickins 		stable_node_chain_add_dup(stable_node_dup, stable_node);
21674146d2d6SHugh Dickins 	}
216862b61f61SHugh Dickins 
21692cee57d1SYang Shi 	return stable_node_dup;
21702cee57d1SYang Shi }
21712cee57d1SYang Shi 
217208beca44SHugh Dickins /*
217331dbd01fSIzik Eidus  * unstable_tree_search_insert - search for identical page,
217431dbd01fSIzik Eidus  * else insert rmap_item into the unstable tree.
217531dbd01fSIzik Eidus  *
217631dbd01fSIzik Eidus  * This function searches for a page in the unstable tree identical to the
217731dbd01fSIzik Eidus  * page currently being scanned; and if no identical page is found in the
21785ad64688SHugh Dickins  * tree, we insert rmap_item as a new object into the unstable tree.
21792c653d0eSAndrea Arcangeli  *
21802c653d0eSAndrea Arcangeli  * This function returns pointer to rmap_item found to be identical
21815ad64688SHugh Dickins  * to the currently scanned page, NULL otherwise.
218231dbd01fSIzik Eidus  *
21838dd3557aSHugh Dickins  * This function does both searching and inserting, because they share
218431dbd01fSIzik Eidus  * the same walking algorithm in an rbtree.
218531dbd01fSIzik Eidus  */
218631dbd01fSIzik Eidus static
unstable_tree_search_insert(struct ksm_rmap_item * rmap_item,struct page * page,struct page ** tree_pagep)218731dbd01fSIzik Eidus struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
21884035c07aSHugh Dickins 					      struct page *page,
21894035c07aSHugh Dickins 					      struct page **tree_pagep)
21904035c07aSHugh Dickins {
21914035c07aSHugh Dickins 	struct rb_node **new;
219231dbd01fSIzik Eidus 	struct rb_root *root;
219331dbd01fSIzik Eidus 	struct rb_node *parent = NULL;
219431dbd01fSIzik Eidus 	int nid;
219531dbd01fSIzik Eidus 
219631dbd01fSIzik Eidus 	nid = get_kpfn_nid(page_to_pfn(page));
219731dbd01fSIzik Eidus 	root = root_unstable_tree + nid;
219831dbd01fSIzik Eidus 	new = &root->rb_node;
2199e86c59b1SClaudio Imbrenda 
2200e86c59b1SClaudio Imbrenda 	while (*new) {
2201e86c59b1SClaudio Imbrenda 		struct ksm_rmap_item *tree_rmap_item;
2202e86c59b1SClaudio Imbrenda 		struct page *tree_page;
2203e86c59b1SClaudio Imbrenda 		int ret;
2204e86c59b1SClaudio Imbrenda 
2205e86c59b1SClaudio Imbrenda 		cond_resched();
2206d8ed45c5SMichel Lespinasse 		tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
22074b22927fSKirill Tkhai 		tree_page = get_mergeable_page(tree_rmap_item);
220856df70a6SMuchun Song 		if (!tree_page)
2209e86c59b1SClaudio Imbrenda 			return NULL;
2210e86c59b1SClaudio Imbrenda 
2211739100c8SStefan Roesch 		/*
2212739100c8SStefan Roesch 		 * Don't substitute a ksm page for a forked page.
2213739100c8SStefan Roesch 		 */
221456df70a6SMuchun Song 		if (page == tree_page) {
221556df70a6SMuchun Song 			put_page(tree_page);
221656df70a6SMuchun Song 			return NULL;
221756df70a6SMuchun Song 		}
221856df70a6SMuchun Song 
221956df70a6SMuchun Song 		ret = memcmp_pages(page, tree_page);
222056df70a6SMuchun Song 
2221d8ed45c5SMichel Lespinasse 		parent = *new;
2222e86c59b1SClaudio Imbrenda 		if (ret < 0) {
2223e86c59b1SClaudio Imbrenda 			put_page(tree_page);
2224e86c59b1SClaudio Imbrenda 			new = &parent->rb_left;
2225e86c59b1SClaudio Imbrenda 		} else if (ret > 0) {
2226e86c59b1SClaudio Imbrenda 			put_page(tree_page);
2227e86c59b1SClaudio Imbrenda 			new = &parent->rb_right;
2228e86c59b1SClaudio Imbrenda 		} else if (!ksm_merge_across_nodes &&
22298dd3557aSHugh Dickins 			   page_to_nid(tree_page) != nid) {
22308dd3557aSHugh Dickins 			/*
223131dbd01fSIzik Eidus 			 * If tree_page has been migrated to another NUMA node,
223277da2ba0SClaudio Imbrenda 			 * it will be flushed out and put in the right unstable
223377da2ba0SClaudio Imbrenda 			 * tree next time: only merge with it when across_nodes.
22348dd3557aSHugh Dickins 			 */
22358dd3557aSHugh Dickins 			put_page(tree_page);
223677da2ba0SClaudio Imbrenda 			return NULL;
223777da2ba0SClaudio Imbrenda 		} else {
223877da2ba0SClaudio Imbrenda 			*tree_pagep = tree_page;
223977da2ba0SClaudio Imbrenda 			return tree_rmap_item;
224077da2ba0SClaudio Imbrenda 		}
224177da2ba0SClaudio Imbrenda 	}
224277da2ba0SClaudio Imbrenda 
224377da2ba0SClaudio Imbrenda 	rmap_item->address |= UNSTABLE_FLAG;
224477da2ba0SClaudio Imbrenda 	rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
224577da2ba0SClaudio Imbrenda 	DO_NUMA(rmap_item->nid = nid);
224677da2ba0SClaudio Imbrenda 	rb_link_node(&rmap_item->node, parent, new);
224777da2ba0SClaudio Imbrenda 	rb_insert_color(&rmap_item->node, root);
22488dd3557aSHugh Dickins 
22498dd3557aSHugh Dickins 	ksm_pages_unshared++;
2250bc56620bSHugh Dickins 	return NULL;
2251bc56620bSHugh Dickins }
2252bc56620bSHugh Dickins 
2253bc56620bSHugh Dickins /*
22545ad64688SHugh Dickins  * stable_tree_append - add another rmap_item to the linked list of
22557b6ba2c7SHugh Dickins  * rmap_items hanging off a given node of the stable tree, all sharing
22567b6ba2c7SHugh Dickins  * the same ksm page.
22572c653d0eSAndrea Arcangeli  */
stable_tree_append(struct ksm_rmap_item * rmap_item,struct ksm_stable_node * stable_node,bool max_page_sharing_bypass)22582c653d0eSAndrea Arcangeli static void stable_tree_append(struct ksm_rmap_item *rmap_item,
22592c653d0eSAndrea Arcangeli 			       struct ksm_stable_node *stable_node,
22602c653d0eSAndrea Arcangeli 			       bool max_page_sharing_bypass)
22617b6ba2c7SHugh Dickins {
22625ad64688SHugh Dickins 	/*
22637b6ba2c7SHugh Dickins 	 * rmap won't find this mapping if we don't insert the
226431dbd01fSIzik Eidus 	 * rmap_item in the right stable_node
226531dbd01fSIzik Eidus 	 * duplicate. page_migration could break later if rmap breaks,
226631dbd01fSIzik Eidus 	 * so we can as well crash here. We really need to check for
226731dbd01fSIzik Eidus 	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
226831dbd01fSIzik Eidus 	 * for other negative values as an underflow if detected here
226931dbd01fSIzik Eidus 	 * for the first time (and not when decreasing rmap_hlist_len)
22707b6ba2c7SHugh Dickins 	 * would be sign of memory corruption in the stable_node.
22718dd3557aSHugh Dickins 	 */
22728dd3557aSHugh Dickins 	BUG_ON(stable_node->rmap_hlist_len < 0);
227331dbd01fSIzik Eidus 
227477da2ba0SClaudio Imbrenda 	stable_node->rmap_hlist_len++;
227577da2ba0SClaudio Imbrenda 	if (!max_page_sharing_bypass)
227677da2ba0SClaudio Imbrenda 		/* possibly non fatal but unexpected overflow, only warn */
227777da2ba0SClaudio Imbrenda 		WARN_ON_ONCE(stable_node->rmap_hlist_len >
227877da2ba0SClaudio Imbrenda 			     ksm_max_page_sharing);
227977da2ba0SClaudio Imbrenda 
228077da2ba0SClaudio Imbrenda 	rmap_item->head = stable_node;
228177da2ba0SClaudio Imbrenda 	rmap_item->address |= STABLE_FLAG;
228277da2ba0SClaudio Imbrenda 	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
228377da2ba0SClaudio Imbrenda 
228477da2ba0SClaudio Imbrenda 	if (rmap_item->hlist.next)
228577da2ba0SClaudio Imbrenda 		ksm_pages_sharing++;
228677da2ba0SClaudio Imbrenda 	else
228777da2ba0SClaudio Imbrenda 		ksm_pages_shared++;
228831dbd01fSIzik Eidus 
228931dbd01fSIzik Eidus 	rmap_item->mm->ksm_merging_pages++;
229031dbd01fSIzik Eidus }
229131dbd01fSIzik Eidus 
229221fbd591SQi Zheng /*
229321fbd591SQi Zheng  * cmp_and_merge_page - first see if page can be merged into the stable tree;
229431dbd01fSIzik Eidus  * if not, compare checksum to previous and if it's the same, see if page can
229531dbd01fSIzik Eidus  * be inserted into the unstable tree, or merged with a page already there and
229621fbd591SQi Zheng  * both transferred to the stable tree.
229731dbd01fSIzik Eidus  *
22986514d511SHugh Dickins  * @page: the page that we are searching identical page to.
22996514d511SHugh Dickins  * @rmap_item: the reverse mapping into the virtual address of this page
230093d17715SHugh Dickins  */
cmp_and_merge_page(struct page * page,struct ksm_rmap_item * rmap_item)230131dbd01fSIzik Eidus static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
230231dbd01fSIzik Eidus {
230331dbd01fSIzik Eidus 	struct mm_struct *mm = rmap_item->mm;
23046514d511SHugh Dickins 	struct ksm_rmap_item *tree_rmap_item;
230531dbd01fSIzik Eidus 	struct page *tree_page = NULL;
230631dbd01fSIzik Eidus 	struct ksm_stable_node *stable_node;
230731dbd01fSIzik Eidus 	struct page *kpage;
230831dbd01fSIzik Eidus 	unsigned int checksum;
230931dbd01fSIzik Eidus 	int err;
231031dbd01fSIzik Eidus 	bool max_page_sharing_bypass = false;
231131dbd01fSIzik Eidus 
231258730ab6SQi Zheng 	stable_node = page_stable_node(page);
2313cb4df4caSxu xin 	if (stable_node) {
231431dbd01fSIzik Eidus 		if (stable_node->head != &migrate_nodes &&
23156514d511SHugh Dickins 		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
23166514d511SHugh Dickins 		    NUMA(stable_node->nid)) {
231731dbd01fSIzik Eidus 			stable_node_dup_del(stable_node);
231831dbd01fSIzik Eidus 			stable_node->head = &migrate_nodes;
231931dbd01fSIzik Eidus 			list_add(&stable_node->list, stable_node->head);
232031dbd01fSIzik Eidus 		}
23215e924ff5SStefan Roesch 		if (stable_node->head != &migrate_nodes &&
23225e924ff5SStefan Roesch 		    rmap_item->head == stable_node)
23235e924ff5SStefan Roesch 			return;
23245e924ff5SStefan Roesch 		/*
23255e924ff5SStefan Roesch 		 * If it's a KSM fork, allow it to go over the sharing limit
23265e924ff5SStefan Roesch 		 * without warnings.
23275e924ff5SStefan Roesch 		 */
23285e924ff5SStefan Roesch 		if (!is_page_sharing_candidate(stable_node))
23295e924ff5SStefan Roesch 			max_page_sharing_bypass = true;
23305e924ff5SStefan Roesch 	}
23315e924ff5SStefan Roesch 
23325e924ff5SStefan Roesch 	/* We first start with searching the page inside the stable tree */
23335e924ff5SStefan Roesch 	kpage = stable_tree_search(page);
23345e924ff5SStefan Roesch 	if (kpage == page && rmap_item->head == stable_node) {
23355e924ff5SStefan Roesch 		put_page(kpage);
23365e924ff5SStefan Roesch 		return;
23375e924ff5SStefan Roesch 	}
23385e924ff5SStefan Roesch 
23395e924ff5SStefan Roesch 	remove_rmap_item_from_tree(rmap_item);
23405e924ff5SStefan Roesch 
23415e924ff5SStefan Roesch 	if (kpage) {
23425e924ff5SStefan Roesch 		if (PTR_ERR(kpage) == -EBUSY)
23435e924ff5SStefan Roesch 			return;
23445e924ff5SStefan Roesch 
23455e924ff5SStefan Roesch 		err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
23465e924ff5SStefan Roesch 		if (!err) {
23475e924ff5SStefan Roesch 			/*
23485e924ff5SStefan Roesch 			 * The page was successfully merged:
23495e924ff5SStefan Roesch 			 * add its rmap_item to the stable tree.
23505e924ff5SStefan Roesch 			 */
23515e924ff5SStefan Roesch 			lock_page(kpage);
23525e924ff5SStefan Roesch 			stable_tree_append(rmap_item, page_stable_node(kpage),
23535e924ff5SStefan Roesch 					   max_page_sharing_bypass);
23545e924ff5SStefan Roesch 			unlock_page(kpage);
23555e924ff5SStefan Roesch 		}
23565e924ff5SStefan Roesch 		put_page(kpage);
23575e924ff5SStefan Roesch 		return;
23585e924ff5SStefan Roesch 	}
23595e924ff5SStefan Roesch 
23605e924ff5SStefan Roesch 	/*
23615e924ff5SStefan Roesch 	 * If the hash value of the page has changed from the last time
23625e924ff5SStefan Roesch 	 * we calculated it, this page is changing frequently: therefore we
23635e924ff5SStefan Roesch 	 * don't want to insert it in the unstable tree, and we don't want
23645e924ff5SStefan Roesch 	 * to waste our time searching for something identical to it there.
23655e924ff5SStefan Roesch 	 */
23665e924ff5SStefan Roesch 	checksum = calc_checksum(page);
23675e924ff5SStefan Roesch 	if (rmap_item->oldchecksum != checksum) {
23685e924ff5SStefan Roesch 		rmap_item->oldchecksum = checksum;
23695e924ff5SStefan Roesch 		return;
23705e924ff5SStefan Roesch 	}
23715e924ff5SStefan Roesch 
23725e924ff5SStefan Roesch 	/*
23735e924ff5SStefan Roesch 	 * Same checksum as an empty page. We attempt to merge it with the
23745e924ff5SStefan Roesch 	 * appropriate zero page if the user enabled this via sysfs.
23755e924ff5SStefan Roesch 	 */
23765e924ff5SStefan Roesch 	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
23775e924ff5SStefan Roesch 		struct vm_area_struct *vma;
23785e924ff5SStefan Roesch 
23795e924ff5SStefan Roesch 		mmap_read_lock(mm);
23805e924ff5SStefan Roesch 		vma = find_mergeable_vma(mm, rmap_item->address);
23815e924ff5SStefan Roesch 		if (vma) {
23825e924ff5SStefan Roesch 			err = try_to_merge_one_page(vma, page,
2383e5a68991SStefan Roesch 					ZERO_PAGE(rmap_item->address));
23845e924ff5SStefan Roesch 			trace_ksm_merge_one_page(
23855e924ff5SStefan Roesch 				page_to_pfn(ZERO_PAGE(rmap_item->address)),
23865e924ff5SStefan Roesch 				rmap_item, mm, err);
23875e924ff5SStefan Roesch 		} else {
23885e924ff5SStefan Roesch 			/*
238921fbd591SQi Zheng 			 * If the vma is out of date, we do not need to
239031dbd01fSIzik Eidus 			 * continue.
239131dbd01fSIzik Eidus 			 */
239258730ab6SQi Zheng 			err = 0;
239358730ab6SQi Zheng 		}
239431dbd01fSIzik Eidus 		mmap_read_unlock(mm);
239521fbd591SQi Zheng 		/*
2396a5f18ba0SMatthew Wilcox (Oracle) 		 * In case of failure, the page was not really empty, so we
239790bd6fd3SPetr Holasek 		 * need to continue. Otherwise we're done.
239831dbd01fSIzik Eidus 		 */
239958730ab6SQi Zheng 		if (!err)
240031dbd01fSIzik Eidus 			return;
240131dbd01fSIzik Eidus 	}
240258730ab6SQi Zheng 	tree_rmap_item =
240358730ab6SQi Zheng 		unstable_tree_search_insert(rmap_item, page, &tree_page);
2404739100c8SStefan Roesch 	if (tree_rmap_item) {
2405739100c8SStefan Roesch 		bool split;
24062919bfd0SHugh Dickins 
24071fec6890SMatthew Wilcox (Oracle) 		kpage = try_to_merge_two_pages(rmap_item, page,
24081fec6890SMatthew Wilcox (Oracle) 						tree_rmap_item, tree_page);
24092919bfd0SHugh Dickins 		/*
24102919bfd0SHugh Dickins 		 * If both pages we tried to merge belong to the same compound
24112919bfd0SHugh Dickins 		 * page, then we actually ended up increasing the reference
24122919bfd0SHugh Dickins 		 * count of the same compound page twice, and split_huge_page
24132919bfd0SHugh Dickins 		 * failed.
24142919bfd0SHugh Dickins 		 * Here we set a flag if that happened, and we use it later to
24152919bfd0SHugh Dickins 		 * try split_huge_page again. Since we call put_page right
24162919bfd0SHugh Dickins 		 * afterwards, the reference count will be correct and
24172919bfd0SHugh Dickins 		 * split_huge_page should succeed.
24184146d2d6SHugh Dickins 		 */
24194146d2d6SHugh Dickins 		split = PageTransCompound(page)
24204146d2d6SHugh Dickins 			&& compound_head(page) == compound_head(tree_page);
24214146d2d6SHugh Dickins 		put_page(tree_page);
24224146d2d6SHugh Dickins 		if (kpage) {
24234146d2d6SHugh Dickins 			/*
24244146d2d6SHugh Dickins 			 * The pages were successfully merged: insert new
242521fbd591SQi Zheng 			 * node in the stable tree and add both rmap_items.
24264146d2d6SHugh Dickins 			 */
24274146d2d6SHugh Dickins 			lock_page(kpage);
242803640418SGeliang Tang 			stable_node = stable_tree_insert(kpage);
242903640418SGeliang Tang 			if (stable_node) {
24302cee57d1SYang Shi 				stable_tree_append(tree_rmap_item, stable_node,
24312cee57d1SYang Shi 						   false);
24324146d2d6SHugh Dickins 				stable_tree_append(rmap_item, stable_node,
24334146d2d6SHugh Dickins 						   false);
24344146d2d6SHugh Dickins 			}
24354146d2d6SHugh Dickins 			unlock_page(kpage);
24364146d2d6SHugh Dickins 
24374146d2d6SHugh Dickins 			/*
2438ef53d16cSHugh Dickins 			 * If we fail to insert the page into the stable tree,
243990bd6fd3SPetr Holasek 			 * we will have 2 virtual addresses that are pointing
244031dbd01fSIzik Eidus 			 * to a ksm page left outside the stable tree,
244131dbd01fSIzik Eidus 			 * in which case we need to break_cow on both.
244258730ab6SQi Zheng 			 */
244358730ab6SQi Zheng 			if (!stable_node) {
244458730ab6SQi Zheng 				break_cow(tree_rmap_item);
244558730ab6SQi Zheng 				break_cow(rmap_item);
244631dbd01fSIzik Eidus 			}
24472b472611SHugh Dickins 		} else if (split) {
24482b472611SHugh Dickins 			/*
24492b472611SHugh Dickins 			 * We are here if we tried to merge two pages and
24502b472611SHugh Dickins 			 * failed because they both belonged to the same
245158730ab6SQi Zheng 			 * compound page. We will split the page now, but no
24522b472611SHugh Dickins 			 * merging will take place.
245331dbd01fSIzik Eidus 			 * We do not want to add the cost of a full lock; if
245431dbd01fSIzik Eidus 			 * the page is locked, it is better to skip it and
245558730ab6SQi Zheng 			 * perhaps try again later.
245631dbd01fSIzik Eidus 			 */
245731dbd01fSIzik Eidus 			if (!trylock_page(page))
245858730ab6SQi Zheng 				return;
245931dbd01fSIzik Eidus 			split_huge_page(page);
2460a5f18ba0SMatthew Wilcox (Oracle) 			unlock_page(page);
2461a5f18ba0SMatthew Wilcox (Oracle) 		}
2462d8ed45c5SMichel Lespinasse 	}
24639ba69294SHugh Dickins }
2464a5f18ba0SMatthew Wilcox (Oracle) 
get_next_rmap_item(struct ksm_mm_slot * mm_slot,struct ksm_rmap_item ** rmap_list,unsigned long addr)24659ba69294SHugh Dickins static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
2466a5f18ba0SMatthew Wilcox (Oracle) 					    struct ksm_rmap_item **rmap_list,
246731dbd01fSIzik Eidus 					    unsigned long addr)
246831dbd01fSIzik Eidus {
246931dbd01fSIzik Eidus 	struct ksm_rmap_item *rmap_item;
247031dbd01fSIzik Eidus 
247131dbd01fSIzik Eidus 	while (*rmap_list) {
247231dbd01fSIzik Eidus 		rmap_item = *rmap_list;
247331dbd01fSIzik Eidus 		if ((rmap_item->address & PAGE_MASK) == addr)
247431dbd01fSIzik Eidus 			return rmap_item;
24759ba69294SHugh Dickins 		if (rmap_item->address > addr)
24769ba69294SHugh Dickins 			break;
247731dbd01fSIzik Eidus 		*rmap_list = rmap_item->rmap_list;
2478f7091ed6SHaiyue Wang 		remove_rmap_item_from_tree(rmap_item);
247921ae5b01SAndrea Arcangeli 		free_rmap_item(rmap_item);
248021ae5b01SAndrea Arcangeli 	}
248121ae5b01SAndrea Arcangeli 
248221ae5b01SAndrea Arcangeli 	rmap_item = alloc_rmap_item();
2483f7091ed6SHaiyue Wang 	if (rmap_item) {
2484f7091ed6SHaiyue Wang 		/* It has already been zeroed */
2485f765f540SKirill A. Shutemov 		rmap_item->mm = mm_slot->slot.mm;
248631dbd01fSIzik Eidus 		rmap_item->mm->ksm_rmap_items++;
248731dbd01fSIzik Eidus 		rmap_item->address = addr;
248858730ab6SQi Zheng 		rmap_item->rmap_list = *rmap_list;
24896514d511SHugh Dickins 		*rmap_list = rmap_item;
249031dbd01fSIzik Eidus 	}
24916514d511SHugh Dickins 	return rmap_item;
24926514d511SHugh Dickins }
24935e924ff5SStefan Roesch 
24945e924ff5SStefan Roesch /*
24955e924ff5SStefan Roesch  * Calculate skip age for the ksm page age. The age determines how often
24965e924ff5SStefan Roesch  * de-duplicating has already been tried unsuccessfully. If the age is
249731dbd01fSIzik Eidus  * smaller, the scanning of this page is skipped for less scans.
249831dbd01fSIzik Eidus  *
249931dbd01fSIzik Eidus  * @age: rmap_item age of page
2500d8ed45c5SMichel Lespinasse  */
skip_age(rmap_age_t age)250131dbd01fSIzik Eidus static unsigned int skip_age(rmap_age_t age)
250231dbd01fSIzik Eidus {
2503f7091ed6SHaiyue Wang 	if (age <= 3)
250431dbd01fSIzik Eidus 		return 1;
250531dbd01fSIzik Eidus 	if (age <= 5)
250631dbd01fSIzik Eidus 		return 2;
250731dbd01fSIzik Eidus 	if (age <= 8)
250831dbd01fSIzik Eidus 		return 4;
250931dbd01fSIzik Eidus 
25109ba69294SHugh Dickins 	return 8;
2511a5f18ba0SMatthew Wilcox (Oracle) }
25129ba69294SHugh Dickins 
251358730ab6SQi Zheng /*
25149ba69294SHugh Dickins  * Determines if a page should be skipped for the current scan.
251531dbd01fSIzik Eidus  *
251631dbd01fSIzik Eidus  * @page: page to check
251731dbd01fSIzik Eidus  * @rmap_item: associated rmap_item of page
251831dbd01fSIzik Eidus  */
should_skip_rmap_item(struct page * page,struct ksm_rmap_item * rmap_item)2519420be4edSChengyang Fan static bool should_skip_rmap_item(struct page *page,
252031dbd01fSIzik Eidus 				  struct ksm_rmap_item *rmap_item)
252131dbd01fSIzik Eidus {
252258730ab6SQi Zheng 	rmap_age_t age;
252358730ab6SQi Zheng 
252458730ab6SQi Zheng 	if (!ksm_smart_scan)
2525cd551f97SHugh Dickins 		return false;
2526cd551f97SHugh Dickins 
2527c1e8d7c6SMichel Lespinasse 	/*
2528cd551f97SHugh Dickins 	 * Never skip pages that are already KSM; pages cmp_and_merge_page()
2529cd551f97SHugh Dickins 	 * will essentially ignore them, but we still have to process them
25309ba69294SHugh Dickins 	 * properly.
25319ba69294SHugh Dickins 	 */
25329ba69294SHugh Dickins 	if (PageKsm(page))
2533c1e8d7c6SMichel Lespinasse 		return false;
2534cd551f97SHugh Dickins 
253558730ab6SQi Zheng 	age = rmap_item->age;
253658730ab6SQi Zheng 	if (age != U8_MAX)
25379ba69294SHugh Dickins 		rmap_item->age++;
25389ba69294SHugh Dickins 
253958730ab6SQi Zheng 	/*
2540cd551f97SHugh Dickins 	 * Smaller ages are not skipped, they need to get a chance to go
2541d7597f59SStefan Roesch 	 * through the different phases of the KSM merging.
2542d8ed45c5SMichel Lespinasse 	 */
25439ba69294SHugh Dickins 	if (age < 3)
25449ba69294SHugh Dickins 		return false;
2545d8ed45c5SMichel Lespinasse 
25467496fea9SZhou Chengming 	/*
25473e4e28c5SMichel Lespinasse 	 * Are we still allowed to skip? If not, then don't skip it
25487496fea9SZhou Chengming 	 * and determine how much more often we are allowed to skip next.
25497496fea9SZhou Chengming 	 */
25507496fea9SZhou Chengming 	if (!rmap_item->remaining_skips) {
25517496fea9SZhou Chengming 		rmap_item->remaining_skips = skip_age(age);
25527496fea9SZhou Chengming 		return false;
25537496fea9SZhou Chengming 	}
25549ba69294SHugh Dickins 
255531dbd01fSIzik Eidus 	/* Skip this page */
255631dbd01fSIzik Eidus 	ksm_pages_skipped++;
255758730ab6SQi Zheng 	rmap_item->remaining_skips--;
255858730ab6SQi Zheng 	remove_rmap_item_from_tree(rmap_item);
255931dbd01fSIzik Eidus 	return true;
256031dbd01fSIzik Eidus }
2561739100c8SStefan Roesch 
scan_get_next_rmap_item(struct page ** page)256231dbd01fSIzik Eidus static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
256331dbd01fSIzik Eidus {
256431dbd01fSIzik Eidus 	struct mm_struct *mm;
256531dbd01fSIzik Eidus 	struct ksm_mm_slot *mm_slot;
256631dbd01fSIzik Eidus 	struct mm_slot *slot;
256731dbd01fSIzik Eidus 	struct vm_area_struct *vma;
2568b7701a5fSMike Rapoport 	struct ksm_rmap_item *rmap_item;
256931dbd01fSIzik Eidus 	struct vma_iterator vmi;
257031dbd01fSIzik Eidus 	int nid;
257131dbd01fSIzik Eidus 
257221fbd591SQi Zheng 	if (list_empty(&ksm_mm_head.slot.mm_node))
25733f649ab7SKees Cook 		return NULL;
2574b348b5feSStefan Roesch 
257531dbd01fSIzik Eidus 	mm_slot = ksm_scan.mm_slot;
2576b348b5feSStefan Roesch 	if (mm_slot == &ksm_mm_head) {
257731dbd01fSIzik Eidus 		advisor_start_scan();
257831dbd01fSIzik Eidus 		trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
257931dbd01fSIzik Eidus 
258031dbd01fSIzik Eidus 		/*
258131dbd01fSIzik Eidus 		 * A number of pages can hang around indefinitely in per-cpu
258231dbd01fSIzik Eidus 		 * LRU cache, raised page count preventing write_protect_page
258331dbd01fSIzik Eidus 		 * from merging them.  Though it doesn't really matter much,
2584b348b5feSStefan Roesch 		 * it is puzzling to see some stuck in pages_volatile until
2585b348b5feSStefan Roesch 		 * other activity jostles them out, and they also prevented
258631dbd01fSIzik Eidus 		 * LTP's KSM test from succeeding deterministically; so drain
258731dbd01fSIzik Eidus 		 * them here (here rather than on entry to ksm_do_scan(),
25886e158384SHugh Dickins 		 * so we don't IPI too often when pages_to_scan is set low).
25896e158384SHugh Dickins 		 */
259058730ab6SQi Zheng 		lru_add_drain_all();
25916e158384SHugh Dickins 
25926e158384SHugh Dickins 		/*
259331dbd01fSIzik Eidus 		 * Whereas stale stable_nodes on the stable_tree itself
259431dbd01fSIzik Eidus 		 * get pruned in the regular course of stable_tree_search(),
2595fcf9a0efSKirill Tkhai 		 * those moved out to the migrate_nodes list can accumulate:
2596fcf9a0efSKirill Tkhai 		 * so prune them once before each full scan.
2597878aee7dSAndrea Arcangeli 		 */
2598339aa624SIzik Eidus 		if (!ksm_merge_across_nodes) {
259931dbd01fSIzik Eidus 			struct ksm_stable_node *stable_node, *next;
260031dbd01fSIzik Eidus 			struct page *page;
260131dbd01fSIzik Eidus 
2602ef4d43a8SHugh Dickins 			list_for_each_entry_safe(stable_node, next,
26036e158384SHugh Dickins 						 &migrate_nodes, list) {
260431dbd01fSIzik Eidus 				page = get_ksm_page(stable_node,
260531dbd01fSIzik Eidus 						    GET_KSM_PAGE_NOLOCK);
26066e158384SHugh Dickins 				if (page)
2607878aee7dSAndrea Arcangeli 					put_page(page);
2608878aee7dSAndrea Arcangeli 				cond_resched();
26096e158384SHugh Dickins 			}
2610fcf9a0efSKirill Tkhai 		}
2611fcf9a0efSKirill Tkhai 
2612fcf9a0efSKirill Tkhai 		for (nid = 0; nid < ksm_nr_node_ids; nid++)
2613fcf9a0efSKirill Tkhai 			root_unstable_tree[nid] = RB_ROOT;
261431dbd01fSIzik Eidus 
2615878aee7dSAndrea Arcangeli 		spin_lock(&ksm_mmlist_lock);
26166e158384SHugh Dickins 		slot = list_entry(mm_slot->slot.mm_node.next,
261731dbd01fSIzik Eidus 				  struct mm_slot, mm_node);
261831dbd01fSIzik Eidus 		mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
261931dbd01fSIzik Eidus 		ksm_scan.mm_slot = mm_slot;
262031dbd01fSIzik Eidus 		spin_unlock(&ksm_mmlist_lock);
262131dbd01fSIzik Eidus 		/*
2622d7597f59SStefan Roesch 		 * Although we tested list_empty() above, a racing __ksm_exit
2623d7597f59SStefan Roesch 		 * of the last mm on the list may have removed it since then.
2624d7597f59SStefan Roesch 		 */
2625d7597f59SStefan Roesch 		if (mm_slot == &ksm_mm_head)
2626d7597f59SStefan Roesch 			return NULL;
2627d7597f59SStefan Roesch next_mm:
2628d7597f59SStefan Roesch 		ksm_scan.address = 0;
2629d7597f59SStefan Roesch 		ksm_scan.rmap_list = &mm_slot->rmap_list;
2630d7597f59SStefan Roesch 	}
2631d7597f59SStefan Roesch 
2632d7597f59SStefan Roesch 	slot = &mm_slot->slot;
263324139c07SDavid Hildenbrand 	mm = slot->mm;
263424139c07SDavid Hildenbrand 	vma_iter_init(&vmi, mm, ksm_scan.address);
263524139c07SDavid Hildenbrand 
263624139c07SDavid Hildenbrand 	mmap_read_lock(mm);
263724139c07SDavid Hildenbrand 	if (ksm_test_exit(mm))
263824139c07SDavid Hildenbrand 		goto no_vmas;
263924139c07SDavid Hildenbrand 
264024139c07SDavid Hildenbrand 	for_each_vma(vmi, vma) {
264149b06385SSuren Baghdasaryan 		if (!(vma->vm_flags & VM_MERGEABLE))
264224139c07SDavid Hildenbrand 			continue;
264324139c07SDavid Hildenbrand 		if (ksm_scan.address < vma->vm_start)
264424139c07SDavid Hildenbrand 			ksm_scan.address = vma->vm_start;
264524139c07SDavid Hildenbrand 		if (!vma->anon_vma)
264624139c07SDavid Hildenbrand 			ksm_scan.address = vma->vm_end;
264724139c07SDavid Hildenbrand 
264824139c07SDavid Hildenbrand 		while (ksm_scan.address < vma->vm_end) {
2649d7597f59SStefan Roesch 			if (ksm_test_exit(mm))
2650d7597f59SStefan Roesch 				break;
2651d7597f59SStefan Roesch 			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
2652d7597f59SStefan Roesch 			if (IS_ERR_OR_NULL(*page)) {
2653d7597f59SStefan Roesch 				ksm_scan.address += PAGE_SIZE;
2654d7597f59SStefan Roesch 				cond_resched();
2655d7597f59SStefan Roesch 				continue;
2656d7597f59SStefan Roesch 			}
2657d7597f59SStefan Roesch 			if (is_zone_device_page(*page))
2658d7597f59SStefan Roesch 				goto next_page;
2659d7597f59SStefan Roesch 			if (PageAnon(*page)) {
2660d7597f59SStefan Roesch 				flush_anon_page(vma, *page, ksm_scan.address);
2661d7597f59SStefan Roesch 				flush_dcache_page(*page);
2662d7597f59SStefan Roesch 				rmap_item = get_next_rmap_item(mm_slot,
2663d7597f59SStefan Roesch 					ksm_scan.rmap_list, ksm_scan.address);
2664d7597f59SStefan Roesch 				if (rmap_item) {
2665d7597f59SStefan Roesch 					ksm_scan.rmap_list =
2666d7597f59SStefan Roesch 							&rmap_item->rmap_list;
2667d7597f59SStefan Roesch 
2668d7597f59SStefan Roesch 					if (should_skip_rmap_item(*page, rmap_item))
2669d7597f59SStefan Roesch 						goto next_page;
2670d7597f59SStefan Roesch 
267124139c07SDavid Hildenbrand 					ksm_scan.address += PAGE_SIZE;
267224139c07SDavid Hildenbrand 				} else
267324139c07SDavid Hildenbrand 					put_page(*page);
267424139c07SDavid Hildenbrand 				mmap_read_unlock(mm);
267524139c07SDavid Hildenbrand 				return rmap_item;
267624139c07SDavid Hildenbrand 			}
267724139c07SDavid Hildenbrand next_page:
267824139c07SDavid Hildenbrand 			put_page(*page);
267924139c07SDavid Hildenbrand 			ksm_scan.address += PAGE_SIZE;
268024139c07SDavid Hildenbrand 			cond_resched();
268124139c07SDavid Hildenbrand 		}
268224139c07SDavid Hildenbrand 	}
268324139c07SDavid Hildenbrand 
268424139c07SDavid Hildenbrand 	if (ksm_test_exit(mm)) {
2685d7597f59SStefan Roesch no_vmas:
2686d7597f59SStefan Roesch 		ksm_scan.address = 0;
2687d7597f59SStefan Roesch 		ksm_scan.rmap_list = &mm_slot->rmap_list;
2688d7597f59SStefan Roesch 	}
2689d7597f59SStefan Roesch 	/*
2690d7597f59SStefan Roesch 	 * Nuke all the rmap_items that are above this current rmap:
2691d7597f59SStefan Roesch 	 * because there were no VM_MERGEABLE vmas with such addresses.
2692d7597f59SStefan Roesch 	 */
2693d7597f59SStefan Roesch 	remove_trailing_rmap_items(ksm_scan.rmap_list);
2694d7597f59SStefan Roesch 
2695d7597f59SStefan Roesch 	spin_lock(&ksm_mmlist_lock);
2696d7597f59SStefan Roesch 	slot = list_entry(mm_slot->slot.mm_node.next,
2697d7597f59SStefan Roesch 			  struct mm_slot, mm_node);
2698d7597f59SStefan Roesch 	ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2699d7597f59SStefan Roesch 	if (ksm_scan.address == 0) {
2700d7597f59SStefan Roesch 		/*
2701d7597f59SStefan Roesch 		 * We've completed a full scan of all vmas, holding mmap_lock
2702d7597f59SStefan Roesch 		 * throughout, and found no VM_MERGEABLE: so do the same as
2703d7597f59SStefan Roesch 		 * __ksm_exit does to remove this mm from all our lists now.
2704d7597f59SStefan Roesch 		 * This applies either when cleaning up after __ksm_exit
2705d7597f59SStefan Roesch 		 * (but beware: we can reach here even before __ksm_exit),
2706d7597f59SStefan Roesch 		 * or when all VM_MERGEABLE areas have been unmapped (and
2707d7597f59SStefan Roesch 		 * mmap_lock then protects against race with MADV_MERGEABLE).
2708d7597f59SStefan Roesch 		 */
2709d7597f59SStefan Roesch 		hash_del(&mm_slot->slot.hash);
2710d7597f59SStefan Roesch 		list_del(&mm_slot->slot.mm_node);
2711d7597f59SStefan Roesch 		spin_unlock(&ksm_mmlist_lock);
271224139c07SDavid Hildenbrand 
271324139c07SDavid Hildenbrand 		mm_slot_free(mm_slot_cache, mm_slot);
271424139c07SDavid Hildenbrand 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
271524139c07SDavid Hildenbrand 		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
271624139c07SDavid Hildenbrand 		mmap_read_unlock(mm);
271724139c07SDavid Hildenbrand 		mmdrop(mm);
271824139c07SDavid Hildenbrand 	} else {
271924139c07SDavid Hildenbrand 		mmap_read_unlock(mm);
272024139c07SDavid Hildenbrand 		/*
272124139c07SDavid Hildenbrand 		 * mmap_read_unlock(mm) first because after
272224139c07SDavid Hildenbrand 		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
272324139c07SDavid Hildenbrand 		 * already have been freed under us by __ksm_exit()
272424139c07SDavid Hildenbrand 		 * because the "mm_slot" is still hashed and
272524139c07SDavid Hildenbrand 		 * ksm_scan.mm_slot doesn't point to it anymore.
272624139c07SDavid Hildenbrand 		 */
272724139c07SDavid Hildenbrand 		spin_unlock(&ksm_mmlist_lock);
272824139c07SDavid Hildenbrand 	}
272924139c07SDavid Hildenbrand 
273024139c07SDavid Hildenbrand 	/* Repeat until we've completed scanning the whole list */
273124139c07SDavid Hildenbrand 	mm_slot = ksm_scan.mm_slot;
273224139c07SDavid Hildenbrand 	if (mm_slot != &ksm_mm_head)
273324139c07SDavid Hildenbrand 		goto next_mm;
273424139c07SDavid Hildenbrand 
273524139c07SDavid Hildenbrand 	advisor_stop_scan();
273624139c07SDavid Hildenbrand 
273724139c07SDavid Hildenbrand 	trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
273824139c07SDavid Hildenbrand 	ksm_scan.seqnr++;
273924139c07SDavid Hildenbrand 	return NULL;
274024139c07SDavid Hildenbrand }
27412c281f54SDavid Hildenbrand 
27422c281f54SDavid Hildenbrand /**
27432c281f54SDavid Hildenbrand  * ksm_do_scan  - the ksm scanner main worker function.
27442c281f54SDavid Hildenbrand  * @scan_npages:  number of pages we want to scan before we return.
27452c281f54SDavid Hildenbrand  */
ksm_do_scan(unsigned int scan_npages)27462c281f54SDavid Hildenbrand static void ksm_do_scan(unsigned int scan_npages)
27472c281f54SDavid Hildenbrand {
27482c281f54SDavid Hildenbrand 	struct ksm_rmap_item *rmap_item;
27492c281f54SDavid Hildenbrand 	struct page *page;
27502c281f54SDavid Hildenbrand 	unsigned int npages = scan_npages;
27512c281f54SDavid Hildenbrand 
2752f8af4da3SHugh Dickins 	while (npages-- && likely(!freezing(current))) {
2753f8af4da3SHugh Dickins 		cond_resched();
2754f8af4da3SHugh Dickins 		rmap_item = scan_get_next_rmap_item(&page);
2755f8af4da3SHugh Dickins 		if (!rmap_item)
2756d952b791SHugh Dickins 			return;
2757f8af4da3SHugh Dickins 		cmp_and_merge_page(page, rmap_item);
2758f8af4da3SHugh Dickins 		put_page(page);
2759f8af4da3SHugh Dickins 	}
2760d7597f59SStefan Roesch 
2761e1fb4a08SDave Jiang 	ksm_pages_scanned += scan_npages - npages;
2762d7597f59SStefan Roesch }
276312564485SShawn Anastasio 
ksmd_should_run(void)2764cc2383ecSKonstantin Khlebnikov static int ksmd_should_run(void)
2765d952b791SHugh Dickins {
2766d952b791SHugh Dickins 	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
2767d952b791SHugh Dickins }
2768d952b791SHugh Dickins 
ksm_scan_thread(void * nothing)2769d952b791SHugh Dickins static int ksm_scan_thread(void *nothing)
2770f8af4da3SHugh Dickins {
2771f8af4da3SHugh Dickins 	unsigned int sleep_ms;
2772f8af4da3SHugh Dickins 
2773f8af4da3SHugh Dickins 	set_freezable();
2774f8af4da3SHugh Dickins 	set_user_nice(current, 5);
2775f8af4da3SHugh Dickins 
2776f8af4da3SHugh Dickins 	while (!kthread_should_stop()) {
2777f8af4da3SHugh Dickins 		mutex_lock(&ksm_thread_mutex);
2778d952b791SHugh Dickins 		wait_while_offlining();
277949b06385SSuren Baghdasaryan 		if (ksmd_should_run())
2780d952b791SHugh Dickins 			ksm_do_scan(ksm_thread_pages_to_scan);
2781d952b791SHugh Dickins 		mutex_unlock(&ksm_thread_mutex);
2782d952b791SHugh Dickins 
2783f8af4da3SHugh Dickins 		if (ksmd_should_run()) {
2784f8af4da3SHugh Dickins 			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2785f8af4da3SHugh Dickins 			wait_event_freezable_timeout(ksm_iter_wait,
2786f8af4da3SHugh Dickins 				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2787f8af4da3SHugh Dickins 				msecs_to_jiffies(sleep_ms));
2788f8af4da3SHugh Dickins 		} else {
2789f8af4da3SHugh Dickins 			wait_event_freezable(ksm_thread_wait,
279033cf1707SBharata B Rao 				ksmd_should_run() || kthread_should_stop());
2791f8af4da3SHugh Dickins 		}
2792f8af4da3SHugh Dickins 	}
2793f8af4da3SHugh Dickins 	return 0;
279421fbd591SQi Zheng }
279558730ab6SQi Zheng 
__ksm_add_vma(struct vm_area_struct * vma)27966e158384SHugh Dickins static void __ksm_add_vma(struct vm_area_struct *vma)
27976e158384SHugh Dickins {
279858730ab6SQi Zheng 	unsigned long vm_flags = vma->vm_flags;
279931dbd01fSIzik Eidus 
280031dbd01fSIzik Eidus 	if (vm_flags & VM_MERGEABLE)
280131dbd01fSIzik Eidus 		return;
280258730ab6SQi Zheng 
280358730ab6SQi Zheng 	if (vma_ksm_compatible(vma))
28046e158384SHugh Dickins 		vm_flags_set(vma, VM_MERGEABLE);
280558730ab6SQi Zheng }
28066e158384SHugh Dickins 
__ksm_del_vma(struct vm_area_struct * vma)280731dbd01fSIzik Eidus static int __ksm_del_vma(struct vm_area_struct *vma)
280858730ab6SQi Zheng {
280931dbd01fSIzik Eidus 	int err;
2810cbf86cfeSHugh Dickins 
2811cbf86cfeSHugh Dickins 	if (!(vma->vm_flags & VM_MERGEABLE))
281231dbd01fSIzik Eidus 		return 0;
281331dbd01fSIzik Eidus 
2814cbf86cfeSHugh Dickins 	if (vma->anon_vma) {
2815cbf86cfeSHugh Dickins 		err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
2816cbf86cfeSHugh Dickins 		if (err)
2817cbf86cfeSHugh Dickins 			return err;
281831dbd01fSIzik Eidus 	}
2819cbf86cfeSHugh Dickins 
282058730ab6SQi Zheng 	vm_flags_clear(vma, VM_MERGEABLE);
2821cbf86cfeSHugh Dickins 	return 0;
282258730ab6SQi Zheng }
282331dbd01fSIzik Eidus /**
282431dbd01fSIzik Eidus  * ksm_add_vma - Mark vma as mergeable if compatible
2825f8af4da3SHugh Dickins  *
2826f1f10076SVegard Nossum  * @vma:  Pointer to vma
28276e158384SHugh Dickins  */
ksm_add_vma(struct vm_area_struct * vma)28286e158384SHugh Dickins void ksm_add_vma(struct vm_area_struct *vma)
28296e158384SHugh Dickins {
28306e158384SHugh Dickins 	struct mm_struct *mm = vma->vm_mm;
2831739100c8SStefan Roesch 
2832f8af4da3SHugh Dickins 	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
2833f8af4da3SHugh Dickins 		__ksm_add_vma(vma);
2834f8af4da3SHugh Dickins }
28351c2fb7a4SAndrea Arcangeli 
ksm_add_vmas(struct mm_struct * mm)2836f8af4da3SHugh Dickins static void ksm_add_vmas(struct mm_struct *mm)
283721fbd591SQi Zheng {
283858730ab6SQi Zheng 	struct vm_area_struct *vma;
28399ba69294SHugh Dickins 
2840cd551f97SHugh Dickins 	VMA_ITERATOR(vmi, mm, 0);
284131dbd01fSIzik Eidus 	for_each_vma(vmi, vma)
28429ba69294SHugh Dickins 		__ksm_add_vma(vma);
28439ba69294SHugh Dickins }
28449ba69294SHugh Dickins 
ksm_del_vmas(struct mm_struct * mm)2845c1e8d7c6SMichel Lespinasse static int ksm_del_vmas(struct mm_struct *mm)
28469ba69294SHugh Dickins {
28479ba69294SHugh Dickins 	struct vm_area_struct *vma;
284831dbd01fSIzik Eidus 	int err;
28499ba69294SHugh Dickins 
2850cd551f97SHugh Dickins 	VMA_ITERATOR(vmi, mm, 0);
285158730ab6SQi Zheng 	for_each_vma(vmi, vma) {
285258730ab6SQi Zheng 		err = __ksm_del_vma(vma);
28539ba69294SHugh Dickins 		if (err)
28546514d511SHugh Dickins 			return err;
285558730ab6SQi Zheng 	}
285658730ab6SQi Zheng 	return 0;
28579ba69294SHugh Dickins }
28589ba69294SHugh Dickins 
285958730ab6SQi Zheng /**
286058730ab6SQi Zheng  * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
28619ba69294SHugh Dickins  *                        compatible VMA's
28629ba69294SHugh Dickins  *
2863cd551f97SHugh Dickins  * @mm:  Pointer to mm
2864cd551f97SHugh Dickins  *
28659ba69294SHugh Dickins  * Returns 0 on success, otherwise error code
286658730ab6SQi Zheng  */
ksm_enable_merge_any(struct mm_struct * mm)2867d7597f59SStefan Roesch int ksm_enable_merge_any(struct mm_struct *mm)
2868cd551f97SHugh Dickins {
28699ba69294SHugh Dickins 	int err;
28709ba69294SHugh Dickins 
2871d8ed45c5SMichel Lespinasse 	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
2872d8ed45c5SMichel Lespinasse 		return 0;
28739ba69294SHugh Dickins 
2874739100c8SStefan Roesch 	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2875739100c8SStefan Roesch 		err = __ksm_enter(mm);
2876f8af4da3SHugh Dickins 		if (err)
287731dbd01fSIzik Eidus 			return err;
2878cbf86cfeSHugh Dickins 	}
28795ad64688SHugh Dickins 
28805ad64688SHugh Dickins 	set_bit(MMF_VM_MERGE_ANY, &mm->flags);
2881e05b3453SMatthew Wilcox (Oracle) 	ksm_add_vmas(mm);
2882e05b3453SMatthew Wilcox (Oracle) 
28835ad64688SHugh Dickins 	return 0;
28845ad64688SHugh Dickins }
2885cbf86cfeSHugh Dickins 
2886cbf86cfeSHugh Dickins /**
2887cbf86cfeSHugh Dickins  * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
2888cbf86cfeSHugh Dickins  *			   previously enabled via ksm_enable_merge_any().
2889cbf86cfeSHugh Dickins  *
2890cbf86cfeSHugh Dickins  * Disabling merging implies unmerging any merged pages, like setting
2891e1c63e11SNanyong Sun  * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
2892e1c63e11SNanyong Sun  * merging on all compatible VMA's remains enabled.
2893cbf86cfeSHugh Dickins  *
2894cbf86cfeSHugh Dickins  * @mm: Pointer to mm
2895f985fc32SMiaohe Lin  *
2896f985fc32SMiaohe Lin  * Returns 0 on success, otherwise error code
2897cbf86cfeSHugh Dickins  */
ksm_disable_merge_any(struct mm_struct * mm)2898cbf86cfeSHugh Dickins int ksm_disable_merge_any(struct mm_struct *mm)
2899cbf86cfeSHugh Dickins {
29005ad64688SHugh Dickins 	int err;
29018f425e4eSMatthew Wilcox (Oracle) 
29028f425e4eSMatthew Wilcox (Oracle) 	if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
290362fdb163SHugh Dickins 		return 0;
290462fdb163SHugh Dickins 
290562fdb163SHugh Dickins 	err = ksm_del_vmas(mm);
29065ad64688SHugh Dickins 	if (err) {
29076b970599SKefeng Wang 		ksm_add_vmas(mm);
29086b970599SKefeng Wang 		return err;
29096b970599SKefeng Wang 	}
29106b970599SKefeng Wang 
29116b970599SKefeng Wang 	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
29125ad64688SHugh Dickins 	return 0;
29135ad64688SHugh Dickins }
291448c935adSKirill A. Shutemov 
ksm_disable(struct mm_struct * mm)29154d45c3afSYang Yang int ksm_disable(struct mm_struct *mm)
29164d45c3afSYang Yang {
29174d45c3afSYang Yang 	mmap_assert_write_locked(mm);
29185ad64688SHugh Dickins 
29195ad64688SHugh Dickins 	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
29205ad64688SHugh Dickins 		return 0;
29215ad64688SHugh Dickins 	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
29225ad64688SHugh Dickins 		return ksm_disable_merge_any(mm);
29236d4675e6SMinchan Kim 	return ksm_del_vmas(mm);
2924e9995ef9SHugh Dickins }
292521fbd591SQi Zheng 
ksm_madvise(struct vm_area_struct * vma,unsigned long start,unsigned long end,int advice,unsigned long * vm_flags)292621fbd591SQi Zheng int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2927e9995ef9SHugh Dickins 		unsigned long end, int advice, unsigned long *vm_flags)
2928e9995ef9SHugh Dickins {
29292f031c6fSMatthew Wilcox (Oracle) 	struct mm_struct *mm = vma->vm_mm;
29309f32624bSJoonsoo Kim 	int err;
29319f32624bSJoonsoo Kim 
29329f32624bSJoonsoo Kim 	switch (advice) {
29339f32624bSJoonsoo Kim 	case MADV_MERGEABLE:
29349f32624bSJoonsoo Kim 		if (vma->vm_flags & VM_MERGEABLE)
29352f031c6fSMatthew Wilcox (Oracle) 			return 0;
2936e9995ef9SHugh Dickins 		if (!vma_ksm_compatible(vma))
29372f031c6fSMatthew Wilcox (Oracle) 			return 0;
2938e9995ef9SHugh Dickins 
29391df631aeSMinchan Kim 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2940e9995ef9SHugh Dickins 			err = __ksm_enter(mm);
2941b67bfe0dSSasha Levin 			if (err)
2942e9995ef9SHugh Dickins 				return err;
29435beb4930SRik van Riel 		}
2944e9995ef9SHugh Dickins 
2945e9995ef9SHugh Dickins 		*vm_flags |= VM_MERGEABLE;
2946ad12695fSAndrea Arcangeli 		break;
29476d4675e6SMinchan Kim 
29486d4675e6SMinchan Kim 	case MADV_UNMERGEABLE:
29496d4675e6SMinchan Kim 		if (!(*vm_flags & VM_MERGEABLE))
29506d4675e6SMinchan Kim 			return 0;		/* just ignore the advice */
29516d4675e6SMinchan Kim 
2952b6b19f25SHugh Dickins 		if (vma->anon_vma) {
29536d4675e6SMinchan Kim 			err = unmerge_ksm_pages(vma, start, end, true);
2954bf181b9fSMichel Lespinasse 			if (err)
2955bf181b9fSMichel Lespinasse 				return err;
29561105a2fcSJia He 		}
29571105a2fcSJia He 
2958ad12695fSAndrea Arcangeli 		*vm_flags &= ~VM_MERGEABLE;
29595beb4930SRik van Riel 		break;
29601105a2fcSJia He 	}
29611105a2fcSJia He 
2962cd7fae26SMiaohe Lin 	return 0;
29631105a2fcSJia He }
29641105a2fcSJia He EXPORT_SYMBOL_GPL(ksm_madvise);
2965e9995ef9SHugh Dickins 
__ksm_enter(struct mm_struct * mm)2966e9995ef9SHugh Dickins int __ksm_enter(struct mm_struct *mm)
2967e9995ef9SHugh Dickins {
2968e9995ef9SHugh Dickins 	struct ksm_mm_slot *mm_slot;
2969e9995ef9SHugh Dickins 	struct mm_slot *slot;
2970e9995ef9SHugh Dickins 	int needs_wakeup;
2971e9995ef9SHugh Dickins 
2972e9995ef9SHugh Dickins 	mm_slot = mm_slot_alloc(mm_slot_cache);
2973e9995ef9SHugh Dickins 	if (!mm_slot)
2974e9995ef9SHugh Dickins 		return -ENOMEM;
29750dd1c7bbSJoonsoo Kim 
29760dd1c7bbSJoonsoo Kim 	slot = &mm_slot->slot;
29770dd1c7bbSJoonsoo Kim 
29782f031c6fSMatthew Wilcox (Oracle) 	/* Check ksm_run too?  Would need tighter locking */
2979b6b19f25SHugh Dickins 	needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
29801df631aeSMinchan Kim 
2981e9995ef9SHugh Dickins 	spin_lock(&ksm_mmlist_lock);
29822f031c6fSMatthew Wilcox (Oracle) 	mm_slot_insert(mm_slots_hash, mm, slot);
29830dd1c7bbSJoonsoo Kim 	/*
29841df631aeSMinchan Kim 	 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
29850dd1c7bbSJoonsoo Kim 	 * insert just behind the scanning cursor, to let the area settle
2986e9995ef9SHugh Dickins 	 * down a little; when fork is followed by immediate exec, we don't
2987b6b19f25SHugh Dickins 	 * want ksmd to waste time setting up and tearing down an rmap_list.
2988e9995ef9SHugh Dickins 	 *
2989e9995ef9SHugh Dickins 	 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
2990e9995ef9SHugh Dickins 	 * scanning cursor, otherwise KSM pages in newly forked mms will be
2991e9995ef9SHugh Dickins 	 * missed: then we might as well insert at the end of the list.
2992e9995ef9SHugh Dickins 	 */
29934248d008SLonglong Xia 	if (ksm_run & KSM_RUN_UNMERGE)
29944248d008SLonglong Xia 		list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
29954248d008SLonglong Xia 	else
29964248d008SLonglong Xia 		list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
29974248d008SLonglong Xia 	spin_unlock(&ksm_mmlist_lock);
29984248d008SLonglong Xia 
29994248d008SLonglong Xia 	set_bit(MMF_VM_MERGEABLE, &mm->flags);
30004248d008SLonglong Xia 	mmgrab(mm);
30014248d008SLonglong Xia 
30024248d008SLonglong Xia 	if (needs_wakeup)
30034248d008SLonglong Xia 		wake_up_interruptible(&ksm_thread_wait);
30044248d008SLonglong Xia 
30054248d008SLonglong Xia 	trace_ksm_enter(mm);
30064248d008SLonglong Xia 	return 0;
30074248d008SLonglong Xia }
30084248d008SLonglong Xia 
__ksm_exit(struct mm_struct * mm)30094248d008SLonglong Xia void __ksm_exit(struct mm_struct *mm)
30104248d008SLonglong Xia {
30114248d008SLonglong Xia 	struct ksm_mm_slot *mm_slot;
30124248d008SLonglong Xia 	struct mm_slot *slot;
3013d256d1cdSTong Tiangen 	int easy_to_free = 0;
30144248d008SLonglong Xia 
30154248d008SLonglong Xia 	/*
30164248d008SLonglong Xia 	 * This process is exiting: if it's straightforward (as is the
30174248d008SLonglong Xia 	 * case when ksmd was never running), free mm_slot immediately.
30184248d008SLonglong Xia 	 * But if it's at the cursor or has rmap_items linked to it, use
30194248d008SLonglong Xia 	 * mmap_lock to synchronize with any break_cows before pagetables
30204248d008SLonglong Xia 	 * are freed, and leave the mm_slot on the list for ksmd to free.
30214248d008SLonglong Xia 	 * Beware: ksm may already have noticed it exiting and freed the slot.
30224248d008SLonglong Xia 	 */
30234248d008SLonglong Xia 
30244248d008SLonglong Xia 	spin_lock(&ksm_mmlist_lock);
30254248d008SLonglong Xia 	slot = mm_slot_lookup(mm_slots_hash, mm);
30264248d008SLonglong Xia 	mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
30274248d008SLonglong Xia 	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
30284248d008SLonglong Xia 		if (!mm_slot->rmap_list) {
30294248d008SLonglong Xia 			hash_del(&slot->hash);
30304248d008SLonglong Xia 			list_del(&slot->mm_node);
30314248d008SLonglong Xia 			easy_to_free = 1;
3032d256d1cdSTong Tiangen 		} else {
30334248d008SLonglong Xia 			list_move(&slot->mm_node,
30344248d008SLonglong Xia 				  &ksm_scan.mm_slot->slot.mm_node);
30354248d008SLonglong Xia 		}
30364248d008SLonglong Xia 	}
30374248d008SLonglong Xia 	spin_unlock(&ksm_mmlist_lock);
303852629506SJoonsoo Kim 
303919138349SMatthew Wilcox (Oracle) 	if (easy_to_free) {
3040e9995ef9SHugh Dickins 		mm_slot_free(mm_slot_cache, mm_slot);
304121fbd591SQi Zheng 		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
3042e9995ef9SHugh Dickins 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
304319138349SMatthew Wilcox (Oracle) 		mmdrop(mm);
304419138349SMatthew Wilcox (Oracle) 	} else if (mm_slot) {
304519138349SMatthew Wilcox (Oracle) 		mmap_write_lock(mm);
3046e9995ef9SHugh Dickins 		mmap_write_unlock(mm);
304719138349SMatthew Wilcox (Oracle) 	}
3048e9995ef9SHugh Dickins 
304919138349SMatthew Wilcox (Oracle) 	trace_ksm_exit(mm);
305019138349SMatthew Wilcox (Oracle) }
3051c8d6553bSHugh Dickins 
ksm_might_need_to_copy(struct folio * folio,struct vm_area_struct * vma,unsigned long addr)305219138349SMatthew Wilcox (Oracle) struct folio *ksm_might_need_to_copy(struct folio *folio,
3053c8d6553bSHugh Dickins 			struct vm_area_struct *vma, unsigned long addr)
305419138349SMatthew Wilcox (Oracle) {
305519138349SMatthew Wilcox (Oracle) 	struct page *page = folio_page(folio, 0);
3056c8d6553bSHugh Dickins 	struct anon_vma *anon_vma = folio_anon_vma(folio);
3057c8d6553bSHugh Dickins 	struct folio *new_folio;
305819138349SMatthew Wilcox (Oracle) 
3059e9995ef9SHugh Dickins 	if (folio_test_large(folio))
3060e9995ef9SHugh Dickins 		return folio;
3061e9995ef9SHugh Dickins 
3062e9995ef9SHugh Dickins 	if (folio_test_ksm(folio)) {
306362b61f61SHugh Dickins 		if (folio_stable_node(folio) &&
3064ef4d43a8SHugh Dickins 		    !(ksm_run & KSM_RUN_UNMERGE))
3065ef4d43a8SHugh Dickins 			return folio;	/* no need to copy it */
3066ef4d43a8SHugh Dickins 	} else if (!anon_vma) {
3067ef4d43a8SHugh Dickins 		return folio;		/* no need to copy it */
3068ef4d43a8SHugh Dickins 	} else if (folio->index == linear_page_index(vma, addr) &&
306974316201SNeilBrown 			anon_vma->root == vma->anon_vma->root) {
3070ef4d43a8SHugh Dickins 		return folio;		/* still no need to copy it */
3071ef4d43a8SHugh Dickins 	}
3072ef4d43a8SHugh Dickins 	if (PageHWPoison(page))
3073ef4d43a8SHugh Dickins 		return ERR_PTR(-EHWPOISON);
307421fbd591SQi Zheng 	if (!folio_test_uptodate(folio))
30752c653d0eSAndrea Arcangeli 		return folio;		/* let do_swap_page report the error */
30762c653d0eSAndrea Arcangeli 
30772c653d0eSAndrea Arcangeli 	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
30782c653d0eSAndrea Arcangeli 	if (new_folio &&
30792c653d0eSAndrea Arcangeli 	    mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
30802c653d0eSAndrea Arcangeli 		folio_put(new_folio);
30812c653d0eSAndrea Arcangeli 		new_folio = NULL;
30822c653d0eSAndrea Arcangeli 	}
30832c653d0eSAndrea Arcangeli 	if (new_folio) {
30842c653d0eSAndrea Arcangeli 		if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
30852c653d0eSAndrea Arcangeli 								addr, vma)) {
30862c653d0eSAndrea Arcangeli 			folio_put(new_folio);
30872c653d0eSAndrea Arcangeli 			memory_failure_queue(folio_pfn(folio), 0);
30882c653d0eSAndrea Arcangeli 			return ERR_PTR(-EHWPOISON);
30892c653d0eSAndrea Arcangeli 		}
309021fbd591SQi Zheng 		folio_set_dirty(new_folio);
30912c653d0eSAndrea Arcangeli 		__folio_mark_uptodate(new_folio);
30922c653d0eSAndrea Arcangeli 		__folio_set_locked(new_folio);
30932c653d0eSAndrea Arcangeli #ifdef CONFIG_SWAP
30942c653d0eSAndrea Arcangeli 		count_vm_event(KSM_SWPIN_COPY);
309521fbd591SQi Zheng #endif
30962c653d0eSAndrea Arcangeli 	}
30972c653d0eSAndrea Arcangeli 
30982c653d0eSAndrea Arcangeli 	return new_folio;
30992c653d0eSAndrea Arcangeli }
31002c653d0eSAndrea Arcangeli 
rmap_walk_ksm(struct folio * folio,struct rmap_walk_control * rwc)31012c653d0eSAndrea Arcangeli void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
31022c653d0eSAndrea Arcangeli {
31032c653d0eSAndrea Arcangeli 	struct ksm_stable_node *stable_node;
31042c653d0eSAndrea Arcangeli 	struct ksm_rmap_item *rmap_item;
31052c653d0eSAndrea Arcangeli 	int search_new_forks = 0;
31062c653d0eSAndrea Arcangeli 
31072c653d0eSAndrea Arcangeli 	VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
31082c653d0eSAndrea Arcangeli 
31092c653d0eSAndrea Arcangeli 	/*
31102c653d0eSAndrea Arcangeli 	 * Rely on the page lock to protect against concurrent modifications
31112c653d0eSAndrea Arcangeli 	 * to that page's node of the stable tree.
31122c653d0eSAndrea Arcangeli 	 */
31132c653d0eSAndrea Arcangeli 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
31142c653d0eSAndrea Arcangeli 
31152c653d0eSAndrea Arcangeli 	stable_node = folio_stable_node(folio);
3116ee0ea59cSHugh Dickins 	if (!stable_node)
311762b61f61SHugh Dickins 		return;
311862b61f61SHugh Dickins again:
311921fbd591SQi Zheng 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
312062b61f61SHugh Dickins 		struct anon_vma *anon_vma = rmap_item->anon_vma;
312190bd6fd3SPetr Holasek 		struct anon_vma_chain *vmac;
312262b61f61SHugh Dickins 		struct vm_area_struct *vma;
3123ef53d16cSHugh Dickins 
3124ef53d16cSHugh Dickins 		cond_resched();
3125ee0ea59cSHugh Dickins 		if (!anon_vma_trylock_read(anon_vma)) {
312621fbd591SQi Zheng 			if (rwc->try_lock) {
31272c653d0eSAndrea Arcangeli 				rwc->contended = true;
31282c653d0eSAndrea Arcangeli 				return;
31292c653d0eSAndrea Arcangeli 			}
31302c653d0eSAndrea Arcangeli 			anon_vma_lock_read(anon_vma);
3131ef53d16cSHugh Dickins 		}
31322c653d0eSAndrea Arcangeli 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
3133ee0ea59cSHugh Dickins 					       0, ULONG_MAX) {
3134ee0ea59cSHugh Dickins 			unsigned long addr;
313562b61f61SHugh Dickins 
3136ee0ea59cSHugh Dickins 			cond_resched();
313703640418SGeliang Tang 			vma = vmac->vma;
31384146d2d6SHugh Dickins 
31394146d2d6SHugh Dickins 			/* Ignore the stable/unstable/sqnr flags */
31404146d2d6SHugh Dickins 			addr = rmap_item->address & PAGE_MASK;
31414146d2d6SHugh Dickins 
31424146d2d6SHugh Dickins 			if (addr < vma->vm_start || addr >= vma->vm_end)
314362b61f61SHugh Dickins 				continue;
314462b61f61SHugh Dickins 			/*
314562b61f61SHugh Dickins 			 * Initially we examine only the vma which covers this
314662b61f61SHugh Dickins 			 * rmap_item; but later, if there is still work to do,
314762b61f61SHugh Dickins 			 * we examine covering vmas in other mms: in case they
314862b61f61SHugh Dickins 			 * were forked from the original since ksmd passed.
314962b61f61SHugh Dickins 			 */
315062b61f61SHugh Dickins 			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
315162b61f61SHugh Dickins 				continue;
315262b61f61SHugh Dickins 
3153ef4d43a8SHugh Dickins 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
3154ef4d43a8SHugh Dickins 				continue;
3155ef4d43a8SHugh Dickins 
3156ef4d43a8SHugh Dickins 			if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
3157ef4d43a8SHugh Dickins 				anon_vma_unlock_read(anon_vma);
315862b61f61SHugh Dickins 				return;
3159ef4d43a8SHugh Dickins 			}
3160ef4d43a8SHugh Dickins 			if (rwc->done && rwc->done(folio)) {
3161ef4d43a8SHugh Dickins 				anon_vma_unlock_read(anon_vma);
316262b61f61SHugh Dickins 				return;
316362b61f61SHugh Dickins 			}
316462b61f61SHugh Dickins 		}
316562b61f61SHugh Dickins 		anon_vma_unlock_read(anon_vma);
316662b61f61SHugh Dickins 	}
316762b61f61SHugh Dickins 	if (!search_new_forks++)
3168ee0ea59cSHugh Dickins 		goto again;
3169ee0ea59cSHugh Dickins }
3170ee0ea59cSHugh Dickins 
317162b61f61SHugh Dickins #ifdef CONFIG_MEMORY_FAILURE
3172ee0ea59cSHugh Dickins /*
3173ee0ea59cSHugh Dickins  * Collect processes when the error hit an ksm page.
3174e4a9bc58SJoe Perches  */
collect_procs_ksm(struct page * page,struct list_head * to_kill,int force_early)317562b61f61SHugh Dickins void collect_procs_ksm(struct page *page, struct list_head *to_kill,
3176ef4d43a8SHugh Dickins 		       int force_early)
3177ef4d43a8SHugh Dickins {
317862b61f61SHugh Dickins 	struct ksm_stable_node *stable_node;
3179ef4d43a8SHugh Dickins 	struct ksm_rmap_item *rmap_item;
3180ef4d43a8SHugh Dickins 	struct folio *folio = page_folio(page);
3181ef4d43a8SHugh Dickins 	struct vm_area_struct *vma;
318262b61f61SHugh Dickins 	struct task_struct *tsk;
318362b61f61SHugh Dickins 
318462b61f61SHugh Dickins 	stable_node = folio_stable_node(folio);
318562b61f61SHugh Dickins 	if (!stable_node)
3186ef4d43a8SHugh Dickins 		return;
3187ef4d43a8SHugh Dickins 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
3188ef4d43a8SHugh Dickins 		struct anon_vma *av = rmap_item->anon_vma;
3189ef4d43a8SHugh Dickins 
319062b61f61SHugh Dickins 		anon_vma_lock_read(av);
319162b61f61SHugh Dickins 		rcu_read_lock();
3192d21077fbSStefan Roesch 		for_each_process(tsk) {
3193d21077fbSStefan Roesch 			struct anon_vma_chain *vmac;
3194d21077fbSStefan Roesch 			unsigned long addr;
31951a8e8430Sxu xin 			struct task_struct *t =
3196d21077fbSStefan Roesch 				task_early_kill(tsk, force_early);
3197d21077fbSStefan Roesch 			if (!t)
3198d21077fbSStefan Roesch 				continue;
3199d21077fbSStefan Roesch 			anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
32002ffd8679SHugh Dickins 						       ULONG_MAX)
32012ffd8679SHugh Dickins 			{
32022ffd8679SHugh Dickins 				vma = vmac->vma;
32032ffd8679SHugh Dickins 				if (vma->vm_mm == t->mm) {
32042ffd8679SHugh Dickins 					addr = rmap_item->address & PAGE_MASK;
320531dbd01fSIzik Eidus 					add_to_kill_ksm(t, page, vma, to_kill,
320631dbd01fSIzik Eidus 							addr);
320731dbd01fSIzik Eidus 				}
32081bad2e5cSMiaohe Lin 			}
320931dbd01fSIzik Eidus 		}
321031dbd01fSIzik Eidus 		rcu_read_unlock();
321131dbd01fSIzik Eidus 		anon_vma_unlock_read(av);
321231dbd01fSIzik Eidus 	}
3213ae7a927dSJoe Perches }
321431dbd01fSIzik Eidus #endif
321531dbd01fSIzik Eidus 
321631dbd01fSIzik Eidus #ifdef CONFIG_MIGRATION
folio_migrate_ksm(struct folio * newfolio,struct folio * folio)321731dbd01fSIzik Eidus void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
321831dbd01fSIzik Eidus {
321931dbd01fSIzik Eidus 	struct ksm_stable_node *stable_node;
3220dfefd226SAlexey Dobriyan 
322131dbd01fSIzik Eidus 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
322231dbd01fSIzik Eidus 	VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
3223dfefd226SAlexey Dobriyan 	VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
3224dfefd226SAlexey Dobriyan 
322531dbd01fSIzik Eidus 	stable_node = folio_stable_node(folio);
322631dbd01fSIzik Eidus 	if (stable_node) {
322731dbd01fSIzik Eidus 		VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
3228fcf9a0efSKirill Tkhai 		stable_node->kpfn = folio_pfn(newfolio);
322931dbd01fSIzik Eidus 		/*
323031dbd01fSIzik Eidus 		 * newfolio->mapping was set in advance; now we need smp_wmb()
323131dbd01fSIzik Eidus 		 * to make sure that the new stable_node->kpfn is visible
323231dbd01fSIzik Eidus 		 * to get_ksm_page() before it can see that folio->mapping
323331dbd01fSIzik Eidus 		 * has gone stale (or that folio_test_swapcache has been cleared).
323431dbd01fSIzik Eidus 		 */
323531dbd01fSIzik Eidus 		smp_wmb();
323631dbd01fSIzik Eidus 		set_page_stable_node(&folio->page, NULL);
3237ae7a927dSJoe Perches 	}
323831dbd01fSIzik Eidus }
323931dbd01fSIzik Eidus #endif /* CONFIG_MIGRATION */
324031dbd01fSIzik Eidus 
324131dbd01fSIzik Eidus #ifdef CONFIG_MEMORY_HOTREMOVE
wait_while_offlining(void)324231dbd01fSIzik Eidus static void wait_while_offlining(void)
324331dbd01fSIzik Eidus {
3244dfefd226SAlexey Dobriyan 	while (ksm_run & KSM_RUN_OFFLINE) {
324531dbd01fSIzik Eidus 		mutex_unlock(&ksm_thread_mutex);
324631dbd01fSIzik Eidus 		wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
3247dfefd226SAlexey Dobriyan 			    TASK_UNINTERRUPTIBLE);
3248dfefd226SAlexey Dobriyan 		mutex_lock(&ksm_thread_mutex);
324931dbd01fSIzik Eidus 	}
325031dbd01fSIzik Eidus }
325131dbd01fSIzik Eidus 
stable_node_dup_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn)325231dbd01fSIzik Eidus static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
325331dbd01fSIzik Eidus 					 unsigned long start_pfn,
325431dbd01fSIzik Eidus 					 unsigned long end_pfn)
325531dbd01fSIzik Eidus {
325631dbd01fSIzik Eidus 	if (stable_node->kpfn >= start_pfn &&
325731dbd01fSIzik Eidus 	    stable_node->kpfn < end_pfn) {
325831dbd01fSIzik Eidus 		/*
325931dbd01fSIzik Eidus 		 * Don't get_ksm_page, page has already gone:
3260ae7a927dSJoe Perches 		 * which is why we keep kpfn instead of page*
326131dbd01fSIzik Eidus 		 */
326231dbd01fSIzik Eidus 		remove_node_from_stable_tree(stable_node);
326331dbd01fSIzik Eidus 		return true;
326431dbd01fSIzik Eidus 	}
326531dbd01fSIzik Eidus 	return false;
3266dfefd226SAlexey Dobriyan }
326731dbd01fSIzik Eidus 
stable_node_chain_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn,struct rb_root * root)326831dbd01fSIzik Eidus static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
3269dfefd226SAlexey Dobriyan 					   unsigned long start_pfn,
3270dfefd226SAlexey Dobriyan 					   unsigned long end_pfn,
327131dbd01fSIzik Eidus 					   struct rb_root *root)
327231dbd01fSIzik Eidus {
327331dbd01fSIzik Eidus 	struct ksm_stable_node *dup;
327431dbd01fSIzik Eidus 	struct hlist_node *hlist_safe;
327531dbd01fSIzik Eidus 
327631dbd01fSIzik Eidus 	if (!is_stable_node_chain(stable_node)) {
327731dbd01fSIzik Eidus 		VM_BUG_ON(is_stable_node_dup(stable_node));
3278d0f209f6SHugh Dickins 		return stable_node_dup_remove_range(stable_node, start_pfn,
3279d0f209f6SHugh Dickins 						    end_pfn);
328031dbd01fSIzik Eidus 	}
328131dbd01fSIzik Eidus 
328231dbd01fSIzik Eidus 	hlist_for_each_entry_safe(dup, hlist_safe,
3283ef4d43a8SHugh Dickins 				  &stable_node->hlist, hlist_dup) {
328431dbd01fSIzik Eidus 		VM_BUG_ON(!is_stable_node_dup(dup));
328531dbd01fSIzik Eidus 		stable_node_dup_remove_range(dup, start_pfn, end_pfn);
3286d952b791SHugh Dickins 	}
3287e1e12d2fSDavid Rientjes 	if (hlist_empty(&stable_node->hlist)) {
3288d952b791SHugh Dickins 		free_stable_node_chain(stable_node, root);
3289e1e12d2fSDavid Rientjes 		return true; /* notify caller that tree was rebalanced */
3290d952b791SHugh Dickins 	} else
3291d952b791SHugh Dickins 		return false;
3292d952b791SHugh Dickins }
3293d952b791SHugh Dickins 
ksm_check_stable_tree(unsigned long start_pfn,unsigned long end_pfn)3294d952b791SHugh Dickins static void ksm_check_stable_tree(unsigned long start_pfn,
329531dbd01fSIzik Eidus 				  unsigned long end_pfn)
329631dbd01fSIzik Eidus {
329731dbd01fSIzik Eidus 	struct ksm_stable_node *stable_node, *next;
329831dbd01fSIzik Eidus 	struct rb_node *node;
329931dbd01fSIzik Eidus 	int nid;
330031dbd01fSIzik Eidus 
330131dbd01fSIzik Eidus 	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
330231dbd01fSIzik Eidus 		node = rb_first(root_stable_tree + nid);
330331dbd01fSIzik Eidus 		while (node) {
330431dbd01fSIzik Eidus 			stable_node = rb_entry(node, struct ksm_stable_node, node);
330590bd6fd3SPetr Holasek 			if (stable_node_chain_remove_range(stable_node,
330690bd6fd3SPetr Holasek 							   start_pfn, end_pfn,
330790bd6fd3SPetr Holasek 							   root_stable_tree +
330890bd6fd3SPetr Holasek 							   nid))
3309ae7a927dSJoe Perches 				node = rb_first(root_stable_tree + nid);
331090bd6fd3SPetr Holasek 			else
331190bd6fd3SPetr Holasek 				node = rb_next(node);
331290bd6fd3SPetr Holasek 			cond_resched();
331390bd6fd3SPetr Holasek 		}
331490bd6fd3SPetr Holasek 	}
331590bd6fd3SPetr Holasek 	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
331690bd6fd3SPetr Holasek 		if (stable_node->kpfn >= start_pfn &&
331790bd6fd3SPetr Holasek 		    stable_node->kpfn < end_pfn)
331890bd6fd3SPetr Holasek 			remove_node_from_stable_tree(stable_node);
331990bd6fd3SPetr Holasek 		cond_resched();
332090bd6fd3SPetr Holasek 	}
332190bd6fd3SPetr Holasek }
332290bd6fd3SPetr Holasek 
ksm_memory_callback(struct notifier_block * self,unsigned long action,void * arg)332390bd6fd3SPetr Holasek static int ksm_memory_callback(struct notifier_block *self,
332490bd6fd3SPetr Holasek 			       unsigned long action, void *arg)
332590bd6fd3SPetr Holasek {
3326ef4d43a8SHugh Dickins 	struct memory_notify *mn = arg;
332790bd6fd3SPetr Holasek 
3328cbf86cfeSHugh Dickins 	switch (action) {
332990bd6fd3SPetr Holasek 	case MEM_GOING_OFFLINE:
3330ef53d16cSHugh Dickins 		/*
3331ef53d16cSHugh Dickins 		 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
3332ef53d16cSHugh Dickins 		 * and remove_all_stable_nodes() while memory is going offline:
3333ef53d16cSHugh Dickins 		 * it is unsafe for them to touch the stable tree at this time.
3334ef53d16cSHugh Dickins 		 * But unmerge_ksm_pages(), rmap lookups and other entry points
3335ef53d16cSHugh Dickins 		 * which do not need the ksm_thread_mutex are all safe.
3336ef53d16cSHugh Dickins 		 */
3337ef53d16cSHugh Dickins 		mutex_lock(&ksm_thread_mutex);
3338ef53d16cSHugh Dickins 		ksm_run |= KSM_RUN_OFFLINE;
3339bafe1e14SJoe Perches 		mutex_unlock(&ksm_thread_mutex);
3340bafe1e14SJoe Perches 		break;
3341ef53d16cSHugh Dickins 
3342ef53d16cSHugh Dickins 	case MEM_OFFLINE:
3343ef53d16cSHugh Dickins 		/*
3344ef53d16cSHugh Dickins 		 * Most of the work is done by page migration; but there might
3345ef53d16cSHugh Dickins 		 * be a few stable_nodes left over, still pointing to struct
3346ef53d16cSHugh Dickins 		 * pages which have been offlined: prune those from the tree,
3347ef53d16cSHugh Dickins 		 * otherwise get_ksm_page() might later try to access a
3348ef53d16cSHugh Dickins 		 * non-existent struct page.
3349ef53d16cSHugh Dickins 		 */
3350ef53d16cSHugh Dickins 		ksm_check_stable_tree(mn->start_pfn,
3351ef53d16cSHugh Dickins 				      mn->start_pfn + mn->nr_pages);
335290bd6fd3SPetr Holasek 		fallthrough;
3353ef53d16cSHugh Dickins 	case MEM_CANCEL_OFFLINE:
3354ef53d16cSHugh Dickins 		mutex_lock(&ksm_thread_mutex);
335590bd6fd3SPetr Holasek 		ksm_run &= ~KSM_RUN_OFFLINE;
335690bd6fd3SPetr Holasek 		mutex_unlock(&ksm_thread_mutex);
335790bd6fd3SPetr Holasek 
335890bd6fd3SPetr Holasek 		smp_mb();	/* wake_up_bit advises this */
335990bd6fd3SPetr Holasek 		wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
336090bd6fd3SPetr Holasek 		break;
336190bd6fd3SPetr Holasek 	}
336290bd6fd3SPetr Holasek 	return NOTIFY_OK;
3363e86c59b1SClaudio Imbrenda }
3364e86c59b1SClaudio Imbrenda #else
wait_while_offlining(void)3365e86c59b1SClaudio Imbrenda static void wait_while_offlining(void)
3366ae7a927dSJoe Perches {
3367e86c59b1SClaudio Imbrenda }
3368e86c59b1SClaudio Imbrenda #endif /* CONFIG_MEMORY_HOTREMOVE */
3369e86c59b1SClaudio Imbrenda 
3370e86c59b1SClaudio Imbrenda #ifdef CONFIG_PROC_FS
ksm_process_profit(struct mm_struct * mm)3371e86c59b1SClaudio Imbrenda long ksm_process_profit(struct mm_struct *mm)
3372e86c59b1SClaudio Imbrenda {
3373e86c59b1SClaudio Imbrenda 	return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
3374e86c59b1SClaudio Imbrenda 		mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
3375e86c59b1SClaudio Imbrenda }
3376e86c59b1SClaudio Imbrenda #endif /* CONFIG_PROC_FS */
3377e86c59b1SClaudio Imbrenda 
3378e86c59b1SClaudio Imbrenda #ifdef CONFIG_SYSFS
3379e86c59b1SClaudio Imbrenda /*
3380e86c59b1SClaudio Imbrenda  * This all compiles without CONFIG_SYSFS, but is a waste of space.
3381e86c59b1SClaudio Imbrenda  */
3382e86c59b1SClaudio Imbrenda 
3383e86c59b1SClaudio Imbrenda #define KSM_ATTR_RO(_name) \
3384e86c59b1SClaudio Imbrenda 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
33852c653d0eSAndrea Arcangeli #define KSM_ATTR(_name) \
33862c653d0eSAndrea Arcangeli 	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
33872c653d0eSAndrea Arcangeli 
sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3388ae7a927dSJoe Perches static ssize_t sleep_millisecs_show(struct kobject *kobj,
33892c653d0eSAndrea Arcangeli 				    struct kobj_attribute *attr, char *buf)
33902c653d0eSAndrea Arcangeli {
33912c653d0eSAndrea Arcangeli 	return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
33922c653d0eSAndrea Arcangeli }
33932c653d0eSAndrea Arcangeli 
sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)33942c653d0eSAndrea Arcangeli static ssize_t sleep_millisecs_store(struct kobject *kobj,
33952c653d0eSAndrea Arcangeli 				     struct kobj_attribute *attr,
33962c653d0eSAndrea Arcangeli 				     const char *buf, size_t count)
33972c653d0eSAndrea Arcangeli {
33982c653d0eSAndrea Arcangeli 	unsigned int msecs;
33992c653d0eSAndrea Arcangeli 	int err;
34002c653d0eSAndrea Arcangeli 
34012c653d0eSAndrea Arcangeli 	err = kstrtouint(buf, 10, &msecs);
34022c653d0eSAndrea Arcangeli 	if (err)
34032c653d0eSAndrea Arcangeli 		return -EINVAL;
34042c653d0eSAndrea Arcangeli 
34052c653d0eSAndrea Arcangeli 	ksm_thread_sleep_millisecs = msecs;
34062c653d0eSAndrea Arcangeli 	wake_up_interruptible(&ksm_iter_wait);
34072c653d0eSAndrea Arcangeli 
34082c653d0eSAndrea Arcangeli 	return count;
34092c653d0eSAndrea Arcangeli }
34102c653d0eSAndrea Arcangeli KSM_ATTR(sleep_millisecs);
34112c653d0eSAndrea Arcangeli 
pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)34122c653d0eSAndrea Arcangeli static ssize_t pages_to_scan_show(struct kobject *kobj,
34132c653d0eSAndrea Arcangeli 				  struct kobj_attribute *attr, char *buf)
34142c653d0eSAndrea Arcangeli {
34152c653d0eSAndrea Arcangeli 	return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
34162c653d0eSAndrea Arcangeli }
34172c653d0eSAndrea Arcangeli 
pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)34182c653d0eSAndrea Arcangeli static ssize_t pages_to_scan_store(struct kobject *kobj,
34192c653d0eSAndrea Arcangeli 				   struct kobj_attribute *attr,
34202c653d0eSAndrea Arcangeli 				   const char *buf, size_t count)
34212c653d0eSAndrea Arcangeli {
34222c653d0eSAndrea Arcangeli 	unsigned int nr_pages;
34232c653d0eSAndrea Arcangeli 	int err;
34242c653d0eSAndrea Arcangeli 
34252c653d0eSAndrea Arcangeli 	if (ksm_advisor != KSM_ADVISOR_NONE)
3426b348b5feSStefan Roesch 		return -EINVAL;
3427b348b5feSStefan Roesch 
3428b348b5feSStefan Roesch 	err = kstrtouint(buf, 10, &nr_pages);
3429b348b5feSStefan Roesch 	if (err)
3430b348b5feSStefan Roesch 		return -EINVAL;
3431b348b5feSStefan Roesch 
3432b348b5feSStefan Roesch 	ksm_thread_pages_to_scan = nr_pages;
3433b4028260SHugh Dickins 
3434b4028260SHugh Dickins 	return count;
3435b4028260SHugh Dickins }
3436ae7a927dSJoe Perches KSM_ATTR(pages_to_scan);
3437b4028260SHugh Dickins 
run_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3438b4028260SHugh Dickins static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
3439b4028260SHugh Dickins 			char *buf)
3440b4028260SHugh Dickins {
3441b4028260SHugh Dickins 	return sysfs_emit(buf, "%lu\n", ksm_run);
3442b4028260SHugh Dickins }
3443ae7a927dSJoe Perches 
run_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3444b4028260SHugh Dickins static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
3445b4028260SHugh Dickins 			 const char *buf, size_t count)
3446b4028260SHugh Dickins {
3447473b0ce4SHugh Dickins 	unsigned int flags;
3448473b0ce4SHugh Dickins 	int err;
3449473b0ce4SHugh Dickins 
3450ae7a927dSJoe Perches 	err = kstrtouint(buf, 10, &flags);
3451473b0ce4SHugh Dickins 	if (err)
3452473b0ce4SHugh Dickins 		return -EINVAL;
3453473b0ce4SHugh Dickins 	if (flags > KSM_RUN_UNMERGE)
3454473b0ce4SHugh Dickins 		return -EINVAL;
3455473b0ce4SHugh Dickins 
3456473b0ce4SHugh Dickins 	/*
3457473b0ce4SHugh Dickins 	 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
3458473b0ce4SHugh Dickins 	 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
3459473b0ce4SHugh Dickins 	 * breaking COW to free the pages_shared (but leaves mm_slots
3460473b0ce4SHugh Dickins 	 * on the list for when ksmd may be set running again).
3461473b0ce4SHugh Dickins 	 */
3462473b0ce4SHugh Dickins 
3463473b0ce4SHugh Dickins 	mutex_lock(&ksm_thread_mutex);
3464473b0ce4SHugh Dickins 	wait_while_offlining();
3465473b0ce4SHugh Dickins 	if (ksm_run != flags) {
3466473b0ce4SHugh Dickins 		ksm_run = flags;
3467ae7a927dSJoe Perches 		if (flags & KSM_RUN_UNMERGE) {
3468473b0ce4SHugh Dickins 			set_current_oom_origin();
3469473b0ce4SHugh Dickins 			err = unmerge_and_remove_all_rmap_items();
3470473b0ce4SHugh Dickins 			clear_current_oom_origin();
3471e5a68991SStefan Roesch 			if (err) {
3472e5a68991SStefan Roesch 				ksm_run = KSM_RUN_STOP;
3473e5a68991SStefan Roesch 				count = err;
3474e5a68991SStefan Roesch 			}
3475e5a68991SStefan Roesch 		}
3476e5a68991SStefan Roesch 	}
3477e5a68991SStefan Roesch 	mutex_unlock(&ksm_thread_mutex);
3478e2942062Sxu xin 
3479e2942062Sxu xin 	if (flags & KSM_RUN_MERGE)
3480e2942062Sxu xin 		wake_up_interruptible(&ksm_thread_wait);
3481e2942062Sxu xin 
3482e2942062Sxu xin 	return count;
3483e2942062Sxu xin }
3484e2942062Sxu xin KSM_ATTR(run);
3485d21077fbSStefan Roesch 
3486d21077fbSStefan Roesch #ifdef CONFIG_NUMA
merge_across_nodes_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3487d21077fbSStefan Roesch static ssize_t merge_across_nodes_show(struct kobject *kobj,
3488d21077fbSStefan Roesch 				       struct kobj_attribute *attr, char *buf)
3489d21077fbSStefan Roesch {
34901a8e8430Sxu xin 	return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
3491d21077fbSStefan Roesch }
3492d21077fbSStefan Roesch 
merge_across_nodes_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3493d21077fbSStefan Roesch static ssize_t merge_across_nodes_store(struct kobject *kobj,
3494d21077fbSStefan Roesch 				   struct kobj_attribute *attr,
3495d21077fbSStefan Roesch 				   const char *buf, size_t count)
3496d21077fbSStefan Roesch {
34972c653d0eSAndrea Arcangeli 	int err;
34982c653d0eSAndrea Arcangeli 	unsigned long knob;
34992c653d0eSAndrea Arcangeli 
3500ae7a927dSJoe Perches 	err = kstrtoul(buf, 10, &knob);
35012c653d0eSAndrea Arcangeli 	if (err)
35022c653d0eSAndrea Arcangeli 		return err;
35032c653d0eSAndrea Arcangeli 	if (knob > 1)
35042c653d0eSAndrea Arcangeli 		return -EINVAL;
35052c653d0eSAndrea Arcangeli 
35062c653d0eSAndrea Arcangeli 	mutex_lock(&ksm_thread_mutex);
3507ae7a927dSJoe Perches 	wait_while_offlining();
35082c653d0eSAndrea Arcangeli 	if (ksm_merge_across_nodes != knob) {
35092c653d0eSAndrea Arcangeli 		if (ksm_pages_shared || remove_all_stable_nodes())
35102c653d0eSAndrea Arcangeli 			err = -EBUSY;
35112c653d0eSAndrea Arcangeli 		else if (root_stable_tree == one_stable_tree) {
35122c653d0eSAndrea Arcangeli 			struct rb_root *buf;
35132c653d0eSAndrea Arcangeli 			/*
35142c653d0eSAndrea Arcangeli 			 * This is the first time that we switch away from the
35152c653d0eSAndrea Arcangeli 			 * default of merging across nodes: must now allocate
3516ae7a927dSJoe Perches 			 * a buffer to hold as many roots as may be needed.
35172c653d0eSAndrea Arcangeli 			 * Allocate stable and unstable together:
35182c653d0eSAndrea Arcangeli 			 * MAXSMP NODES_SHIFT 10 will use 16kB.
35192c653d0eSAndrea Arcangeli 			 */
35202c653d0eSAndrea Arcangeli 			buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
35212c653d0eSAndrea Arcangeli 				      GFP_KERNEL);
35222c653d0eSAndrea Arcangeli 			/* Let us assume that RB_ROOT is NULL is zero */
35232c653d0eSAndrea Arcangeli 			if (!buf)
3524584ff0dfSZhansaya Bagdauletkyzy 				err = -ENOMEM;
35252c653d0eSAndrea Arcangeli 			else {
35262c653d0eSAndrea Arcangeli 				root_stable_tree = buf;
3527584ff0dfSZhansaya Bagdauletkyzy 				root_unstable_tree = buf + nr_node_ids;
3528584ff0dfSZhansaya Bagdauletkyzy 				/* Stable tree is empty but not the unstable */
35292c653d0eSAndrea Arcangeli 				root_unstable_tree[0] = one_unstable_tree[0];
35302c653d0eSAndrea Arcangeli 			}
35312c653d0eSAndrea Arcangeli 		}
35322c653d0eSAndrea Arcangeli 		if (!err) {
35332c653d0eSAndrea Arcangeli 			ksm_merge_across_nodes = knob;
35342c653d0eSAndrea Arcangeli 			ksm_nr_node_ids = knob ? 1 : nr_node_ids;
35352c653d0eSAndrea Arcangeli 		}
35362c653d0eSAndrea Arcangeli 	}
3537473b0ce4SHugh Dickins 	mutex_unlock(&ksm_thread_mutex);
3538473b0ce4SHugh Dickins 
3539473b0ce4SHugh Dickins 	return err ? err : count;
3540ae7a927dSJoe Perches }
3541473b0ce4SHugh Dickins KSM_ATTR(merge_across_nodes);
3542473b0ce4SHugh Dickins #endif
3543473b0ce4SHugh Dickins 
use_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)35445e924ff5SStefan Roesch static ssize_t use_zero_pages_show(struct kobject *kobj,
35455e924ff5SStefan Roesch 				   struct kobj_attribute *attr, char *buf)
35465e924ff5SStefan Roesch {
35475e924ff5SStefan Roesch 	return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
35485e924ff5SStefan Roesch }
use_zero_pages_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)35495e924ff5SStefan Roesch static ssize_t use_zero_pages_store(struct kobject *kobj,
35505e924ff5SStefan Roesch 				   struct kobj_attribute *attr,
35515e924ff5SStefan Roesch 				   const char *buf, size_t count)
35525e924ff5SStefan Roesch {
35535e924ff5SStefan Roesch 	int err;
35545e924ff5SStefan Roesch 	bool value;
35555e924ff5SStefan Roesch 
35565e924ff5SStefan Roesch 	err = kstrtobool(buf, &value);
35575e924ff5SStefan Roesch 	if (err)
35585e924ff5SStefan Roesch 		return -EINVAL;
35595e924ff5SStefan Roesch 
35605e924ff5SStefan Roesch 	ksm_use_zero_pages = value;
35615e924ff5SStefan Roesch 
35625e924ff5SStefan Roesch 	return count;
35635e924ff5SStefan Roesch }
35645e924ff5SStefan Roesch KSM_ATTR(use_zero_pages);
35655e924ff5SStefan Roesch 
max_page_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)356631dbd01fSIzik Eidus static ssize_t max_page_sharing_show(struct kobject *kobj,
356731dbd01fSIzik Eidus 				     struct kobj_attribute *attr, char *buf)
356831dbd01fSIzik Eidus {
356931dbd01fSIzik Eidus 	return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3570b348b5feSStefan Roesch }
3571b4028260SHugh Dickins 
max_page_sharing_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3572b4028260SHugh Dickins static ssize_t max_page_sharing_store(struct kobject *kobj,
3573473b0ce4SHugh Dickins 				      struct kobj_attribute *attr,
3574473b0ce4SHugh Dickins 				      const char *buf, size_t count)
3575e5a68991SStefan Roesch {
3576e2942062Sxu xin 	int err;
3577473b0ce4SHugh Dickins 	int knob;
357890bd6fd3SPetr Holasek 
357990bd6fd3SPetr Holasek 	err = kstrtoint(buf, 10, &knob);
358090bd6fd3SPetr Holasek 	if (err)
35812c653d0eSAndrea Arcangeli 		return err;
35822c653d0eSAndrea Arcangeli 	/*
35832c653d0eSAndrea Arcangeli 	 * When a KSM page is created it is shared by 2 mappings. This
35842c653d0eSAndrea Arcangeli 	 * being a signed comparison, it implicitly verifies it's not
3585e86c59b1SClaudio Imbrenda 	 * negative.
3586d21077fbSStefan Roesch 	 */
35875e924ff5SStefan Roesch 	if (knob < 2)
358831dbd01fSIzik Eidus 		return -EINVAL;
358931dbd01fSIzik Eidus 
359031dbd01fSIzik Eidus 	if (READ_ONCE(ksm_max_page_sharing) == knob)
3591f907c26aSArvind Yadav 		return count;
359231dbd01fSIzik Eidus 
359331dbd01fSIzik Eidus 	mutex_lock(&ksm_thread_mutex);
359431dbd01fSIzik Eidus 	wait_while_offlining();
35952ffd8679SHugh Dickins 	if (ksm_max_page_sharing != knob) {
359631dbd01fSIzik Eidus 		if (ksm_pages_shared || remove_all_stable_nodes())
359731dbd01fSIzik Eidus 			err = -EBUSY;
359831dbd01fSIzik Eidus 		else
359931dbd01fSIzik Eidus 			ksm_max_page_sharing = knob;
360031dbd01fSIzik Eidus 	}
360131dbd01fSIzik Eidus 	mutex_unlock(&ksm_thread_mutex);
3602e86c59b1SClaudio Imbrenda 
3603e86c59b1SClaudio Imbrenda 	return err ? err : count;
3604e86c59b1SClaudio Imbrenda }
3605e86c59b1SClaudio Imbrenda KSM_ATTR(max_page_sharing);
3606e86c59b1SClaudio Imbrenda 
pages_scanned_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)360731dbd01fSIzik Eidus static ssize_t pages_scanned_show(struct kobject *kobj,
360831dbd01fSIzik Eidus 				  struct kobj_attribute *attr, char *buf)
360931dbd01fSIzik Eidus {
361031dbd01fSIzik Eidus 	return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
361131dbd01fSIzik Eidus }
361231dbd01fSIzik Eidus KSM_ATTR_RO(pages_scanned);
361325acde31SPaul McQuade 
pages_shared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)361431dbd01fSIzik Eidus static ssize_t pages_shared_show(struct kobject *kobj,
3615d9f8984cSLai Jiangshan 				 struct kobj_attribute *attr, char *buf)
361631dbd01fSIzik Eidus {
361731dbd01fSIzik Eidus 	return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
36182ffd8679SHugh Dickins }
361931dbd01fSIzik Eidus KSM_ATTR_RO(pages_shared);
362031dbd01fSIzik Eidus 
pages_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)362125acde31SPaul McQuade static ssize_t pages_sharing_show(struct kobject *kobj,
36222ffd8679SHugh Dickins 				  struct kobj_attribute *attr, char *buf)
3623d9f8984cSLai Jiangshan {
362431dbd01fSIzik Eidus 	return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3625c73602adSHugh Dickins }
3626c73602adSHugh Dickins KSM_ATTR_RO(pages_sharing);
3627c73602adSHugh Dickins 
pages_unshared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)36282ffd8679SHugh Dickins static ssize_t pages_unshared_show(struct kobject *kobj,
362931dbd01fSIzik Eidus 				   struct kobj_attribute *attr, char *buf)
363062b61f61SHugh Dickins {
3631ef4d43a8SHugh Dickins 	return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
36321eeaa4fdSLiu Shixin }
363362b61f61SHugh Dickins KSM_ATTR_RO(pages_unshared);
363431dbd01fSIzik Eidus 
pages_volatile_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)363531dbd01fSIzik Eidus static ssize_t pages_volatile_show(struct kobject *kobj,
3636d9f8984cSLai Jiangshan 				   struct kobj_attribute *attr, char *buf)
363731dbd01fSIzik Eidus {
363831dbd01fSIzik Eidus 	long ksm_pages_volatile;
363931dbd01fSIzik Eidus 
364031dbd01fSIzik Eidus 	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3641a64fb3cdSPaul Gortmaker 				- ksm_pages_sharing - ksm_pages_unshared;
3642 	/*
3643 	 * It was not worth any locking to calculate that statistic,
3644 	 * but it might therefore sometimes be negative: conceal that.
3645 	 */
3646 	if (ksm_pages_volatile < 0)
3647 		ksm_pages_volatile = 0;
3648 	return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3649 }
3650 KSM_ATTR_RO(pages_volatile);
3651 
pages_skipped_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3652 static ssize_t pages_skipped_show(struct kobject *kobj,
3653 				  struct kobj_attribute *attr, char *buf)
3654 {
3655 	return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
3656 }
3657 KSM_ATTR_RO(pages_skipped);
3658 
ksm_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3659 static ssize_t ksm_zero_pages_show(struct kobject *kobj,
3660 				struct kobj_attribute *attr, char *buf)
3661 {
3662 	return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
3663 }
3664 KSM_ATTR_RO(ksm_zero_pages);
3665 
general_profit_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3666 static ssize_t general_profit_show(struct kobject *kobj,
3667 				   struct kobj_attribute *attr, char *buf)
3668 {
3669 	long general_profit;
3670 
3671 	general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
3672 				ksm_rmap_items * sizeof(struct ksm_rmap_item);
3673 
3674 	return sysfs_emit(buf, "%ld\n", general_profit);
3675 }
3676 KSM_ATTR_RO(general_profit);
3677 
stable_node_dups_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3678 static ssize_t stable_node_dups_show(struct kobject *kobj,
3679 				     struct kobj_attribute *attr, char *buf)
3680 {
3681 	return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3682 }
3683 KSM_ATTR_RO(stable_node_dups);
3684 
stable_node_chains_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3685 static ssize_t stable_node_chains_show(struct kobject *kobj,
3686 				       struct kobj_attribute *attr, char *buf)
3687 {
3688 	return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3689 }
3690 KSM_ATTR_RO(stable_node_chains);
3691 
3692 static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3693 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3694 					struct kobj_attribute *attr,
3695 					char *buf)
3696 {
3697 	return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3698 }
3699 
3700 static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3701 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3702 					 struct kobj_attribute *attr,
3703 					 const char *buf, size_t count)
3704 {
3705 	unsigned int msecs;
3706 	int err;
3707 
3708 	err = kstrtouint(buf, 10, &msecs);
3709 	if (err)
3710 		return -EINVAL;
3711 
3712 	ksm_stable_node_chains_prune_millisecs = msecs;
3713 
3714 	return count;
3715 }
3716 KSM_ATTR(stable_node_chains_prune_millisecs);
3717 
full_scans_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3718 static ssize_t full_scans_show(struct kobject *kobj,
3719 			       struct kobj_attribute *attr, char *buf)
3720 {
3721 	return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3722 }
3723 KSM_ATTR_RO(full_scans);
3724 
smart_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3725 static ssize_t smart_scan_show(struct kobject *kobj,
3726 			       struct kobj_attribute *attr, char *buf)
3727 {
3728 	return sysfs_emit(buf, "%u\n", ksm_smart_scan);
3729 }
3730 
smart_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3731 static ssize_t smart_scan_store(struct kobject *kobj,
3732 				struct kobj_attribute *attr,
3733 				const char *buf, size_t count)
3734 {
3735 	int err;
3736 	bool value;
3737 
3738 	err = kstrtobool(buf, &value);
3739 	if (err)
3740 		return -EINVAL;
3741 
3742 	ksm_smart_scan = value;
3743 	return count;
3744 }
3745 KSM_ATTR(smart_scan);
3746 
advisor_mode_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3747 static ssize_t advisor_mode_show(struct kobject *kobj,
3748 				 struct kobj_attribute *attr, char *buf)
3749 {
3750 	const char *output;
3751 
3752 	if (ksm_advisor == KSM_ADVISOR_NONE)
3753 		output = "[none] scan-time";
3754 	else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
3755 		output = "none [scan-time]";
3756 
3757 	return sysfs_emit(buf, "%s\n", output);
3758 }
3759 
advisor_mode_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3760 static ssize_t advisor_mode_store(struct kobject *kobj,
3761 				  struct kobj_attribute *attr, const char *buf,
3762 				  size_t count)
3763 {
3764 	enum ksm_advisor_type curr_advisor = ksm_advisor;
3765 
3766 	if (sysfs_streq("scan-time", buf))
3767 		ksm_advisor = KSM_ADVISOR_SCAN_TIME;
3768 	else if (sysfs_streq("none", buf))
3769 		ksm_advisor = KSM_ADVISOR_NONE;
3770 	else
3771 		return -EINVAL;
3772 
3773 	/* Set advisor default values */
3774 	if (curr_advisor != ksm_advisor)
3775 		set_advisor_defaults();
3776 
3777 	return count;
3778 }
3779 KSM_ATTR(advisor_mode);
3780 
advisor_max_cpu_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3781 static ssize_t advisor_max_cpu_show(struct kobject *kobj,
3782 				    struct kobj_attribute *attr, char *buf)
3783 {
3784 	return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
3785 }
3786 
advisor_max_cpu_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3787 static ssize_t advisor_max_cpu_store(struct kobject *kobj,
3788 				     struct kobj_attribute *attr,
3789 				     const char *buf, size_t count)
3790 {
3791 	int err;
3792 	unsigned long value;
3793 
3794 	err = kstrtoul(buf, 10, &value);
3795 	if (err)
3796 		return -EINVAL;
3797 
3798 	ksm_advisor_max_cpu = value;
3799 	return count;
3800 }
3801 KSM_ATTR(advisor_max_cpu);
3802 
advisor_min_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3803 static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
3804 					struct kobj_attribute *attr, char *buf)
3805 {
3806 	return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
3807 }
3808 
advisor_min_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3809 static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
3810 					struct kobj_attribute *attr,
3811 					const char *buf, size_t count)
3812 {
3813 	int err;
3814 	unsigned long value;
3815 
3816 	err = kstrtoul(buf, 10, &value);
3817 	if (err)
3818 		return -EINVAL;
3819 
3820 	ksm_advisor_min_pages_to_scan = value;
3821 	return count;
3822 }
3823 KSM_ATTR(advisor_min_pages_to_scan);
3824 
advisor_max_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3825 static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
3826 					struct kobj_attribute *attr, char *buf)
3827 {
3828 	return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
3829 }
3830 
advisor_max_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3831 static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
3832 					struct kobj_attribute *attr,
3833 					const char *buf, size_t count)
3834 {
3835 	int err;
3836 	unsigned long value;
3837 
3838 	err = kstrtoul(buf, 10, &value);
3839 	if (err)
3840 		return -EINVAL;
3841 
3842 	ksm_advisor_max_pages_to_scan = value;
3843 	return count;
3844 }
3845 KSM_ATTR(advisor_max_pages_to_scan);
3846 
advisor_target_scan_time_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3847 static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
3848 					     struct kobj_attribute *attr, char *buf)
3849 {
3850 	return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
3851 }
3852 
advisor_target_scan_time_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3853 static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
3854 					      struct kobj_attribute *attr,
3855 					      const char *buf, size_t count)
3856 {
3857 	int err;
3858 	unsigned long value;
3859 
3860 	err = kstrtoul(buf, 10, &value);
3861 	if (err)
3862 		return -EINVAL;
3863 	if (value < 1)
3864 		return -EINVAL;
3865 
3866 	ksm_advisor_target_scan_time = value;
3867 	return count;
3868 }
3869 KSM_ATTR(advisor_target_scan_time);
3870 
3871 static struct attribute *ksm_attrs[] = {
3872 	&sleep_millisecs_attr.attr,
3873 	&pages_to_scan_attr.attr,
3874 	&run_attr.attr,
3875 	&pages_scanned_attr.attr,
3876 	&pages_shared_attr.attr,
3877 	&pages_sharing_attr.attr,
3878 	&pages_unshared_attr.attr,
3879 	&pages_volatile_attr.attr,
3880 	&pages_skipped_attr.attr,
3881 	&ksm_zero_pages_attr.attr,
3882 	&full_scans_attr.attr,
3883 #ifdef CONFIG_NUMA
3884 	&merge_across_nodes_attr.attr,
3885 #endif
3886 	&max_page_sharing_attr.attr,
3887 	&stable_node_chains_attr.attr,
3888 	&stable_node_dups_attr.attr,
3889 	&stable_node_chains_prune_millisecs_attr.attr,
3890 	&use_zero_pages_attr.attr,
3891 	&general_profit_attr.attr,
3892 	&smart_scan_attr.attr,
3893 	&advisor_mode_attr.attr,
3894 	&advisor_max_cpu_attr.attr,
3895 	&advisor_min_pages_to_scan_attr.attr,
3896 	&advisor_max_pages_to_scan_attr.attr,
3897 	&advisor_target_scan_time_attr.attr,
3898 	NULL,
3899 };
3900 
3901 static const struct attribute_group ksm_attr_group = {
3902 	.attrs = ksm_attrs,
3903 	.name = "ksm",
3904 };
3905 #endif /* CONFIG_SYSFS */
3906 
ksm_init(void)3907 static int __init ksm_init(void)
3908 {
3909 	struct task_struct *ksm_thread;
3910 	int err;
3911 
3912 	/* The correct value depends on page size and endianness */
3913 	zero_checksum = calc_checksum(ZERO_PAGE(0));
3914 	/* Default to false for backwards compatibility */
3915 	ksm_use_zero_pages = false;
3916 
3917 	err = ksm_slab_init();
3918 	if (err)
3919 		goto out;
3920 
3921 	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3922 	if (IS_ERR(ksm_thread)) {
3923 		pr_err("ksm: creating kthread failed\n");
3924 		err = PTR_ERR(ksm_thread);
3925 		goto out_free;
3926 	}
3927 
3928 #ifdef CONFIG_SYSFS
3929 	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3930 	if (err) {
3931 		pr_err("ksm: register sysfs failed\n");
3932 		kthread_stop(ksm_thread);
3933 		goto out_free;
3934 	}
3935 #else
3936 	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */
3937 
3938 #endif /* CONFIG_SYSFS */
3939 
3940 #ifdef CONFIG_MEMORY_HOTREMOVE
3941 	/* There is no significance to this priority 100 */
3942 	hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
3943 #endif
3944 	return 0;
3945 
3946 out_free:
3947 	ksm_slab_free();
3948 out:
3949 	return err;
3950 }
3951 subsys_initcall(ksm_init);
3952