17a338472SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2f8af4da3SHugh Dickins /*
331dbd01fSIzik Eidus * Memory merging support.
431dbd01fSIzik Eidus *
531dbd01fSIzik Eidus * This code enables dynamic sharing of identical pages found in different
631dbd01fSIzik Eidus * memory areas, even if they are not shared by fork()
731dbd01fSIzik Eidus *
836b2528dSIzik Eidus * Copyright (C) 2008-2009 Red Hat, Inc.
931dbd01fSIzik Eidus * Authors:
1031dbd01fSIzik Eidus * Izik Eidus
1131dbd01fSIzik Eidus * Andrea Arcangeli
1231dbd01fSIzik Eidus * Chris Wright
1336b2528dSIzik Eidus * Hugh Dickins
14f8af4da3SHugh Dickins */
15f8af4da3SHugh Dickins
16f8af4da3SHugh Dickins #include <linux/errno.h>
1731dbd01fSIzik Eidus #include <linux/mm.h>
1836090defSArnd Bergmann #include <linux/mm_inline.h>
1931dbd01fSIzik Eidus #include <linux/fs.h>
20f8af4da3SHugh Dickins #include <linux/mman.h>
2131dbd01fSIzik Eidus #include <linux/sched.h>
226e84f315SIngo Molnar #include <linux/sched/mm.h>
23f7ccbae4SIngo Molnar #include <linux/sched/coredump.h>
2431dbd01fSIzik Eidus #include <linux/sched/cputime.h>
2531dbd01fSIzik Eidus #include <linux/rwsem.h>
2631dbd01fSIzik Eidus #include <linux/pagemap.h>
2731dbd01fSIzik Eidus #include <linux/rmap.h>
2859e1a2f4STimofey Titovets #include <linux/spinlock.h>
2931dbd01fSIzik Eidus #include <linux/xxhash.h>
3031dbd01fSIzik Eidus #include <linux/delay.h>
3131dbd01fSIzik Eidus #include <linux/kthread.h>
3231dbd01fSIzik Eidus #include <linux/wait.h>
3331dbd01fSIzik Eidus #include <linux/slab.h>
3462b61f61SHugh Dickins #include <linux/rbtree.h>
3531dbd01fSIzik Eidus #include <linux/memory.h>
362c6854fdSIzik Eidus #include <linux/mmu_notifier.h>
37f8af4da3SHugh Dickins #include <linux/swap.h>
384ca3a69bSSasha Levin #include <linux/ksm.h>
39878aee7dSAndrea Arcangeli #include <linux/hashtable.h>
4072788c38SDavid Rientjes #include <linux/freezer.h>
4190bd6fd3SPetr Holasek #include <linux/oom.h>
42d7c0e68dSDavid Hildenbrand #include <linux/numa.h>
43f8af4da3SHugh Dickins #include <linux/pagewalk.h>
4431dbd01fSIzik Eidus
4573848b46SHugh Dickins #include <asm/tlbflush.h>
4658730ab6SQi Zheng #include "internal.h"
4731dbd01fSIzik Eidus #include "mm_slot.h"
48739100c8SStefan Roesch
49739100c8SStefan Roesch #define CREATE_TRACE_POINTS
50739100c8SStefan Roesch #include <trace/events/ksm.h>
51e850dcf5SHugh Dickins
52e850dcf5SHugh Dickins #ifdef CONFIG_NUMA
53e850dcf5SHugh Dickins #define NUMA(x) (x)
54e850dcf5SHugh Dickins #define DO_NUMA(x) do { (x); } while (0)
55e850dcf5SHugh Dickins #else
56e850dcf5SHugh Dickins #define NUMA(x) (0)
57e850dcf5SHugh Dickins #define DO_NUMA(x) do { } while (0)
58e850dcf5SHugh Dickins #endif
595e924ff5SStefan Roesch
605e924ff5SStefan Roesch typedef u8 rmap_age_t;
615a2ca3efSMike Rapoport
625a2ca3efSMike Rapoport /**
635a2ca3efSMike Rapoport * DOC: Overview
6431dbd01fSIzik Eidus *
6531dbd01fSIzik Eidus * A few notes about the KSM scanning process,
6631dbd01fSIzik Eidus * to make it easier to understand the data structures below:
6731dbd01fSIzik Eidus *
6831dbd01fSIzik Eidus * In order to reduce excessive scanning, KSM sorts the memory pages by their
6931dbd01fSIzik Eidus * contents into a data structure that holds pointers to the pages' locations.
7031dbd01fSIzik Eidus *
7131dbd01fSIzik Eidus * Since the contents of the pages may change at any moment, KSM cannot just
7231dbd01fSIzik Eidus * insert the pages into a normal sorted tree and expect it to find anything.
7331dbd01fSIzik Eidus * Therefore KSM uses two data structures - the stable and the unstable tree.
7431dbd01fSIzik Eidus *
7531dbd01fSIzik Eidus * The stable tree holds pointers to all the merged pages (ksm pages), sorted
7631dbd01fSIzik Eidus * by their contents. Because each such page is write-protected, searching on
7731dbd01fSIzik Eidus * this tree is fully assured to be working (except when pages are unmapped),
7831dbd01fSIzik Eidus * and therefore this tree is called the stable tree.
795a2ca3efSMike Rapoport *
805a2ca3efSMike Rapoport * The stable tree node includes information required for reverse
815a2ca3efSMike Rapoport * mapping from a KSM page to virtual addresses that map this page.
825a2ca3efSMike Rapoport *
835a2ca3efSMike Rapoport * In order to avoid large latencies of the rmap walks on KSM pages,
845a2ca3efSMike Rapoport * KSM maintains two types of nodes in the stable tree:
855a2ca3efSMike Rapoport *
865a2ca3efSMike Rapoport * * the regular nodes that keep the reverse mapping structures in a
875a2ca3efSMike Rapoport * linked list
885a2ca3efSMike Rapoport * * the "chains" that link nodes ("dups") that represent the same
895a2ca3efSMike Rapoport * write protected memory content, but each "dup" corresponds to a
905a2ca3efSMike Rapoport * different KSM page copy of that content
915a2ca3efSMike Rapoport *
9221fbd591SQi Zheng * Internally, the regular nodes, "dups" and "chains" are represented
935a2ca3efSMike Rapoport * using the same struct ksm_stable_node structure.
9431dbd01fSIzik Eidus *
9531dbd01fSIzik Eidus * In addition to the stable tree, KSM uses a second data structure called the
9631dbd01fSIzik Eidus * unstable tree: this tree holds pointers to pages which have been found to
9731dbd01fSIzik Eidus * be "unchanged for a period of time". The unstable tree sorts these pages
9831dbd01fSIzik Eidus * by their contents, but since they are not write-protected, KSM cannot rely
9931dbd01fSIzik Eidus * upon the unstable tree to work correctly - the unstable tree is liable to
10031dbd01fSIzik Eidus * be corrupted as its contents are modified, and so it is called unstable.
10131dbd01fSIzik Eidus *
10231dbd01fSIzik Eidus * KSM solves this problem by several techniques:
10331dbd01fSIzik Eidus *
10431dbd01fSIzik Eidus * 1) The unstable tree is flushed every time KSM completes scanning all
10531dbd01fSIzik Eidus * memory areas, and then the tree is rebuilt again from the beginning.
10631dbd01fSIzik Eidus * 2) KSM will only insert into the unstable tree, pages whose hash value
10731dbd01fSIzik Eidus * has not changed since the previous scan of all memory areas.
10831dbd01fSIzik Eidus * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
10931dbd01fSIzik Eidus * colors of the nodes and not on their contents, assuring that even when
11031dbd01fSIzik Eidus * the tree gets "corrupted" it won't get out of balance, so scanning time
11131dbd01fSIzik Eidus * remains the same (also, searching and inserting nodes in an rbtree uses
11231dbd01fSIzik Eidus * the same algorithm, so we have no overhead when we flush and rebuild).
11331dbd01fSIzik Eidus * 4) KSM never flushes the stable tree, which means that even if it were to
11431dbd01fSIzik Eidus * take 10 attempts to find a page in the unstable tree, once it is found,
11531dbd01fSIzik Eidus * it is secured in the stable tree. (When we scan a new page, we first
1168fdb3dbfSHugh Dickins * compare it against the stable tree, and then against the unstable tree.)
1178fdb3dbfSHugh Dickins *
1188fdb3dbfSHugh Dickins * If the merge_across_nodes tunable is unset, then KSM maintains multiple
11931dbd01fSIzik Eidus * stable trees and multiple unstable trees: one of each for each NUMA node.
12031dbd01fSIzik Eidus */
12131dbd01fSIzik Eidus
12221fbd591SQi Zheng /**
12358730ab6SQi Zheng * struct ksm_mm_slot - ksm information per mm that is being scanned
1246514d511SHugh Dickins * @slot: hash lookup from mm to mm_slot
12531dbd01fSIzik Eidus * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
12621fbd591SQi Zheng */
12758730ab6SQi Zheng struct ksm_mm_slot {
12821fbd591SQi Zheng struct mm_slot slot;
12931dbd01fSIzik Eidus struct ksm_rmap_item *rmap_list;
13031dbd01fSIzik Eidus };
13131dbd01fSIzik Eidus
13231dbd01fSIzik Eidus /**
13331dbd01fSIzik Eidus * struct ksm_scan - cursor for scanning
13431dbd01fSIzik Eidus * @mm_slot: the current mm_slot we are scanning
1356514d511SHugh Dickins * @address: the next address inside that to be scanned
13631dbd01fSIzik Eidus * @rmap_list: link to the next rmap to be scanned in the rmap_list
13731dbd01fSIzik Eidus * @seqnr: count of completed full scans (needed when removing unstable node)
13831dbd01fSIzik Eidus *
13931dbd01fSIzik Eidus * There is only the one ksm_scan instance of this cursor structure.
14031dbd01fSIzik Eidus */
14121fbd591SQi Zheng struct ksm_scan {
14231dbd01fSIzik Eidus struct ksm_mm_slot *mm_slot;
14321fbd591SQi Zheng unsigned long address;
14431dbd01fSIzik Eidus struct ksm_rmap_item **rmap_list;
14531dbd01fSIzik Eidus unsigned long seqnr;
14631dbd01fSIzik Eidus };
14731dbd01fSIzik Eidus
14821fbd591SQi Zheng /**
1497b6ba2c7SHugh Dickins * struct ksm_stable_node - node of the stable rbtree
1504146d2d6SHugh Dickins * @node: rb node of this ksm page in the stable tree
1512c653d0eSAndrea Arcangeli * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
1524146d2d6SHugh Dickins * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
1537b6ba2c7SHugh Dickins * @list: linked into migrate_nodes, pending placement in the proper node tree
1544146d2d6SHugh Dickins * @hlist: hlist head of rmap_items using this ksm page
1552c653d0eSAndrea Arcangeli * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
1562c653d0eSAndrea Arcangeli * @chain_prune_time: time of the last full garbage collection
1574146d2d6SHugh Dickins * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
1587b6ba2c7SHugh Dickins * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
15921fbd591SQi Zheng */
1604146d2d6SHugh Dickins struct ksm_stable_node {
1614146d2d6SHugh Dickins union {
1624146d2d6SHugh Dickins struct rb_node node; /* when node of stable tree */
1634146d2d6SHugh Dickins struct { /* when listed for migration */
1642c653d0eSAndrea Arcangeli struct list_head *head;
1652c653d0eSAndrea Arcangeli struct {
1664146d2d6SHugh Dickins struct hlist_node hlist_dup;
1674146d2d6SHugh Dickins struct list_head list;
1684146d2d6SHugh Dickins };
1692c653d0eSAndrea Arcangeli };
1707b6ba2c7SHugh Dickins };
1712c653d0eSAndrea Arcangeli struct hlist_head hlist;
17262b61f61SHugh Dickins union {
1732c653d0eSAndrea Arcangeli unsigned long kpfn;
1742c653d0eSAndrea Arcangeli unsigned long chain_prune_time;
1752c653d0eSAndrea Arcangeli };
1762c653d0eSAndrea Arcangeli /*
1772c653d0eSAndrea Arcangeli * STABLE_NODE_CHAIN can be any negative number in
1782c653d0eSAndrea Arcangeli * rmap_hlist_len negative range, but better not -1 to be able
1792c653d0eSAndrea Arcangeli * to reliably detect underflows.
1802c653d0eSAndrea Arcangeli */
1812c653d0eSAndrea Arcangeli #define STABLE_NODE_CHAIN -1024
1824146d2d6SHugh Dickins int rmap_hlist_len;
1834146d2d6SHugh Dickins #ifdef CONFIG_NUMA
1844146d2d6SHugh Dickins int nid;
1857b6ba2c7SHugh Dickins #endif
1867b6ba2c7SHugh Dickins };
1877b6ba2c7SHugh Dickins
18821fbd591SQi Zheng /**
1896514d511SHugh Dickins * struct ksm_rmap_item - reverse mapping item for virtual addresses
190db114b83SHugh Dickins * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
191bc56620bSHugh Dickins * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
19231dbd01fSIzik Eidus * @nid: NUMA node id of unstable tree in which linked (may not match page)
19331dbd01fSIzik Eidus * @mm: the memory structure this rmap_item is pointing into
19431dbd01fSIzik Eidus * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1957b6ba2c7SHugh Dickins * @oldchecksum: previous checksum of the page at that virtual address
1967b6ba2c7SHugh Dickins * @node: rb node of this rmap_item in the unstable tree
1977b6ba2c7SHugh Dickins * @head: pointer to stable_node heading this list in the stable tree
1985e924ff5SStefan Roesch * @hlist: link into hlist of rmap_items hanging off that stable_node
1995e924ff5SStefan Roesch * @age: number of scan iterations since creation
20031dbd01fSIzik Eidus * @remaining_skips: how many scans to skip
20121fbd591SQi Zheng */
20221fbd591SQi Zheng struct ksm_rmap_item {
203bc56620bSHugh Dickins struct ksm_rmap_item *rmap_list;
204db114b83SHugh Dickins union {
205bc56620bSHugh Dickins struct anon_vma *anon_vma; /* when stable */
206bc56620bSHugh Dickins #ifdef CONFIG_NUMA
207bc56620bSHugh Dickins int nid; /* when node of unstable tree */
208bc56620bSHugh Dickins #endif
20931dbd01fSIzik Eidus };
21031dbd01fSIzik Eidus struct mm_struct *mm;
21131dbd01fSIzik Eidus unsigned long address; /* + low bits used for flags below */
2125e924ff5SStefan Roesch unsigned int oldchecksum; /* when unstable */
2135e924ff5SStefan Roesch rmap_age_t age;
21431dbd01fSIzik Eidus rmap_age_t remaining_skips;
2157b6ba2c7SHugh Dickins union {
2167b6ba2c7SHugh Dickins struct rb_node node; /* when node of unstable tree */
21721fbd591SQi Zheng struct { /* when listed from stable tree */
2187b6ba2c7SHugh Dickins struct ksm_stable_node *head;
2197b6ba2c7SHugh Dickins struct hlist_node hlist;
22031dbd01fSIzik Eidus };
22131dbd01fSIzik Eidus };
22231dbd01fSIzik Eidus };
22331dbd01fSIzik Eidus
2247b6ba2c7SHugh Dickins #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
2257b6ba2c7SHugh Dickins #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
22631dbd01fSIzik Eidus #define STABLE_FLAG 0x200 /* is listed from the stable tree */
22731dbd01fSIzik Eidus
228ef53d16cSHugh Dickins /* The stable and unstable tree heads */
229ef53d16cSHugh Dickins static struct rb_root one_stable_tree[1] = { RB_ROOT };
230ef53d16cSHugh Dickins static struct rb_root one_unstable_tree[1] = { RB_ROOT };
231ef53d16cSHugh Dickins static struct rb_root *root_stable_tree = one_stable_tree;
23231dbd01fSIzik Eidus static struct rb_root *root_unstable_tree = one_unstable_tree;
2334146d2d6SHugh Dickins
2344146d2d6SHugh Dickins /* Recently migrated nodes of stable tree, pending proper placement */
2352c653d0eSAndrea Arcangeli static LIST_HEAD(migrate_nodes);
2364146d2d6SHugh Dickins #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
2374ca3a69bSSasha Levin
2384ca3a69bSSasha Levin #define MM_SLOTS_HASH_BITS 10
23931dbd01fSIzik Eidus static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
24021fbd591SQi Zheng
24158730ab6SQi Zheng static struct ksm_mm_slot ksm_mm_head = {
24231dbd01fSIzik Eidus .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
24331dbd01fSIzik Eidus };
24431dbd01fSIzik Eidus static struct ksm_scan ksm_scan = {
24531dbd01fSIzik Eidus .mm_slot = &ksm_mm_head,
24631dbd01fSIzik Eidus };
24731dbd01fSIzik Eidus
2487b6ba2c7SHugh Dickins static struct kmem_cache *rmap_item_cache;
24931dbd01fSIzik Eidus static struct kmem_cache *stable_node_cache;
25031dbd01fSIzik Eidus static struct kmem_cache *mm_slot_cache;
251b348b5feSStefan Roesch
252b348b5feSStefan Roesch /* Default number of pages to scan per batch */
253b348b5feSStefan Roesch #define DEFAULT_PAGES_TO_SCAN 100
25431dbd01fSIzik Eidus
255b4028260SHugh Dickins /* The number of pages scanned */
25631dbd01fSIzik Eidus static unsigned long ksm_pages_scanned;
257e178dfdeSHugh Dickins
258b4028260SHugh Dickins /* The number of nodes in the stable tree */
25931dbd01fSIzik Eidus static unsigned long ksm_pages_shared;
260473b0ce4SHugh Dickins
261473b0ce4SHugh Dickins /* The number of page slots additionally sharing those nodes */
262473b0ce4SHugh Dickins static unsigned long ksm_pages_sharing;
263473b0ce4SHugh Dickins
264473b0ce4SHugh Dickins /* The number of nodes in the unstable tree */
265473b0ce4SHugh Dickins static unsigned long ksm_pages_unshared;
2662c653d0eSAndrea Arcangeli
2672c653d0eSAndrea Arcangeli /* The number of rmap_items in use: to calculate pages_volatile */
2682c653d0eSAndrea Arcangeli static unsigned long ksm_rmap_items;
2692c653d0eSAndrea Arcangeli
2702c653d0eSAndrea Arcangeli /* The number of stable_node chains */
2712c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_chains;
2722c653d0eSAndrea Arcangeli
273584ff0dfSZhansaya Bagdauletkyzy /* The number of stable_node dups linked to the stable_node chains */
2742c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_dups;
2752c653d0eSAndrea Arcangeli
2762c653d0eSAndrea Arcangeli /* Delay in pruning stale stable_node_dups in the stable_node_chains */
2772c653d0eSAndrea Arcangeli static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
27831dbd01fSIzik Eidus
2792c6854fdSIzik Eidus /* Maximum number of page slots sharing a stable node */
28031dbd01fSIzik Eidus static int ksm_max_page_sharing = 256;
28131dbd01fSIzik Eidus
2822ffd8679SHugh Dickins /* Number of pages ksmd should scan in one batch */
28331dbd01fSIzik Eidus static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
284e86c59b1SClaudio Imbrenda
285e86c59b1SClaudio Imbrenda /* Milliseconds ksmd should sleep between batches */
286e86c59b1SClaudio Imbrenda static unsigned int ksm_thread_sleep_millisecs = 20;
287e86c59b1SClaudio Imbrenda
288e86c59b1SClaudio Imbrenda /* Checksum of an empty (zeroed) page */
289e86c59b1SClaudio Imbrenda static unsigned int zero_checksum __read_mostly;
2905e924ff5SStefan Roesch
2915e924ff5SStefan Roesch /* Whether to merge empty (zeroed) pages with actual zero pages */
2925e924ff5SStefan Roesch static bool ksm_use_zero_pages __read_mostly;
2935e924ff5SStefan Roesch
294e2942062Sxu xin /* Skip pages that couldn't be de-duplicated previously */
295e2942062Sxu xin /* Default to true at least temporarily, for testing */
296e2942062Sxu xin static bool ksm_smart_scan = true;
297e5a68991SStefan Roesch
298e5a68991SStefan Roesch /* The number of zero pages which is placed by KSM */
299e5a68991SStefan Roesch unsigned long ksm_zero_pages;
300e850dcf5SHugh Dickins
30190bd6fd3SPetr Holasek /* The number of pages that have been skipped due to "smart scanning" */
30290bd6fd3SPetr Holasek static unsigned long ksm_pages_skipped;
303ef53d16cSHugh Dickins
304e850dcf5SHugh Dickins /* Don't scan more than max pages per batch. */
305e850dcf5SHugh Dickins static unsigned long ksm_advisor_max_pages_to_scan = 30000;
306ef53d16cSHugh Dickins
307e850dcf5SHugh Dickins /* Min CPU for scanning pages per scan */
30890bd6fd3SPetr Holasek #define KSM_ADVISOR_MIN_CPU 10
30931dbd01fSIzik Eidus
31031dbd01fSIzik Eidus /* Max CPU for scanning pages per scan */
31131dbd01fSIzik Eidus static unsigned int ksm_advisor_max_cpu = 70;
312ef4d43a8SHugh Dickins
313ef4d43a8SHugh Dickins /* Target scan time in seconds to analyze all KSM candidate pages. */
314ef4d43a8SHugh Dickins static unsigned long ksm_advisor_target_scan_time = 200;
31531dbd01fSIzik Eidus
31631dbd01fSIzik Eidus /* Exponentially weighted moving average. */
317fcf9a0efSKirill Tkhai #define EWMA_WEIGHT 30
31831dbd01fSIzik Eidus
31931dbd01fSIzik Eidus /**
32031dbd01fSIzik Eidus * struct advisor_ctx - metadata for KSM advisor
32121fbd591SQi Zheng * @start_scan: start time of the current scan
32231dbd01fSIzik Eidus * @scan_time: scan time of previous scan
32331dbd01fSIzik Eidus * @change: change in percent to pages_to_scan parameter
32431dbd01fSIzik Eidus * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
32531dbd01fSIzik Eidus */
32631dbd01fSIzik Eidus struct advisor_ctx {
32721fbd591SQi Zheng ktime_t start_scan;
32831dbd01fSIzik Eidus unsigned long scan_time;
32931dbd01fSIzik Eidus unsigned long change;
33031dbd01fSIzik Eidus unsigned long long cpu_time;
33121fbd591SQi Zheng };
3327b6ba2c7SHugh Dickins static struct advisor_ctx advisor_ctx;
3337b6ba2c7SHugh Dickins
3347b6ba2c7SHugh Dickins /* Define different advisor's */
33521fbd591SQi Zheng enum ksm_advisor_type {
33631dbd01fSIzik Eidus KSM_ADVISOR_NONE,
3377b6ba2c7SHugh Dickins KSM_ADVISOR_SCAN_TIME,
33831dbd01fSIzik Eidus };
33931dbd01fSIzik Eidus static enum ksm_advisor_type ksm_advisor;
34031dbd01fSIzik Eidus
3417b6ba2c7SHugh Dickins #ifdef CONFIG_SYSFS
3427b6ba2c7SHugh Dickins /*
3437b6ba2c7SHugh Dickins * Only called through the sysfs control interface:
34431dbd01fSIzik Eidus */
34531dbd01fSIzik Eidus
34631dbd01fSIzik Eidus /* At least scan this many pages per batch. */
34731dbd01fSIzik Eidus static unsigned long ksm_advisor_min_pages_to_scan = 500;
34831dbd01fSIzik Eidus
set_advisor_defaults(void)34931dbd01fSIzik Eidus static void set_advisor_defaults(void)
35031dbd01fSIzik Eidus {
35131dbd01fSIzik Eidus if (ksm_advisor == KSM_ADVISOR_NONE) {
3527b6ba2c7SHugh Dickins ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
35331dbd01fSIzik Eidus } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
35431dbd01fSIzik Eidus advisor_ctx = (const struct advisor_ctx){ 0 };
35531dbd01fSIzik Eidus ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
35631dbd01fSIzik Eidus }
35721fbd591SQi Zheng }
3582c653d0eSAndrea Arcangeli #endif /* CONFIG_SYSFS */
3592c653d0eSAndrea Arcangeli
advisor_start_scan(void)3602c653d0eSAndrea Arcangeli static inline void advisor_start_scan(void)
3612c653d0eSAndrea Arcangeli {
36221fbd591SQi Zheng if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
3632c653d0eSAndrea Arcangeli advisor_ctx.start_scan = ktime_get();
3642c653d0eSAndrea Arcangeli }
3652c653d0eSAndrea Arcangeli
3662c653d0eSAndrea Arcangeli /*
36721fbd591SQi Zheng * Use previous scan time if available, otherwise use current scan time as an
36821fbd591SQi Zheng * approximation for the previous scan time.
3692c653d0eSAndrea Arcangeli */
prev_scan_time(struct advisor_ctx * ctx,unsigned long scan_time)3702c653d0eSAndrea Arcangeli static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
3712c653d0eSAndrea Arcangeli unsigned long scan_time)
3722c653d0eSAndrea Arcangeli {
3732c653d0eSAndrea Arcangeli return ctx->scan_time ? ctx->scan_time : scan_time;
3742c653d0eSAndrea Arcangeli }
3752c653d0eSAndrea Arcangeli
3762c653d0eSAndrea Arcangeli /* Calculate exponential weighted moving average */
ewma(unsigned long prev,unsigned long curr)37721fbd591SQi Zheng static unsigned long ewma(unsigned long prev, unsigned long curr)
3782c653d0eSAndrea Arcangeli {
379b4fecc67SAndrea Arcangeli return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
3802c653d0eSAndrea Arcangeli }
3812c653d0eSAndrea Arcangeli
3822c653d0eSAndrea Arcangeli /*
3832c653d0eSAndrea Arcangeli * The scan time advisor is based on the current scan rate and the target
38421fbd591SQi Zheng * scan rate.
3852c653d0eSAndrea Arcangeli *
3862c653d0eSAndrea Arcangeli * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
3872c653d0eSAndrea Arcangeli *
3882c653d0eSAndrea Arcangeli * To avoid perturbations it calculates a change factor of previous changes.
3892c653d0eSAndrea Arcangeli * A new change factor is calculated for each iteration and it uses an
3902c653d0eSAndrea Arcangeli * exponentially weighted moving average. The new pages_to_scan value is
3912c653d0eSAndrea Arcangeli * multiplied with that change factor:
3922c653d0eSAndrea Arcangeli *
3932c653d0eSAndrea Arcangeli * new_pages_to_scan *= change facor
3942c653d0eSAndrea Arcangeli *
3952c653d0eSAndrea Arcangeli * The new_pages_to_scan value is limited by the cpu min and max values. It
39621fbd591SQi Zheng * calculates the cpu percent for the last scan and calculates the new
39731dbd01fSIzik Eidus * estimated cpu percent cost for the next scan. That value is capped by the
39821fbd591SQi Zheng * cpu min and max setting.
399473b0ce4SHugh Dickins *
4005b398e41Szhong jiang * In addition the new pages_to_scan value is capped by the max and min
4015b398e41Szhong jiang * limits.
402473b0ce4SHugh Dickins */
scan_time_advisor(void)403473b0ce4SHugh Dickins static void scan_time_advisor(void)
404473b0ce4SHugh Dickins {
40531dbd01fSIzik Eidus unsigned int cpu_percent;
40631dbd01fSIzik Eidus unsigned long cpu_time;
40721fbd591SQi Zheng unsigned long cpu_time_diff;
40831dbd01fSIzik Eidus unsigned long cpu_time_diff_ms;
409473b0ce4SHugh Dickins unsigned long pages;
410cb4df4caSxu xin unsigned long per_page_cost;
41131dbd01fSIzik Eidus unsigned long factor;
41231dbd01fSIzik Eidus unsigned long change;
41331dbd01fSIzik Eidus unsigned long last_scan_time;
41431dbd01fSIzik Eidus unsigned long scan_time;
41521fbd591SQi Zheng
4167b6ba2c7SHugh Dickins /* Convert scan time to seconds */
4176213055fSzhong jiang scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
4186213055fSzhong jiang MSEC_PER_SEC);
4196213055fSzhong jiang scan_time = scan_time ? scan_time : 1;
4206213055fSzhong jiang
4216213055fSzhong jiang /* Calculate CPU consumption of ksmd background thread */
4226213055fSzhong jiang cpu_time = task_sched_runtime(current);
4237b6ba2c7SHugh Dickins cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
4247b6ba2c7SHugh Dickins cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;
42521fbd591SQi Zheng
4267b6ba2c7SHugh Dickins cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
4272c653d0eSAndrea Arcangeli cpu_percent = cpu_percent ? cpu_percent : 1;
4282c653d0eSAndrea Arcangeli last_scan_time = prev_scan_time(&advisor_ctx, scan_time);
4297b6ba2c7SHugh Dickins
4307b6ba2c7SHugh Dickins /* Calculate scan time as percentage of target scan time */
4317b6ba2c7SHugh Dickins factor = ksm_advisor_target_scan_time * 100 / scan_time;
43231dbd01fSIzik Eidus factor = factor ? factor : 1;
433a913e182SHugh Dickins
434a913e182SHugh Dickins /*
435c1e8d7c6SMichel Lespinasse * Calculate scan time as percentage of last scan time and use
436a913e182SHugh Dickins * exponentially weighted average to smooth it
437a913e182SHugh Dickins */
438a913e182SHugh Dickins change = scan_time * 100 / last_scan_time;
439a913e182SHugh Dickins change = change ? change : 1;
440a913e182SHugh Dickins change = ewma(advisor_ctx.change, change);
441a913e182SHugh Dickins
442a913e182SHugh Dickins /* Calculate new scan rate based on target scan rate. */
443a913e182SHugh Dickins pages = ksm_thread_pages_to_scan * 100 / factor;
444a913e182SHugh Dickins /* Update pages_to_scan by weighted change percentage. */
445d7c0e68dSDavid Hildenbrand pages = pages * change / 100;
446d7c0e68dSDavid Hildenbrand
447d7c0e68dSDavid Hildenbrand /* Cap new pages_to_scan value */
448d7c0e68dSDavid Hildenbrand per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
449d7c0e68dSDavid Hildenbrand per_page_cost = per_page_cost ? per_page_cost : 1;
450d7c0e68dSDavid Hildenbrand
451c33c7948SRyan Roberts pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
452d7c0e68dSDavid Hildenbrand pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
453d7c0e68dSDavid Hildenbrand pages = min(pages, ksm_advisor_max_pages_to_scan);
454d7c0e68dSDavid Hildenbrand
45504dee9e8SHugh Dickins /* Update advisor context */
45604dee9e8SHugh Dickins advisor_ctx.change = change;
457c33c7948SRyan Roberts advisor_ctx.scan_time = scan_time;
458c33c7948SRyan Roberts advisor_ctx.cpu_time = cpu_time;
459c33c7948SRyan Roberts
460c33c7948SRyan Roberts ksm_thread_pages_to_scan = pages;
461c33c7948SRyan Roberts trace_ksm_advisor(scan_time, pages, cpu_percent);
462d7c0e68dSDavid Hildenbrand }
463d7c0e68dSDavid Hildenbrand
advisor_stop_scan(void)464d7c0e68dSDavid Hildenbrand static void advisor_stop_scan(void)
465d7c0e68dSDavid Hildenbrand {
466d7c0e68dSDavid Hildenbrand if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
467d7c0e68dSDavid Hildenbrand scan_time_advisor();
468d7c0e68dSDavid Hildenbrand }
469d7c0e68dSDavid Hildenbrand
47079271476Sxu xin #ifdef CONFIG_NUMA
471*afccb080SRyan Roberts /* Zeroed when merging across nodes is not allowed */
472d7c0e68dSDavid Hildenbrand static unsigned int ksm_merge_across_nodes = 1;
473d7c0e68dSDavid Hildenbrand static int ksm_nr_node_ids = 1;
474d7c0e68dSDavid Hildenbrand #else
475d7c0e68dSDavid Hildenbrand #define ksm_merge_across_nodes 1U
476d7c0e68dSDavid Hildenbrand #define ksm_nr_node_ids 1
477d7c0e68dSDavid Hildenbrand #endif
47849b06385SSuren Baghdasaryan
47949b06385SSuren Baghdasaryan #define KSM_RUN_STOP 0
48049b06385SSuren Baghdasaryan #define KSM_RUN_MERGE 1
48149b06385SSuren Baghdasaryan #define KSM_RUN_UNMERGE 2
48249b06385SSuren Baghdasaryan #define KSM_RUN_OFFLINE 4
48349b06385SSuren Baghdasaryan static unsigned long ksm_run = KSM_RUN_STOP;
484d7c0e68dSDavid Hildenbrand static void wait_while_offlining(void);
485d7c0e68dSDavid Hildenbrand
486a913e182SHugh Dickins static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
4876cce3314SDavid Hildenbrand static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
4886cce3314SDavid Hildenbrand static DEFINE_MUTEX(ksm_thread_mutex);
48931dbd01fSIzik Eidus static DEFINE_SPINLOCK(ksm_mmlist_lock);
4906cce3314SDavid Hildenbrand
49131dbd01fSIzik Eidus #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
49231dbd01fSIzik Eidus sizeof(struct __struct), __alignof__(struct __struct),\
493bbcd53c9SDavid Hildenbrand (__flags), NULL)
4941b2ee126SDave Hansen
ksm_slab_init(void)4956cce3314SDavid Hildenbrand static int __init ksm_slab_init(void)
4961b2ee126SDave Hansen {
4971b2ee126SDave Hansen rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
49831dbd01fSIzik Eidus if (!rmap_item_cache)
49949b06385SSuren Baghdasaryan goto out;
50031dbd01fSIzik Eidus
50150a7ca3cSSouptick Joarder stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
50249b06385SSuren Baghdasaryan if (!stable_node_cache)
50349b06385SSuren Baghdasaryan goto out_free1;
50431dbd01fSIzik Eidus
50531dbd01fSIzik Eidus mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
506d7c0e68dSDavid Hildenbrand if (!mm_slot_cache)
50758f595c6SDavid Hildenbrand goto out_free2;
50831dbd01fSIzik Eidus
50949b06385SSuren Baghdasaryan return 0;
510d7c0e68dSDavid Hildenbrand
511d7c0e68dSDavid Hildenbrand out_free2:
51258f595c6SDavid Hildenbrand kmem_cache_destroy(stable_node_cache);
51358f595c6SDavid Hildenbrand out_free1:
514dcddffd4SKirill A. Shutemov kmem_cache_destroy(rmap_item_cache);
5156cce3314SDavid Hildenbrand out:
516bce617edSPeter Xu return -ENOMEM;
51758f595c6SDavid Hildenbrand }
518d952b791SHugh Dickins
ksm_slab_free(void)51958f595c6SDavid Hildenbrand static void __init ksm_slab_free(void)
52058f595c6SDavid Hildenbrand {
52158f595c6SDavid Hildenbrand kmem_cache_destroy(mm_slot_cache);
522d952b791SHugh Dickins kmem_cache_destroy(stable_node_cache);
523d952b791SHugh Dickins kmem_cache_destroy(rmap_item_cache);
524d952b791SHugh Dickins mm_slot_cache = NULL;
525d952b791SHugh Dickins }
526d952b791SHugh Dickins
is_stable_node_chain(struct ksm_stable_node * chain)527d952b791SHugh Dickins static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
528d952b791SHugh Dickins {
529d952b791SHugh Dickins return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
530d952b791SHugh Dickins }
531d952b791SHugh Dickins
is_stable_node_dup(struct ksm_stable_node * dup)532d952b791SHugh Dickins static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
533d952b791SHugh Dickins {
534d952b791SHugh Dickins return dup->head == STABLE_NODE_DUP_HEAD;
535d952b791SHugh Dickins }
536d952b791SHugh Dickins
stable_node_chain_add_dup(struct ksm_stable_node * dup,struct ksm_stable_node * chain)537d952b791SHugh Dickins static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
538d952b791SHugh Dickins struct ksm_stable_node *chain)
539d952b791SHugh Dickins {
540d952b791SHugh Dickins VM_BUG_ON(is_stable_node_dup(dup));
541d952b791SHugh Dickins dup->head = STABLE_NODE_DUP_HEAD;
542d952b791SHugh Dickins VM_BUG_ON(!is_stable_node_chain(chain));
543d952b791SHugh Dickins hlist_add_head(&dup->hlist_dup, &chain->hlist);
54431dbd01fSIzik Eidus ksm_stable_node_dups++;
54531dbd01fSIzik Eidus }
546d7597f59SStefan Roesch
__stable_node_dup_del(struct ksm_stable_node * dup)547d7597f59SStefan Roesch static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
548d7597f59SStefan Roesch {
549d7597f59SStefan Roesch VM_BUG_ON(!is_stable_node_dup(dup));
550d7597f59SStefan Roesch hlist_del(&dup->hlist_dup);
551d7597f59SStefan Roesch ksm_stable_node_dups--;
552d7597f59SStefan Roesch }
553d7597f59SStefan Roesch
stable_node_dup_del(struct ksm_stable_node * dup)554d7597f59SStefan Roesch static inline void stable_node_dup_del(struct ksm_stable_node *dup)
555d7597f59SStefan Roesch {
556d7597f59SStefan Roesch VM_BUG_ON(is_stable_node_chain(dup));
557d7597f59SStefan Roesch if (is_stable_node_dup(dup))
558d7597f59SStefan Roesch __stable_node_dup_del(dup);
559d7597f59SStefan Roesch else
560d7597f59SStefan Roesch rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
561d7597f59SStefan Roesch #ifdef CONFIG_DEBUG_VM
562d7597f59SStefan Roesch dup->head = NULL;
563d7597f59SStefan Roesch #endif
564d7597f59SStefan Roesch }
565d7597f59SStefan Roesch
alloc_rmap_item(void)566d7597f59SStefan Roesch static inline struct ksm_rmap_item *alloc_rmap_item(void)
567d7597f59SStefan Roesch {
568ef694222SBob Liu struct ksm_rmap_item *rmap_item;
569ef694222SBob Liu
570ef694222SBob Liu rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
571ef694222SBob Liu __GFP_NORETRY | __GFP_NOWARN);
572ef694222SBob Liu if (rmap_item)
573ef694222SBob Liu ksm_rmap_items++;
574ff69fb81SLiam Howlett return rmap_item;
575ff69fb81SLiam Howlett }
576ef694222SBob Liu
free_rmap_item(struct ksm_rmap_item * rmap_item)577ef694222SBob Liu static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
578ef694222SBob Liu {
579ef694222SBob Liu ksm_rmap_items--;
58021fbd591SQi Zheng rmap_item->mm->ksm_rmap_items--;
58131dbd01fSIzik Eidus rmap_item->mm = NULL; /* debug safety */
5828dd3557aSHugh Dickins kmem_cache_free(rmap_item_cache, rmap_item);
5838dd3557aSHugh Dickins }
58431dbd01fSIzik Eidus
alloc_stable_node(void)58531dbd01fSIzik Eidus static inline struct ksm_stable_node *alloc_stable_node(void)
5864035c07aSHugh Dickins {
5874035c07aSHugh Dickins /*
5884035c07aSHugh Dickins * The allocation can take too long with GFP_KERNEL when memory is under
5894035c07aSHugh Dickins * pressure, which may lead to hung task warnings. Adding __GFP_HIGH
5909e60109fSPeter Zijlstra * grants access to memory reserves, helping to avoid this problem.
5914035c07aSHugh Dickins */
592d8ed45c5SMichel Lespinasse return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
593ef694222SBob Liu }
594ef694222SBob Liu
free_stable_node(struct ksm_stable_node * stable_node)59549b06385SSuren Baghdasaryan static inline void free_stable_node(struct ksm_stable_node *stable_node)
596d8ed45c5SMichel Lespinasse {
59731dbd01fSIzik Eidus VM_BUG_ON(stable_node->rmap_hlist_len &&
59831dbd01fSIzik Eidus !is_stable_node_chain(stable_node));
59921fbd591SQi Zheng kmem_cache_free(stable_node_cache, stable_node);
60031dbd01fSIzik Eidus }
60131dbd01fSIzik Eidus
60231dbd01fSIzik Eidus /*
60331dbd01fSIzik Eidus * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
60431dbd01fSIzik Eidus * page tables after it has passed through ksm_exit() - which, if necessary,
60531dbd01fSIzik Eidus * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
606d8ed45c5SMichel Lespinasse * a special flag: they can just back out as soon as mm_users goes to zero.
607ef694222SBob Liu * ksm_test_exit() is used throughout to make this test for exit: in some
608ef694222SBob Liu * places for correctness, in some places just to avoid unnecessary work.
60931dbd01fSIzik Eidus */
ksm_test_exit(struct mm_struct * mm)61031dbd01fSIzik Eidus static inline bool ksm_test_exit(struct mm_struct *mm)
61131dbd01fSIzik Eidus {
612f7091ed6SHaiyue Wang return atomic_read(&mm->mm_users) == 0;
61331dbd01fSIzik Eidus }
614f7091ed6SHaiyue Wang
break_ksm_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)615f7091ed6SHaiyue Wang static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
616f765f540SKirill A. Shutemov struct mm_walk *walk)
61731dbd01fSIzik Eidus {
61831dbd01fSIzik Eidus struct page *page = NULL;
61931dbd01fSIzik Eidus spinlock_t *ptl;
620f7091ed6SHaiyue Wang pte_t *pte;
62131dbd01fSIzik Eidus pte_t ptent;
622c8f95ed1SAndrea Arcangeli int ret;
623c8f95ed1SAndrea Arcangeli
62431dbd01fSIzik Eidus pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
625d8ed45c5SMichel Lespinasse if (!pte)
62631dbd01fSIzik Eidus return 0;
62731dbd01fSIzik Eidus ptent = ptep_get(pte);
62831dbd01fSIzik Eidus if (pte_present(ptent)) {
62990bd6fd3SPetr Holasek page = vm_normal_page(walk->vma, addr, ptent);
63090bd6fd3SPetr Holasek } else if (!pte_none(ptent)) {
63190bd6fd3SPetr Holasek swp_entry_t entry = pte_to_swp_entry(ptent);
63290bd6fd3SPetr Holasek
63390bd6fd3SPetr Holasek /*
63490bd6fd3SPetr Holasek * As KSM pages remain KSM pages until freed, no need to wait
63590bd6fd3SPetr Holasek * here for migration to end.
63690bd6fd3SPetr Holasek */
637d8fc16a8SHugh Dickins if (is_migration_entry(entry))
63890bd6fd3SPetr Holasek page = pfn_swap_entry_to_page(entry);
63990bd6fd3SPetr Holasek }
64021fbd591SQi Zheng /* return 1 if the page is an normal ksm page or KSM-placed zero page */
6412c653d0eSAndrea Arcangeli ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
6422c653d0eSAndrea Arcangeli pte_unmap_unlock(pte, ptl);
64321fbd591SQi Zheng return ret;
6442c653d0eSAndrea Arcangeli }
6452c653d0eSAndrea Arcangeli
6462c653d0eSAndrea Arcangeli static const struct mm_walk_ops break_ksm_ops = {
6472c653d0eSAndrea Arcangeli .pmd_entry = break_ksm_pmd_entry,
6482c653d0eSAndrea Arcangeli .walk_lock = PGWALK_RDLOCK,
6492c653d0eSAndrea Arcangeli };
65098fa15f3SAnshuman Khandual
6512c653d0eSAndrea Arcangeli static const struct mm_walk_ops break_ksm_lock_vma_ops = {
6522c653d0eSAndrea Arcangeli .pmd_entry = break_ksm_pmd_entry,
6532c653d0eSAndrea Arcangeli .walk_lock = PGWALK_WRLOCK,
6542c653d0eSAndrea Arcangeli };
6552c653d0eSAndrea Arcangeli
6562c653d0eSAndrea Arcangeli /*
6572c653d0eSAndrea Arcangeli * We use break_ksm to break COW on a ksm page by triggering unsharing,
6582c653d0eSAndrea Arcangeli * such that the ksm page will get replaced by an exclusive anonymous page.
6592c653d0eSAndrea Arcangeli *
6602c653d0eSAndrea Arcangeli * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
6612c653d0eSAndrea Arcangeli * in case the application has unmapped and remapped mm,addr meanwhile.
6622c653d0eSAndrea Arcangeli * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
6632c653d0eSAndrea Arcangeli * mmap of /dev/mem, where we would not want to touch it.
6642c653d0eSAndrea Arcangeli *
665457aef94SEthon Paul * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
6662c653d0eSAndrea Arcangeli * of the process that owns 'vma'. We also do not want to enforce
6672c653d0eSAndrea Arcangeli * protection keys here anyway.
6682c653d0eSAndrea Arcangeli */
break_ksm(struct vm_area_struct * vma,unsigned long addr,bool lock_vma)6692c653d0eSAndrea Arcangeli static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
6702c653d0eSAndrea Arcangeli {
6712c653d0eSAndrea Arcangeli vm_fault_t ret = 0;
6722c653d0eSAndrea Arcangeli const struct mm_walk_ops *ops = lock_vma ?
67321fbd591SQi Zheng &break_ksm_lock_vma_ops : &break_ksm_ops;
6742c653d0eSAndrea Arcangeli
6752c653d0eSAndrea Arcangeli do {
6762c653d0eSAndrea Arcangeli int ksm_page;
6772c653d0eSAndrea Arcangeli
6782c653d0eSAndrea Arcangeli cond_resched();
6792c653d0eSAndrea Arcangeli ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
6802c653d0eSAndrea Arcangeli if (WARN_ON_ONCE(ksm_page < 0))
68121fbd591SQi Zheng return ksm_page;
6824035c07aSHugh Dickins if (!ksm_page)
68321fbd591SQi Zheng return 0;
6844035c07aSHugh Dickins ret = handle_mm_fault(vma, addr,
6852c653d0eSAndrea Arcangeli FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
6862c653d0eSAndrea Arcangeli NULL);
6872c653d0eSAndrea Arcangeli } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
688b67bfe0dSSasha Levin /*
689739100c8SStefan Roesch * We must loop until we no longer find a KSM page because
6904035c07aSHugh Dickins * handle_mm_fault() may back out if there's any difficulty e.g. if
691739100c8SStefan Roesch * pte accessed bit gets updated concurrently.
692739100c8SStefan Roesch *
6934035c07aSHugh Dickins * VM_FAULT_SIGBUS could occur if we race with truncation of the
694739100c8SStefan Roesch * backing file, which also invalidates anonymous pages: that's
69576093853Sxu xin * okay, that truncation will have unmapped the PageKsm for us.
69676093853Sxu xin *
69776093853Sxu xin * VM_FAULT_OOM: at the time of writing (late July 2009), setting
6982c653d0eSAndrea Arcangeli * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
6992c653d0eSAndrea Arcangeli * current task has TIF_MEMDIE set, and will be OOM killed on return
7009e60109fSPeter Zijlstra * to user; and ksmd, having no mm, would never be chosen for that.
7014035c07aSHugh Dickins *
7024035c07aSHugh Dickins * But if the mm is in a limited mem_cgroup, then the fault may fail
7034035c07aSHugh Dickins * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
7044035c07aSHugh Dickins * even ksmd can fail in this way - though it's usually breaking ksm
7052c653d0eSAndrea Arcangeli * just to undo a merge it made a moment before, so unlikely to oom.
7062c653d0eSAndrea Arcangeli *
7072c653d0eSAndrea Arcangeli * That's a pity: we might therefore have more kernel pages allocated
7082c653d0eSAndrea Arcangeli * than we're counting as nodes in the stable tree; but ksm_do_scan
7092c653d0eSAndrea Arcangeli * will retry to break_cow on each pass, so should recover the page
710815f0ddbSNick Desaulniers * in due course. The important thing is to not let VM_MERGEABLE
7112c653d0eSAndrea Arcangeli * be cleared while any such pages might remain in the area.
7122c653d0eSAndrea Arcangeli */
7132c653d0eSAndrea Arcangeli return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
7142c653d0eSAndrea Arcangeli }
715739100c8SStefan Roesch
vma_ksm_compatible(struct vm_area_struct * vma)7164146d2d6SHugh Dickins static bool vma_ksm_compatible(struct vm_area_struct *vma)
7174146d2d6SHugh Dickins {
7184146d2d6SHugh Dickins if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
7192c653d0eSAndrea Arcangeli VM_IO | VM_DONTEXPAND | VM_HUGETLB |
7204035c07aSHugh Dickins VM_MIXEDMAP))
7214035c07aSHugh Dickins return false; /* just ignore the advice */
7224035c07aSHugh Dickins
7232cee57d1SYang Shi if (vma_is_dax(vma))
7242cee57d1SYang Shi return false;
7252cee57d1SYang Shi
7262cee57d1SYang Shi #ifdef VM_SAO
7272cee57d1SYang Shi if (vma->vm_flags & VM_SAO)
7282cee57d1SYang Shi return false;
7294035c07aSHugh Dickins #endif
7304035c07aSHugh Dickins #ifdef VM_SPARC_ADI
7314035c07aSHugh Dickins if (vma->vm_flags & VM_SPARC_ADI)
7324035c07aSHugh Dickins return false;
7334035c07aSHugh Dickins #endif
7344035c07aSHugh Dickins
735c8d6553bSHugh Dickins return true;
7364035c07aSHugh Dickins }
7374035c07aSHugh Dickins
find_mergeable_vma(struct mm_struct * mm,unsigned long addr)7384035c07aSHugh Dickins static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
7394035c07aSHugh Dickins unsigned long addr)
7404035c07aSHugh Dickins {
7414035c07aSHugh Dickins struct vm_area_struct *vma;
7424035c07aSHugh Dickins if (ksm_test_exit(mm))
7434035c07aSHugh Dickins return NULL;
7444035c07aSHugh Dickins vma = vma_lookup(mm, addr);
7454035c07aSHugh Dickins if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
7464035c07aSHugh Dickins return NULL;
7474035c07aSHugh Dickins return vma;
74821fbd591SQi Zheng }
7492cee57d1SYang Shi
break_cow(struct ksm_rmap_item * rmap_item)7504035c07aSHugh Dickins static void break_cow(struct ksm_rmap_item *rmap_item)
7514035c07aSHugh Dickins {
7524035c07aSHugh Dickins struct mm_struct *mm = rmap_item->mm;
753c8d6553bSHugh Dickins unsigned long addr = rmap_item->address;
7544035c07aSHugh Dickins struct vm_area_struct *vma;
755bda807d4SMinchan Kim
756bda807d4SMinchan Kim /*
757c8d6553bSHugh Dickins * It is not an accident that whenever we want to break COW
75808df4774SPaul E. McKenney * to undo, we also need to drop a reference to the anon_vma.
759c8d6553bSHugh Dickins */
7604db0c3c2SJason Low put_anon_vma(rmap_item->anon_vma);
7614035c07aSHugh Dickins
762c8d6553bSHugh Dickins mmap_read_lock(mm);
763c8d6553bSHugh Dickins vma = find_mergeable_vma(mm, addr);
764c8d6553bSHugh Dickins if (vma)
765c8d6553bSHugh Dickins break_ksm(vma, addr, false);
766c8d6553bSHugh Dickins mmap_read_unlock(mm);
7671c4c3b99SJiang Biao }
768c8d6553bSHugh Dickins
get_mergeable_page(struct ksm_rmap_item * rmap_item)76952d1e606SKirill Tkhai static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
7709800562fSMatthew Wilcox (Oracle) {
77152d1e606SKirill Tkhai struct mm_struct *mm = rmap_item->mm;
772c8d6553bSHugh Dickins unsigned long addr = rmap_item->address;
773c8d6553bSHugh Dickins struct vm_area_struct *vma;
774c8d6553bSHugh Dickins struct page *page;
775c8d6553bSHugh Dickins
776c8d6553bSHugh Dickins mmap_read_lock(mm);
777c8d6553bSHugh Dickins vma = find_mergeable_vma(mm, addr);
778c8d6553bSHugh Dickins if (!vma)
7791c4c3b99SJiang Biao goto out;
780c8d6553bSHugh Dickins
781c8d6553bSHugh Dickins page = follow_page(vma, addr, FOLL_GET);
782c8d6553bSHugh Dickins if (IS_ERR_OR_NULL(page))
7834035c07aSHugh Dickins goto out;
784c8d6553bSHugh Dickins if (is_zone_device_page(page))
785c8d6553bSHugh Dickins goto out_putpage;
786c8d6553bSHugh Dickins if (PageAnon(page)) {
7874db0c3c2SJason Low flush_anon_page(vma, page, addr);
7884035c07aSHugh Dickins flush_dcache_page(page);
7894035c07aSHugh Dickins } else {
7904035c07aSHugh Dickins out_putpage:
791c8d6553bSHugh Dickins put_page(page);
7922cee57d1SYang Shi out:
7932cee57d1SYang Shi page = NULL;
7942cee57d1SYang Shi }
7952cee57d1SYang Shi mmap_read_unlock(mm);
7962cee57d1SYang Shi return page;
7972cee57d1SYang Shi }
7988aafa6a4SHugh Dickins
7992cee57d1SYang Shi /*
8002cee57d1SYang Shi * This helper is used for getting right index into array of tree roots.
8014db0c3c2SJason Low * When merge_across_nodes knob is set to 1, there are only two rb-trees for
8028aafa6a4SHugh Dickins * stable and unstable pages from all nodes with roots in index 0. Otherwise,
8038aafa6a4SHugh Dickins * every node has its own stable and unstable tree.
8048aafa6a4SHugh Dickins */
get_kpfn_nid(unsigned long kpfn)8058aafa6a4SHugh Dickins static inline int get_kpfn_nid(unsigned long kpfn)
8068aafa6a4SHugh Dickins {
8074035c07aSHugh Dickins return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
808c8d6553bSHugh Dickins }
8094035c07aSHugh Dickins
alloc_stable_node_chain(struct ksm_stable_node * dup,struct rb_root * root)810c8d6553bSHugh Dickins static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
811c8d6553bSHugh Dickins struct rb_root *root)
812c8d6553bSHugh Dickins {
81319138349SMatthew Wilcox (Oracle) struct ksm_stable_node *chain = alloc_stable_node();
814c8d6553bSHugh Dickins VM_BUG_ON(is_stable_node_chain(dup));
815c8d6553bSHugh Dickins if (likely(chain)) {
816c8d6553bSHugh Dickins INIT_HLIST_HEAD(&chain->hlist);
8174db0c3c2SJason Low chain->chain_prune_time = jiffies;
818c8d6553bSHugh Dickins chain->rmap_hlist_len = STABLE_NODE_CHAIN;
8194035c07aSHugh Dickins #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
8204035c07aSHugh Dickins chain->nid = NUMA_NO_NODE; /* debug */
8214035c07aSHugh Dickins #endif
8224035c07aSHugh Dickins ksm_stable_node_chains++;
82331dbd01fSIzik Eidus
82431dbd01fSIzik Eidus /*
82531dbd01fSIzik Eidus * Put the stable node chain in the first dimension of
82631dbd01fSIzik Eidus * the stable tree and at the same time remove the old
82721fbd591SQi Zheng * stable node.
82831dbd01fSIzik Eidus */
8297b6ba2c7SHugh Dickins rb_replace_node(&dup->node, &chain->node, root);
83021fbd591SQi Zheng
8315ad64688SHugh Dickins /*
83231dbd01fSIzik Eidus * Move the old stable node to the second dimension
8337b6ba2c7SHugh Dickins * queued in the hlist_dup. The invariant is that all
83462862290SHugh Dickins * dup stable_nodes in the chain->hlist point to pages
8354035c07aSHugh Dickins * that are write protected and have the exact same
8364035c07aSHugh Dickins * content.
8375ad64688SHugh Dickins */
8387b6ba2c7SHugh Dickins stable_node_chain_add_dup(dup, chain);
83962862290SHugh Dickins }
8405ad64688SHugh Dickins return chain;
84108beca44SHugh Dickins }
84298666f8aSAndrea Arcangeli
free_stable_node_chain(struct ksm_stable_node * chain,struct rb_root * root)8434035c07aSHugh Dickins static inline void free_stable_node_chain(struct ksm_stable_node *chain,
8444035c07aSHugh Dickins struct rb_root *root)
845b4028260SHugh Dickins {
84676093853Sxu xin rb_erase(&chain->node, root);
84776093853Sxu xin free_stable_node(chain);
84876093853Sxu xin ksm_stable_node_chains--;
8492c653d0eSAndrea Arcangeli }
8502c653d0eSAndrea Arcangeli
remove_node_from_stable_tree(struct ksm_stable_node * stable_node)85131dbd01fSIzik Eidus static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
8529e60109fSPeter Zijlstra {
853c89a384eSMiaohe Lin struct ksm_rmap_item *rmap_item;
85493d17715SHugh Dickins
85531dbd01fSIzik Eidus /* check it's not STABLE_NODE_CHAIN or negative */
8567b6ba2c7SHugh Dickins BUG_ON(stable_node->rmap_hlist_len < 0);
85731dbd01fSIzik Eidus
85831dbd01fSIzik Eidus hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
8599ba69294SHugh Dickins if (rmap_item->hlist.next) {
86031dbd01fSIzik Eidus ksm_pages_sharing--;
8619ba69294SHugh Dickins trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
8629ba69294SHugh Dickins } else {
8639ba69294SHugh Dickins ksm_pages_shared--;
86431dbd01fSIzik Eidus }
86531dbd01fSIzik Eidus
866cd551f97SHugh Dickins rmap_item->mm->ksm_merging_pages--;
86731dbd01fSIzik Eidus
86890bd6fd3SPetr Holasek VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
869ef53d16cSHugh Dickins stable_node->rmap_hlist_len--;
87093d17715SHugh Dickins put_anon_vma(rmap_item->anon_vma);
87131dbd01fSIzik Eidus rmap_item->address &= PAGE_MASK;
87293d17715SHugh Dickins cond_resched();
8734035c07aSHugh Dickins }
87431dbd01fSIzik Eidus
87531dbd01fSIzik Eidus /*
87631dbd01fSIzik Eidus * We need the second aligned pointer of the migrate_nodes
87721fbd591SQi Zheng * list_head to stay clear from the rb_parent_color union
87831dbd01fSIzik Eidus * (aligned and different than any node) and also different
8796514d511SHugh Dickins * from &migrate_nodes. This will verify that future list.h changes
88021fbd591SQi Zheng * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
8816514d511SHugh Dickins */
88231dbd01fSIzik Eidus BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
88331dbd01fSIzik Eidus BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
88431dbd01fSIzik Eidus
88531dbd01fSIzik Eidus trace_ksm_remove_ksm_page(stable_node->kpfn);
88631dbd01fSIzik Eidus if (stable_node->head == &migrate_nodes)
88731dbd01fSIzik Eidus list_del(&stable_node->list);
888e850dcf5SHugh Dickins else
88931dbd01fSIzik Eidus stable_node_dup_del(stable_node);
89031dbd01fSIzik Eidus free_stable_node(stable_node);
891c1e8d7c6SMichel Lespinasse }
89231dbd01fSIzik Eidus
89331dbd01fSIzik Eidus enum get_ksm_page_flags {
89481464e30SHugh Dickins GET_KSM_PAGE_NOLOCK,
89581464e30SHugh Dickins GET_KSM_PAGE_LOCK,
89681464e30SHugh Dickins GET_KSM_PAGE_TRYLOCK
89781464e30SHugh Dickins };
89881464e30SHugh Dickins
89931dbd01fSIzik Eidus /*
900d952b791SHugh Dickins * get_ksm_page: checks if the page indicated by the stable node
90149b06385SSuren Baghdasaryan * is still its ksm page, despite having held no reference to it.
90231dbd01fSIzik Eidus * In which case we can trust the content of the page, and it
90331dbd01fSIzik Eidus * returns the gotten page; but if the page has now been zapped,
904d952b791SHugh Dickins * remove the stale node from the stable tree and return NULL.
90531dbd01fSIzik Eidus * But beware, the stable node's page might be being migrated.
906d952b791SHugh Dickins *
9079ba69294SHugh Dickins * You would expect the stable_node to hold a reference to the ksm page.
9089ba69294SHugh Dickins * But if it increments the page's count, swapping out has to wait for
909d952b791SHugh Dickins * ksmd to come around again before it can free the page, which may take
910d952b791SHugh Dickins * seconds or even minutes: much too unresponsive. So instead we use a
911d952b791SHugh Dickins * "keyhole reference": access to the ksm page from the stable node peeps
91249b06385SSuren Baghdasaryan * out through its keyhole to see if that page still holds the right key,
913d952b791SHugh Dickins * pointing back to this stable node. This relies on freeing a PageAnon
914d952b791SHugh Dickins * page to reset its page->mapping to NULL, and relies on no other use of
91531dbd01fSIzik Eidus * a page to put something that might look like our key in page->mapping.
91631dbd01fSIzik Eidus * is on its way to being freed; but it is an anomaly to bear in mind.
91721fbd591SQi Zheng */
get_ksm_page(struct ksm_stable_node * stable_node,enum get_ksm_page_flags flags)91819138349SMatthew Wilcox (Oracle) static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
91919138349SMatthew Wilcox (Oracle) enum get_ksm_page_flags flags)
92019138349SMatthew Wilcox (Oracle) {
92119138349SMatthew Wilcox (Oracle) struct page *page;
92221fbd591SQi Zheng void *expected_mapping;
92388484826SMike Rapoport unsigned long kpfn;
92419138349SMatthew Wilcox (Oracle)
92588484826SMike Rapoport expected_mapping = (void *)((unsigned long)stable_node |
92688484826SMike Rapoport PAGE_MAPPING_KSM);
92788484826SMike Rapoport again:
92821fbd591SQi Zheng kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
92988484826SMike Rapoport page = pfn_to_page(kpfn);
9306c287605SDavid Hildenbrand if (READ_ONCE(page->mapping) != expected_mapping)
93188484826SMike Rapoport goto stale;
93288484826SMike Rapoport
93388484826SMike Rapoport /*
9342ffd8679SHugh Dickins * We cannot do anything with the page while its refcount is 0.
9352ffd8679SHugh Dickins * Usually 0 means free, or tail of a higher-order page: in which
9362ffd8679SHugh Dickins * case this node is no longer referenced, and should be freed;
9372ffd8679SHugh Dickins * however, it might mean that the page is under page_ref_freeze().
93821fbd591SQi Zheng * The __remove_mapping() case is easy, again the node is now stale;
939cbf86cfeSHugh Dickins * the same is in reuse_ksm_page() case; but if page is swapcache
940cbf86cfeSHugh Dickins * in folio_migrate_mapping(), it might still be our page,
941cbf86cfeSHugh Dickins * in which case it's essential to keep the node.
942cbf86cfeSHugh Dickins */
9432cee57d1SYang Shi while (!get_page_unless_zero(page)) {
944cbf86cfeSHugh Dickins /*
945cbf86cfeSHugh Dickins * Another check for page->mapping != expected_mapping would
946cbf86cfeSHugh Dickins * work here too. We have chosen the !PageSwapCache test to
947cbf86cfeSHugh Dickins * optimize the common case, when the page is or is about to
948cbf86cfeSHugh Dickins * be freed: PageSwapCache is cleared (under spin_lock_irq)
949cbf86cfeSHugh Dickins * in the ref_freeze section of __remove_mapping(); but Anon
950cbf86cfeSHugh Dickins * page->mapping reset to NULL later, in free_pages_prepare().
951cbf86cfeSHugh Dickins */
9529a63236fSAndrey Ryabinin if (!PageSwapCache(page))
9539a63236fSAndrey Ryabinin goto stale;
9549a63236fSAndrey Ryabinin cpu_relax();
9558fdb3dbfSHugh Dickins }
9568fdb3dbfSHugh Dickins
9579a63236fSAndrey Ryabinin if (READ_ONCE(page->mapping) != expected_mapping) {
9588fdb3dbfSHugh Dickins put_page(page);
9598fdb3dbfSHugh Dickins goto stale;
9608fdb3dbfSHugh Dickins }
9618fdb3dbfSHugh Dickins
9621fec6890SMatthew Wilcox (Oracle) if (flags == GET_KSM_PAGE_TRYLOCK) {
963cbf86cfeSHugh Dickins if (!trylock_page(page)) {
964cbf86cfeSHugh Dickins put_page(page);
965cbf86cfeSHugh Dickins return ERR_PTR(-EBUSY);
966cbf86cfeSHugh Dickins }
967cbf86cfeSHugh Dickins } else if (flags == GET_KSM_PAGE_LOCK)
968cbf86cfeSHugh Dickins lock_page(page);
969cbf86cfeSHugh Dickins
970cbf86cfeSHugh Dickins if (flags != GET_KSM_PAGE_NOLOCK) {
971cbf86cfeSHugh Dickins if (READ_ONCE(page->mapping) != expected_mapping) {
972cbf86cfeSHugh Dickins unlock_page(page);
973cbf86cfeSHugh Dickins put_page(page);
974cbf86cfeSHugh Dickins goto stale;
975cbf86cfeSHugh Dickins }
97621fbd591SQi Zheng }
9772c653d0eSAndrea Arcangeli return page;
9782c653d0eSAndrea Arcangeli
97921fbd591SQi Zheng stale:
9802c653d0eSAndrea Arcangeli /*
9812c653d0eSAndrea Arcangeli * We come here from above when page->mapping or !PageSwapCache
9822c653d0eSAndrea Arcangeli * suggests that the node is stale; but it might be under migration.
9832c653d0eSAndrea Arcangeli * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
9842c653d0eSAndrea Arcangeli * before checking whether node->kpfn has been changed.
9852c653d0eSAndrea Arcangeli */
9862c653d0eSAndrea Arcangeli smp_rmb();
9872c653d0eSAndrea Arcangeli if (READ_ONCE(stable_node->kpfn) != kpfn)
9882c653d0eSAndrea Arcangeli goto again;
9892c653d0eSAndrea Arcangeli remove_node_from_stable_tree(stable_node);
9902c653d0eSAndrea Arcangeli return NULL;
9912c653d0eSAndrea Arcangeli }
9922c653d0eSAndrea Arcangeli
9932c653d0eSAndrea Arcangeli /*
9942c653d0eSAndrea Arcangeli * Removing rmap_item from stable or unstable tree.
9952c653d0eSAndrea Arcangeli * This function will clean the information from the stable/unstable tree.
9962c653d0eSAndrea Arcangeli */
remove_rmap_item_from_tree(struct ksm_rmap_item * rmap_item)9972c653d0eSAndrea Arcangeli static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
9982c653d0eSAndrea Arcangeli {
9992c653d0eSAndrea Arcangeli if (rmap_item->address & STABLE_FLAG) {
10002c653d0eSAndrea Arcangeli struct ksm_stable_node *stable_node;
1001cbf86cfeSHugh Dickins struct page *page;
1002cbf86cfeSHugh Dickins
100321fbd591SQi Zheng stable_node = rmap_item->head;
1004cbf86cfeSHugh Dickins page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
1005cbf86cfeSHugh Dickins if (!page)
1006cbf86cfeSHugh Dickins goto out;
1007ef53d16cSHugh Dickins
1008cbf86cfeSHugh Dickins hlist_del(&rmap_item->hlist);
1009cbf86cfeSHugh Dickins unlock_page(page);
101021fbd591SQi Zheng put_page(page);
10112c653d0eSAndrea Arcangeli
10122c653d0eSAndrea Arcangeli if (!hlist_empty(&stable_node->hlist))
1013cbf86cfeSHugh Dickins ksm_pages_sharing--;
1014cbf86cfeSHugh Dickins else
1015cbf86cfeSHugh Dickins ksm_pages_shared--;
1016cbf86cfeSHugh Dickins
1017cbf86cfeSHugh Dickins rmap_item->mm->ksm_merging_pages--;
1018cbf86cfeSHugh Dickins
101903640418SGeliang Tang VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
10204146d2d6SHugh Dickins stable_node->rmap_hlist_len--;
10214146d2d6SHugh Dickins
10224146d2d6SHugh Dickins put_anon_vma(rmap_item->anon_vma);
10234146d2d6SHugh Dickins rmap_item->head = NULL;
1024cbf86cfeSHugh Dickins rmap_item->address &= PAGE_MASK;
1025cbf86cfeSHugh Dickins
1026cbf86cfeSHugh Dickins } else if (rmap_item->address & UNSTABLE_FLAG) {
1027d952b791SHugh Dickins unsigned char age;
102831dbd01fSIzik Eidus /*
102921fbd591SQi Zheng * Usually ksmd can and must skip the rb_erase, because
103058730ab6SQi Zheng * root_unstable_tree was already reset to RB_ROOT.
103131dbd01fSIzik Eidus * But be careful when an mm is exiting: do the rb_erase
103231dbd01fSIzik Eidus * if this rmap_item was inserted by this scan, rather
1033d952b791SHugh Dickins * than left over from before.
103431dbd01fSIzik Eidus */
1035d952b791SHugh Dickins age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
103658730ab6SQi Zheng BUG_ON(age > 1);
103758730ab6SQi Zheng if (!age)
103858730ab6SQi Zheng rb_erase(&rmap_item->node,
1039d952b791SHugh Dickins root_unstable_tree + NUMA(rmap_item->nid));
1040d952b791SHugh Dickins ksm_pages_unshared--;
1041a5f18ba0SMatthew Wilcox (Oracle) rmap_item->address &= PAGE_MASK;
1042a5f18ba0SMatthew Wilcox (Oracle) }
104358730ab6SQi Zheng out:
1044a5f18ba0SMatthew Wilcox (Oracle) cond_resched(); /* we're called from many long loops */
104558730ab6SQi Zheng }
1046d8ed45c5SMichel Lespinasse
remove_trailing_rmap_items(struct ksm_rmap_item ** rmap_list)10476db504ceSLiam R. Howlett static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
10486db504ceSLiam R. Howlett {
10496db504ceSLiam R. Howlett while (*rmap_list) {
10506db504ceSLiam R. Howlett struct ksm_rmap_item *rmap_item = *rmap_list;
10516db504ceSLiam R. Howlett *rmap_list = rmap_item->rmap_list;
10529ba69294SHugh Dickins remove_rmap_item_from_tree(rmap_item);
10536db504ceSLiam R. Howlett free_rmap_item(rmap_item);
10546db504ceSLiam R. Howlett }
10556db504ceSLiam R. Howlett }
105631dbd01fSIzik Eidus
105731dbd01fSIzik Eidus /*
1058d952b791SHugh Dickins * Though it's very tempting to unmerge rmap_items from stable tree rather
105949b06385SSuren Baghdasaryan * than check every pte of a given vma, the locking doesn't quite work for
10609ba69294SHugh Dickins * that - an rmap_item is assigned to the stable tree after inserting ksm
10619ba69294SHugh Dickins * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
1062d952b791SHugh Dickins * rmap_items from parent to child at fork time (so as not to waste time
10639ba69294SHugh Dickins * if exit comes before the next scan reaches it).
10646db504ceSLiam R. Howlett *
1065420be4edSChengyang Fan * Similarly, although we'd like to remove rmap_items (so updating counts
1066d8ed45c5SMichel Lespinasse * and freeing memory) when unmerging an area, it's easier to leave that
106731dbd01fSIzik Eidus * to the next pass of ksmd - consider, for example, how ksmd might be
106831dbd01fSIzik Eidus * in cmp_and_merge_page on one of the rmap_items we would be removing.
106958730ab6SQi Zheng */
unmerge_ksm_pages(struct vm_area_struct * vma,unsigned long start,unsigned long end,bool lock_vma)107058730ab6SQi Zheng static int unmerge_ksm_pages(struct vm_area_struct *vma,
107158730ab6SQi Zheng unsigned long start, unsigned long end, bool lock_vma)
10729ba69294SHugh Dickins {
107358730ab6SQi Zheng unsigned long addr;
107458730ab6SQi Zheng int err = 0;
107531dbd01fSIzik Eidus
10769ba69294SHugh Dickins for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
107758730ab6SQi Zheng if (ksm_test_exit(vma->vm_mm))
10789ba69294SHugh Dickins break;
1079d7597f59SStefan Roesch if (signal_pending(current))
10809ba69294SHugh Dickins err = -ERESTARTSYS;
10817496fea9SZhou Chengming else
10829ba69294SHugh Dickins err = break_ksm(vma, addr, lock_vma);
108331dbd01fSIzik Eidus }
108431dbd01fSIzik Eidus return err;
1085cbf86cfeSHugh Dickins }
1086cbf86cfeSHugh Dickins
folio_stable_node(struct folio * folio)1087d952b791SHugh Dickins static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
10889ba69294SHugh Dickins {
10899ba69294SHugh Dickins return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
10909ba69294SHugh Dickins }
1091d8ed45c5SMichel Lespinasse
page_stable_node(struct page * page)1092d952b791SHugh Dickins static inline struct ksm_stable_node *page_stable_node(struct page *page)
1093d952b791SHugh Dickins {
1094d952b791SHugh Dickins return folio_stable_node(page_folio(page));
1095d952b791SHugh Dickins }
1096d952b791SHugh Dickins
set_page_stable_node(struct page * page,struct ksm_stable_node * stable_node)10972ffd8679SHugh Dickins static inline void set_page_stable_node(struct page *page,
1098d952b791SHugh Dickins struct ksm_stable_node *stable_node)
109931dbd01fSIzik Eidus {
110031dbd01fSIzik Eidus VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
110131dbd01fSIzik Eidus page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
11029b04c5feSCong Wang }
110359e1a2f4STimofey Titovets
11049b04c5feSCong Wang #ifdef CONFIG_SYSFS
110531dbd01fSIzik Eidus /*
110631dbd01fSIzik Eidus * Only called through the sysfs control interface:
110731dbd01fSIzik Eidus */
remove_stable_node(struct ksm_stable_node * stable_node)110831dbd01fSIzik Eidus static int remove_stable_node(struct ksm_stable_node *stable_node)
110931dbd01fSIzik Eidus {
111031dbd01fSIzik Eidus struct page *page;
111131dbd01fSIzik Eidus int err;
1112eed05e54SMatthew Wilcox (Oracle)
111331dbd01fSIzik Eidus page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
111431dbd01fSIzik Eidus if (!page) {
1115ac46d4f3SJérôme Glisse /*
11166c287605SDavid Hildenbrand * get_ksm_page did remove_node_from_stable_tree itself.
1117c33c7948SRyan Roberts */
111831dbd01fSIzik Eidus return 0;
111936eaff33SKirill A. Shutemov }
112036eaff33SKirill A. Shutemov
112131dbd01fSIzik Eidus /*
112231dbd01fSIzik Eidus * Page could be still mapped if this races with __mmput() running in
112329ad768cSAndrea Arcangeli * between ksm_exit() and exit_mmap(). Just refuse to let
11246bdb913fSHaggai Eran * merge_across_nodes/max_page_sharing be switched.
11257d4a8be0SAlistair Popple */
1126ac46d4f3SJérôme Glisse err = -EBUSY;
1127ac46d4f3SJérôme Glisse if (!page_mapped(page)) {
11286bdb913fSHaggai Eran /*
112936eaff33SKirill A. Shutemov * The stable node did not yet appear stale to get_ksm_page(),
11306bdb913fSHaggai Eran * since that allows for an unmapped ksm page to be recognized
113136eaff33SKirill A. Shutemov * right up until it is freed; but the node is safe to remove.
113236eaff33SKirill A. Shutemov * This page might be in an LRU cache waiting to be freed,
113331dbd01fSIzik Eidus * or it might be PageSwapCache (perhaps under writeback),
11346c287605SDavid Hildenbrand * or it might have been removed from swapcache a moment ago.
1135c33c7948SRyan Roberts */
1136c33c7948SRyan Roberts set_page_stable_node(page, NULL);
11376c287605SDavid Hildenbrand remove_node_from_stable_tree(stable_node);
113831dbd01fSIzik Eidus err = 0;
113936eaff33SKirill A. Shutemov }
114031dbd01fSIzik Eidus
114125985edcSLucas De Marchi unlock_page(page);
114231dbd01fSIzik Eidus put_page(page);
1143f0953a1bSIngo Molnar return err;
114431dbd01fSIzik Eidus }
114531dbd01fSIzik Eidus
remove_stable_node_chain(struct ksm_stable_node * stable_node,struct rb_root * root)114631dbd01fSIzik Eidus static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
114731dbd01fSIzik Eidus struct rb_root *root)
11480f10851eSJérôme Glisse {
11490f10851eSJérôme Glisse struct ksm_stable_node *dup;
11500f10851eSJérôme Glisse struct hlist_node *hlist_safe;
11510f10851eSJérôme Glisse
1152ee65728eSMike Rapoport if (!is_stable_node_chain(stable_node)) {
115331dbd01fSIzik Eidus VM_BUG_ON(is_stable_node_dup(stable_node));
11540f10851eSJérôme Glisse if (remove_stable_node(stable_node))
115531dbd01fSIzik Eidus return true;
115631dbd01fSIzik Eidus else
115731dbd01fSIzik Eidus return false;
115831dbd01fSIzik Eidus }
115931e855eaSHugh Dickins
116036eaff33SKirill A. Shutemov hlist_for_each_entry_safe(dup, hlist_safe,
116131dbd01fSIzik Eidus &stable_node->hlist, hlist_dup) {
116231dbd01fSIzik Eidus VM_BUG_ON(!is_stable_node_dup(dup));
11636c287605SDavid Hildenbrand if (remove_stable_node(dup))
1164088b8aa5SDavid Hildenbrand return true;
11656c287605SDavid Hildenbrand }
11666c287605SDavid Hildenbrand BUG_ON(!hlist_empty(&stable_node->hlist));
11676c287605SDavid Hildenbrand free_stable_node_chain(stable_node, root);
11686c287605SDavid Hildenbrand return false;
11696c287605SDavid Hildenbrand }
11704e31635cSHugh Dickins
remove_all_stable_nodes(void)11714e31635cSHugh Dickins static int remove_all_stable_nodes(void)
11726a56ccbcSDavid Hildenbrand {
1173595cd8f2SAneesh Kumar K.V struct ksm_stable_node *stable_node, *next;
11746a56ccbcSDavid Hildenbrand int nid;
11756a56ccbcSDavid Hildenbrand int err = 0;
11766a56ccbcSDavid Hildenbrand
117736eaff33SKirill A. Shutemov for (nid = 0; nid < ksm_nr_node_ids; nid++) {
117831dbd01fSIzik Eidus while (root_stable_tree[nid].rb_node) {
1179c33c7948SRyan Roberts stable_node = rb_entry(root_stable_tree[nid].rb_node,
118031dbd01fSIzik Eidus struct ksm_stable_node, node);
118131dbd01fSIzik Eidus if (remove_stable_node_chain(stable_node,
118231dbd01fSIzik Eidus root_stable_tree + nid)) {
118336eaff33SKirill A. Shutemov err = -EBUSY;
11846bdb913fSHaggai Eran break; /* proceed to next nid */
1185ac46d4f3SJérôme Glisse }
118631dbd01fSIzik Eidus cond_resched();
118731dbd01fSIzik Eidus }
118831dbd01fSIzik Eidus }
118931dbd01fSIzik Eidus list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
119031dbd01fSIzik Eidus if (remove_stable_node(stable_node))
119131dbd01fSIzik Eidus err = -EBUSY;
11928dd3557aSHugh Dickins cond_resched();
11938dd3557aSHugh Dickins }
11948dd3557aSHugh Dickins return err;
119531dbd01fSIzik Eidus }
119631dbd01fSIzik Eidus
unmerge_and_remove_all_rmap_items(void)119731dbd01fSIzik Eidus static int unmerge_and_remove_all_rmap_items(void)
119831dbd01fSIzik Eidus {
11998dd3557aSHugh Dickins struct ksm_mm_slot *mm_slot;
12008dd3557aSHugh Dickins struct mm_slot *slot;
120131dbd01fSIzik Eidus struct mm_struct *mm;
120231dbd01fSIzik Eidus struct vm_area_struct *vma;
1203b4e6f66eSMatthew Wilcox (Oracle) int err = 0;
120431dbd01fSIzik Eidus
120550722804SZach O'Keefe spin_lock(&ksm_mmlist_lock);
120631dbd01fSIzik Eidus slot = list_entry(ksm_mm_head.slot.mm_node.next,
1207e86c59b1SClaudio Imbrenda struct mm_slot, mm_node);
120831dbd01fSIzik Eidus ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
120931dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock);
121031dbd01fSIzik Eidus
1211ac46d4f3SJérôme Glisse for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
121231dbd01fSIzik Eidus mm_slot = ksm_scan.mm_slot) {
12138dd3557aSHugh Dickins VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
121431dbd01fSIzik Eidus
121531dbd01fSIzik Eidus mm = mm_slot->slot.mm;
121631dbd01fSIzik Eidus mmap_read_lock(mm);
12176219049aSBob Liu
12186219049aSBob Liu /*
121931dbd01fSIzik Eidus * Exit right away if mm is exiting to avoid lockdep issue in
122050722804SZach O'Keefe * the maple tree
122150722804SZach O'Keefe */
122250722804SZach O'Keefe if (ksm_test_exit(mm))
122350722804SZach O'Keefe goto mm_exiting;
122450722804SZach O'Keefe
122526e1a0c3SHugh Dickins for_each_vma(vmi, vma) {
122650722804SZach O'Keefe if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
122750722804SZach O'Keefe continue;
122831dbd01fSIzik Eidus err = unmerge_ksm_pages(vma,
12297d4a8be0SAlistair Popple vma->vm_start, vma->vm_end, false);
12306f4f13e8SJérôme Glisse if (err)
1231ac46d4f3SJérôme Glisse goto error;
12326bdb913fSHaggai Eran }
123331dbd01fSIzik Eidus
123404dee9e8SHugh Dickins mm_exiting:
123504dee9e8SHugh Dickins remove_trailing_rmap_items(&mm_slot->rmap_list);
1236c33c7948SRyan Roberts mmap_read_unlock(mm);
123731dbd01fSIzik Eidus
12386bdb913fSHaggai Eran spin_lock(&ksm_mmlist_lock);
123931dbd01fSIzik Eidus slot = list_entry(mm_slot->slot.mm_node.next,
12406c287605SDavid Hildenbrand struct mm_slot, mm_node);
12416c287605SDavid Hildenbrand ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
124231dbd01fSIzik Eidus if (ksm_test_exit(mm)) {
1243e86c59b1SClaudio Imbrenda hash_del(&mm_slot->slot.hash);
1244e86c59b1SClaudio Imbrenda list_del(&mm_slot->slot.mm_node);
1245457aef94SEthon Paul spin_unlock(&ksm_mmlist_lock);
1246e86c59b1SClaudio Imbrenda
1247e86c59b1SClaudio Imbrenda mm_slot_free(mm_slot_cache, mm_slot);
12488dd3557aSHugh Dickins clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1249f1e2db12SDavid Hildenbrand clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
1250e86c59b1SClaudio Imbrenda mmdrop(mm);
1251e86c59b1SClaudio Imbrenda } else
125279271476Sxu xin spin_unlock(&ksm_mmlist_lock);
125379271476Sxu xin }
125479271476Sxu xin
125579271476Sxu xin /* Clean up stable nodes, but don't worry if some are still busy */
125679271476Sxu xin remove_all_stable_nodes();
125779271476Sxu xin ksm_scan.seqnr = 0;
1258e2942062Sxu xin return 0;
12596080d19fSxu xin
1260a38c015fSClaudio Imbrenda error:
1261a38c015fSClaudio Imbrenda mmap_read_unlock(mm);
1262a38c015fSClaudio Imbrenda spin_lock(&ksm_mmlist_lock);
1263a38c015fSClaudio Imbrenda ksm_scan.mm_slot = &ksm_mm_head;
1264a38c015fSClaudio Imbrenda spin_unlock(&ksm_mmlist_lock);
1265a38c015fSClaudio Imbrenda return err;
1266a38c015fSClaudio Imbrenda }
1267e86c59b1SClaudio Imbrenda #endif /* CONFIG_SYSFS */
126831dbd01fSIzik Eidus
calc_checksum(struct page * page)1269c33c7948SRyan Roberts static u32 calc_checksum(struct page *page)
12700f10851eSJérôme Glisse {
12710f10851eSJérôme Glisse u32 checksum;
12720f10851eSJérôme Glisse void *addr = kmap_local_page(page);
12730f10851eSJérôme Glisse checksum = xxhash(addr, PAGE_SIZE, 0);
1274ee65728eSMike Rapoport kunmap_local(addr);
12750f10851eSJérôme Glisse return checksum;
12760f10851eSJérôme Glisse }
1277e86c59b1SClaudio Imbrenda
write_protect_page(struct vm_area_struct * vma,struct page * page,pte_t * orig_pte)127831dbd01fSIzik Eidus static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1279b4e6f66eSMatthew Wilcox (Oracle) pte_t *orig_pte)
1280cea86fe2SHugh Dickins {
1281b4e6f66eSMatthew Wilcox (Oracle) struct mm_struct *mm = vma->vm_mm;
1282b4e6f66eSMatthew Wilcox (Oracle) DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
1283b4e6f66eSMatthew Wilcox (Oracle) int swapped;
128431dbd01fSIzik Eidus int err = -EFAULT;
128531dbd01fSIzik Eidus struct mmu_notifier_range range;
128631dbd01fSIzik Eidus bool anon_exclusive;
12876bdb913fSHaggai Eran pte_t entry;
1288ac46d4f3SJérôme Glisse
128931dbd01fSIzik Eidus pvmw.address = page_address_in_vma(page, vma);
129031dbd01fSIzik Eidus if (pvmw.address == -EFAULT)
129131dbd01fSIzik Eidus goto out;
129231dbd01fSIzik Eidus
129331dbd01fSIzik Eidus BUG_ON(PageTransCompound(page));
129431dbd01fSIzik Eidus
12958dd3557aSHugh Dickins mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
12968dd3557aSHugh Dickins pvmw.address + PAGE_SIZE);
129780e14822SHugh Dickins mmu_notifier_invalidate_range_start(&range);
129880e14822SHugh Dickins
129931dbd01fSIzik Eidus if (!page_vma_mapped_walk(&pvmw))
130031dbd01fSIzik Eidus goto out_mn;
130131dbd01fSIzik Eidus if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
130231dbd01fSIzik Eidus goto out_unlock;
13038dd3557aSHugh Dickins
130431dbd01fSIzik Eidus anon_exclusive = PageAnonExclusive(page);
130531dbd01fSIzik Eidus entry = ptep_get(pvmw.pte);
130631dbd01fSIzik Eidus if (pte_write(entry) || pte_dirty(entry) ||
130731dbd01fSIzik Eidus anon_exclusive || mm_tlb_flush_pending(mm)) {
1308db114b83SHugh Dickins swapped = PageSwapCache(page);
1309db114b83SHugh Dickins flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1310db114b83SHugh Dickins /*
13118dd3557aSHugh Dickins * Ok this is tricky, when get_user_pages_fast() run it doesn't
131231dbd01fSIzik Eidus * take any lock, therefore the check that we are going to make
131331dbd01fSIzik Eidus * with the pagecount against the mapcount is racy and
131431dbd01fSIzik Eidus * O_DIRECT can happen right after the check.
131531dbd01fSIzik Eidus * So we clear the pte and flush the tlb before the check
131631dbd01fSIzik Eidus * this assure us that no O_DIRECT can happen after the check
131731dbd01fSIzik Eidus * or in the middle of the check.
131831dbd01fSIzik Eidus *
131931dbd01fSIzik Eidus * No need to notify as we are downgrading page table to read
132031dbd01fSIzik Eidus * only not changing it to point to a new page.
13218dd3557aSHugh Dickins *
132231e855eaSHugh Dickins * See Documentation/mm/mmu_notifier.rst
1323f765f540SKirill A. Shutemov */
1324f765f540SKirill A. Shutemov entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1325a7306c34SAndrea Arcangeli /*
1326f765f540SKirill A. Shutemov * Check that no O_DIRECT or similar I/O is in progress on the
1327f765f540SKirill A. Shutemov * page
1328f765f540SKirill A. Shutemov */
132931dbd01fSIzik Eidus if (page_mapcount(page) + 1 + swapped != page_count(page)) {
133031dbd01fSIzik Eidus set_pte_at(mm, pvmw.address, pvmw.pte, entry);
133131dbd01fSIzik Eidus goto out_unlock;
133231dbd01fSIzik Eidus }
133331dbd01fSIzik Eidus
133431dbd01fSIzik Eidus /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
133580e14822SHugh Dickins if (anon_exclusive &&
133680e14822SHugh Dickins folio_try_share_anon_rmap_pte(page_folio(page), page)) {
133780e14822SHugh Dickins set_pte_at(mm, pvmw.address, pvmw.pte, entry);
133880e14822SHugh Dickins goto out_unlock;
133980e14822SHugh Dickins }
134080e14822SHugh Dickins
134180e14822SHugh Dickins if (pte_dirty(entry))
134280e14822SHugh Dickins set_page_dirty(page);
134380e14822SHugh Dickins entry = pte_mkclean(entry);
1344337ed7ebSMinchan Kim
1345337ed7ebSMinchan Kim if (pte_write(entry))
1346337ed7ebSMinchan Kim entry = pte_wrprotect(entry);
1347337ed7ebSMinchan Kim
1348337ed7ebSMinchan Kim set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1349337ed7ebSMinchan Kim }
135080e14822SHugh Dickins *orig_pte = entry;
135180e14822SHugh Dickins err = 0;
13528dd3557aSHugh Dickins
135380e14822SHugh Dickins out_unlock:
135431dbd01fSIzik Eidus page_vma_mapped_walk_done(&pvmw);
1355f765f540SKirill A. Shutemov out_mn:
13568dd3557aSHugh Dickins mmu_notifier_invalidate_range_end(&range);
135731dbd01fSIzik Eidus out:
135831dbd01fSIzik Eidus return err;
135931dbd01fSIzik Eidus }
136031dbd01fSIzik Eidus
136131dbd01fSIzik Eidus /**
136281464e30SHugh Dickins * replace_page - replace page in vma by new ksm page
136381464e30SHugh Dickins * @vma: vma that holds the pte pointing to page
13648dd3557aSHugh Dickins * @page: the page we are replacing by kpage
13658dd3557aSHugh Dickins * @kpage: the ksm page we replace page by
136681464e30SHugh Dickins * @orig_pte: the original value of the pte
136721fbd591SQi Zheng *
13688dd3557aSHugh Dickins * Returns 0 on success, -EFAULT on failure.
136981464e30SHugh Dickins */
replace_page(struct vm_area_struct * vma,struct page * page,struct page * kpage,pte_t orig_pte)13708dd3557aSHugh Dickins static int replace_page(struct vm_area_struct *vma, struct page *page,
137181464e30SHugh Dickins struct page *kpage, pte_t orig_pte)
137281464e30SHugh Dickins {
137381464e30SHugh Dickins struct folio *kfolio = page_folio(kpage);
1374d8ed45c5SMichel Lespinasse struct mm_struct *mm = vma->vm_mm;
137585c6e8ddSAndrea Arcangeli struct folio *folio;
137685c6e8ddSAndrea Arcangeli pmd_t *pmd;
13779ba69294SHugh Dickins pmd_t pmde;
13789ba69294SHugh Dickins pte_t *ptep;
13798dd3557aSHugh Dickins pte_t newpte;
1380db114b83SHugh Dickins spinlock_t *ptl;
1381db114b83SHugh Dickins unsigned long addr;
1382db114b83SHugh Dickins int err = -EFAULT;
1383bc56620bSHugh Dickins struct mmu_notifier_range range;
1384bc56620bSHugh Dickins
1385bc56620bSHugh Dickins addr = page_address_in_vma(page, vma);
1386c1e8d7c6SMichel Lespinasse if (addr == -EFAULT)
13879e60109fSPeter Zijlstra goto out;
13889e60109fSPeter Zijlstra
138981464e30SHugh Dickins pmd = mm_find_pmd(mm, addr);
1390d8ed45c5SMichel Lespinasse if (!pmd)
1391739100c8SStefan Roesch goto out;
1392739100c8SStefan Roesch /*
139381464e30SHugh Dickins * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
139481464e30SHugh Dickins * without holding anon_vma lock for write. So when looking for a
139581464e30SHugh Dickins * genuine pmde (in which to find pte), test present and !THP together.
139681464e30SHugh Dickins */
139731dbd01fSIzik Eidus pmde = pmdp_get_lockless(pmd);
139831dbd01fSIzik Eidus if (!pmd_present(pmde) || pmd_trans_huge(pmde))
139931dbd01fSIzik Eidus goto out;
14008dd3557aSHugh Dickins
14018dd3557aSHugh Dickins mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
140231dbd01fSIzik Eidus addr + PAGE_SIZE);
140380e14822SHugh Dickins mmu_notifier_invalidate_range_start(&range);
140431dbd01fSIzik Eidus
140531dbd01fSIzik Eidus ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
140621fbd591SQi Zheng if (!ptep)
14078dd3557aSHugh Dickins goto out_mn;
140821fbd591SQi Zheng if (!pte_same(ptep_get(ptep), orig_pte)) {
14098dd3557aSHugh Dickins pte_unmap_unlock(ptep, ptl);
141031dbd01fSIzik Eidus goto out_mn;
141180e14822SHugh Dickins }
141231dbd01fSIzik Eidus VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
141380e14822SHugh Dickins VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
141431dbd01fSIzik Eidus kfolio);
14158dd3557aSHugh Dickins
141680e14822SHugh Dickins /*
141731dbd01fSIzik Eidus * No need to check ksm_use_zero_pages here: we can only have a
141881464e30SHugh Dickins * zero_page here if ksm_use_zero_pages was enabled already.
141981464e30SHugh Dickins */
142031dbd01fSIzik Eidus if (!is_zero_pfn(page_to_pfn(kpage))) {
14214035c07aSHugh Dickins folio_get(kfolio);
14228dd3557aSHugh Dickins folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
142331dbd01fSIzik Eidus newpte = mk_pte(kpage, vma->vm_page_prot);
142480e14822SHugh Dickins } else {
142531dbd01fSIzik Eidus /*
142631dbd01fSIzik Eidus * Use pte_mkdirty to mark the zero page mapped by KSM, and then
14272c653d0eSAndrea Arcangeli * we can easily track all KSM-placed zero pages by checking if
142821fbd591SQi Zheng * the dirty bit in zero page's PTE is set.
14292c653d0eSAndrea Arcangeli */
14302c653d0eSAndrea Arcangeli newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
14312c653d0eSAndrea Arcangeli ksm_zero_pages++;
14322c653d0eSAndrea Arcangeli mm->ksm_zero_pages++;
14332c653d0eSAndrea Arcangeli /*
14342c653d0eSAndrea Arcangeli * We're replacing an anonymous page with a zero page, which is
14352c653d0eSAndrea Arcangeli * not anonymous. We need to do proper accounting otherwise we
14362c653d0eSAndrea Arcangeli * will get wrong values in /proc, and a BUG message in dmesg
14372c653d0eSAndrea Arcangeli * when tearing down the mm.
14382c653d0eSAndrea Arcangeli */
14392c653d0eSAndrea Arcangeli dec_mm_counter(mm, MM_ANONPAGES);
14402c653d0eSAndrea Arcangeli }
14412c653d0eSAndrea Arcangeli
144221fbd591SQi Zheng flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
14432c653d0eSAndrea Arcangeli /*
14442c653d0eSAndrea Arcangeli * No need to notify as we are replacing a read only page with another
14452c653d0eSAndrea Arcangeli * read only page with the same content.
14462c653d0eSAndrea Arcangeli *
144721fbd591SQi Zheng * See Documentation/mm/mmu_notifier.rst
144821fbd591SQi Zheng */
14492c653d0eSAndrea Arcangeli ptep_clear_flush(vma, addr, ptep);
14502c653d0eSAndrea Arcangeli set_pte_at_notify(mm, addr, ptep, newpte);
14512c653d0eSAndrea Arcangeli
145221fbd591SQi Zheng folio = page_folio(page);
14532c653d0eSAndrea Arcangeli folio_remove_rmap_pte(folio, page, vma);
14548dc5ffcdSAndrea Arcangeli if (!folio_mapped(folio))
14552c653d0eSAndrea Arcangeli folio_free_swap(folio);
14562c653d0eSAndrea Arcangeli folio_put(folio);
14572c653d0eSAndrea Arcangeli
14582c653d0eSAndrea Arcangeli pte_unmap_unlock(ptep, ptl);
14592c653d0eSAndrea Arcangeli err = 0;
14602c653d0eSAndrea Arcangeli out_mn:
14612c653d0eSAndrea Arcangeli mmu_notifier_invalidate_range_end(&range);
14622c653d0eSAndrea Arcangeli out:
14632c653d0eSAndrea Arcangeli return err;
14642c653d0eSAndrea Arcangeli }
14652c653d0eSAndrea Arcangeli
14662c653d0eSAndrea Arcangeli /*
14672c653d0eSAndrea Arcangeli * try_to_merge_one_page - take two pages and merge them into one
14682c653d0eSAndrea Arcangeli * @vma: the vma that holds the pte pointing to page
14692c653d0eSAndrea Arcangeli * @page: the PageAnon page that we want to replace with kpage
14702c653d0eSAndrea Arcangeli * @kpage: the PageKsm page that we want to map instead of page,
14712c653d0eSAndrea Arcangeli * or NULL the first time when we want to use page as kpage.
14722c653d0eSAndrea Arcangeli *
14732c653d0eSAndrea Arcangeli * This function returns 0 if the pages were merged, -EFAULT otherwise.
14742c653d0eSAndrea Arcangeli */
try_to_merge_one_page(struct vm_area_struct * vma,struct page * page,struct page * kpage)14752c653d0eSAndrea Arcangeli static int try_to_merge_one_page(struct vm_area_struct *vma,
14762c653d0eSAndrea Arcangeli struct page *page, struct page *kpage)
14772c653d0eSAndrea Arcangeli {
14782c653d0eSAndrea Arcangeli pte_t orig_pte = __pte(0);
14792cee57d1SYang Shi int err = -EFAULT;
14802c653d0eSAndrea Arcangeli
14812c653d0eSAndrea Arcangeli if (page == kpage) /* ksm page forked */
14822c653d0eSAndrea Arcangeli return 0;
14832c653d0eSAndrea Arcangeli
14842c653d0eSAndrea Arcangeli if (!PageAnon(page))
14852c653d0eSAndrea Arcangeli goto out;
14862c653d0eSAndrea Arcangeli
14878dc5ffcdSAndrea Arcangeli /*
14882c653d0eSAndrea Arcangeli * We need the page lock to read a stable PageSwapCache in
14892c653d0eSAndrea Arcangeli * write_protect_page(). We use trylock_page() instead of
14908dc5ffcdSAndrea Arcangeli * lock_page() because we don't want to wait here - we
14912c653d0eSAndrea Arcangeli * prefer to continue scanning and merging different pages,
14928dc5ffcdSAndrea Arcangeli * then come back to this page when it is unlocked.
14932c653d0eSAndrea Arcangeli */
14942c653d0eSAndrea Arcangeli if (!trylock_page(page))
14952c653d0eSAndrea Arcangeli goto out;
14962c653d0eSAndrea Arcangeli
14972c653d0eSAndrea Arcangeli if (PageTransCompound(page)) {
14982c653d0eSAndrea Arcangeli if (split_huge_page(page))
14992c653d0eSAndrea Arcangeli goto out_unlock;
15002c653d0eSAndrea Arcangeli }
150180b18dfaSAndrea Arcangeli
15022c653d0eSAndrea Arcangeli /*
150380b18dfaSAndrea Arcangeli * If this anonymous page is mapped only here, its pte may need
150480b18dfaSAndrea Arcangeli * to be write-protected. If it's mapped elsewhere, all of its
150580b18dfaSAndrea Arcangeli * ptes are necessarily already write-protected. But in either
150680b18dfaSAndrea Arcangeli * case, we need to lock and check page_count is not raised.
15072c653d0eSAndrea Arcangeli */
150880b18dfaSAndrea Arcangeli if (write_protect_page(vma, page, &orig_pte) == 0) {
15092c653d0eSAndrea Arcangeli if (!kpage) {
15102c653d0eSAndrea Arcangeli /*
15112c653d0eSAndrea Arcangeli * While we hold page lock, upgrade page from
15122c653d0eSAndrea Arcangeli * PageAnon+anon_vma to PageKsm+NULL stable_node:
15132c653d0eSAndrea Arcangeli * stable_tree_insert() will update stable_node.
15142c653d0eSAndrea Arcangeli */
15152c653d0eSAndrea Arcangeli set_page_stable_node(page, NULL);
15162c653d0eSAndrea Arcangeli mark_page_accessed(page);
15172c653d0eSAndrea Arcangeli /*
15182c653d0eSAndrea Arcangeli * Page reclaim just frees a clean page with no dirty
15192c653d0eSAndrea Arcangeli * ptes: make sure that the ksm page would be swapped.
15202c653d0eSAndrea Arcangeli */
15212c653d0eSAndrea Arcangeli if (!PageDirty(page))
15222c653d0eSAndrea Arcangeli SetPageDirty(page);
15232c653d0eSAndrea Arcangeli err = 0;
15242c653d0eSAndrea Arcangeli } else if (pages_identical(page, kpage))
15252c653d0eSAndrea Arcangeli err = replace_page(vma, page, kpage, orig_pte);
1526b4fecc67SAndrea Arcangeli }
15270ba1d0f7SAndrea Arcangeli
15280ba1d0f7SAndrea Arcangeli out_unlock:
15290ba1d0f7SAndrea Arcangeli unlock_page(page);
1530b4fecc67SAndrea Arcangeli out:
15310ba1d0f7SAndrea Arcangeli return err;
15320ba1d0f7SAndrea Arcangeli }
1533f0953a1bSIngo Molnar
15340ba1d0f7SAndrea Arcangeli /*
15350ba1d0f7SAndrea Arcangeli * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
15360ba1d0f7SAndrea Arcangeli * but no new kernel page is allocated: kpage must already be a ksm page.
15370ba1d0f7SAndrea Arcangeli *
15380ba1d0f7SAndrea Arcangeli * This function returns 0 if the pages were merged, -EFAULT otherwise.
153980b18dfaSAndrea Arcangeli */
try_to_merge_with_ksm_page(struct ksm_rmap_item * rmap_item,struct page * page,struct page * kpage)154080b18dfaSAndrea Arcangeli static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
15412c653d0eSAndrea Arcangeli struct page *page, struct page *kpage)
154280b18dfaSAndrea Arcangeli {
154380b18dfaSAndrea Arcangeli struct mm_struct *mm = rmap_item->mm;
154480b18dfaSAndrea Arcangeli struct vm_area_struct *vma;
154580b18dfaSAndrea Arcangeli int err = -EFAULT;
154680b18dfaSAndrea Arcangeli
154780b18dfaSAndrea Arcangeli mmap_read_lock(mm);
154880b18dfaSAndrea Arcangeli vma = find_mergeable_vma(mm, rmap_item->address);
154980b18dfaSAndrea Arcangeli if (!vma)
155080b18dfaSAndrea Arcangeli goto out;
155180b18dfaSAndrea Arcangeli
155280b18dfaSAndrea Arcangeli err = try_to_merge_one_page(vma, page, kpage);
155380b18dfaSAndrea Arcangeli if (err)
155480b18dfaSAndrea Arcangeli goto out;
15552c653d0eSAndrea Arcangeli
15562c653d0eSAndrea Arcangeli /* Unstable nid is in union with stable anon_vma: remove first */
15572c653d0eSAndrea Arcangeli remove_rmap_item_from_tree(rmap_item);
15582c653d0eSAndrea Arcangeli
15592c653d0eSAndrea Arcangeli /* Must get reference to anon_vma while still holding mmap_lock */
15602c653d0eSAndrea Arcangeli rmap_item->anon_vma = vma->anon_vma;
15612c653d0eSAndrea Arcangeli get_anon_vma(vma->anon_vma);
15628dc5ffcdSAndrea Arcangeli out:
15638dc5ffcdSAndrea Arcangeli mmap_read_unlock(mm);
15642c653d0eSAndrea Arcangeli trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
15652c653d0eSAndrea Arcangeli rmap_item, mm, err);
156621fbd591SQi Zheng return err;
15672c653d0eSAndrea Arcangeli }
15682c653d0eSAndrea Arcangeli
15692c653d0eSAndrea Arcangeli /*
15702c653d0eSAndrea Arcangeli * try_to_merge_two_pages - take two identical pages and prepare them
15712c653d0eSAndrea Arcangeli * to be merged into one page.
15722c653d0eSAndrea Arcangeli *
15732c653d0eSAndrea Arcangeli * This function returns the kpage if we successfully merged two identical
15742c653d0eSAndrea Arcangeli * pages into one ksm page, NULL otherwise.
15752c653d0eSAndrea Arcangeli *
15762c653d0eSAndrea Arcangeli * Note that this function upgrades page to ksm page: if one of the pages
15772c653d0eSAndrea Arcangeli * is already a ksm page, try_to_merge_with_ksm_page should be used.
15782c653d0eSAndrea Arcangeli */
try_to_merge_two_pages(struct ksm_rmap_item * rmap_item,struct page * page,struct ksm_rmap_item * tree_rmap_item,struct page * tree_page)15798dc5ffcdSAndrea Arcangeli static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
15808dc5ffcdSAndrea Arcangeli struct page *page,
15818dc5ffcdSAndrea Arcangeli struct ksm_rmap_item *tree_rmap_item,
15828dc5ffcdSAndrea Arcangeli struct page *tree_page)
15838dc5ffcdSAndrea Arcangeli {
15848dc5ffcdSAndrea Arcangeli int err;
15858dc5ffcdSAndrea Arcangeli
15868dc5ffcdSAndrea Arcangeli err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
15878dc5ffcdSAndrea Arcangeli if (!err) {
15888dc5ffcdSAndrea Arcangeli err = try_to_merge_with_ksm_page(tree_rmap_item,
15898dc5ffcdSAndrea Arcangeli tree_page, page);
15908dc5ffcdSAndrea Arcangeli /*
15918dc5ffcdSAndrea Arcangeli * If that fails, we have a ksm page with only one pte
15928dc5ffcdSAndrea Arcangeli * pointing to it: so break it.
159321fbd591SQi Zheng */
159421fbd591SQi Zheng if (err)
15952c653d0eSAndrea Arcangeli break_cow(rmap_item);
15962c653d0eSAndrea Arcangeli }
15972c653d0eSAndrea Arcangeli return err ? NULL : page;
159821fbd591SQi Zheng }
15992c653d0eSAndrea Arcangeli
16002c653d0eSAndrea Arcangeli static __always_inline
__is_page_sharing_candidate(struct ksm_stable_node * stable_node,int offset)16018dc5ffcdSAndrea Arcangeli bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
16022cee57d1SYang Shi {
16032c653d0eSAndrea Arcangeli VM_BUG_ON(stable_node->rmap_hlist_len < 0);
16048dc5ffcdSAndrea Arcangeli /*
16058dc5ffcdSAndrea Arcangeli * Check that at least one mapping still exists, otherwise
16068dc5ffcdSAndrea Arcangeli * there's no much point to merge and share with this
16078dc5ffcdSAndrea Arcangeli * stable_node, as the underlying tree_page of the other
16088dc5ffcdSAndrea Arcangeli * sharer is going to be freed soon.
16092c653d0eSAndrea Arcangeli */
16102c653d0eSAndrea Arcangeli return stable_node->rmap_hlist_len &&
16118dc5ffcdSAndrea Arcangeli stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
16122c653d0eSAndrea Arcangeli }
16132c653d0eSAndrea Arcangeli
16142c653d0eSAndrea Arcangeli static __always_inline
is_page_sharing_candidate(struct ksm_stable_node * stable_node)161521fbd591SQi Zheng bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
161621fbd591SQi Zheng {
16172c653d0eSAndrea Arcangeli return __is_page_sharing_candidate(stable_node, 0);
16182c653d0eSAndrea Arcangeli }
16198dc5ffcdSAndrea Arcangeli
stable_node_dup(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)16202c653d0eSAndrea Arcangeli static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
16212c653d0eSAndrea Arcangeli struct ksm_stable_node **_stable_node,
162221fbd591SQi Zheng struct rb_root *root,
162321fbd591SQi Zheng bool prune_stale_stable_nodes)
16242c653d0eSAndrea Arcangeli {
16252c653d0eSAndrea Arcangeli struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
162621fbd591SQi Zheng struct hlist_node *hlist_safe;
16278dc5ffcdSAndrea Arcangeli struct page *_tree_page, *tree_page = NULL;
16288dc5ffcdSAndrea Arcangeli int nr = 0;
16298dc5ffcdSAndrea Arcangeli int found_rmap_hlist_len;
16308dc5ffcdSAndrea Arcangeli
16318dc5ffcdSAndrea Arcangeli if (!prune_stale_stable_nodes ||
16328dc5ffcdSAndrea Arcangeli time_before(jiffies, stable_node->chain_prune_time +
16332c653d0eSAndrea Arcangeli msecs_to_jiffies(
16342c653d0eSAndrea Arcangeli ksm_stable_node_chains_prune_millisecs)))
163531dbd01fSIzik Eidus prune_stale_stable_nodes = false;
16368dd3557aSHugh Dickins else
163731dbd01fSIzik Eidus stable_node->chain_prune_time = jiffies;
163831dbd01fSIzik Eidus
163931dbd01fSIzik Eidus hlist_for_each_entry_safe(dup, hlist_safe,
164031dbd01fSIzik Eidus &stable_node->hlist, hlist_dup) {
16417b6ba2c7SHugh Dickins cond_resched();
164231dbd01fSIzik Eidus /*
164331dbd01fSIzik Eidus * We must walk all stable_node_dup to prune the stale
164462b61f61SHugh Dickins * stable nodes during lookup.
164531dbd01fSIzik Eidus *
164690bd6fd3SPetr Holasek * get_ksm_page can drop the nodes from the
1647ef53d16cSHugh Dickins * stable_node->hlist if they point to freed pages
16484146d2d6SHugh Dickins * (that's why we do a _safe walk). The "dup"
16494146d2d6SHugh Dickins * stable_node parameter itself will be freed from
165021fbd591SQi Zheng * under us if it returns NULL.
165121fbd591SQi Zheng */
165231dbd01fSIzik Eidus _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
16534146d2d6SHugh Dickins if (!_tree_page)
16544146d2d6SHugh Dickins continue;
16554146d2d6SHugh Dickins nr += 1;
165608beca44SHugh Dickins if (is_page_sharing_candidate(dup)) {
165762b61f61SHugh Dickins if (!found ||
165808beca44SHugh Dickins dup->rmap_hlist_len > found_rmap_hlist_len) {
165908beca44SHugh Dickins if (found)
166090bd6fd3SPetr Holasek put_page(tree_page);
1661ef53d16cSHugh Dickins found = dup;
16624146d2d6SHugh Dickins found_rmap_hlist_len = found->rmap_hlist_len;
1663ef53d16cSHugh Dickins tree_page = _tree_page;
16644146d2d6SHugh Dickins
166590bd6fd3SPetr Holasek /* skip put_page for found dup */
16664146d2d6SHugh Dickins if (!prune_stale_stable_nodes)
16674035c07aSHugh Dickins break;
166831dbd01fSIzik Eidus continue;
166931dbd01fSIzik Eidus }
167031dbd01fSIzik Eidus }
167121fbd591SQi Zheng put_page(_tree_page);
16722c653d0eSAndrea Arcangeli }
16738dc5ffcdSAndrea Arcangeli
1674b4fecc67SAndrea Arcangeli if (found) {
1675b4fecc67SAndrea Arcangeli /*
1676b4fecc67SAndrea Arcangeli * nr is counting all dups in the chain only if
1677b4fecc67SAndrea Arcangeli * prune_stale_stable_nodes is true, otherwise we may
1678b4fecc67SAndrea Arcangeli * break the loop at nr == 1 even if there are
1679b4fecc67SAndrea Arcangeli * multiple entries.
16800ba1d0f7SAndrea Arcangeli */
16813413b2c8SJulia Lawall if (prune_stale_stable_nodes && nr == 1) {
16820ba1d0f7SAndrea Arcangeli /*
16830ba1d0f7SAndrea Arcangeli * If there's not just one entry it would
16840ba1d0f7SAndrea Arcangeli * corrupt memory, better BUG_ON. In KSM
1685b4fecc67SAndrea Arcangeli * context with no lock held it's not even
16862c653d0eSAndrea Arcangeli * fatal.
16872c653d0eSAndrea Arcangeli */
16882c653d0eSAndrea Arcangeli BUG_ON(stable_node->hlist.first->next);
16892c653d0eSAndrea Arcangeli
16902c653d0eSAndrea Arcangeli /*
16912c653d0eSAndrea Arcangeli * There's just one entry and it is below the
16922c653d0eSAndrea Arcangeli * deduplication limit so drop the chain.
16932c653d0eSAndrea Arcangeli */
16942c653d0eSAndrea Arcangeli rb_replace_node(&stable_node->node, &found->node,
16952c653d0eSAndrea Arcangeli root);
16962c653d0eSAndrea Arcangeli free_stable_node(stable_node);
16972c653d0eSAndrea Arcangeli ksm_stable_node_chains--;
16982c653d0eSAndrea Arcangeli ksm_stable_node_dups--;
16992c653d0eSAndrea Arcangeli /*
17002c653d0eSAndrea Arcangeli * NOTE: the caller depends on the stable_node
17012c653d0eSAndrea Arcangeli * to be equal to stable_node_dup if the chain
17022c653d0eSAndrea Arcangeli * was collapsed.
17032c653d0eSAndrea Arcangeli */
1704457aef94SEthon Paul *_stable_node = found;
17052c653d0eSAndrea Arcangeli /*
17062c653d0eSAndrea Arcangeli * Just for robustness, as stable_node is
17072cee57d1SYang Shi * otherwise left as a stable pointer, the
17082cee57d1SYang Shi * compiler shall optimize it away at build
17092c653d0eSAndrea Arcangeli * time.
17102c653d0eSAndrea Arcangeli */
1711f2e5ff85SAndrea Arcangeli stable_node = NULL;
1712f2e5ff85SAndrea Arcangeli } else if (stable_node->hlist.first != &found->hlist_dup &&
1713f2e5ff85SAndrea Arcangeli __is_page_sharing_candidate(found, 1)) {
1714f2e5ff85SAndrea Arcangeli /*
1715f2e5ff85SAndrea Arcangeli * If the found stable_node dup can accept one
1716f2e5ff85SAndrea Arcangeli * more future merge (in addition to the one
1717f2e5ff85SAndrea Arcangeli * that is underway) and is not at the head of
1718f2e5ff85SAndrea Arcangeli * the chain, put it there so next search will
1719f2e5ff85SAndrea Arcangeli * be quicker in the !prune_stale_stable_nodes
1720f2e5ff85SAndrea Arcangeli * case.
1721f2e5ff85SAndrea Arcangeli *
1722f2e5ff85SAndrea Arcangeli * NOTE: it would be inaccurate to use nr > 1
172331dbd01fSIzik Eidus * instead of checking the hlist.first pointer
17244035c07aSHugh Dickins * directly, because in the
1725c8d6553bSHugh Dickins * prune_stale_stable_nodes case "nr" isn't
172631dbd01fSIzik Eidus * the position of the found dup in the chain,
17274146d2d6SHugh Dickins * but the total number of dups in the chain.
1728c8d6553bSHugh Dickins */
17294146d2d6SHugh Dickins hlist_del(&found->hlist_dup);
1730c8d6553bSHugh Dickins hlist_add_head(&found->hlist_dup,
17314146d2d6SHugh Dickins &stable_node->hlist);
1732c8d6553bSHugh Dickins }
17332c653d0eSAndrea Arcangeli }
17342c653d0eSAndrea Arcangeli
17352c653d0eSAndrea Arcangeli *_stable_node_dup = found;
17362c653d0eSAndrea Arcangeli return tree_page;
17372c653d0eSAndrea Arcangeli }
17382c653d0eSAndrea Arcangeli
stable_node_dup_any(struct ksm_stable_node * stable_node,struct rb_root * root)17392c653d0eSAndrea Arcangeli static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
17402c653d0eSAndrea Arcangeli struct rb_root *root)
17412c653d0eSAndrea Arcangeli {
17422c653d0eSAndrea Arcangeli if (!is_stable_node_chain(stable_node))
17432c653d0eSAndrea Arcangeli return stable_node;
17442c653d0eSAndrea Arcangeli if (hlist_empty(&stable_node->hlist)) {
17452c653d0eSAndrea Arcangeli free_stable_node_chain(stable_node, root);
17462c653d0eSAndrea Arcangeli return NULL;
17472c653d0eSAndrea Arcangeli }
17482c653d0eSAndrea Arcangeli return hlist_entry(stable_node->hlist.first,
17492c653d0eSAndrea Arcangeli typeof(*stable_node), hlist_dup);
17502c653d0eSAndrea Arcangeli }
17512c653d0eSAndrea Arcangeli
17522c653d0eSAndrea Arcangeli /*
17532c653d0eSAndrea Arcangeli * Like for get_ksm_page, this function can free the *_stable_node and
17542c653d0eSAndrea Arcangeli * *_stable_node_dup if the returned tree_page is NULL.
17552c653d0eSAndrea Arcangeli *
17562c653d0eSAndrea Arcangeli * It can also free and overwrite *_stable_node with the found
17572c653d0eSAndrea Arcangeli * stable_node_dup if the chain is collapsed (in which case
17582c653d0eSAndrea Arcangeli * *_stable_node will be equal to *_stable_node_dup like if the chain
17592c653d0eSAndrea Arcangeli * never existed). It's up to the caller to verify tree_page is not
17602c653d0eSAndrea Arcangeli * NULL before dereferencing *_stable_node or *_stable_node_dup.
1761c8d6553bSHugh Dickins *
1762c8d6553bSHugh Dickins * *_stable_node_dup is really a second output parameter of this
1763c8d6553bSHugh Dickins * function and will be overwritten in all cases, the caller doesn't
1764c8d6553bSHugh Dickins * need to initialize it.
1765c8d6553bSHugh Dickins */
__stable_node_chain(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)1766c8d6553bSHugh Dickins static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
1767c8d6553bSHugh Dickins struct ksm_stable_node **_stable_node,
17682cee57d1SYang Shi struct rb_root *root,
17692cee57d1SYang Shi bool prune_stale_stable_nodes)
17702cee57d1SYang Shi {
17712cee57d1SYang Shi struct ksm_stable_node *stable_node = *_stable_node;
17722cee57d1SYang Shi if (!is_stable_node_chain(stable_node)) {
17732cee57d1SYang Shi if (is_page_sharing_candidate(stable_node)) {
17742c653d0eSAndrea Arcangeli *_stable_node_dup = stable_node;
17752c653d0eSAndrea Arcangeli return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
17762c653d0eSAndrea Arcangeli }
17772c653d0eSAndrea Arcangeli /*
17782c653d0eSAndrea Arcangeli * _stable_node_dup set to NULL means the stable_node
17792c653d0eSAndrea Arcangeli * reached the ksm_max_page_sharing limit.
1780c8d6553bSHugh Dickins */
17812c653d0eSAndrea Arcangeli *_stable_node_dup = NULL;
17822c653d0eSAndrea Arcangeli return NULL;
17832c653d0eSAndrea Arcangeli }
17844146d2d6SHugh Dickins return stable_node_dup(_stable_node_dup, _stable_node, root,
17854146d2d6SHugh Dickins prune_stale_stable_nodes);
17864146d2d6SHugh Dickins }
178762b61f61SHugh Dickins
chain_prune(struct ksm_stable_node ** s_n_d,struct ksm_stable_node ** s_n,struct rb_root * root)178831dbd01fSIzik Eidus static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
1789c8d6553bSHugh Dickins struct ksm_stable_node **s_n,
179031dbd01fSIzik Eidus struct rb_root *root)
17914146d2d6SHugh Dickins {
179231dbd01fSIzik Eidus return __stable_node_chain(s_n_d, s_n, root, true);
17934146d2d6SHugh Dickins }
17944146d2d6SHugh Dickins
chain(struct ksm_stable_node ** s_n_d,struct ksm_stable_node * s_n,struct rb_root * root)17954146d2d6SHugh Dickins static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
17964146d2d6SHugh Dickins struct ksm_stable_node *s_n,
1797ef53d16cSHugh Dickins struct rb_root *root)
17982c653d0eSAndrea Arcangeli {
17992c653d0eSAndrea Arcangeli struct ksm_stable_node *old_stable_node = s_n;
18004146d2d6SHugh Dickins struct page *tree_page;
18014146d2d6SHugh Dickins
18022c653d0eSAndrea Arcangeli tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
18032c653d0eSAndrea Arcangeli /* not pruning dups so s_n cannot have changed */
18044146d2d6SHugh Dickins VM_BUG_ON(s_n != old_stable_node);
18054146d2d6SHugh Dickins return tree_page;
1806b4fecc67SAndrea Arcangeli }
1807b4fecc67SAndrea Arcangeli
18080ba1d0f7SAndrea Arcangeli /*
18090ba1d0f7SAndrea Arcangeli * stable_tree_search - search for page inside the stable tree
18100ba1d0f7SAndrea Arcangeli *
18110ba1d0f7SAndrea Arcangeli * This function checks if there is a page inside the stable tree
18120ba1d0f7SAndrea Arcangeli * with identical content to the page that we are scanning right now.
1813b4fecc67SAndrea Arcangeli *
18140ba1d0f7SAndrea Arcangeli * This function returns the stable tree node of identical content if found,
1815b4fecc67SAndrea Arcangeli * NULL otherwise.
1816b4fecc67SAndrea Arcangeli */
stable_tree_search(struct page * page)18172c653d0eSAndrea Arcangeli static struct page *stable_tree_search(struct page *page)
18184146d2d6SHugh Dickins {
18192c653d0eSAndrea Arcangeli int nid;
18204146d2d6SHugh Dickins struct rb_root *root;
18214146d2d6SHugh Dickins struct rb_node **new;
1822b4fecc67SAndrea Arcangeli struct rb_node *parent;
1823b4fecc67SAndrea Arcangeli struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
18242c653d0eSAndrea Arcangeli struct ksm_stable_node *page_node;
18252c653d0eSAndrea Arcangeli
18264146d2d6SHugh Dickins page_node = page_stable_node(page);
18272c653d0eSAndrea Arcangeli if (page_node && page_node->head != &migrate_nodes) {
18282c653d0eSAndrea Arcangeli /* ksm page forked */
18294146d2d6SHugh Dickins get_page(page);
1830b4fecc67SAndrea Arcangeli return page;
18314146d2d6SHugh Dickins }
18324146d2d6SHugh Dickins
18332c653d0eSAndrea Arcangeli nid = get_kpfn_nid(page_to_pfn(page));
18342c653d0eSAndrea Arcangeli root = root_stable_tree + nid;
18352c653d0eSAndrea Arcangeli again:
18362c653d0eSAndrea Arcangeli new = &root->rb_node;
18372c653d0eSAndrea Arcangeli parent = NULL;
18382c653d0eSAndrea Arcangeli
18392c653d0eSAndrea Arcangeli while (*new) {
18402c653d0eSAndrea Arcangeli struct page *tree_page;
18412c653d0eSAndrea Arcangeli int ret;
18422c653d0eSAndrea Arcangeli
18432c653d0eSAndrea Arcangeli cond_resched();
18442c653d0eSAndrea Arcangeli stable_node = rb_entry(*new, struct ksm_stable_node, node);
18452c653d0eSAndrea Arcangeli stable_node_any = NULL;
18462c653d0eSAndrea Arcangeli tree_page = chain_prune(&stable_node_dup, &stable_node, root);
18472c653d0eSAndrea Arcangeli /*
18482c653d0eSAndrea Arcangeli * NOTE: stable_node may have been freed by
18492c653d0eSAndrea Arcangeli * chain_prune() if the returned stable_node_dup is
18502c653d0eSAndrea Arcangeli * not NULL. stable_node_dup may have been inserted in
18514146d2d6SHugh Dickins * the rbtree instead as a regular stable_node (in
18522c653d0eSAndrea Arcangeli * order to collapse the stable_node chain if a single
18532c653d0eSAndrea Arcangeli * stable_node dup was found in it). In such case the
18542c653d0eSAndrea Arcangeli * stable_node is overwritten by the callee to point
18552c653d0eSAndrea Arcangeli * to the stable_node_dup that was collapsed in the
18562c653d0eSAndrea Arcangeli * stable rbtree and stable_node will be equal to
1857b4fecc67SAndrea Arcangeli * stable_node_dup like if the chain never existed.
1858b4fecc67SAndrea Arcangeli */
18590ba1d0f7SAndrea Arcangeli if (!stable_node_dup) {
18600ba1d0f7SAndrea Arcangeli /*
18610ba1d0f7SAndrea Arcangeli * Either all stable_node dups were full in
18620ba1d0f7SAndrea Arcangeli * this stable_node chain, or this chain was
18630ba1d0f7SAndrea Arcangeli * empty and should be rb_erased.
1864b4fecc67SAndrea Arcangeli */
18650ba1d0f7SAndrea Arcangeli stable_node_any = stable_node_dup_any(stable_node,
1866b4fecc67SAndrea Arcangeli root);
18672c653d0eSAndrea Arcangeli if (!stable_node_any) {
18682c653d0eSAndrea Arcangeli /* rb_erase just run */
18692c653d0eSAndrea Arcangeli goto again;
18702c653d0eSAndrea Arcangeli }
18712c653d0eSAndrea Arcangeli /*
18722c653d0eSAndrea Arcangeli * Take any of the stable_node dups page of
18732c653d0eSAndrea Arcangeli * this stable_node chain to let the tree walk
18742c653d0eSAndrea Arcangeli * continue. All KSM pages belonging to the
18752c653d0eSAndrea Arcangeli * stable_node dups in a stable_node chain
18762c653d0eSAndrea Arcangeli * have the same content and they're
18772c653d0eSAndrea Arcangeli * write protected at all times. Any will work
18782c653d0eSAndrea Arcangeli * fine to continue the walk.
1879b4fecc67SAndrea Arcangeli */
18802c653d0eSAndrea Arcangeli tree_page = get_ksm_page(stable_node_any,
18812c653d0eSAndrea Arcangeli GET_KSM_PAGE_NOLOCK);
18822c653d0eSAndrea Arcangeli }
18832c653d0eSAndrea Arcangeli VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
18842c653d0eSAndrea Arcangeli if (!tree_page) {
188531dbd01fSIzik Eidus /*
188631dbd01fSIzik Eidus * If we walked over a stale stable_node,
188731dbd01fSIzik Eidus * get_ksm_page() will call rb_erase() and it
1888e850dcf5SHugh Dickins * may rebalance the tree from under us. So
188931dbd01fSIzik Eidus * restart the search from scratch. Returning
189031dbd01fSIzik Eidus * NULL would be safe too, but we'd generate
18917b6ba2c7SHugh Dickins * false negative insertions just because some
18927b6ba2c7SHugh Dickins * stable_node was stale.
189331dbd01fSIzik Eidus */
189421fbd591SQi Zheng goto again;
189531dbd01fSIzik Eidus }
189690bd6fd3SPetr Holasek
189790bd6fd3SPetr Holasek ret = memcmp_pages(page, tree_page);
1898ef53d16cSHugh Dickins put_page(tree_page);
189990bd6fd3SPetr Holasek
1900f2e5ff85SAndrea Arcangeli parent = *new;
190121fbd591SQi Zheng if (ret < 0)
19022c653d0eSAndrea Arcangeli new = &parent->rb_left;
190331dbd01fSIzik Eidus else if (ret > 0)
190490bd6fd3SPetr Holasek new = &parent->rb_right;
190590bd6fd3SPetr Holasek else {
1906ef53d16cSHugh Dickins if (page_node) {
1907f2e5ff85SAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
1908f2e5ff85SAndrea Arcangeli /*
1909ef53d16cSHugh Dickins * Test if the migrated page should be merged
191090bd6fd3SPetr Holasek * into a stable node dup. If the mapcount is
191131dbd01fSIzik Eidus * 1 we can migrate it with another KSM page
19124035c07aSHugh Dickins * without adding it to the chain.
191331dbd01fSIzik Eidus */
191431dbd01fSIzik Eidus if (page_mapcount(page) > 1)
191531dbd01fSIzik Eidus goto chain_append;
191621fbd591SQi Zheng }
19172c653d0eSAndrea Arcangeli
19188dc5ffcdSAndrea Arcangeli if (!stable_node_dup) {
19192c653d0eSAndrea Arcangeli /*
19202c653d0eSAndrea Arcangeli * If the stable_node is a chain and
19212c653d0eSAndrea Arcangeli * we got a payload match in memcmp
19222c653d0eSAndrea Arcangeli * but we cannot merge the scanned
19232c653d0eSAndrea Arcangeli * page in any of the existing
19242c653d0eSAndrea Arcangeli * stable_node dups because they're
19252c653d0eSAndrea Arcangeli * all full, we need to wait the
19262c653d0eSAndrea Arcangeli * scanned page to find itself a match
19272c653d0eSAndrea Arcangeli * in the unstable tree to create a
19282c653d0eSAndrea Arcangeli * brand new KSM page to add later to
19292c653d0eSAndrea Arcangeli * the dups of this stable_node.
19302c653d0eSAndrea Arcangeli */
19312c653d0eSAndrea Arcangeli return NULL;
19322c653d0eSAndrea Arcangeli }
19332c653d0eSAndrea Arcangeli
19342c653d0eSAndrea Arcangeli /*
19352c653d0eSAndrea Arcangeli * Lock and unlock the stable_node's page (which
19362c653d0eSAndrea Arcangeli * might already have been migrated) so that page
1937457aef94SEthon Paul * migration is sure to notice its raised count.
19382c653d0eSAndrea Arcangeli * It would be more elegant to return stable_node
19392c653d0eSAndrea Arcangeli * than kpage, but that involves more changes.
19402cee57d1SYang Shi */
19412cee57d1SYang Shi tree_page = get_ksm_page(stable_node_dup,
19422c653d0eSAndrea Arcangeli GET_KSM_PAGE_TRYLOCK);
19432c653d0eSAndrea Arcangeli
1944f2e5ff85SAndrea Arcangeli if (PTR_ERR(tree_page) == -EBUSY)
1945f2e5ff85SAndrea Arcangeli return ERR_PTR(-EBUSY);
1946f2e5ff85SAndrea Arcangeli
1947f2e5ff85SAndrea Arcangeli if (unlikely(!tree_page))
1948f2e5ff85SAndrea Arcangeli /*
1949f2e5ff85SAndrea Arcangeli * The tree may have been rebalanced,
1950f2e5ff85SAndrea Arcangeli * so re-evaluate parent and new.
1951f2e5ff85SAndrea Arcangeli */
1952f2e5ff85SAndrea Arcangeli goto again;
1953f2e5ff85SAndrea Arcangeli unlock_page(tree_page);
1954f2e5ff85SAndrea Arcangeli
1955f2e5ff85SAndrea Arcangeli if (get_kpfn_nid(stable_node_dup->kpfn) !=
195631dbd01fSIzik Eidus NUMA(stable_node_dup->nid)) {
19574035c07aSHugh Dickins put_page(tree_page);
19584035c07aSHugh Dickins goto replace;
195931dbd01fSIzik Eidus }
196031dbd01fSIzik Eidus return tree_page;
196131dbd01fSIzik Eidus }
196231dbd01fSIzik Eidus }
196331dbd01fSIzik Eidus
196431dbd01fSIzik Eidus if (!page_node)
196531dbd01fSIzik Eidus return NULL;
19662c653d0eSAndrea Arcangeli
19672c653d0eSAndrea Arcangeli list_del(&page_node->list);
196831dbd01fSIzik Eidus DO_NUMA(page_node->nid = nid);
196931dbd01fSIzik Eidus rb_link_node(&page_node->node, parent, new);
197031dbd01fSIzik Eidus rb_insert_color(&page_node->node, root);
19712c653d0eSAndrea Arcangeli out:
19722c653d0eSAndrea Arcangeli if (is_page_sharing_candidate(page_node)) {
19737b6ba2c7SHugh Dickins get_page(page);
197431dbd01fSIzik Eidus return page;
19752c653d0eSAndrea Arcangeli } else
19762c653d0eSAndrea Arcangeli return NULL;
19772c653d0eSAndrea Arcangeli
19782c653d0eSAndrea Arcangeli replace:
19792c653d0eSAndrea Arcangeli /*
19802c653d0eSAndrea Arcangeli * If stable_node was a chain and chain_prune collapsed it,
19812c653d0eSAndrea Arcangeli * stable_node has been updated to be the new regular
19822c653d0eSAndrea Arcangeli * stable_node. A collapse of the chain is indistinguishable
19832c653d0eSAndrea Arcangeli * from the case there was no chain in the stable
19842c653d0eSAndrea Arcangeli * rbtree. Otherwise stable_node is the chain and
198521fbd591SQi Zheng * stable_node_dup is the dup to replace.
19862c653d0eSAndrea Arcangeli */
19872c653d0eSAndrea Arcangeli if (stable_node_dup == stable_node) {
19882c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_chain(stable_node_dup));
19892c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(stable_node_dup));
19902c653d0eSAndrea Arcangeli /* there is no chain */
19912c653d0eSAndrea Arcangeli if (page_node) {
19922c653d0eSAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
19932c653d0eSAndrea Arcangeli list_del(&page_node->list);
19942c653d0eSAndrea Arcangeli DO_NUMA(page_node->nid = nid);
199508beca44SHugh Dickins rb_replace_node(&stable_node_dup->node,
19962c653d0eSAndrea Arcangeli &page_node->node,
199731dbd01fSIzik Eidus root);
199831dbd01fSIzik Eidus if (is_page_sharing_candidate(page_node))
199931dbd01fSIzik Eidus get_page(page);
20008dd3557aSHugh Dickins else
20018dd3557aSHugh Dickins page = NULL;
200231dbd01fSIzik Eidus } else {
200331dbd01fSIzik Eidus rb_erase(&stable_node_dup->node, root);
200431dbd01fSIzik Eidus page = NULL;
200531dbd01fSIzik Eidus }
200631dbd01fSIzik Eidus } else {
200731dbd01fSIzik Eidus VM_BUG_ON(!is_stable_node_chain(stable_node));
200831dbd01fSIzik Eidus __stable_node_dup_del(stable_node_dup);
200931dbd01fSIzik Eidus if (page_node) {
201031dbd01fSIzik Eidus VM_BUG_ON(page_node->head != &migrate_nodes);
201131dbd01fSIzik Eidus list_del(&page_node->list);
201231dbd01fSIzik Eidus DO_NUMA(page_node->nid = nid);
20138dd3557aSHugh Dickins stable_node_chain_add_dup(page_node, stable_node);
201421fbd591SQi Zheng if (is_page_sharing_candidate(page_node))
20158dd3557aSHugh Dickins get_page(page);
20168dd3557aSHugh Dickins else
201731dbd01fSIzik Eidus page = NULL;
201890bd6fd3SPetr Holasek } else {
201990bd6fd3SPetr Holasek page = NULL;
202031dbd01fSIzik Eidus }
202190bd6fd3SPetr Holasek }
202290bd6fd3SPetr Holasek stable_node_dup->head = &migrate_nodes;
202390bd6fd3SPetr Holasek list_add(&stable_node_dup->list, stable_node_dup->head);
2024ef53d16cSHugh Dickins return page;
202590bd6fd3SPetr Holasek
202631dbd01fSIzik Eidus chain_append:
202731dbd01fSIzik Eidus /* stable_node_dup could be null if it reached the limit */
202821fbd591SQi Zheng if (!stable_node_dup)
20298dd3557aSHugh Dickins stable_node_dup = stable_node_any;
203031dbd01fSIzik Eidus /*
203131dbd01fSIzik Eidus * If stable_node was a chain and chain_prune collapsed it,
2032d178f27fSHugh Dickins * stable_node has been updated to be the new regular
203321fbd591SQi Zheng * stable_node. A collapse of the chain is indistinguishable
20348dd3557aSHugh Dickins * from the case there was no chain in the stable
2035c8f95ed1SAndrea Arcangeli * rbtree. Otherwise stable_node is the chain and
203631dbd01fSIzik Eidus * stable_node_dup is the dup to replace.
203731dbd01fSIzik Eidus */
203831dbd01fSIzik Eidus if (stable_node_dup == stable_node) {
20398dd3557aSHugh Dickins VM_BUG_ON(is_stable_node_dup(stable_node_dup));
204031dbd01fSIzik Eidus /* chain is missing so create it */
20418dd3557aSHugh Dickins stable_node = alloc_stable_node_chain(stable_node_dup,
20428dd3557aSHugh Dickins root);
204331dbd01fSIzik Eidus if (!stable_node)
204431dbd01fSIzik Eidus return NULL;
204531dbd01fSIzik Eidus }
20468dd3557aSHugh Dickins /*
204731dbd01fSIzik Eidus * Add this stable_node dup that was
204831dbd01fSIzik Eidus * migrated to the stable_node chain
204931dbd01fSIzik Eidus * of the current nid for this page
20508dd3557aSHugh Dickins * content.
205131dbd01fSIzik Eidus */
205231dbd01fSIzik Eidus VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
20538dd3557aSHugh Dickins VM_BUG_ON(page_node->head != &migrate_nodes);
205431dbd01fSIzik Eidus list_del(&page_node->list);
2055b599cbdfSHugh Dickins DO_NUMA(page_node->nid = nid);
2056b599cbdfSHugh Dickins stable_node_chain_add_dup(page_node, stable_node);
2057b599cbdfSHugh Dickins goto out;
2058b599cbdfSHugh Dickins }
2059b599cbdfSHugh Dickins
2060b599cbdfSHugh Dickins /*
2061b599cbdfSHugh Dickins * stable_tree_insert - insert stable tree node pointing to new ksm page
2062b599cbdfSHugh Dickins * into the stable tree.
2063b599cbdfSHugh Dickins *
206431dbd01fSIzik Eidus * This function returns the stable tree node just allocated on success,
20658dd3557aSHugh Dickins * NULL otherwise.
206631dbd01fSIzik Eidus */
stable_tree_insert(struct page * kpage)206731dbd01fSIzik Eidus static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
206831dbd01fSIzik Eidus {
206931dbd01fSIzik Eidus int nid;
20707b6ba2c7SHugh Dickins unsigned long kpfn;
207131dbd01fSIzik Eidus struct rb_root *root;
2072e850dcf5SHugh Dickins struct rb_node **new;
207331dbd01fSIzik Eidus struct rb_node *parent;
207490bd6fd3SPetr Holasek struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
207531dbd01fSIzik Eidus bool need_chain = false;
2076473b0ce4SHugh Dickins
207731dbd01fSIzik Eidus kpfn = page_to_pfn(kpage);
207831dbd01fSIzik Eidus nid = get_kpfn_nid(kpfn);
207931dbd01fSIzik Eidus root = root_stable_tree + nid;
208031dbd01fSIzik Eidus again:
208131dbd01fSIzik Eidus parent = NULL;
208231dbd01fSIzik Eidus new = &root->rb_node;
208331dbd01fSIzik Eidus
208431dbd01fSIzik Eidus while (*new) {
208521fbd591SQi Zheng struct page *tree_page;
208621fbd591SQi Zheng int ret;
20872c653d0eSAndrea Arcangeli
208831dbd01fSIzik Eidus cond_resched();
20892c653d0eSAndrea Arcangeli stable_node = rb_entry(*new, struct ksm_stable_node, node);
20902c653d0eSAndrea Arcangeli stable_node_any = NULL;
20912c653d0eSAndrea Arcangeli tree_page = chain(&stable_node_dup, stable_node, root);
20922c653d0eSAndrea Arcangeli if (!stable_node_dup) {
20932c653d0eSAndrea Arcangeli /*
20942c653d0eSAndrea Arcangeli * Either all stable_node dups were full in
2095457aef94SEthon Paul * this stable_node chain, or this chain was
20962c653d0eSAndrea Arcangeli * empty and should be rb_erased.
20972c653d0eSAndrea Arcangeli */
20982c653d0eSAndrea Arcangeli stable_node_any = stable_node_dup_any(stable_node,
20992c653d0eSAndrea Arcangeli root);
21002c653d0eSAndrea Arcangeli if (!stable_node_any) {
21012c653d0eSAndrea Arcangeli /* rb_erase just run */
21022c653d0eSAndrea Arcangeli goto again;
21032c653d0eSAndrea Arcangeli }
21042c653d0eSAndrea Arcangeli /*
21052c653d0eSAndrea Arcangeli * Take any of the stable_node dups page of
21062c653d0eSAndrea Arcangeli * this stable_node chain to let the tree walk
21077b6ba2c7SHugh Dickins * continue. All KSM pages belonging to the
210831dbd01fSIzik Eidus * stable_node dups in a stable_node chain
21097b6ba2c7SHugh Dickins * have the same content and they're
2110e178dfdeSHugh Dickins * write protected at all times. Any will work
21117b6ba2c7SHugh Dickins * fine to continue the walk.
2112e178dfdeSHugh Dickins */
21137b6ba2c7SHugh Dickins tree_page = get_ksm_page(stable_node_any,
21147b6ba2c7SHugh Dickins GET_KSM_PAGE_NOLOCK);
211576093853Sxu xin }
211676093853Sxu xin VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
211731dbd01fSIzik Eidus if (!tree_page) {
211831dbd01fSIzik Eidus /*
211931dbd01fSIzik Eidus * If we walked over a stale stable_node,
212081464e30SHugh Dickins * get_ksm_page() will call rb_erase() and it
212181464e30SHugh Dickins * may rebalance the tree from under us. So
212281464e30SHugh Dickins * restart the search from scratch. Returning
212381464e30SHugh Dickins * NULL would be safe too, but we'd generate
212431dbd01fSIzik Eidus * false negative insertions just because some
212531dbd01fSIzik Eidus * stable_node was stale.
212631dbd01fSIzik Eidus */
212731dbd01fSIzik Eidus goto again;
212821fbd591SQi Zheng }
212931dbd01fSIzik Eidus
21304b22927fSKirill Tkhai ret = memcmp_pages(kpage, tree_page);
213121fbd591SQi Zheng put_page(tree_page);
21328dd3557aSHugh Dickins
213321fbd591SQi Zheng parent = *new;
21348dd3557aSHugh Dickins if (ret < 0)
213531dbd01fSIzik Eidus new = &parent->rb_left;
213631dbd01fSIzik Eidus else if (ret > 0)
21372c653d0eSAndrea Arcangeli new = &parent->rb_right;
213831dbd01fSIzik Eidus else {
21394146d2d6SHugh Dickins need_chain = true;
21404146d2d6SHugh Dickins break;
21414146d2d6SHugh Dickins }
21422c653d0eSAndrea Arcangeli }
21432c653d0eSAndrea Arcangeli
21442c653d0eSAndrea Arcangeli stable_node_dup = alloc_stable_node();
21454146d2d6SHugh Dickins if (!stable_node_dup)
21464146d2d6SHugh Dickins return NULL;
21474146d2d6SHugh Dickins
21484146d2d6SHugh Dickins INIT_HLIST_HEAD(&stable_node_dup->hlist);
21494146d2d6SHugh Dickins stable_node_dup->kpfn = kpfn;
21504146d2d6SHugh Dickins set_page_stable_node(kpage, stable_node_dup);
21512c653d0eSAndrea Arcangeli stable_node_dup->rmap_hlist_len = 0;
21522c653d0eSAndrea Arcangeli DO_NUMA(stable_node_dup->nid = nid);
21532c653d0eSAndrea Arcangeli if (!need_chain) {
21542c653d0eSAndrea Arcangeli rb_link_node(&stable_node_dup->node, parent, new);
21552c653d0eSAndrea Arcangeli rb_insert_color(&stable_node_dup->node, root);
21562c653d0eSAndrea Arcangeli } else {
21574146d2d6SHugh Dickins if (!is_stable_node_chain(stable_node)) {
215831dbd01fSIzik Eidus struct ksm_stable_node *orig = stable_node;
215931dbd01fSIzik Eidus /* chain is missing so create it */
216062b61f61SHugh Dickins stable_node = alloc_stable_node_chain(orig, root);
21614146d2d6SHugh Dickins if (!stable_node) {
21624146d2d6SHugh Dickins free_stable_node(stable_node_dup);
21634146d2d6SHugh Dickins return NULL;
21644146d2d6SHugh Dickins }
21654146d2d6SHugh Dickins }
21664146d2d6SHugh Dickins stable_node_chain_add_dup(stable_node_dup, stable_node);
21674146d2d6SHugh Dickins }
216862b61f61SHugh Dickins
21692cee57d1SYang Shi return stable_node_dup;
21702cee57d1SYang Shi }
21712cee57d1SYang Shi
217208beca44SHugh Dickins /*
217331dbd01fSIzik Eidus * unstable_tree_search_insert - search for identical page,
217431dbd01fSIzik Eidus * else insert rmap_item into the unstable tree.
217531dbd01fSIzik Eidus *
217631dbd01fSIzik Eidus * This function searches for a page in the unstable tree identical to the
217731dbd01fSIzik Eidus * page currently being scanned; and if no identical page is found in the
21785ad64688SHugh Dickins * tree, we insert rmap_item as a new object into the unstable tree.
21792c653d0eSAndrea Arcangeli *
21802c653d0eSAndrea Arcangeli * This function returns pointer to rmap_item found to be identical
21815ad64688SHugh Dickins * to the currently scanned page, NULL otherwise.
218231dbd01fSIzik Eidus *
21838dd3557aSHugh Dickins * This function does both searching and inserting, because they share
218431dbd01fSIzik Eidus * the same walking algorithm in an rbtree.
218531dbd01fSIzik Eidus */
218631dbd01fSIzik Eidus static
unstable_tree_search_insert(struct ksm_rmap_item * rmap_item,struct page * page,struct page ** tree_pagep)218731dbd01fSIzik Eidus struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
21884035c07aSHugh Dickins struct page *page,
21894035c07aSHugh Dickins struct page **tree_pagep)
21904035c07aSHugh Dickins {
21914035c07aSHugh Dickins struct rb_node **new;
219231dbd01fSIzik Eidus struct rb_root *root;
219331dbd01fSIzik Eidus struct rb_node *parent = NULL;
219431dbd01fSIzik Eidus int nid;
219531dbd01fSIzik Eidus
219631dbd01fSIzik Eidus nid = get_kpfn_nid(page_to_pfn(page));
219731dbd01fSIzik Eidus root = root_unstable_tree + nid;
219831dbd01fSIzik Eidus new = &root->rb_node;
2199e86c59b1SClaudio Imbrenda
2200e86c59b1SClaudio Imbrenda while (*new) {
2201e86c59b1SClaudio Imbrenda struct ksm_rmap_item *tree_rmap_item;
2202e86c59b1SClaudio Imbrenda struct page *tree_page;
2203e86c59b1SClaudio Imbrenda int ret;
2204e86c59b1SClaudio Imbrenda
2205e86c59b1SClaudio Imbrenda cond_resched();
2206d8ed45c5SMichel Lespinasse tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
22074b22927fSKirill Tkhai tree_page = get_mergeable_page(tree_rmap_item);
220856df70a6SMuchun Song if (!tree_page)
2209e86c59b1SClaudio Imbrenda return NULL;
2210e86c59b1SClaudio Imbrenda
2211739100c8SStefan Roesch /*
2212739100c8SStefan Roesch * Don't substitute a ksm page for a forked page.
2213739100c8SStefan Roesch */
221456df70a6SMuchun Song if (page == tree_page) {
221556df70a6SMuchun Song put_page(tree_page);
221656df70a6SMuchun Song return NULL;
221756df70a6SMuchun Song }
221856df70a6SMuchun Song
221956df70a6SMuchun Song ret = memcmp_pages(page, tree_page);
222056df70a6SMuchun Song
2221d8ed45c5SMichel Lespinasse parent = *new;
2222e86c59b1SClaudio Imbrenda if (ret < 0) {
2223e86c59b1SClaudio Imbrenda put_page(tree_page);
2224e86c59b1SClaudio Imbrenda new = &parent->rb_left;
2225e86c59b1SClaudio Imbrenda } else if (ret > 0) {
2226e86c59b1SClaudio Imbrenda put_page(tree_page);
2227e86c59b1SClaudio Imbrenda new = &parent->rb_right;
2228e86c59b1SClaudio Imbrenda } else if (!ksm_merge_across_nodes &&
22298dd3557aSHugh Dickins page_to_nid(tree_page) != nid) {
22308dd3557aSHugh Dickins /*
223131dbd01fSIzik Eidus * If tree_page has been migrated to another NUMA node,
223277da2ba0SClaudio Imbrenda * it will be flushed out and put in the right unstable
223377da2ba0SClaudio Imbrenda * tree next time: only merge with it when across_nodes.
22348dd3557aSHugh Dickins */
22358dd3557aSHugh Dickins put_page(tree_page);
223677da2ba0SClaudio Imbrenda return NULL;
223777da2ba0SClaudio Imbrenda } else {
223877da2ba0SClaudio Imbrenda *tree_pagep = tree_page;
223977da2ba0SClaudio Imbrenda return tree_rmap_item;
224077da2ba0SClaudio Imbrenda }
224177da2ba0SClaudio Imbrenda }
224277da2ba0SClaudio Imbrenda
224377da2ba0SClaudio Imbrenda rmap_item->address |= UNSTABLE_FLAG;
224477da2ba0SClaudio Imbrenda rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
224577da2ba0SClaudio Imbrenda DO_NUMA(rmap_item->nid = nid);
224677da2ba0SClaudio Imbrenda rb_link_node(&rmap_item->node, parent, new);
224777da2ba0SClaudio Imbrenda rb_insert_color(&rmap_item->node, root);
22488dd3557aSHugh Dickins
22498dd3557aSHugh Dickins ksm_pages_unshared++;
2250bc56620bSHugh Dickins return NULL;
2251bc56620bSHugh Dickins }
2252bc56620bSHugh Dickins
2253bc56620bSHugh Dickins /*
22545ad64688SHugh Dickins * stable_tree_append - add another rmap_item to the linked list of
22557b6ba2c7SHugh Dickins * rmap_items hanging off a given node of the stable tree, all sharing
22567b6ba2c7SHugh Dickins * the same ksm page.
22572c653d0eSAndrea Arcangeli */
stable_tree_append(struct ksm_rmap_item * rmap_item,struct ksm_stable_node * stable_node,bool max_page_sharing_bypass)22582c653d0eSAndrea Arcangeli static void stable_tree_append(struct ksm_rmap_item *rmap_item,
22592c653d0eSAndrea Arcangeli struct ksm_stable_node *stable_node,
22602c653d0eSAndrea Arcangeli bool max_page_sharing_bypass)
22617b6ba2c7SHugh Dickins {
22625ad64688SHugh Dickins /*
22637b6ba2c7SHugh Dickins * rmap won't find this mapping if we don't insert the
226431dbd01fSIzik Eidus * rmap_item in the right stable_node
226531dbd01fSIzik Eidus * duplicate. page_migration could break later if rmap breaks,
226631dbd01fSIzik Eidus * so we can as well crash here. We really need to check for
226731dbd01fSIzik Eidus * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
226831dbd01fSIzik Eidus * for other negative values as an underflow if detected here
226931dbd01fSIzik Eidus * for the first time (and not when decreasing rmap_hlist_len)
22707b6ba2c7SHugh Dickins * would be sign of memory corruption in the stable_node.
22718dd3557aSHugh Dickins */
22728dd3557aSHugh Dickins BUG_ON(stable_node->rmap_hlist_len < 0);
227331dbd01fSIzik Eidus
227477da2ba0SClaudio Imbrenda stable_node->rmap_hlist_len++;
227577da2ba0SClaudio Imbrenda if (!max_page_sharing_bypass)
227677da2ba0SClaudio Imbrenda /* possibly non fatal but unexpected overflow, only warn */
227777da2ba0SClaudio Imbrenda WARN_ON_ONCE(stable_node->rmap_hlist_len >
227877da2ba0SClaudio Imbrenda ksm_max_page_sharing);
227977da2ba0SClaudio Imbrenda
228077da2ba0SClaudio Imbrenda rmap_item->head = stable_node;
228177da2ba0SClaudio Imbrenda rmap_item->address |= STABLE_FLAG;
228277da2ba0SClaudio Imbrenda hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
228377da2ba0SClaudio Imbrenda
228477da2ba0SClaudio Imbrenda if (rmap_item->hlist.next)
228577da2ba0SClaudio Imbrenda ksm_pages_sharing++;
228677da2ba0SClaudio Imbrenda else
228777da2ba0SClaudio Imbrenda ksm_pages_shared++;
228831dbd01fSIzik Eidus
228931dbd01fSIzik Eidus rmap_item->mm->ksm_merging_pages++;
229031dbd01fSIzik Eidus }
229131dbd01fSIzik Eidus
229221fbd591SQi Zheng /*
229321fbd591SQi Zheng * cmp_and_merge_page - first see if page can be merged into the stable tree;
229431dbd01fSIzik Eidus * if not, compare checksum to previous and if it's the same, see if page can
229531dbd01fSIzik Eidus * be inserted into the unstable tree, or merged with a page already there and
229621fbd591SQi Zheng * both transferred to the stable tree.
229731dbd01fSIzik Eidus *
22986514d511SHugh Dickins * @page: the page that we are searching identical page to.
22996514d511SHugh Dickins * @rmap_item: the reverse mapping into the virtual address of this page
230093d17715SHugh Dickins */
cmp_and_merge_page(struct page * page,struct ksm_rmap_item * rmap_item)230131dbd01fSIzik Eidus static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
230231dbd01fSIzik Eidus {
230331dbd01fSIzik Eidus struct mm_struct *mm = rmap_item->mm;
23046514d511SHugh Dickins struct ksm_rmap_item *tree_rmap_item;
230531dbd01fSIzik Eidus struct page *tree_page = NULL;
230631dbd01fSIzik Eidus struct ksm_stable_node *stable_node;
230731dbd01fSIzik Eidus struct page *kpage;
230831dbd01fSIzik Eidus unsigned int checksum;
230931dbd01fSIzik Eidus int err;
231031dbd01fSIzik Eidus bool max_page_sharing_bypass = false;
231131dbd01fSIzik Eidus
231258730ab6SQi Zheng stable_node = page_stable_node(page);
2313cb4df4caSxu xin if (stable_node) {
231431dbd01fSIzik Eidus if (stable_node->head != &migrate_nodes &&
23156514d511SHugh Dickins get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
23166514d511SHugh Dickins NUMA(stable_node->nid)) {
231731dbd01fSIzik Eidus stable_node_dup_del(stable_node);
231831dbd01fSIzik Eidus stable_node->head = &migrate_nodes;
231931dbd01fSIzik Eidus list_add(&stable_node->list, stable_node->head);
232031dbd01fSIzik Eidus }
23215e924ff5SStefan Roesch if (stable_node->head != &migrate_nodes &&
23225e924ff5SStefan Roesch rmap_item->head == stable_node)
23235e924ff5SStefan Roesch return;
23245e924ff5SStefan Roesch /*
23255e924ff5SStefan Roesch * If it's a KSM fork, allow it to go over the sharing limit
23265e924ff5SStefan Roesch * without warnings.
23275e924ff5SStefan Roesch */
23285e924ff5SStefan Roesch if (!is_page_sharing_candidate(stable_node))
23295e924ff5SStefan Roesch max_page_sharing_bypass = true;
23305e924ff5SStefan Roesch }
23315e924ff5SStefan Roesch
23325e924ff5SStefan Roesch /* We first start with searching the page inside the stable tree */
23335e924ff5SStefan Roesch kpage = stable_tree_search(page);
23345e924ff5SStefan Roesch if (kpage == page && rmap_item->head == stable_node) {
23355e924ff5SStefan Roesch put_page(kpage);
23365e924ff5SStefan Roesch return;
23375e924ff5SStefan Roesch }
23385e924ff5SStefan Roesch
23395e924ff5SStefan Roesch remove_rmap_item_from_tree(rmap_item);
23405e924ff5SStefan Roesch
23415e924ff5SStefan Roesch if (kpage) {
23425e924ff5SStefan Roesch if (PTR_ERR(kpage) == -EBUSY)
23435e924ff5SStefan Roesch return;
23445e924ff5SStefan Roesch
23455e924ff5SStefan Roesch err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
23465e924ff5SStefan Roesch if (!err) {
23475e924ff5SStefan Roesch /*
23485e924ff5SStefan Roesch * The page was successfully merged:
23495e924ff5SStefan Roesch * add its rmap_item to the stable tree.
23505e924ff5SStefan Roesch */
23515e924ff5SStefan Roesch lock_page(kpage);
23525e924ff5SStefan Roesch stable_tree_append(rmap_item, page_stable_node(kpage),
23535e924ff5SStefan Roesch max_page_sharing_bypass);
23545e924ff5SStefan Roesch unlock_page(kpage);
23555e924ff5SStefan Roesch }
23565e924ff5SStefan Roesch put_page(kpage);
23575e924ff5SStefan Roesch return;
23585e924ff5SStefan Roesch }
23595e924ff5SStefan Roesch
23605e924ff5SStefan Roesch /*
23615e924ff5SStefan Roesch * If the hash value of the page has changed from the last time
23625e924ff5SStefan Roesch * we calculated it, this page is changing frequently: therefore we
23635e924ff5SStefan Roesch * don't want to insert it in the unstable tree, and we don't want
23645e924ff5SStefan Roesch * to waste our time searching for something identical to it there.
23655e924ff5SStefan Roesch */
23665e924ff5SStefan Roesch checksum = calc_checksum(page);
23675e924ff5SStefan Roesch if (rmap_item->oldchecksum != checksum) {
23685e924ff5SStefan Roesch rmap_item->oldchecksum = checksum;
23695e924ff5SStefan Roesch return;
23705e924ff5SStefan Roesch }
23715e924ff5SStefan Roesch
23725e924ff5SStefan Roesch /*
23735e924ff5SStefan Roesch * Same checksum as an empty page. We attempt to merge it with the
23745e924ff5SStefan Roesch * appropriate zero page if the user enabled this via sysfs.
23755e924ff5SStefan Roesch */
23765e924ff5SStefan Roesch if (ksm_use_zero_pages && (checksum == zero_checksum)) {
23775e924ff5SStefan Roesch struct vm_area_struct *vma;
23785e924ff5SStefan Roesch
23795e924ff5SStefan Roesch mmap_read_lock(mm);
23805e924ff5SStefan Roesch vma = find_mergeable_vma(mm, rmap_item->address);
23815e924ff5SStefan Roesch if (vma) {
23825e924ff5SStefan Roesch err = try_to_merge_one_page(vma, page,
2383e5a68991SStefan Roesch ZERO_PAGE(rmap_item->address));
23845e924ff5SStefan Roesch trace_ksm_merge_one_page(
23855e924ff5SStefan Roesch page_to_pfn(ZERO_PAGE(rmap_item->address)),
23865e924ff5SStefan Roesch rmap_item, mm, err);
23875e924ff5SStefan Roesch } else {
23885e924ff5SStefan Roesch /*
238921fbd591SQi Zheng * If the vma is out of date, we do not need to
239031dbd01fSIzik Eidus * continue.
239131dbd01fSIzik Eidus */
239258730ab6SQi Zheng err = 0;
239358730ab6SQi Zheng }
239431dbd01fSIzik Eidus mmap_read_unlock(mm);
239521fbd591SQi Zheng /*
2396a5f18ba0SMatthew Wilcox (Oracle) * In case of failure, the page was not really empty, so we
239790bd6fd3SPetr Holasek * need to continue. Otherwise we're done.
239831dbd01fSIzik Eidus */
239958730ab6SQi Zheng if (!err)
240031dbd01fSIzik Eidus return;
240131dbd01fSIzik Eidus }
240258730ab6SQi Zheng tree_rmap_item =
240358730ab6SQi Zheng unstable_tree_search_insert(rmap_item, page, &tree_page);
2404739100c8SStefan Roesch if (tree_rmap_item) {
2405739100c8SStefan Roesch bool split;
24062919bfd0SHugh Dickins
24071fec6890SMatthew Wilcox (Oracle) kpage = try_to_merge_two_pages(rmap_item, page,
24081fec6890SMatthew Wilcox (Oracle) tree_rmap_item, tree_page);
24092919bfd0SHugh Dickins /*
24102919bfd0SHugh Dickins * If both pages we tried to merge belong to the same compound
24112919bfd0SHugh Dickins * page, then we actually ended up increasing the reference
24122919bfd0SHugh Dickins * count of the same compound page twice, and split_huge_page
24132919bfd0SHugh Dickins * failed.
24142919bfd0SHugh Dickins * Here we set a flag if that happened, and we use it later to
24152919bfd0SHugh Dickins * try split_huge_page again. Since we call put_page right
24162919bfd0SHugh Dickins * afterwards, the reference count will be correct and
24172919bfd0SHugh Dickins * split_huge_page should succeed.
24184146d2d6SHugh Dickins */
24194146d2d6SHugh Dickins split = PageTransCompound(page)
24204146d2d6SHugh Dickins && compound_head(page) == compound_head(tree_page);
24214146d2d6SHugh Dickins put_page(tree_page);
24224146d2d6SHugh Dickins if (kpage) {
24234146d2d6SHugh Dickins /*
24244146d2d6SHugh Dickins * The pages were successfully merged: insert new
242521fbd591SQi Zheng * node in the stable tree and add both rmap_items.
24264146d2d6SHugh Dickins */
24274146d2d6SHugh Dickins lock_page(kpage);
242803640418SGeliang Tang stable_node = stable_tree_insert(kpage);
242903640418SGeliang Tang if (stable_node) {
24302cee57d1SYang Shi stable_tree_append(tree_rmap_item, stable_node,
24312cee57d1SYang Shi false);
24324146d2d6SHugh Dickins stable_tree_append(rmap_item, stable_node,
24334146d2d6SHugh Dickins false);
24344146d2d6SHugh Dickins }
24354146d2d6SHugh Dickins unlock_page(kpage);
24364146d2d6SHugh Dickins
24374146d2d6SHugh Dickins /*
2438ef53d16cSHugh Dickins * If we fail to insert the page into the stable tree,
243990bd6fd3SPetr Holasek * we will have 2 virtual addresses that are pointing
244031dbd01fSIzik Eidus * to a ksm page left outside the stable tree,
244131dbd01fSIzik Eidus * in which case we need to break_cow on both.
244258730ab6SQi Zheng */
244358730ab6SQi Zheng if (!stable_node) {
244458730ab6SQi Zheng break_cow(tree_rmap_item);
244558730ab6SQi Zheng break_cow(rmap_item);
244631dbd01fSIzik Eidus }
24472b472611SHugh Dickins } else if (split) {
24482b472611SHugh Dickins /*
24492b472611SHugh Dickins * We are here if we tried to merge two pages and
24502b472611SHugh Dickins * failed because they both belonged to the same
245158730ab6SQi Zheng * compound page. We will split the page now, but no
24522b472611SHugh Dickins * merging will take place.
245331dbd01fSIzik Eidus * We do not want to add the cost of a full lock; if
245431dbd01fSIzik Eidus * the page is locked, it is better to skip it and
245558730ab6SQi Zheng * perhaps try again later.
245631dbd01fSIzik Eidus */
245731dbd01fSIzik Eidus if (!trylock_page(page))
245858730ab6SQi Zheng return;
245931dbd01fSIzik Eidus split_huge_page(page);
2460a5f18ba0SMatthew Wilcox (Oracle) unlock_page(page);
2461a5f18ba0SMatthew Wilcox (Oracle) }
2462d8ed45c5SMichel Lespinasse }
24639ba69294SHugh Dickins }
2464a5f18ba0SMatthew Wilcox (Oracle)
get_next_rmap_item(struct ksm_mm_slot * mm_slot,struct ksm_rmap_item ** rmap_list,unsigned long addr)24659ba69294SHugh Dickins static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
2466a5f18ba0SMatthew Wilcox (Oracle) struct ksm_rmap_item **rmap_list,
246731dbd01fSIzik Eidus unsigned long addr)
246831dbd01fSIzik Eidus {
246931dbd01fSIzik Eidus struct ksm_rmap_item *rmap_item;
247031dbd01fSIzik Eidus
247131dbd01fSIzik Eidus while (*rmap_list) {
247231dbd01fSIzik Eidus rmap_item = *rmap_list;
247331dbd01fSIzik Eidus if ((rmap_item->address & PAGE_MASK) == addr)
247431dbd01fSIzik Eidus return rmap_item;
24759ba69294SHugh Dickins if (rmap_item->address > addr)
24769ba69294SHugh Dickins break;
247731dbd01fSIzik Eidus *rmap_list = rmap_item->rmap_list;
2478f7091ed6SHaiyue Wang remove_rmap_item_from_tree(rmap_item);
247921ae5b01SAndrea Arcangeli free_rmap_item(rmap_item);
248021ae5b01SAndrea Arcangeli }
248121ae5b01SAndrea Arcangeli
248221ae5b01SAndrea Arcangeli rmap_item = alloc_rmap_item();
2483f7091ed6SHaiyue Wang if (rmap_item) {
2484f7091ed6SHaiyue Wang /* It has already been zeroed */
2485f765f540SKirill A. Shutemov rmap_item->mm = mm_slot->slot.mm;
248631dbd01fSIzik Eidus rmap_item->mm->ksm_rmap_items++;
248731dbd01fSIzik Eidus rmap_item->address = addr;
248858730ab6SQi Zheng rmap_item->rmap_list = *rmap_list;
24896514d511SHugh Dickins *rmap_list = rmap_item;
249031dbd01fSIzik Eidus }
24916514d511SHugh Dickins return rmap_item;
24926514d511SHugh Dickins }
24935e924ff5SStefan Roesch
24945e924ff5SStefan Roesch /*
24955e924ff5SStefan Roesch * Calculate skip age for the ksm page age. The age determines how often
24965e924ff5SStefan Roesch * de-duplicating has already been tried unsuccessfully. If the age is
249731dbd01fSIzik Eidus * smaller, the scanning of this page is skipped for less scans.
249831dbd01fSIzik Eidus *
249931dbd01fSIzik Eidus * @age: rmap_item age of page
2500d8ed45c5SMichel Lespinasse */
skip_age(rmap_age_t age)250131dbd01fSIzik Eidus static unsigned int skip_age(rmap_age_t age)
250231dbd01fSIzik Eidus {
2503f7091ed6SHaiyue Wang if (age <= 3)
250431dbd01fSIzik Eidus return 1;
250531dbd01fSIzik Eidus if (age <= 5)
250631dbd01fSIzik Eidus return 2;
250731dbd01fSIzik Eidus if (age <= 8)
250831dbd01fSIzik Eidus return 4;
250931dbd01fSIzik Eidus
25109ba69294SHugh Dickins return 8;
2511a5f18ba0SMatthew Wilcox (Oracle) }
25129ba69294SHugh Dickins
251358730ab6SQi Zheng /*
25149ba69294SHugh Dickins * Determines if a page should be skipped for the current scan.
251531dbd01fSIzik Eidus *
251631dbd01fSIzik Eidus * @page: page to check
251731dbd01fSIzik Eidus * @rmap_item: associated rmap_item of page
251831dbd01fSIzik Eidus */
should_skip_rmap_item(struct page * page,struct ksm_rmap_item * rmap_item)2519420be4edSChengyang Fan static bool should_skip_rmap_item(struct page *page,
252031dbd01fSIzik Eidus struct ksm_rmap_item *rmap_item)
252131dbd01fSIzik Eidus {
252258730ab6SQi Zheng rmap_age_t age;
252358730ab6SQi Zheng
252458730ab6SQi Zheng if (!ksm_smart_scan)
2525cd551f97SHugh Dickins return false;
2526cd551f97SHugh Dickins
2527c1e8d7c6SMichel Lespinasse /*
2528cd551f97SHugh Dickins * Never skip pages that are already KSM; pages cmp_and_merge_page()
2529cd551f97SHugh Dickins * will essentially ignore them, but we still have to process them
25309ba69294SHugh Dickins * properly.
25319ba69294SHugh Dickins */
25329ba69294SHugh Dickins if (PageKsm(page))
2533c1e8d7c6SMichel Lespinasse return false;
2534cd551f97SHugh Dickins
253558730ab6SQi Zheng age = rmap_item->age;
253658730ab6SQi Zheng if (age != U8_MAX)
25379ba69294SHugh Dickins rmap_item->age++;
25389ba69294SHugh Dickins
253958730ab6SQi Zheng /*
2540cd551f97SHugh Dickins * Smaller ages are not skipped, they need to get a chance to go
2541d7597f59SStefan Roesch * through the different phases of the KSM merging.
2542d8ed45c5SMichel Lespinasse */
25439ba69294SHugh Dickins if (age < 3)
25449ba69294SHugh Dickins return false;
2545d8ed45c5SMichel Lespinasse
25467496fea9SZhou Chengming /*
25473e4e28c5SMichel Lespinasse * Are we still allowed to skip? If not, then don't skip it
25487496fea9SZhou Chengming * and determine how much more often we are allowed to skip next.
25497496fea9SZhou Chengming */
25507496fea9SZhou Chengming if (!rmap_item->remaining_skips) {
25517496fea9SZhou Chengming rmap_item->remaining_skips = skip_age(age);
25527496fea9SZhou Chengming return false;
25537496fea9SZhou Chengming }
25549ba69294SHugh Dickins
255531dbd01fSIzik Eidus /* Skip this page */
255631dbd01fSIzik Eidus ksm_pages_skipped++;
255758730ab6SQi Zheng rmap_item->remaining_skips--;
255858730ab6SQi Zheng remove_rmap_item_from_tree(rmap_item);
255931dbd01fSIzik Eidus return true;
256031dbd01fSIzik Eidus }
2561739100c8SStefan Roesch
scan_get_next_rmap_item(struct page ** page)256231dbd01fSIzik Eidus static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
256331dbd01fSIzik Eidus {
256431dbd01fSIzik Eidus struct mm_struct *mm;
256531dbd01fSIzik Eidus struct ksm_mm_slot *mm_slot;
256631dbd01fSIzik Eidus struct mm_slot *slot;
256731dbd01fSIzik Eidus struct vm_area_struct *vma;
2568b7701a5fSMike Rapoport struct ksm_rmap_item *rmap_item;
256931dbd01fSIzik Eidus struct vma_iterator vmi;
257031dbd01fSIzik Eidus int nid;
257131dbd01fSIzik Eidus
257221fbd591SQi Zheng if (list_empty(&ksm_mm_head.slot.mm_node))
25733f649ab7SKees Cook return NULL;
2574b348b5feSStefan Roesch
257531dbd01fSIzik Eidus mm_slot = ksm_scan.mm_slot;
2576b348b5feSStefan Roesch if (mm_slot == &ksm_mm_head) {
257731dbd01fSIzik Eidus advisor_start_scan();
257831dbd01fSIzik Eidus trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
257931dbd01fSIzik Eidus
258031dbd01fSIzik Eidus /*
258131dbd01fSIzik Eidus * A number of pages can hang around indefinitely in per-cpu
258231dbd01fSIzik Eidus * LRU cache, raised page count preventing write_protect_page
258331dbd01fSIzik Eidus * from merging them. Though it doesn't really matter much,
2584b348b5feSStefan Roesch * it is puzzling to see some stuck in pages_volatile until
2585b348b5feSStefan Roesch * other activity jostles them out, and they also prevented
258631dbd01fSIzik Eidus * LTP's KSM test from succeeding deterministically; so drain
258731dbd01fSIzik Eidus * them here (here rather than on entry to ksm_do_scan(),
25886e158384SHugh Dickins * so we don't IPI too often when pages_to_scan is set low).
25896e158384SHugh Dickins */
259058730ab6SQi Zheng lru_add_drain_all();
25916e158384SHugh Dickins
25926e158384SHugh Dickins /*
259331dbd01fSIzik Eidus * Whereas stale stable_nodes on the stable_tree itself
259431dbd01fSIzik Eidus * get pruned in the regular course of stable_tree_search(),
2595fcf9a0efSKirill Tkhai * those moved out to the migrate_nodes list can accumulate:
2596fcf9a0efSKirill Tkhai * so prune them once before each full scan.
2597878aee7dSAndrea Arcangeli */
2598339aa624SIzik Eidus if (!ksm_merge_across_nodes) {
259931dbd01fSIzik Eidus struct ksm_stable_node *stable_node, *next;
260031dbd01fSIzik Eidus struct page *page;
260131dbd01fSIzik Eidus
2602ef4d43a8SHugh Dickins list_for_each_entry_safe(stable_node, next,
26036e158384SHugh Dickins &migrate_nodes, list) {
260431dbd01fSIzik Eidus page = get_ksm_page(stable_node,
260531dbd01fSIzik Eidus GET_KSM_PAGE_NOLOCK);
26066e158384SHugh Dickins if (page)
2607878aee7dSAndrea Arcangeli put_page(page);
2608878aee7dSAndrea Arcangeli cond_resched();
26096e158384SHugh Dickins }
2610fcf9a0efSKirill Tkhai }
2611fcf9a0efSKirill Tkhai
2612fcf9a0efSKirill Tkhai for (nid = 0; nid < ksm_nr_node_ids; nid++)
2613fcf9a0efSKirill Tkhai root_unstable_tree[nid] = RB_ROOT;
261431dbd01fSIzik Eidus
2615878aee7dSAndrea Arcangeli spin_lock(&ksm_mmlist_lock);
26166e158384SHugh Dickins slot = list_entry(mm_slot->slot.mm_node.next,
261731dbd01fSIzik Eidus struct mm_slot, mm_node);
261831dbd01fSIzik Eidus mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
261931dbd01fSIzik Eidus ksm_scan.mm_slot = mm_slot;
262031dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock);
262131dbd01fSIzik Eidus /*
2622d7597f59SStefan Roesch * Although we tested list_empty() above, a racing __ksm_exit
2623d7597f59SStefan Roesch * of the last mm on the list may have removed it since then.
2624d7597f59SStefan Roesch */
2625d7597f59SStefan Roesch if (mm_slot == &ksm_mm_head)
2626d7597f59SStefan Roesch return NULL;
2627d7597f59SStefan Roesch next_mm:
2628d7597f59SStefan Roesch ksm_scan.address = 0;
2629d7597f59SStefan Roesch ksm_scan.rmap_list = &mm_slot->rmap_list;
2630d7597f59SStefan Roesch }
2631d7597f59SStefan Roesch
2632d7597f59SStefan Roesch slot = &mm_slot->slot;
263324139c07SDavid Hildenbrand mm = slot->mm;
263424139c07SDavid Hildenbrand vma_iter_init(&vmi, mm, ksm_scan.address);
263524139c07SDavid Hildenbrand
263624139c07SDavid Hildenbrand mmap_read_lock(mm);
263724139c07SDavid Hildenbrand if (ksm_test_exit(mm))
263824139c07SDavid Hildenbrand goto no_vmas;
263924139c07SDavid Hildenbrand
264024139c07SDavid Hildenbrand for_each_vma(vmi, vma) {
264149b06385SSuren Baghdasaryan if (!(vma->vm_flags & VM_MERGEABLE))
264224139c07SDavid Hildenbrand continue;
264324139c07SDavid Hildenbrand if (ksm_scan.address < vma->vm_start)
264424139c07SDavid Hildenbrand ksm_scan.address = vma->vm_start;
264524139c07SDavid Hildenbrand if (!vma->anon_vma)
264624139c07SDavid Hildenbrand ksm_scan.address = vma->vm_end;
264724139c07SDavid Hildenbrand
264824139c07SDavid Hildenbrand while (ksm_scan.address < vma->vm_end) {
2649d7597f59SStefan Roesch if (ksm_test_exit(mm))
2650d7597f59SStefan Roesch break;
2651d7597f59SStefan Roesch *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2652d7597f59SStefan Roesch if (IS_ERR_OR_NULL(*page)) {
2653d7597f59SStefan Roesch ksm_scan.address += PAGE_SIZE;
2654d7597f59SStefan Roesch cond_resched();
2655d7597f59SStefan Roesch continue;
2656d7597f59SStefan Roesch }
2657d7597f59SStefan Roesch if (is_zone_device_page(*page))
2658d7597f59SStefan Roesch goto next_page;
2659d7597f59SStefan Roesch if (PageAnon(*page)) {
2660d7597f59SStefan Roesch flush_anon_page(vma, *page, ksm_scan.address);
2661d7597f59SStefan Roesch flush_dcache_page(*page);
2662d7597f59SStefan Roesch rmap_item = get_next_rmap_item(mm_slot,
2663d7597f59SStefan Roesch ksm_scan.rmap_list, ksm_scan.address);
2664d7597f59SStefan Roesch if (rmap_item) {
2665d7597f59SStefan Roesch ksm_scan.rmap_list =
2666d7597f59SStefan Roesch &rmap_item->rmap_list;
2667d7597f59SStefan Roesch
2668d7597f59SStefan Roesch if (should_skip_rmap_item(*page, rmap_item))
2669d7597f59SStefan Roesch goto next_page;
2670d7597f59SStefan Roesch
267124139c07SDavid Hildenbrand ksm_scan.address += PAGE_SIZE;
267224139c07SDavid Hildenbrand } else
267324139c07SDavid Hildenbrand put_page(*page);
267424139c07SDavid Hildenbrand mmap_read_unlock(mm);
267524139c07SDavid Hildenbrand return rmap_item;
267624139c07SDavid Hildenbrand }
267724139c07SDavid Hildenbrand next_page:
267824139c07SDavid Hildenbrand put_page(*page);
267924139c07SDavid Hildenbrand ksm_scan.address += PAGE_SIZE;
268024139c07SDavid Hildenbrand cond_resched();
268124139c07SDavid Hildenbrand }
268224139c07SDavid Hildenbrand }
268324139c07SDavid Hildenbrand
268424139c07SDavid Hildenbrand if (ksm_test_exit(mm)) {
2685d7597f59SStefan Roesch no_vmas:
2686d7597f59SStefan Roesch ksm_scan.address = 0;
2687d7597f59SStefan Roesch ksm_scan.rmap_list = &mm_slot->rmap_list;
2688d7597f59SStefan Roesch }
2689d7597f59SStefan Roesch /*
2690d7597f59SStefan Roesch * Nuke all the rmap_items that are above this current rmap:
2691d7597f59SStefan Roesch * because there were no VM_MERGEABLE vmas with such addresses.
2692d7597f59SStefan Roesch */
2693d7597f59SStefan Roesch remove_trailing_rmap_items(ksm_scan.rmap_list);
2694d7597f59SStefan Roesch
2695d7597f59SStefan Roesch spin_lock(&ksm_mmlist_lock);
2696d7597f59SStefan Roesch slot = list_entry(mm_slot->slot.mm_node.next,
2697d7597f59SStefan Roesch struct mm_slot, mm_node);
2698d7597f59SStefan Roesch ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2699d7597f59SStefan Roesch if (ksm_scan.address == 0) {
2700d7597f59SStefan Roesch /*
2701d7597f59SStefan Roesch * We've completed a full scan of all vmas, holding mmap_lock
2702d7597f59SStefan Roesch * throughout, and found no VM_MERGEABLE: so do the same as
2703d7597f59SStefan Roesch * __ksm_exit does to remove this mm from all our lists now.
2704d7597f59SStefan Roesch * This applies either when cleaning up after __ksm_exit
2705d7597f59SStefan Roesch * (but beware: we can reach here even before __ksm_exit),
2706d7597f59SStefan Roesch * or when all VM_MERGEABLE areas have been unmapped (and
2707d7597f59SStefan Roesch * mmap_lock then protects against race with MADV_MERGEABLE).
2708d7597f59SStefan Roesch */
2709d7597f59SStefan Roesch hash_del(&mm_slot->slot.hash);
2710d7597f59SStefan Roesch list_del(&mm_slot->slot.mm_node);
2711d7597f59SStefan Roesch spin_unlock(&ksm_mmlist_lock);
271224139c07SDavid Hildenbrand
271324139c07SDavid Hildenbrand mm_slot_free(mm_slot_cache, mm_slot);
271424139c07SDavid Hildenbrand clear_bit(MMF_VM_MERGEABLE, &mm->flags);
271524139c07SDavid Hildenbrand clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
271624139c07SDavid Hildenbrand mmap_read_unlock(mm);
271724139c07SDavid Hildenbrand mmdrop(mm);
271824139c07SDavid Hildenbrand } else {
271924139c07SDavid Hildenbrand mmap_read_unlock(mm);
272024139c07SDavid Hildenbrand /*
272124139c07SDavid Hildenbrand * mmap_read_unlock(mm) first because after
272224139c07SDavid Hildenbrand * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
272324139c07SDavid Hildenbrand * already have been freed under us by __ksm_exit()
272424139c07SDavid Hildenbrand * because the "mm_slot" is still hashed and
272524139c07SDavid Hildenbrand * ksm_scan.mm_slot doesn't point to it anymore.
272624139c07SDavid Hildenbrand */
272724139c07SDavid Hildenbrand spin_unlock(&ksm_mmlist_lock);
272824139c07SDavid Hildenbrand }
272924139c07SDavid Hildenbrand
273024139c07SDavid Hildenbrand /* Repeat until we've completed scanning the whole list */
273124139c07SDavid Hildenbrand mm_slot = ksm_scan.mm_slot;
273224139c07SDavid Hildenbrand if (mm_slot != &ksm_mm_head)
273324139c07SDavid Hildenbrand goto next_mm;
273424139c07SDavid Hildenbrand
273524139c07SDavid Hildenbrand advisor_stop_scan();
273624139c07SDavid Hildenbrand
273724139c07SDavid Hildenbrand trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
273824139c07SDavid Hildenbrand ksm_scan.seqnr++;
273924139c07SDavid Hildenbrand return NULL;
274024139c07SDavid Hildenbrand }
27412c281f54SDavid Hildenbrand
27422c281f54SDavid Hildenbrand /**
27432c281f54SDavid Hildenbrand * ksm_do_scan - the ksm scanner main worker function.
27442c281f54SDavid Hildenbrand * @scan_npages: number of pages we want to scan before we return.
27452c281f54SDavid Hildenbrand */
ksm_do_scan(unsigned int scan_npages)27462c281f54SDavid Hildenbrand static void ksm_do_scan(unsigned int scan_npages)
27472c281f54SDavid Hildenbrand {
27482c281f54SDavid Hildenbrand struct ksm_rmap_item *rmap_item;
27492c281f54SDavid Hildenbrand struct page *page;
27502c281f54SDavid Hildenbrand unsigned int npages = scan_npages;
27512c281f54SDavid Hildenbrand
2752f8af4da3SHugh Dickins while (npages-- && likely(!freezing(current))) {
2753f8af4da3SHugh Dickins cond_resched();
2754f8af4da3SHugh Dickins rmap_item = scan_get_next_rmap_item(&page);
2755f8af4da3SHugh Dickins if (!rmap_item)
2756d952b791SHugh Dickins return;
2757f8af4da3SHugh Dickins cmp_and_merge_page(page, rmap_item);
2758f8af4da3SHugh Dickins put_page(page);
2759f8af4da3SHugh Dickins }
2760d7597f59SStefan Roesch
2761e1fb4a08SDave Jiang ksm_pages_scanned += scan_npages - npages;
2762d7597f59SStefan Roesch }
276312564485SShawn Anastasio
ksmd_should_run(void)2764cc2383ecSKonstantin Khlebnikov static int ksmd_should_run(void)
2765d952b791SHugh Dickins {
2766d952b791SHugh Dickins return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
2767d952b791SHugh Dickins }
2768d952b791SHugh Dickins
ksm_scan_thread(void * nothing)2769d952b791SHugh Dickins static int ksm_scan_thread(void *nothing)
2770f8af4da3SHugh Dickins {
2771f8af4da3SHugh Dickins unsigned int sleep_ms;
2772f8af4da3SHugh Dickins
2773f8af4da3SHugh Dickins set_freezable();
2774f8af4da3SHugh Dickins set_user_nice(current, 5);
2775f8af4da3SHugh Dickins
2776f8af4da3SHugh Dickins while (!kthread_should_stop()) {
2777f8af4da3SHugh Dickins mutex_lock(&ksm_thread_mutex);
2778d952b791SHugh Dickins wait_while_offlining();
277949b06385SSuren Baghdasaryan if (ksmd_should_run())
2780d952b791SHugh Dickins ksm_do_scan(ksm_thread_pages_to_scan);
2781d952b791SHugh Dickins mutex_unlock(&ksm_thread_mutex);
2782d952b791SHugh Dickins
2783f8af4da3SHugh Dickins if (ksmd_should_run()) {
2784f8af4da3SHugh Dickins sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2785f8af4da3SHugh Dickins wait_event_freezable_timeout(ksm_iter_wait,
2786f8af4da3SHugh Dickins sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2787f8af4da3SHugh Dickins msecs_to_jiffies(sleep_ms));
2788f8af4da3SHugh Dickins } else {
2789f8af4da3SHugh Dickins wait_event_freezable(ksm_thread_wait,
279033cf1707SBharata B Rao ksmd_should_run() || kthread_should_stop());
2791f8af4da3SHugh Dickins }
2792f8af4da3SHugh Dickins }
2793f8af4da3SHugh Dickins return 0;
279421fbd591SQi Zheng }
279558730ab6SQi Zheng
__ksm_add_vma(struct vm_area_struct * vma)27966e158384SHugh Dickins static void __ksm_add_vma(struct vm_area_struct *vma)
27976e158384SHugh Dickins {
279858730ab6SQi Zheng unsigned long vm_flags = vma->vm_flags;
279931dbd01fSIzik Eidus
280031dbd01fSIzik Eidus if (vm_flags & VM_MERGEABLE)
280131dbd01fSIzik Eidus return;
280258730ab6SQi Zheng
280358730ab6SQi Zheng if (vma_ksm_compatible(vma))
28046e158384SHugh Dickins vm_flags_set(vma, VM_MERGEABLE);
280558730ab6SQi Zheng }
28066e158384SHugh Dickins
__ksm_del_vma(struct vm_area_struct * vma)280731dbd01fSIzik Eidus static int __ksm_del_vma(struct vm_area_struct *vma)
280858730ab6SQi Zheng {
280931dbd01fSIzik Eidus int err;
2810cbf86cfeSHugh Dickins
2811cbf86cfeSHugh Dickins if (!(vma->vm_flags & VM_MERGEABLE))
281231dbd01fSIzik Eidus return 0;
281331dbd01fSIzik Eidus
2814cbf86cfeSHugh Dickins if (vma->anon_vma) {
2815cbf86cfeSHugh Dickins err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
2816cbf86cfeSHugh Dickins if (err)
2817cbf86cfeSHugh Dickins return err;
281831dbd01fSIzik Eidus }
2819cbf86cfeSHugh Dickins
282058730ab6SQi Zheng vm_flags_clear(vma, VM_MERGEABLE);
2821cbf86cfeSHugh Dickins return 0;
282258730ab6SQi Zheng }
282331dbd01fSIzik Eidus /**
282431dbd01fSIzik Eidus * ksm_add_vma - Mark vma as mergeable if compatible
2825f8af4da3SHugh Dickins *
2826f1f10076SVegard Nossum * @vma: Pointer to vma
28276e158384SHugh Dickins */
ksm_add_vma(struct vm_area_struct * vma)28286e158384SHugh Dickins void ksm_add_vma(struct vm_area_struct *vma)
28296e158384SHugh Dickins {
28306e158384SHugh Dickins struct mm_struct *mm = vma->vm_mm;
2831739100c8SStefan Roesch
2832f8af4da3SHugh Dickins if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
2833f8af4da3SHugh Dickins __ksm_add_vma(vma);
2834f8af4da3SHugh Dickins }
28351c2fb7a4SAndrea Arcangeli
ksm_add_vmas(struct mm_struct * mm)2836f8af4da3SHugh Dickins static void ksm_add_vmas(struct mm_struct *mm)
283721fbd591SQi Zheng {
283858730ab6SQi Zheng struct vm_area_struct *vma;
28399ba69294SHugh Dickins
2840cd551f97SHugh Dickins VMA_ITERATOR(vmi, mm, 0);
284131dbd01fSIzik Eidus for_each_vma(vmi, vma)
28429ba69294SHugh Dickins __ksm_add_vma(vma);
28439ba69294SHugh Dickins }
28449ba69294SHugh Dickins
ksm_del_vmas(struct mm_struct * mm)2845c1e8d7c6SMichel Lespinasse static int ksm_del_vmas(struct mm_struct *mm)
28469ba69294SHugh Dickins {
28479ba69294SHugh Dickins struct vm_area_struct *vma;
284831dbd01fSIzik Eidus int err;
28499ba69294SHugh Dickins
2850cd551f97SHugh Dickins VMA_ITERATOR(vmi, mm, 0);
285158730ab6SQi Zheng for_each_vma(vmi, vma) {
285258730ab6SQi Zheng err = __ksm_del_vma(vma);
28539ba69294SHugh Dickins if (err)
28546514d511SHugh Dickins return err;
285558730ab6SQi Zheng }
285658730ab6SQi Zheng return 0;
28579ba69294SHugh Dickins }
28589ba69294SHugh Dickins
285958730ab6SQi Zheng /**
286058730ab6SQi Zheng * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
28619ba69294SHugh Dickins * compatible VMA's
28629ba69294SHugh Dickins *
2863cd551f97SHugh Dickins * @mm: Pointer to mm
2864cd551f97SHugh Dickins *
28659ba69294SHugh Dickins * Returns 0 on success, otherwise error code
286658730ab6SQi Zheng */
ksm_enable_merge_any(struct mm_struct * mm)2867d7597f59SStefan Roesch int ksm_enable_merge_any(struct mm_struct *mm)
2868cd551f97SHugh Dickins {
28699ba69294SHugh Dickins int err;
28709ba69294SHugh Dickins
2871d8ed45c5SMichel Lespinasse if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
2872d8ed45c5SMichel Lespinasse return 0;
28739ba69294SHugh Dickins
2874739100c8SStefan Roesch if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2875739100c8SStefan Roesch err = __ksm_enter(mm);
2876f8af4da3SHugh Dickins if (err)
287731dbd01fSIzik Eidus return err;
2878cbf86cfeSHugh Dickins }
28795ad64688SHugh Dickins
28805ad64688SHugh Dickins set_bit(MMF_VM_MERGE_ANY, &mm->flags);
2881e05b3453SMatthew Wilcox (Oracle) ksm_add_vmas(mm);
2882e05b3453SMatthew Wilcox (Oracle)
28835ad64688SHugh Dickins return 0;
28845ad64688SHugh Dickins }
2885cbf86cfeSHugh Dickins
2886cbf86cfeSHugh Dickins /**
2887cbf86cfeSHugh Dickins * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
2888cbf86cfeSHugh Dickins * previously enabled via ksm_enable_merge_any().
2889cbf86cfeSHugh Dickins *
2890cbf86cfeSHugh Dickins * Disabling merging implies unmerging any merged pages, like setting
2891e1c63e11SNanyong Sun * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
2892e1c63e11SNanyong Sun * merging on all compatible VMA's remains enabled.
2893cbf86cfeSHugh Dickins *
2894cbf86cfeSHugh Dickins * @mm: Pointer to mm
2895f985fc32SMiaohe Lin *
2896f985fc32SMiaohe Lin * Returns 0 on success, otherwise error code
2897cbf86cfeSHugh Dickins */
ksm_disable_merge_any(struct mm_struct * mm)2898cbf86cfeSHugh Dickins int ksm_disable_merge_any(struct mm_struct *mm)
2899cbf86cfeSHugh Dickins {
29005ad64688SHugh Dickins int err;
29018f425e4eSMatthew Wilcox (Oracle)
29028f425e4eSMatthew Wilcox (Oracle) if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
290362fdb163SHugh Dickins return 0;
290462fdb163SHugh Dickins
290562fdb163SHugh Dickins err = ksm_del_vmas(mm);
29065ad64688SHugh Dickins if (err) {
29076b970599SKefeng Wang ksm_add_vmas(mm);
29086b970599SKefeng Wang return err;
29096b970599SKefeng Wang }
29106b970599SKefeng Wang
29116b970599SKefeng Wang clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
29125ad64688SHugh Dickins return 0;
29135ad64688SHugh Dickins }
291448c935adSKirill A. Shutemov
ksm_disable(struct mm_struct * mm)29154d45c3afSYang Yang int ksm_disable(struct mm_struct *mm)
29164d45c3afSYang Yang {
29174d45c3afSYang Yang mmap_assert_write_locked(mm);
29185ad64688SHugh Dickins
29195ad64688SHugh Dickins if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
29205ad64688SHugh Dickins return 0;
29215ad64688SHugh Dickins if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
29225ad64688SHugh Dickins return ksm_disable_merge_any(mm);
29236d4675e6SMinchan Kim return ksm_del_vmas(mm);
2924e9995ef9SHugh Dickins }
292521fbd591SQi Zheng
ksm_madvise(struct vm_area_struct * vma,unsigned long start,unsigned long end,int advice,unsigned long * vm_flags)292621fbd591SQi Zheng int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2927e9995ef9SHugh Dickins unsigned long end, int advice, unsigned long *vm_flags)
2928e9995ef9SHugh Dickins {
29292f031c6fSMatthew Wilcox (Oracle) struct mm_struct *mm = vma->vm_mm;
29309f32624bSJoonsoo Kim int err;
29319f32624bSJoonsoo Kim
29329f32624bSJoonsoo Kim switch (advice) {
29339f32624bSJoonsoo Kim case MADV_MERGEABLE:
29349f32624bSJoonsoo Kim if (vma->vm_flags & VM_MERGEABLE)
29352f031c6fSMatthew Wilcox (Oracle) return 0;
2936e9995ef9SHugh Dickins if (!vma_ksm_compatible(vma))
29372f031c6fSMatthew Wilcox (Oracle) return 0;
2938e9995ef9SHugh Dickins
29391df631aeSMinchan Kim if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2940e9995ef9SHugh Dickins err = __ksm_enter(mm);
2941b67bfe0dSSasha Levin if (err)
2942e9995ef9SHugh Dickins return err;
29435beb4930SRik van Riel }
2944e9995ef9SHugh Dickins
2945e9995ef9SHugh Dickins *vm_flags |= VM_MERGEABLE;
2946ad12695fSAndrea Arcangeli break;
29476d4675e6SMinchan Kim
29486d4675e6SMinchan Kim case MADV_UNMERGEABLE:
29496d4675e6SMinchan Kim if (!(*vm_flags & VM_MERGEABLE))
29506d4675e6SMinchan Kim return 0; /* just ignore the advice */
29516d4675e6SMinchan Kim
2952b6b19f25SHugh Dickins if (vma->anon_vma) {
29536d4675e6SMinchan Kim err = unmerge_ksm_pages(vma, start, end, true);
2954bf181b9fSMichel Lespinasse if (err)
2955bf181b9fSMichel Lespinasse return err;
29561105a2fcSJia He }
29571105a2fcSJia He
2958ad12695fSAndrea Arcangeli *vm_flags &= ~VM_MERGEABLE;
29595beb4930SRik van Riel break;
29601105a2fcSJia He }
29611105a2fcSJia He
2962cd7fae26SMiaohe Lin return 0;
29631105a2fcSJia He }
29641105a2fcSJia He EXPORT_SYMBOL_GPL(ksm_madvise);
2965e9995ef9SHugh Dickins
__ksm_enter(struct mm_struct * mm)2966e9995ef9SHugh Dickins int __ksm_enter(struct mm_struct *mm)
2967e9995ef9SHugh Dickins {
2968e9995ef9SHugh Dickins struct ksm_mm_slot *mm_slot;
2969e9995ef9SHugh Dickins struct mm_slot *slot;
2970e9995ef9SHugh Dickins int needs_wakeup;
2971e9995ef9SHugh Dickins
2972e9995ef9SHugh Dickins mm_slot = mm_slot_alloc(mm_slot_cache);
2973e9995ef9SHugh Dickins if (!mm_slot)
2974e9995ef9SHugh Dickins return -ENOMEM;
29750dd1c7bbSJoonsoo Kim
29760dd1c7bbSJoonsoo Kim slot = &mm_slot->slot;
29770dd1c7bbSJoonsoo Kim
29782f031c6fSMatthew Wilcox (Oracle) /* Check ksm_run too? Would need tighter locking */
2979b6b19f25SHugh Dickins needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
29801df631aeSMinchan Kim
2981e9995ef9SHugh Dickins spin_lock(&ksm_mmlist_lock);
29822f031c6fSMatthew Wilcox (Oracle) mm_slot_insert(mm_slots_hash, mm, slot);
29830dd1c7bbSJoonsoo Kim /*
29841df631aeSMinchan Kim * When KSM_RUN_MERGE (or KSM_RUN_STOP),
29850dd1c7bbSJoonsoo Kim * insert just behind the scanning cursor, to let the area settle
2986e9995ef9SHugh Dickins * down a little; when fork is followed by immediate exec, we don't
2987b6b19f25SHugh Dickins * want ksmd to waste time setting up and tearing down an rmap_list.
2988e9995ef9SHugh Dickins *
2989e9995ef9SHugh Dickins * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
2990e9995ef9SHugh Dickins * scanning cursor, otherwise KSM pages in newly forked mms will be
2991e9995ef9SHugh Dickins * missed: then we might as well insert at the end of the list.
2992e9995ef9SHugh Dickins */
29934248d008SLonglong Xia if (ksm_run & KSM_RUN_UNMERGE)
29944248d008SLonglong Xia list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
29954248d008SLonglong Xia else
29964248d008SLonglong Xia list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
29974248d008SLonglong Xia spin_unlock(&ksm_mmlist_lock);
29984248d008SLonglong Xia
29994248d008SLonglong Xia set_bit(MMF_VM_MERGEABLE, &mm->flags);
30004248d008SLonglong Xia mmgrab(mm);
30014248d008SLonglong Xia
30024248d008SLonglong Xia if (needs_wakeup)
30034248d008SLonglong Xia wake_up_interruptible(&ksm_thread_wait);
30044248d008SLonglong Xia
30054248d008SLonglong Xia trace_ksm_enter(mm);
30064248d008SLonglong Xia return 0;
30074248d008SLonglong Xia }
30084248d008SLonglong Xia
__ksm_exit(struct mm_struct * mm)30094248d008SLonglong Xia void __ksm_exit(struct mm_struct *mm)
30104248d008SLonglong Xia {
30114248d008SLonglong Xia struct ksm_mm_slot *mm_slot;
30124248d008SLonglong Xia struct mm_slot *slot;
3013d256d1cdSTong Tiangen int easy_to_free = 0;
30144248d008SLonglong Xia
30154248d008SLonglong Xia /*
30164248d008SLonglong Xia * This process is exiting: if it's straightforward (as is the
30174248d008SLonglong Xia * case when ksmd was never running), free mm_slot immediately.
30184248d008SLonglong Xia * But if it's at the cursor or has rmap_items linked to it, use
30194248d008SLonglong Xia * mmap_lock to synchronize with any break_cows before pagetables
30204248d008SLonglong Xia * are freed, and leave the mm_slot on the list for ksmd to free.
30214248d008SLonglong Xia * Beware: ksm may already have noticed it exiting and freed the slot.
30224248d008SLonglong Xia */
30234248d008SLonglong Xia
30244248d008SLonglong Xia spin_lock(&ksm_mmlist_lock);
30254248d008SLonglong Xia slot = mm_slot_lookup(mm_slots_hash, mm);
30264248d008SLonglong Xia mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
30274248d008SLonglong Xia if (mm_slot && ksm_scan.mm_slot != mm_slot) {
30284248d008SLonglong Xia if (!mm_slot->rmap_list) {
30294248d008SLonglong Xia hash_del(&slot->hash);
30304248d008SLonglong Xia list_del(&slot->mm_node);
30314248d008SLonglong Xia easy_to_free = 1;
3032d256d1cdSTong Tiangen } else {
30334248d008SLonglong Xia list_move(&slot->mm_node,
30344248d008SLonglong Xia &ksm_scan.mm_slot->slot.mm_node);
30354248d008SLonglong Xia }
30364248d008SLonglong Xia }
30374248d008SLonglong Xia spin_unlock(&ksm_mmlist_lock);
303852629506SJoonsoo Kim
303919138349SMatthew Wilcox (Oracle) if (easy_to_free) {
3040e9995ef9SHugh Dickins mm_slot_free(mm_slot_cache, mm_slot);
304121fbd591SQi Zheng clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
3042e9995ef9SHugh Dickins clear_bit(MMF_VM_MERGEABLE, &mm->flags);
304319138349SMatthew Wilcox (Oracle) mmdrop(mm);
304419138349SMatthew Wilcox (Oracle) } else if (mm_slot) {
304519138349SMatthew Wilcox (Oracle) mmap_write_lock(mm);
3046e9995ef9SHugh Dickins mmap_write_unlock(mm);
304719138349SMatthew Wilcox (Oracle) }
3048e9995ef9SHugh Dickins
304919138349SMatthew Wilcox (Oracle) trace_ksm_exit(mm);
305019138349SMatthew Wilcox (Oracle) }
3051c8d6553bSHugh Dickins
ksm_might_need_to_copy(struct folio * folio,struct vm_area_struct * vma,unsigned long addr)305219138349SMatthew Wilcox (Oracle) struct folio *ksm_might_need_to_copy(struct folio *folio,
3053c8d6553bSHugh Dickins struct vm_area_struct *vma, unsigned long addr)
305419138349SMatthew Wilcox (Oracle) {
305519138349SMatthew Wilcox (Oracle) struct page *page = folio_page(folio, 0);
3056c8d6553bSHugh Dickins struct anon_vma *anon_vma = folio_anon_vma(folio);
3057c8d6553bSHugh Dickins struct folio *new_folio;
305819138349SMatthew Wilcox (Oracle)
3059e9995ef9SHugh Dickins if (folio_test_large(folio))
3060e9995ef9SHugh Dickins return folio;
3061e9995ef9SHugh Dickins
3062e9995ef9SHugh Dickins if (folio_test_ksm(folio)) {
306362b61f61SHugh Dickins if (folio_stable_node(folio) &&
3064ef4d43a8SHugh Dickins !(ksm_run & KSM_RUN_UNMERGE))
3065ef4d43a8SHugh Dickins return folio; /* no need to copy it */
3066ef4d43a8SHugh Dickins } else if (!anon_vma) {
3067ef4d43a8SHugh Dickins return folio; /* no need to copy it */
3068ef4d43a8SHugh Dickins } else if (folio->index == linear_page_index(vma, addr) &&
306974316201SNeilBrown anon_vma->root == vma->anon_vma->root) {
3070ef4d43a8SHugh Dickins return folio; /* still no need to copy it */
3071ef4d43a8SHugh Dickins }
3072ef4d43a8SHugh Dickins if (PageHWPoison(page))
3073ef4d43a8SHugh Dickins return ERR_PTR(-EHWPOISON);
307421fbd591SQi Zheng if (!folio_test_uptodate(folio))
30752c653d0eSAndrea Arcangeli return folio; /* let do_swap_page report the error */
30762c653d0eSAndrea Arcangeli
30772c653d0eSAndrea Arcangeli new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
30782c653d0eSAndrea Arcangeli if (new_folio &&
30792c653d0eSAndrea Arcangeli mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
30802c653d0eSAndrea Arcangeli folio_put(new_folio);
30812c653d0eSAndrea Arcangeli new_folio = NULL;
30822c653d0eSAndrea Arcangeli }
30832c653d0eSAndrea Arcangeli if (new_folio) {
30842c653d0eSAndrea Arcangeli if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
30852c653d0eSAndrea Arcangeli addr, vma)) {
30862c653d0eSAndrea Arcangeli folio_put(new_folio);
30872c653d0eSAndrea Arcangeli memory_failure_queue(folio_pfn(folio), 0);
30882c653d0eSAndrea Arcangeli return ERR_PTR(-EHWPOISON);
30892c653d0eSAndrea Arcangeli }
309021fbd591SQi Zheng folio_set_dirty(new_folio);
30912c653d0eSAndrea Arcangeli __folio_mark_uptodate(new_folio);
30922c653d0eSAndrea Arcangeli __folio_set_locked(new_folio);
30932c653d0eSAndrea Arcangeli #ifdef CONFIG_SWAP
30942c653d0eSAndrea Arcangeli count_vm_event(KSM_SWPIN_COPY);
309521fbd591SQi Zheng #endif
30962c653d0eSAndrea Arcangeli }
30972c653d0eSAndrea Arcangeli
30982c653d0eSAndrea Arcangeli return new_folio;
30992c653d0eSAndrea Arcangeli }
31002c653d0eSAndrea Arcangeli
rmap_walk_ksm(struct folio * folio,struct rmap_walk_control * rwc)31012c653d0eSAndrea Arcangeli void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
31022c653d0eSAndrea Arcangeli {
31032c653d0eSAndrea Arcangeli struct ksm_stable_node *stable_node;
31042c653d0eSAndrea Arcangeli struct ksm_rmap_item *rmap_item;
31052c653d0eSAndrea Arcangeli int search_new_forks = 0;
31062c653d0eSAndrea Arcangeli
31072c653d0eSAndrea Arcangeli VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
31082c653d0eSAndrea Arcangeli
31092c653d0eSAndrea Arcangeli /*
31102c653d0eSAndrea Arcangeli * Rely on the page lock to protect against concurrent modifications
31112c653d0eSAndrea Arcangeli * to that page's node of the stable tree.
31122c653d0eSAndrea Arcangeli */
31132c653d0eSAndrea Arcangeli VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
31142c653d0eSAndrea Arcangeli
31152c653d0eSAndrea Arcangeli stable_node = folio_stable_node(folio);
3116ee0ea59cSHugh Dickins if (!stable_node)
311762b61f61SHugh Dickins return;
311862b61f61SHugh Dickins again:
311921fbd591SQi Zheng hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
312062b61f61SHugh Dickins struct anon_vma *anon_vma = rmap_item->anon_vma;
312190bd6fd3SPetr Holasek struct anon_vma_chain *vmac;
312262b61f61SHugh Dickins struct vm_area_struct *vma;
3123ef53d16cSHugh Dickins
3124ef53d16cSHugh Dickins cond_resched();
3125ee0ea59cSHugh Dickins if (!anon_vma_trylock_read(anon_vma)) {
312621fbd591SQi Zheng if (rwc->try_lock) {
31272c653d0eSAndrea Arcangeli rwc->contended = true;
31282c653d0eSAndrea Arcangeli return;
31292c653d0eSAndrea Arcangeli }
31302c653d0eSAndrea Arcangeli anon_vma_lock_read(anon_vma);
3131ef53d16cSHugh Dickins }
31322c653d0eSAndrea Arcangeli anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
3133ee0ea59cSHugh Dickins 0, ULONG_MAX) {
3134ee0ea59cSHugh Dickins unsigned long addr;
313562b61f61SHugh Dickins
3136ee0ea59cSHugh Dickins cond_resched();
313703640418SGeliang Tang vma = vmac->vma;
31384146d2d6SHugh Dickins
31394146d2d6SHugh Dickins /* Ignore the stable/unstable/sqnr flags */
31404146d2d6SHugh Dickins addr = rmap_item->address & PAGE_MASK;
31414146d2d6SHugh Dickins
31424146d2d6SHugh Dickins if (addr < vma->vm_start || addr >= vma->vm_end)
314362b61f61SHugh Dickins continue;
314462b61f61SHugh Dickins /*
314562b61f61SHugh Dickins * Initially we examine only the vma which covers this
314662b61f61SHugh Dickins * rmap_item; but later, if there is still work to do,
314762b61f61SHugh Dickins * we examine covering vmas in other mms: in case they
314862b61f61SHugh Dickins * were forked from the original since ksmd passed.
314962b61f61SHugh Dickins */
315062b61f61SHugh Dickins if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
315162b61f61SHugh Dickins continue;
315262b61f61SHugh Dickins
3153ef4d43a8SHugh Dickins if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
3154ef4d43a8SHugh Dickins continue;
3155ef4d43a8SHugh Dickins
3156ef4d43a8SHugh Dickins if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
3157ef4d43a8SHugh Dickins anon_vma_unlock_read(anon_vma);
315862b61f61SHugh Dickins return;
3159ef4d43a8SHugh Dickins }
3160ef4d43a8SHugh Dickins if (rwc->done && rwc->done(folio)) {
3161ef4d43a8SHugh Dickins anon_vma_unlock_read(anon_vma);
316262b61f61SHugh Dickins return;
316362b61f61SHugh Dickins }
316462b61f61SHugh Dickins }
316562b61f61SHugh Dickins anon_vma_unlock_read(anon_vma);
316662b61f61SHugh Dickins }
316762b61f61SHugh Dickins if (!search_new_forks++)
3168ee0ea59cSHugh Dickins goto again;
3169ee0ea59cSHugh Dickins }
3170ee0ea59cSHugh Dickins
317162b61f61SHugh Dickins #ifdef CONFIG_MEMORY_FAILURE
3172ee0ea59cSHugh Dickins /*
3173ee0ea59cSHugh Dickins * Collect processes when the error hit an ksm page.
3174e4a9bc58SJoe Perches */
collect_procs_ksm(struct page * page,struct list_head * to_kill,int force_early)317562b61f61SHugh Dickins void collect_procs_ksm(struct page *page, struct list_head *to_kill,
3176ef4d43a8SHugh Dickins int force_early)
3177ef4d43a8SHugh Dickins {
317862b61f61SHugh Dickins struct ksm_stable_node *stable_node;
3179ef4d43a8SHugh Dickins struct ksm_rmap_item *rmap_item;
3180ef4d43a8SHugh Dickins struct folio *folio = page_folio(page);
3181ef4d43a8SHugh Dickins struct vm_area_struct *vma;
318262b61f61SHugh Dickins struct task_struct *tsk;
318362b61f61SHugh Dickins
318462b61f61SHugh Dickins stable_node = folio_stable_node(folio);
318562b61f61SHugh Dickins if (!stable_node)
3186ef4d43a8SHugh Dickins return;
3187ef4d43a8SHugh Dickins hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
3188ef4d43a8SHugh Dickins struct anon_vma *av = rmap_item->anon_vma;
3189ef4d43a8SHugh Dickins
319062b61f61SHugh Dickins anon_vma_lock_read(av);
319162b61f61SHugh Dickins rcu_read_lock();
3192d21077fbSStefan Roesch for_each_process(tsk) {
3193d21077fbSStefan Roesch struct anon_vma_chain *vmac;
3194d21077fbSStefan Roesch unsigned long addr;
31951a8e8430Sxu xin struct task_struct *t =
3196d21077fbSStefan Roesch task_early_kill(tsk, force_early);
3197d21077fbSStefan Roesch if (!t)
3198d21077fbSStefan Roesch continue;
3199d21077fbSStefan Roesch anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
32002ffd8679SHugh Dickins ULONG_MAX)
32012ffd8679SHugh Dickins {
32022ffd8679SHugh Dickins vma = vmac->vma;
32032ffd8679SHugh Dickins if (vma->vm_mm == t->mm) {
32042ffd8679SHugh Dickins addr = rmap_item->address & PAGE_MASK;
320531dbd01fSIzik Eidus add_to_kill_ksm(t, page, vma, to_kill,
320631dbd01fSIzik Eidus addr);
320731dbd01fSIzik Eidus }
32081bad2e5cSMiaohe Lin }
320931dbd01fSIzik Eidus }
321031dbd01fSIzik Eidus rcu_read_unlock();
321131dbd01fSIzik Eidus anon_vma_unlock_read(av);
321231dbd01fSIzik Eidus }
3213ae7a927dSJoe Perches }
321431dbd01fSIzik Eidus #endif
321531dbd01fSIzik Eidus
321631dbd01fSIzik Eidus #ifdef CONFIG_MIGRATION
folio_migrate_ksm(struct folio * newfolio,struct folio * folio)321731dbd01fSIzik Eidus void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
321831dbd01fSIzik Eidus {
321931dbd01fSIzik Eidus struct ksm_stable_node *stable_node;
3220dfefd226SAlexey Dobriyan
322131dbd01fSIzik Eidus VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
322231dbd01fSIzik Eidus VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
3223dfefd226SAlexey Dobriyan VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
3224dfefd226SAlexey Dobriyan
322531dbd01fSIzik Eidus stable_node = folio_stable_node(folio);
322631dbd01fSIzik Eidus if (stable_node) {
322731dbd01fSIzik Eidus VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
3228fcf9a0efSKirill Tkhai stable_node->kpfn = folio_pfn(newfolio);
322931dbd01fSIzik Eidus /*
323031dbd01fSIzik Eidus * newfolio->mapping was set in advance; now we need smp_wmb()
323131dbd01fSIzik Eidus * to make sure that the new stable_node->kpfn is visible
323231dbd01fSIzik Eidus * to get_ksm_page() before it can see that folio->mapping
323331dbd01fSIzik Eidus * has gone stale (or that folio_test_swapcache has been cleared).
323431dbd01fSIzik Eidus */
323531dbd01fSIzik Eidus smp_wmb();
323631dbd01fSIzik Eidus set_page_stable_node(&folio->page, NULL);
3237ae7a927dSJoe Perches }
323831dbd01fSIzik Eidus }
323931dbd01fSIzik Eidus #endif /* CONFIG_MIGRATION */
324031dbd01fSIzik Eidus
324131dbd01fSIzik Eidus #ifdef CONFIG_MEMORY_HOTREMOVE
wait_while_offlining(void)324231dbd01fSIzik Eidus static void wait_while_offlining(void)
324331dbd01fSIzik Eidus {
3244dfefd226SAlexey Dobriyan while (ksm_run & KSM_RUN_OFFLINE) {
324531dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex);
324631dbd01fSIzik Eidus wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
3247dfefd226SAlexey Dobriyan TASK_UNINTERRUPTIBLE);
3248dfefd226SAlexey Dobriyan mutex_lock(&ksm_thread_mutex);
324931dbd01fSIzik Eidus }
325031dbd01fSIzik Eidus }
325131dbd01fSIzik Eidus
stable_node_dup_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn)325231dbd01fSIzik Eidus static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
325331dbd01fSIzik Eidus unsigned long start_pfn,
325431dbd01fSIzik Eidus unsigned long end_pfn)
325531dbd01fSIzik Eidus {
325631dbd01fSIzik Eidus if (stable_node->kpfn >= start_pfn &&
325731dbd01fSIzik Eidus stable_node->kpfn < end_pfn) {
325831dbd01fSIzik Eidus /*
325931dbd01fSIzik Eidus * Don't get_ksm_page, page has already gone:
3260ae7a927dSJoe Perches * which is why we keep kpfn instead of page*
326131dbd01fSIzik Eidus */
326231dbd01fSIzik Eidus remove_node_from_stable_tree(stable_node);
326331dbd01fSIzik Eidus return true;
326431dbd01fSIzik Eidus }
326531dbd01fSIzik Eidus return false;
3266dfefd226SAlexey Dobriyan }
326731dbd01fSIzik Eidus
stable_node_chain_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn,struct rb_root * root)326831dbd01fSIzik Eidus static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
3269dfefd226SAlexey Dobriyan unsigned long start_pfn,
3270dfefd226SAlexey Dobriyan unsigned long end_pfn,
327131dbd01fSIzik Eidus struct rb_root *root)
327231dbd01fSIzik Eidus {
327331dbd01fSIzik Eidus struct ksm_stable_node *dup;
327431dbd01fSIzik Eidus struct hlist_node *hlist_safe;
327531dbd01fSIzik Eidus
327631dbd01fSIzik Eidus if (!is_stable_node_chain(stable_node)) {
327731dbd01fSIzik Eidus VM_BUG_ON(is_stable_node_dup(stable_node));
3278d0f209f6SHugh Dickins return stable_node_dup_remove_range(stable_node, start_pfn,
3279d0f209f6SHugh Dickins end_pfn);
328031dbd01fSIzik Eidus }
328131dbd01fSIzik Eidus
328231dbd01fSIzik Eidus hlist_for_each_entry_safe(dup, hlist_safe,
3283ef4d43a8SHugh Dickins &stable_node->hlist, hlist_dup) {
328431dbd01fSIzik Eidus VM_BUG_ON(!is_stable_node_dup(dup));
328531dbd01fSIzik Eidus stable_node_dup_remove_range(dup, start_pfn, end_pfn);
3286d952b791SHugh Dickins }
3287e1e12d2fSDavid Rientjes if (hlist_empty(&stable_node->hlist)) {
3288d952b791SHugh Dickins free_stable_node_chain(stable_node, root);
3289e1e12d2fSDavid Rientjes return true; /* notify caller that tree was rebalanced */
3290d952b791SHugh Dickins } else
3291d952b791SHugh Dickins return false;
3292d952b791SHugh Dickins }
3293d952b791SHugh Dickins
ksm_check_stable_tree(unsigned long start_pfn,unsigned long end_pfn)3294d952b791SHugh Dickins static void ksm_check_stable_tree(unsigned long start_pfn,
329531dbd01fSIzik Eidus unsigned long end_pfn)
329631dbd01fSIzik Eidus {
329731dbd01fSIzik Eidus struct ksm_stable_node *stable_node, *next;
329831dbd01fSIzik Eidus struct rb_node *node;
329931dbd01fSIzik Eidus int nid;
330031dbd01fSIzik Eidus
330131dbd01fSIzik Eidus for (nid = 0; nid < ksm_nr_node_ids; nid++) {
330231dbd01fSIzik Eidus node = rb_first(root_stable_tree + nid);
330331dbd01fSIzik Eidus while (node) {
330431dbd01fSIzik Eidus stable_node = rb_entry(node, struct ksm_stable_node, node);
330590bd6fd3SPetr Holasek if (stable_node_chain_remove_range(stable_node,
330690bd6fd3SPetr Holasek start_pfn, end_pfn,
330790bd6fd3SPetr Holasek root_stable_tree +
330890bd6fd3SPetr Holasek nid))
3309ae7a927dSJoe Perches node = rb_first(root_stable_tree + nid);
331090bd6fd3SPetr Holasek else
331190bd6fd3SPetr Holasek node = rb_next(node);
331290bd6fd3SPetr Holasek cond_resched();
331390bd6fd3SPetr Holasek }
331490bd6fd3SPetr Holasek }
331590bd6fd3SPetr Holasek list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
331690bd6fd3SPetr Holasek if (stable_node->kpfn >= start_pfn &&
331790bd6fd3SPetr Holasek stable_node->kpfn < end_pfn)
331890bd6fd3SPetr Holasek remove_node_from_stable_tree(stable_node);
331990bd6fd3SPetr Holasek cond_resched();
332090bd6fd3SPetr Holasek }
332190bd6fd3SPetr Holasek }
332290bd6fd3SPetr Holasek
ksm_memory_callback(struct notifier_block * self,unsigned long action,void * arg)332390bd6fd3SPetr Holasek static int ksm_memory_callback(struct notifier_block *self,
332490bd6fd3SPetr Holasek unsigned long action, void *arg)
332590bd6fd3SPetr Holasek {
3326ef4d43a8SHugh Dickins struct memory_notify *mn = arg;
332790bd6fd3SPetr Holasek
3328cbf86cfeSHugh Dickins switch (action) {
332990bd6fd3SPetr Holasek case MEM_GOING_OFFLINE:
3330ef53d16cSHugh Dickins /*
3331ef53d16cSHugh Dickins * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
3332ef53d16cSHugh Dickins * and remove_all_stable_nodes() while memory is going offline:
3333ef53d16cSHugh Dickins * it is unsafe for them to touch the stable tree at this time.
3334ef53d16cSHugh Dickins * But unmerge_ksm_pages(), rmap lookups and other entry points
3335ef53d16cSHugh Dickins * which do not need the ksm_thread_mutex are all safe.
3336ef53d16cSHugh Dickins */
3337ef53d16cSHugh Dickins mutex_lock(&ksm_thread_mutex);
3338ef53d16cSHugh Dickins ksm_run |= KSM_RUN_OFFLINE;
3339bafe1e14SJoe Perches mutex_unlock(&ksm_thread_mutex);
3340bafe1e14SJoe Perches break;
3341ef53d16cSHugh Dickins
3342ef53d16cSHugh Dickins case MEM_OFFLINE:
3343ef53d16cSHugh Dickins /*
3344ef53d16cSHugh Dickins * Most of the work is done by page migration; but there might
3345ef53d16cSHugh Dickins * be a few stable_nodes left over, still pointing to struct
3346ef53d16cSHugh Dickins * pages which have been offlined: prune those from the tree,
3347ef53d16cSHugh Dickins * otherwise get_ksm_page() might later try to access a
3348ef53d16cSHugh Dickins * non-existent struct page.
3349ef53d16cSHugh Dickins */
3350ef53d16cSHugh Dickins ksm_check_stable_tree(mn->start_pfn,
3351ef53d16cSHugh Dickins mn->start_pfn + mn->nr_pages);
335290bd6fd3SPetr Holasek fallthrough;
3353ef53d16cSHugh Dickins case MEM_CANCEL_OFFLINE:
3354ef53d16cSHugh Dickins mutex_lock(&ksm_thread_mutex);
335590bd6fd3SPetr Holasek ksm_run &= ~KSM_RUN_OFFLINE;
335690bd6fd3SPetr Holasek mutex_unlock(&ksm_thread_mutex);
335790bd6fd3SPetr Holasek
335890bd6fd3SPetr Holasek smp_mb(); /* wake_up_bit advises this */
335990bd6fd3SPetr Holasek wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
336090bd6fd3SPetr Holasek break;
336190bd6fd3SPetr Holasek }
336290bd6fd3SPetr Holasek return NOTIFY_OK;
3363e86c59b1SClaudio Imbrenda }
3364e86c59b1SClaudio Imbrenda #else
wait_while_offlining(void)3365e86c59b1SClaudio Imbrenda static void wait_while_offlining(void)
3366ae7a927dSJoe Perches {
3367e86c59b1SClaudio Imbrenda }
3368e86c59b1SClaudio Imbrenda #endif /* CONFIG_MEMORY_HOTREMOVE */
3369e86c59b1SClaudio Imbrenda
3370e86c59b1SClaudio Imbrenda #ifdef CONFIG_PROC_FS
ksm_process_profit(struct mm_struct * mm)3371e86c59b1SClaudio Imbrenda long ksm_process_profit(struct mm_struct *mm)
3372e86c59b1SClaudio Imbrenda {
3373e86c59b1SClaudio Imbrenda return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
3374e86c59b1SClaudio Imbrenda mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
3375e86c59b1SClaudio Imbrenda }
3376e86c59b1SClaudio Imbrenda #endif /* CONFIG_PROC_FS */
3377e86c59b1SClaudio Imbrenda
3378e86c59b1SClaudio Imbrenda #ifdef CONFIG_SYSFS
3379e86c59b1SClaudio Imbrenda /*
3380e86c59b1SClaudio Imbrenda * This all compiles without CONFIG_SYSFS, but is a waste of space.
3381e86c59b1SClaudio Imbrenda */
3382e86c59b1SClaudio Imbrenda
3383e86c59b1SClaudio Imbrenda #define KSM_ATTR_RO(_name) \
3384e86c59b1SClaudio Imbrenda static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
33852c653d0eSAndrea Arcangeli #define KSM_ATTR(_name) \
33862c653d0eSAndrea Arcangeli static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
33872c653d0eSAndrea Arcangeli
sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3388ae7a927dSJoe Perches static ssize_t sleep_millisecs_show(struct kobject *kobj,
33892c653d0eSAndrea Arcangeli struct kobj_attribute *attr, char *buf)
33902c653d0eSAndrea Arcangeli {
33912c653d0eSAndrea Arcangeli return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
33922c653d0eSAndrea Arcangeli }
33932c653d0eSAndrea Arcangeli
sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)33942c653d0eSAndrea Arcangeli static ssize_t sleep_millisecs_store(struct kobject *kobj,
33952c653d0eSAndrea Arcangeli struct kobj_attribute *attr,
33962c653d0eSAndrea Arcangeli const char *buf, size_t count)
33972c653d0eSAndrea Arcangeli {
33982c653d0eSAndrea Arcangeli unsigned int msecs;
33992c653d0eSAndrea Arcangeli int err;
34002c653d0eSAndrea Arcangeli
34012c653d0eSAndrea Arcangeli err = kstrtouint(buf, 10, &msecs);
34022c653d0eSAndrea Arcangeli if (err)
34032c653d0eSAndrea Arcangeli return -EINVAL;
34042c653d0eSAndrea Arcangeli
34052c653d0eSAndrea Arcangeli ksm_thread_sleep_millisecs = msecs;
34062c653d0eSAndrea Arcangeli wake_up_interruptible(&ksm_iter_wait);
34072c653d0eSAndrea Arcangeli
34082c653d0eSAndrea Arcangeli return count;
34092c653d0eSAndrea Arcangeli }
34102c653d0eSAndrea Arcangeli KSM_ATTR(sleep_millisecs);
34112c653d0eSAndrea Arcangeli
pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)34122c653d0eSAndrea Arcangeli static ssize_t pages_to_scan_show(struct kobject *kobj,
34132c653d0eSAndrea Arcangeli struct kobj_attribute *attr, char *buf)
34142c653d0eSAndrea Arcangeli {
34152c653d0eSAndrea Arcangeli return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
34162c653d0eSAndrea Arcangeli }
34172c653d0eSAndrea Arcangeli
pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)34182c653d0eSAndrea Arcangeli static ssize_t pages_to_scan_store(struct kobject *kobj,
34192c653d0eSAndrea Arcangeli struct kobj_attribute *attr,
34202c653d0eSAndrea Arcangeli const char *buf, size_t count)
34212c653d0eSAndrea Arcangeli {
34222c653d0eSAndrea Arcangeli unsigned int nr_pages;
34232c653d0eSAndrea Arcangeli int err;
34242c653d0eSAndrea Arcangeli
34252c653d0eSAndrea Arcangeli if (ksm_advisor != KSM_ADVISOR_NONE)
3426b348b5feSStefan Roesch return -EINVAL;
3427b348b5feSStefan Roesch
3428b348b5feSStefan Roesch err = kstrtouint(buf, 10, &nr_pages);
3429b348b5feSStefan Roesch if (err)
3430b348b5feSStefan Roesch return -EINVAL;
3431b348b5feSStefan Roesch
3432b348b5feSStefan Roesch ksm_thread_pages_to_scan = nr_pages;
3433b4028260SHugh Dickins
3434b4028260SHugh Dickins return count;
3435b4028260SHugh Dickins }
3436ae7a927dSJoe Perches KSM_ATTR(pages_to_scan);
3437b4028260SHugh Dickins
run_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3438b4028260SHugh Dickins static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
3439b4028260SHugh Dickins char *buf)
3440b4028260SHugh Dickins {
3441b4028260SHugh Dickins return sysfs_emit(buf, "%lu\n", ksm_run);
3442b4028260SHugh Dickins }
3443ae7a927dSJoe Perches
run_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3444b4028260SHugh Dickins static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
3445b4028260SHugh Dickins const char *buf, size_t count)
3446b4028260SHugh Dickins {
3447473b0ce4SHugh Dickins unsigned int flags;
3448473b0ce4SHugh Dickins int err;
3449473b0ce4SHugh Dickins
3450ae7a927dSJoe Perches err = kstrtouint(buf, 10, &flags);
3451473b0ce4SHugh Dickins if (err)
3452473b0ce4SHugh Dickins return -EINVAL;
3453473b0ce4SHugh Dickins if (flags > KSM_RUN_UNMERGE)
3454473b0ce4SHugh Dickins return -EINVAL;
3455473b0ce4SHugh Dickins
3456473b0ce4SHugh Dickins /*
3457473b0ce4SHugh Dickins * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
3458473b0ce4SHugh Dickins * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
3459473b0ce4SHugh Dickins * breaking COW to free the pages_shared (but leaves mm_slots
3460473b0ce4SHugh Dickins * on the list for when ksmd may be set running again).
3461473b0ce4SHugh Dickins */
3462473b0ce4SHugh Dickins
3463473b0ce4SHugh Dickins mutex_lock(&ksm_thread_mutex);
3464473b0ce4SHugh Dickins wait_while_offlining();
3465473b0ce4SHugh Dickins if (ksm_run != flags) {
3466473b0ce4SHugh Dickins ksm_run = flags;
3467ae7a927dSJoe Perches if (flags & KSM_RUN_UNMERGE) {
3468473b0ce4SHugh Dickins set_current_oom_origin();
3469473b0ce4SHugh Dickins err = unmerge_and_remove_all_rmap_items();
3470473b0ce4SHugh Dickins clear_current_oom_origin();
3471e5a68991SStefan Roesch if (err) {
3472e5a68991SStefan Roesch ksm_run = KSM_RUN_STOP;
3473e5a68991SStefan Roesch count = err;
3474e5a68991SStefan Roesch }
3475e5a68991SStefan Roesch }
3476e5a68991SStefan Roesch }
3477e5a68991SStefan Roesch mutex_unlock(&ksm_thread_mutex);
3478e2942062Sxu xin
3479e2942062Sxu xin if (flags & KSM_RUN_MERGE)
3480e2942062Sxu xin wake_up_interruptible(&ksm_thread_wait);
3481e2942062Sxu xin
3482e2942062Sxu xin return count;
3483e2942062Sxu xin }
3484e2942062Sxu xin KSM_ATTR(run);
3485d21077fbSStefan Roesch
3486d21077fbSStefan Roesch #ifdef CONFIG_NUMA
merge_across_nodes_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3487d21077fbSStefan Roesch static ssize_t merge_across_nodes_show(struct kobject *kobj,
3488d21077fbSStefan Roesch struct kobj_attribute *attr, char *buf)
3489d21077fbSStefan Roesch {
34901a8e8430Sxu xin return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
3491d21077fbSStefan Roesch }
3492d21077fbSStefan Roesch
merge_across_nodes_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3493d21077fbSStefan Roesch static ssize_t merge_across_nodes_store(struct kobject *kobj,
3494d21077fbSStefan Roesch struct kobj_attribute *attr,
3495d21077fbSStefan Roesch const char *buf, size_t count)
3496d21077fbSStefan Roesch {
34972c653d0eSAndrea Arcangeli int err;
34982c653d0eSAndrea Arcangeli unsigned long knob;
34992c653d0eSAndrea Arcangeli
3500ae7a927dSJoe Perches err = kstrtoul(buf, 10, &knob);
35012c653d0eSAndrea Arcangeli if (err)
35022c653d0eSAndrea Arcangeli return err;
35032c653d0eSAndrea Arcangeli if (knob > 1)
35042c653d0eSAndrea Arcangeli return -EINVAL;
35052c653d0eSAndrea Arcangeli
35062c653d0eSAndrea Arcangeli mutex_lock(&ksm_thread_mutex);
3507ae7a927dSJoe Perches wait_while_offlining();
35082c653d0eSAndrea Arcangeli if (ksm_merge_across_nodes != knob) {
35092c653d0eSAndrea Arcangeli if (ksm_pages_shared || remove_all_stable_nodes())
35102c653d0eSAndrea Arcangeli err = -EBUSY;
35112c653d0eSAndrea Arcangeli else if (root_stable_tree == one_stable_tree) {
35122c653d0eSAndrea Arcangeli struct rb_root *buf;
35132c653d0eSAndrea Arcangeli /*
35142c653d0eSAndrea Arcangeli * This is the first time that we switch away from the
35152c653d0eSAndrea Arcangeli * default of merging across nodes: must now allocate
3516ae7a927dSJoe Perches * a buffer to hold as many roots as may be needed.
35172c653d0eSAndrea Arcangeli * Allocate stable and unstable together:
35182c653d0eSAndrea Arcangeli * MAXSMP NODES_SHIFT 10 will use 16kB.
35192c653d0eSAndrea Arcangeli */
35202c653d0eSAndrea Arcangeli buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
35212c653d0eSAndrea Arcangeli GFP_KERNEL);
35222c653d0eSAndrea Arcangeli /* Let us assume that RB_ROOT is NULL is zero */
35232c653d0eSAndrea Arcangeli if (!buf)
3524584ff0dfSZhansaya Bagdauletkyzy err = -ENOMEM;
35252c653d0eSAndrea Arcangeli else {
35262c653d0eSAndrea Arcangeli root_stable_tree = buf;
3527584ff0dfSZhansaya Bagdauletkyzy root_unstable_tree = buf + nr_node_ids;
3528584ff0dfSZhansaya Bagdauletkyzy /* Stable tree is empty but not the unstable */
35292c653d0eSAndrea Arcangeli root_unstable_tree[0] = one_unstable_tree[0];
35302c653d0eSAndrea Arcangeli }
35312c653d0eSAndrea Arcangeli }
35322c653d0eSAndrea Arcangeli if (!err) {
35332c653d0eSAndrea Arcangeli ksm_merge_across_nodes = knob;
35342c653d0eSAndrea Arcangeli ksm_nr_node_ids = knob ? 1 : nr_node_ids;
35352c653d0eSAndrea Arcangeli }
35362c653d0eSAndrea Arcangeli }
3537473b0ce4SHugh Dickins mutex_unlock(&ksm_thread_mutex);
3538473b0ce4SHugh Dickins
3539473b0ce4SHugh Dickins return err ? err : count;
3540ae7a927dSJoe Perches }
3541473b0ce4SHugh Dickins KSM_ATTR(merge_across_nodes);
3542473b0ce4SHugh Dickins #endif
3543473b0ce4SHugh Dickins
use_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)35445e924ff5SStefan Roesch static ssize_t use_zero_pages_show(struct kobject *kobj,
35455e924ff5SStefan Roesch struct kobj_attribute *attr, char *buf)
35465e924ff5SStefan Roesch {
35475e924ff5SStefan Roesch return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
35485e924ff5SStefan Roesch }
use_zero_pages_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)35495e924ff5SStefan Roesch static ssize_t use_zero_pages_store(struct kobject *kobj,
35505e924ff5SStefan Roesch struct kobj_attribute *attr,
35515e924ff5SStefan Roesch const char *buf, size_t count)
35525e924ff5SStefan Roesch {
35535e924ff5SStefan Roesch int err;
35545e924ff5SStefan Roesch bool value;
35555e924ff5SStefan Roesch
35565e924ff5SStefan Roesch err = kstrtobool(buf, &value);
35575e924ff5SStefan Roesch if (err)
35585e924ff5SStefan Roesch return -EINVAL;
35595e924ff5SStefan Roesch
35605e924ff5SStefan Roesch ksm_use_zero_pages = value;
35615e924ff5SStefan Roesch
35625e924ff5SStefan Roesch return count;
35635e924ff5SStefan Roesch }
35645e924ff5SStefan Roesch KSM_ATTR(use_zero_pages);
35655e924ff5SStefan Roesch
max_page_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)356631dbd01fSIzik Eidus static ssize_t max_page_sharing_show(struct kobject *kobj,
356731dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf)
356831dbd01fSIzik Eidus {
356931dbd01fSIzik Eidus return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3570b348b5feSStefan Roesch }
3571b4028260SHugh Dickins
max_page_sharing_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3572b4028260SHugh Dickins static ssize_t max_page_sharing_store(struct kobject *kobj,
3573473b0ce4SHugh Dickins struct kobj_attribute *attr,
3574473b0ce4SHugh Dickins const char *buf, size_t count)
3575e5a68991SStefan Roesch {
3576e2942062Sxu xin int err;
3577473b0ce4SHugh Dickins int knob;
357890bd6fd3SPetr Holasek
357990bd6fd3SPetr Holasek err = kstrtoint(buf, 10, &knob);
358090bd6fd3SPetr Holasek if (err)
35812c653d0eSAndrea Arcangeli return err;
35822c653d0eSAndrea Arcangeli /*
35832c653d0eSAndrea Arcangeli * When a KSM page is created it is shared by 2 mappings. This
35842c653d0eSAndrea Arcangeli * being a signed comparison, it implicitly verifies it's not
3585e86c59b1SClaudio Imbrenda * negative.
3586d21077fbSStefan Roesch */
35875e924ff5SStefan Roesch if (knob < 2)
358831dbd01fSIzik Eidus return -EINVAL;
358931dbd01fSIzik Eidus
359031dbd01fSIzik Eidus if (READ_ONCE(ksm_max_page_sharing) == knob)
3591f907c26aSArvind Yadav return count;
359231dbd01fSIzik Eidus
359331dbd01fSIzik Eidus mutex_lock(&ksm_thread_mutex);
359431dbd01fSIzik Eidus wait_while_offlining();
35952ffd8679SHugh Dickins if (ksm_max_page_sharing != knob) {
359631dbd01fSIzik Eidus if (ksm_pages_shared || remove_all_stable_nodes())
359731dbd01fSIzik Eidus err = -EBUSY;
359831dbd01fSIzik Eidus else
359931dbd01fSIzik Eidus ksm_max_page_sharing = knob;
360031dbd01fSIzik Eidus }
360131dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex);
3602e86c59b1SClaudio Imbrenda
3603e86c59b1SClaudio Imbrenda return err ? err : count;
3604e86c59b1SClaudio Imbrenda }
3605e86c59b1SClaudio Imbrenda KSM_ATTR(max_page_sharing);
3606e86c59b1SClaudio Imbrenda
pages_scanned_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)360731dbd01fSIzik Eidus static ssize_t pages_scanned_show(struct kobject *kobj,
360831dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf)
360931dbd01fSIzik Eidus {
361031dbd01fSIzik Eidus return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
361131dbd01fSIzik Eidus }
361231dbd01fSIzik Eidus KSM_ATTR_RO(pages_scanned);
361325acde31SPaul McQuade
pages_shared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)361431dbd01fSIzik Eidus static ssize_t pages_shared_show(struct kobject *kobj,
3615d9f8984cSLai Jiangshan struct kobj_attribute *attr, char *buf)
361631dbd01fSIzik Eidus {
361731dbd01fSIzik Eidus return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
36182ffd8679SHugh Dickins }
361931dbd01fSIzik Eidus KSM_ATTR_RO(pages_shared);
362031dbd01fSIzik Eidus
pages_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)362125acde31SPaul McQuade static ssize_t pages_sharing_show(struct kobject *kobj,
36222ffd8679SHugh Dickins struct kobj_attribute *attr, char *buf)
3623d9f8984cSLai Jiangshan {
362431dbd01fSIzik Eidus return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3625c73602adSHugh Dickins }
3626c73602adSHugh Dickins KSM_ATTR_RO(pages_sharing);
3627c73602adSHugh Dickins
pages_unshared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)36282ffd8679SHugh Dickins static ssize_t pages_unshared_show(struct kobject *kobj,
362931dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf)
363062b61f61SHugh Dickins {
3631ef4d43a8SHugh Dickins return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
36321eeaa4fdSLiu Shixin }
363362b61f61SHugh Dickins KSM_ATTR_RO(pages_unshared);
363431dbd01fSIzik Eidus
pages_volatile_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)363531dbd01fSIzik Eidus static ssize_t pages_volatile_show(struct kobject *kobj,
3636d9f8984cSLai Jiangshan struct kobj_attribute *attr, char *buf)
363731dbd01fSIzik Eidus {
363831dbd01fSIzik Eidus long ksm_pages_volatile;
363931dbd01fSIzik Eidus
364031dbd01fSIzik Eidus ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3641a64fb3cdSPaul Gortmaker - ksm_pages_sharing - ksm_pages_unshared;
3642 /*
3643 * It was not worth any locking to calculate that statistic,
3644 * but it might therefore sometimes be negative: conceal that.
3645 */
3646 if (ksm_pages_volatile < 0)
3647 ksm_pages_volatile = 0;
3648 return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3649 }
3650 KSM_ATTR_RO(pages_volatile);
3651
pages_skipped_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3652 static ssize_t pages_skipped_show(struct kobject *kobj,
3653 struct kobj_attribute *attr, char *buf)
3654 {
3655 return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
3656 }
3657 KSM_ATTR_RO(pages_skipped);
3658
ksm_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3659 static ssize_t ksm_zero_pages_show(struct kobject *kobj,
3660 struct kobj_attribute *attr, char *buf)
3661 {
3662 return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
3663 }
3664 KSM_ATTR_RO(ksm_zero_pages);
3665
general_profit_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3666 static ssize_t general_profit_show(struct kobject *kobj,
3667 struct kobj_attribute *attr, char *buf)
3668 {
3669 long general_profit;
3670
3671 general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
3672 ksm_rmap_items * sizeof(struct ksm_rmap_item);
3673
3674 return sysfs_emit(buf, "%ld\n", general_profit);
3675 }
3676 KSM_ATTR_RO(general_profit);
3677
stable_node_dups_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3678 static ssize_t stable_node_dups_show(struct kobject *kobj,
3679 struct kobj_attribute *attr, char *buf)
3680 {
3681 return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3682 }
3683 KSM_ATTR_RO(stable_node_dups);
3684
stable_node_chains_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3685 static ssize_t stable_node_chains_show(struct kobject *kobj,
3686 struct kobj_attribute *attr, char *buf)
3687 {
3688 return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3689 }
3690 KSM_ATTR_RO(stable_node_chains);
3691
3692 static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3693 stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3694 struct kobj_attribute *attr,
3695 char *buf)
3696 {
3697 return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3698 }
3699
3700 static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3701 stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3702 struct kobj_attribute *attr,
3703 const char *buf, size_t count)
3704 {
3705 unsigned int msecs;
3706 int err;
3707
3708 err = kstrtouint(buf, 10, &msecs);
3709 if (err)
3710 return -EINVAL;
3711
3712 ksm_stable_node_chains_prune_millisecs = msecs;
3713
3714 return count;
3715 }
3716 KSM_ATTR(stable_node_chains_prune_millisecs);
3717
full_scans_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3718 static ssize_t full_scans_show(struct kobject *kobj,
3719 struct kobj_attribute *attr, char *buf)
3720 {
3721 return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3722 }
3723 KSM_ATTR_RO(full_scans);
3724
smart_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3725 static ssize_t smart_scan_show(struct kobject *kobj,
3726 struct kobj_attribute *attr, char *buf)
3727 {
3728 return sysfs_emit(buf, "%u\n", ksm_smart_scan);
3729 }
3730
smart_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3731 static ssize_t smart_scan_store(struct kobject *kobj,
3732 struct kobj_attribute *attr,
3733 const char *buf, size_t count)
3734 {
3735 int err;
3736 bool value;
3737
3738 err = kstrtobool(buf, &value);
3739 if (err)
3740 return -EINVAL;
3741
3742 ksm_smart_scan = value;
3743 return count;
3744 }
3745 KSM_ATTR(smart_scan);
3746
advisor_mode_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3747 static ssize_t advisor_mode_show(struct kobject *kobj,
3748 struct kobj_attribute *attr, char *buf)
3749 {
3750 const char *output;
3751
3752 if (ksm_advisor == KSM_ADVISOR_NONE)
3753 output = "[none] scan-time";
3754 else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
3755 output = "none [scan-time]";
3756
3757 return sysfs_emit(buf, "%s\n", output);
3758 }
3759
advisor_mode_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3760 static ssize_t advisor_mode_store(struct kobject *kobj,
3761 struct kobj_attribute *attr, const char *buf,
3762 size_t count)
3763 {
3764 enum ksm_advisor_type curr_advisor = ksm_advisor;
3765
3766 if (sysfs_streq("scan-time", buf))
3767 ksm_advisor = KSM_ADVISOR_SCAN_TIME;
3768 else if (sysfs_streq("none", buf))
3769 ksm_advisor = KSM_ADVISOR_NONE;
3770 else
3771 return -EINVAL;
3772
3773 /* Set advisor default values */
3774 if (curr_advisor != ksm_advisor)
3775 set_advisor_defaults();
3776
3777 return count;
3778 }
3779 KSM_ATTR(advisor_mode);
3780
advisor_max_cpu_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3781 static ssize_t advisor_max_cpu_show(struct kobject *kobj,
3782 struct kobj_attribute *attr, char *buf)
3783 {
3784 return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
3785 }
3786
advisor_max_cpu_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3787 static ssize_t advisor_max_cpu_store(struct kobject *kobj,
3788 struct kobj_attribute *attr,
3789 const char *buf, size_t count)
3790 {
3791 int err;
3792 unsigned long value;
3793
3794 err = kstrtoul(buf, 10, &value);
3795 if (err)
3796 return -EINVAL;
3797
3798 ksm_advisor_max_cpu = value;
3799 return count;
3800 }
3801 KSM_ATTR(advisor_max_cpu);
3802
advisor_min_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3803 static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
3804 struct kobj_attribute *attr, char *buf)
3805 {
3806 return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
3807 }
3808
advisor_min_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3809 static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
3810 struct kobj_attribute *attr,
3811 const char *buf, size_t count)
3812 {
3813 int err;
3814 unsigned long value;
3815
3816 err = kstrtoul(buf, 10, &value);
3817 if (err)
3818 return -EINVAL;
3819
3820 ksm_advisor_min_pages_to_scan = value;
3821 return count;
3822 }
3823 KSM_ATTR(advisor_min_pages_to_scan);
3824
advisor_max_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3825 static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
3826 struct kobj_attribute *attr, char *buf)
3827 {
3828 return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
3829 }
3830
advisor_max_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3831 static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
3832 struct kobj_attribute *attr,
3833 const char *buf, size_t count)
3834 {
3835 int err;
3836 unsigned long value;
3837
3838 err = kstrtoul(buf, 10, &value);
3839 if (err)
3840 return -EINVAL;
3841
3842 ksm_advisor_max_pages_to_scan = value;
3843 return count;
3844 }
3845 KSM_ATTR(advisor_max_pages_to_scan);
3846
advisor_target_scan_time_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3847 static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
3848 struct kobj_attribute *attr, char *buf)
3849 {
3850 return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
3851 }
3852
advisor_target_scan_time_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3853 static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
3854 struct kobj_attribute *attr,
3855 const char *buf, size_t count)
3856 {
3857 int err;
3858 unsigned long value;
3859
3860 err = kstrtoul(buf, 10, &value);
3861 if (err)
3862 return -EINVAL;
3863 if (value < 1)
3864 return -EINVAL;
3865
3866 ksm_advisor_target_scan_time = value;
3867 return count;
3868 }
3869 KSM_ATTR(advisor_target_scan_time);
3870
3871 static struct attribute *ksm_attrs[] = {
3872 &sleep_millisecs_attr.attr,
3873 &pages_to_scan_attr.attr,
3874 &run_attr.attr,
3875 &pages_scanned_attr.attr,
3876 &pages_shared_attr.attr,
3877 &pages_sharing_attr.attr,
3878 &pages_unshared_attr.attr,
3879 &pages_volatile_attr.attr,
3880 &pages_skipped_attr.attr,
3881 &ksm_zero_pages_attr.attr,
3882 &full_scans_attr.attr,
3883 #ifdef CONFIG_NUMA
3884 &merge_across_nodes_attr.attr,
3885 #endif
3886 &max_page_sharing_attr.attr,
3887 &stable_node_chains_attr.attr,
3888 &stable_node_dups_attr.attr,
3889 &stable_node_chains_prune_millisecs_attr.attr,
3890 &use_zero_pages_attr.attr,
3891 &general_profit_attr.attr,
3892 &smart_scan_attr.attr,
3893 &advisor_mode_attr.attr,
3894 &advisor_max_cpu_attr.attr,
3895 &advisor_min_pages_to_scan_attr.attr,
3896 &advisor_max_pages_to_scan_attr.attr,
3897 &advisor_target_scan_time_attr.attr,
3898 NULL,
3899 };
3900
3901 static const struct attribute_group ksm_attr_group = {
3902 .attrs = ksm_attrs,
3903 .name = "ksm",
3904 };
3905 #endif /* CONFIG_SYSFS */
3906
ksm_init(void)3907 static int __init ksm_init(void)
3908 {
3909 struct task_struct *ksm_thread;
3910 int err;
3911
3912 /* The correct value depends on page size and endianness */
3913 zero_checksum = calc_checksum(ZERO_PAGE(0));
3914 /* Default to false for backwards compatibility */
3915 ksm_use_zero_pages = false;
3916
3917 err = ksm_slab_init();
3918 if (err)
3919 goto out;
3920
3921 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3922 if (IS_ERR(ksm_thread)) {
3923 pr_err("ksm: creating kthread failed\n");
3924 err = PTR_ERR(ksm_thread);
3925 goto out_free;
3926 }
3927
3928 #ifdef CONFIG_SYSFS
3929 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3930 if (err) {
3931 pr_err("ksm: register sysfs failed\n");
3932 kthread_stop(ksm_thread);
3933 goto out_free;
3934 }
3935 #else
3936 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
3937
3938 #endif /* CONFIG_SYSFS */
3939
3940 #ifdef CONFIG_MEMORY_HOTREMOVE
3941 /* There is no significance to this priority 100 */
3942 hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
3943 #endif
3944 return 0;
3945
3946 out_free:
3947 ksm_slab_free();
3948 out:
3949 return err;
3950 }
3951 subsys_initcall(ksm_init);
3952