1f8af4da3SHugh Dickins /* 231dbd01fSIzik Eidus * Memory merging support. 331dbd01fSIzik Eidus * 431dbd01fSIzik Eidus * This code enables dynamic sharing of identical pages found in different 531dbd01fSIzik Eidus * memory areas, even if they are not shared by fork() 631dbd01fSIzik Eidus * 736b2528dSIzik Eidus * Copyright (C) 2008-2009 Red Hat, Inc. 831dbd01fSIzik Eidus * Authors: 931dbd01fSIzik Eidus * Izik Eidus 1031dbd01fSIzik Eidus * Andrea Arcangeli 1131dbd01fSIzik Eidus * Chris Wright 1236b2528dSIzik Eidus * Hugh Dickins 1331dbd01fSIzik Eidus * 1431dbd01fSIzik Eidus * This work is licensed under the terms of the GNU GPL, version 2. 15f8af4da3SHugh Dickins */ 16f8af4da3SHugh Dickins 17f8af4da3SHugh Dickins #include <linux/errno.h> 1831dbd01fSIzik Eidus #include <linux/mm.h> 1931dbd01fSIzik Eidus #include <linux/fs.h> 20f8af4da3SHugh Dickins #include <linux/mman.h> 2131dbd01fSIzik Eidus #include <linux/sched.h> 2231dbd01fSIzik Eidus #include <linux/rwsem.h> 2331dbd01fSIzik Eidus #include <linux/pagemap.h> 2431dbd01fSIzik Eidus #include <linux/rmap.h> 2531dbd01fSIzik Eidus #include <linux/spinlock.h> 2631dbd01fSIzik Eidus #include <linux/jhash.h> 2731dbd01fSIzik Eidus #include <linux/delay.h> 2831dbd01fSIzik Eidus #include <linux/kthread.h> 2931dbd01fSIzik Eidus #include <linux/wait.h> 3031dbd01fSIzik Eidus #include <linux/slab.h> 3131dbd01fSIzik Eidus #include <linux/rbtree.h> 3231dbd01fSIzik Eidus #include <linux/mmu_notifier.h> 332c6854fdSIzik Eidus #include <linux/swap.h> 34f8af4da3SHugh Dickins #include <linux/ksm.h> 35f8af4da3SHugh Dickins 3631dbd01fSIzik Eidus #include <asm/tlbflush.h> 3731dbd01fSIzik Eidus 3831dbd01fSIzik Eidus /* 3931dbd01fSIzik Eidus * A few notes about the KSM scanning process, 4031dbd01fSIzik Eidus * to make it easier to understand the data structures below: 4131dbd01fSIzik Eidus * 4231dbd01fSIzik Eidus * In order to reduce excessive scanning, KSM sorts the memory pages by their 4331dbd01fSIzik Eidus * contents into a data structure that holds pointers to the pages' locations. 4431dbd01fSIzik Eidus * 4531dbd01fSIzik Eidus * Since the contents of the pages may change at any moment, KSM cannot just 4631dbd01fSIzik Eidus * insert the pages into a normal sorted tree and expect it to find anything. 4731dbd01fSIzik Eidus * Therefore KSM uses two data structures - the stable and the unstable tree. 4831dbd01fSIzik Eidus * 4931dbd01fSIzik Eidus * The stable tree holds pointers to all the merged pages (ksm pages), sorted 5031dbd01fSIzik Eidus * by their contents. Because each such page is write-protected, searching on 5131dbd01fSIzik Eidus * this tree is fully assured to be working (except when pages are unmapped), 5231dbd01fSIzik Eidus * and therefore this tree is called the stable tree. 5331dbd01fSIzik Eidus * 5431dbd01fSIzik Eidus * In addition to the stable tree, KSM uses a second data structure called the 5531dbd01fSIzik Eidus * unstable tree: this tree holds pointers to pages which have been found to 5631dbd01fSIzik Eidus * be "unchanged for a period of time". The unstable tree sorts these pages 5731dbd01fSIzik Eidus * by their contents, but since they are not write-protected, KSM cannot rely 5831dbd01fSIzik Eidus * upon the unstable tree to work correctly - the unstable tree is liable to 5931dbd01fSIzik Eidus * be corrupted as its contents are modified, and so it is called unstable. 6031dbd01fSIzik Eidus * 6131dbd01fSIzik Eidus * KSM solves this problem by several techniques: 6231dbd01fSIzik Eidus * 6331dbd01fSIzik Eidus * 1) The unstable tree is flushed every time KSM completes scanning all 6431dbd01fSIzik Eidus * memory areas, and then the tree is rebuilt again from the beginning. 6531dbd01fSIzik Eidus * 2) KSM will only insert into the unstable tree, pages whose hash value 6631dbd01fSIzik Eidus * has not changed since the previous scan of all memory areas. 6731dbd01fSIzik Eidus * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 6831dbd01fSIzik Eidus * colors of the nodes and not on their contents, assuring that even when 6931dbd01fSIzik Eidus * the tree gets "corrupted" it won't get out of balance, so scanning time 7031dbd01fSIzik Eidus * remains the same (also, searching and inserting nodes in an rbtree uses 7131dbd01fSIzik Eidus * the same algorithm, so we have no overhead when we flush and rebuild). 7231dbd01fSIzik Eidus * 4) KSM never flushes the stable tree, which means that even if it were to 7331dbd01fSIzik Eidus * take 10 attempts to find a page in the unstable tree, once it is found, 7431dbd01fSIzik Eidus * it is secured in the stable tree. (When we scan a new page, we first 7531dbd01fSIzik Eidus * compare it against the stable tree, and then against the unstable tree.) 7631dbd01fSIzik Eidus */ 7731dbd01fSIzik Eidus 7831dbd01fSIzik Eidus /** 7931dbd01fSIzik Eidus * struct mm_slot - ksm information per mm that is being scanned 8031dbd01fSIzik Eidus * @link: link to the mm_slots hash list 8131dbd01fSIzik Eidus * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 8231dbd01fSIzik Eidus * @rmap_list: head for this mm_slot's list of rmap_items 8331dbd01fSIzik Eidus * @mm: the mm that this information is valid for 8431dbd01fSIzik Eidus */ 8531dbd01fSIzik Eidus struct mm_slot { 8631dbd01fSIzik Eidus struct hlist_node link; 8731dbd01fSIzik Eidus struct list_head mm_list; 8831dbd01fSIzik Eidus struct list_head rmap_list; 8931dbd01fSIzik Eidus struct mm_struct *mm; 9031dbd01fSIzik Eidus }; 9131dbd01fSIzik Eidus 9231dbd01fSIzik Eidus /** 9331dbd01fSIzik Eidus * struct ksm_scan - cursor for scanning 9431dbd01fSIzik Eidus * @mm_slot: the current mm_slot we are scanning 9531dbd01fSIzik Eidus * @address: the next address inside that to be scanned 9631dbd01fSIzik Eidus * @rmap_item: the current rmap that we are scanning inside the rmap_list 9731dbd01fSIzik Eidus * @seqnr: count of completed full scans (needed when removing unstable node) 9831dbd01fSIzik Eidus * 9931dbd01fSIzik Eidus * There is only the one ksm_scan instance of this cursor structure. 10031dbd01fSIzik Eidus */ 10131dbd01fSIzik Eidus struct ksm_scan { 10231dbd01fSIzik Eidus struct mm_slot *mm_slot; 10331dbd01fSIzik Eidus unsigned long address; 10431dbd01fSIzik Eidus struct rmap_item *rmap_item; 10531dbd01fSIzik Eidus unsigned long seqnr; 10631dbd01fSIzik Eidus }; 10731dbd01fSIzik Eidus 10831dbd01fSIzik Eidus /** 10931dbd01fSIzik Eidus * struct rmap_item - reverse mapping item for virtual addresses 11031dbd01fSIzik Eidus * @link: link into mm_slot's rmap_list (rmap_list is per mm) 11131dbd01fSIzik Eidus * @mm: the memory structure this rmap_item is pointing into 11231dbd01fSIzik Eidus * @address: the virtual address this rmap_item tracks (+ flags in low bits) 11331dbd01fSIzik Eidus * @oldchecksum: previous checksum of the page at that virtual address 11431dbd01fSIzik Eidus * @node: rb_node of this rmap_item in either unstable or stable tree 11531dbd01fSIzik Eidus * @next: next rmap_item hanging off the same node of the stable tree 11631dbd01fSIzik Eidus * @prev: previous rmap_item hanging off the same node of the stable tree 11731dbd01fSIzik Eidus */ 11831dbd01fSIzik Eidus struct rmap_item { 11931dbd01fSIzik Eidus struct list_head link; 12031dbd01fSIzik Eidus struct mm_struct *mm; 12131dbd01fSIzik Eidus unsigned long address; /* + low bits used for flags below */ 12231dbd01fSIzik Eidus union { 12331dbd01fSIzik Eidus unsigned int oldchecksum; /* when unstable */ 12431dbd01fSIzik Eidus struct rmap_item *next; /* when stable */ 12531dbd01fSIzik Eidus }; 12631dbd01fSIzik Eidus union { 12731dbd01fSIzik Eidus struct rb_node node; /* when tree node */ 12831dbd01fSIzik Eidus struct rmap_item *prev; /* in stable list */ 12931dbd01fSIzik Eidus }; 13031dbd01fSIzik Eidus }; 13131dbd01fSIzik Eidus 13231dbd01fSIzik Eidus #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 13331dbd01fSIzik Eidus #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 13431dbd01fSIzik Eidus #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 13531dbd01fSIzik Eidus 13631dbd01fSIzik Eidus /* The stable and unstable tree heads */ 13731dbd01fSIzik Eidus static struct rb_root root_stable_tree = RB_ROOT; 13831dbd01fSIzik Eidus static struct rb_root root_unstable_tree = RB_ROOT; 13931dbd01fSIzik Eidus 14031dbd01fSIzik Eidus #define MM_SLOTS_HASH_HEADS 1024 14131dbd01fSIzik Eidus static struct hlist_head *mm_slots_hash; 14231dbd01fSIzik Eidus 14331dbd01fSIzik Eidus static struct mm_slot ksm_mm_head = { 14431dbd01fSIzik Eidus .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 14531dbd01fSIzik Eidus }; 14631dbd01fSIzik Eidus static struct ksm_scan ksm_scan = { 14731dbd01fSIzik Eidus .mm_slot = &ksm_mm_head, 14831dbd01fSIzik Eidus }; 14931dbd01fSIzik Eidus 15031dbd01fSIzik Eidus static struct kmem_cache *rmap_item_cache; 15131dbd01fSIzik Eidus static struct kmem_cache *mm_slot_cache; 15231dbd01fSIzik Eidus 15331dbd01fSIzik Eidus /* The number of nodes in the stable tree */ 154b4028260SHugh Dickins static unsigned long ksm_pages_shared; 15531dbd01fSIzik Eidus 156e178dfdeSHugh Dickins /* The number of page slots additionally sharing those nodes */ 157b4028260SHugh Dickins static unsigned long ksm_pages_sharing; 15831dbd01fSIzik Eidus 159473b0ce4SHugh Dickins /* The number of nodes in the unstable tree */ 160473b0ce4SHugh Dickins static unsigned long ksm_pages_unshared; 161473b0ce4SHugh Dickins 162473b0ce4SHugh Dickins /* The number of rmap_items in use: to calculate pages_volatile */ 163473b0ce4SHugh Dickins static unsigned long ksm_rmap_items; 164473b0ce4SHugh Dickins 16531dbd01fSIzik Eidus /* Limit on the number of unswappable pages used */ 1662c6854fdSIzik Eidus static unsigned long ksm_max_kernel_pages; 16731dbd01fSIzik Eidus 16831dbd01fSIzik Eidus /* Number of pages ksmd should scan in one batch */ 1692c6854fdSIzik Eidus static unsigned int ksm_thread_pages_to_scan = 100; 17031dbd01fSIzik Eidus 17131dbd01fSIzik Eidus /* Milliseconds ksmd should sleep between batches */ 1722ffd8679SHugh Dickins static unsigned int ksm_thread_sleep_millisecs = 20; 17331dbd01fSIzik Eidus 17431dbd01fSIzik Eidus #define KSM_RUN_STOP 0 17531dbd01fSIzik Eidus #define KSM_RUN_MERGE 1 17631dbd01fSIzik Eidus #define KSM_RUN_UNMERGE 2 1772c6854fdSIzik Eidus static unsigned int ksm_run = KSM_RUN_STOP; 17831dbd01fSIzik Eidus 17931dbd01fSIzik Eidus static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 18031dbd01fSIzik Eidus static DEFINE_MUTEX(ksm_thread_mutex); 18131dbd01fSIzik Eidus static DEFINE_SPINLOCK(ksm_mmlist_lock); 18231dbd01fSIzik Eidus 18331dbd01fSIzik Eidus #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 18431dbd01fSIzik Eidus sizeof(struct __struct), __alignof__(struct __struct),\ 18531dbd01fSIzik Eidus (__flags), NULL) 18631dbd01fSIzik Eidus 18731dbd01fSIzik Eidus static int __init ksm_slab_init(void) 18831dbd01fSIzik Eidus { 18931dbd01fSIzik Eidus rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 19031dbd01fSIzik Eidus if (!rmap_item_cache) 19131dbd01fSIzik Eidus goto out; 19231dbd01fSIzik Eidus 19331dbd01fSIzik Eidus mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 19431dbd01fSIzik Eidus if (!mm_slot_cache) 19531dbd01fSIzik Eidus goto out_free; 19631dbd01fSIzik Eidus 19731dbd01fSIzik Eidus return 0; 19831dbd01fSIzik Eidus 19931dbd01fSIzik Eidus out_free: 20031dbd01fSIzik Eidus kmem_cache_destroy(rmap_item_cache); 20131dbd01fSIzik Eidus out: 20231dbd01fSIzik Eidus return -ENOMEM; 20331dbd01fSIzik Eidus } 20431dbd01fSIzik Eidus 20531dbd01fSIzik Eidus static void __init ksm_slab_free(void) 20631dbd01fSIzik Eidus { 20731dbd01fSIzik Eidus kmem_cache_destroy(mm_slot_cache); 20831dbd01fSIzik Eidus kmem_cache_destroy(rmap_item_cache); 20931dbd01fSIzik Eidus mm_slot_cache = NULL; 21031dbd01fSIzik Eidus } 21131dbd01fSIzik Eidus 21231dbd01fSIzik Eidus static inline struct rmap_item *alloc_rmap_item(void) 21331dbd01fSIzik Eidus { 214473b0ce4SHugh Dickins struct rmap_item *rmap_item; 215473b0ce4SHugh Dickins 216473b0ce4SHugh Dickins rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 217473b0ce4SHugh Dickins if (rmap_item) 218473b0ce4SHugh Dickins ksm_rmap_items++; 219473b0ce4SHugh Dickins return rmap_item; 22031dbd01fSIzik Eidus } 22131dbd01fSIzik Eidus 22231dbd01fSIzik Eidus static inline void free_rmap_item(struct rmap_item *rmap_item) 22331dbd01fSIzik Eidus { 224473b0ce4SHugh Dickins ksm_rmap_items--; 22531dbd01fSIzik Eidus rmap_item->mm = NULL; /* debug safety */ 22631dbd01fSIzik Eidus kmem_cache_free(rmap_item_cache, rmap_item); 22731dbd01fSIzik Eidus } 22831dbd01fSIzik Eidus 22931dbd01fSIzik Eidus static inline struct mm_slot *alloc_mm_slot(void) 23031dbd01fSIzik Eidus { 23131dbd01fSIzik Eidus if (!mm_slot_cache) /* initialization failed */ 23231dbd01fSIzik Eidus return NULL; 23331dbd01fSIzik Eidus return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 23431dbd01fSIzik Eidus } 23531dbd01fSIzik Eidus 23631dbd01fSIzik Eidus static inline void free_mm_slot(struct mm_slot *mm_slot) 23731dbd01fSIzik Eidus { 23831dbd01fSIzik Eidus kmem_cache_free(mm_slot_cache, mm_slot); 23931dbd01fSIzik Eidus } 24031dbd01fSIzik Eidus 24131dbd01fSIzik Eidus static int __init mm_slots_hash_init(void) 24231dbd01fSIzik Eidus { 24331dbd01fSIzik Eidus mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 24431dbd01fSIzik Eidus GFP_KERNEL); 24531dbd01fSIzik Eidus if (!mm_slots_hash) 24631dbd01fSIzik Eidus return -ENOMEM; 24731dbd01fSIzik Eidus return 0; 24831dbd01fSIzik Eidus } 24931dbd01fSIzik Eidus 25031dbd01fSIzik Eidus static void __init mm_slots_hash_free(void) 25131dbd01fSIzik Eidus { 25231dbd01fSIzik Eidus kfree(mm_slots_hash); 25331dbd01fSIzik Eidus } 25431dbd01fSIzik Eidus 25531dbd01fSIzik Eidus static struct mm_slot *get_mm_slot(struct mm_struct *mm) 25631dbd01fSIzik Eidus { 25731dbd01fSIzik Eidus struct mm_slot *mm_slot; 25831dbd01fSIzik Eidus struct hlist_head *bucket; 25931dbd01fSIzik Eidus struct hlist_node *node; 26031dbd01fSIzik Eidus 26131dbd01fSIzik Eidus bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 26231dbd01fSIzik Eidus % MM_SLOTS_HASH_HEADS]; 26331dbd01fSIzik Eidus hlist_for_each_entry(mm_slot, node, bucket, link) { 26431dbd01fSIzik Eidus if (mm == mm_slot->mm) 26531dbd01fSIzik Eidus return mm_slot; 26631dbd01fSIzik Eidus } 26731dbd01fSIzik Eidus return NULL; 26831dbd01fSIzik Eidus } 26931dbd01fSIzik Eidus 27031dbd01fSIzik Eidus static void insert_to_mm_slots_hash(struct mm_struct *mm, 27131dbd01fSIzik Eidus struct mm_slot *mm_slot) 27231dbd01fSIzik Eidus { 27331dbd01fSIzik Eidus struct hlist_head *bucket; 27431dbd01fSIzik Eidus 27531dbd01fSIzik Eidus bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 27631dbd01fSIzik Eidus % MM_SLOTS_HASH_HEADS]; 27731dbd01fSIzik Eidus mm_slot->mm = mm; 27831dbd01fSIzik Eidus INIT_LIST_HEAD(&mm_slot->rmap_list); 27931dbd01fSIzik Eidus hlist_add_head(&mm_slot->link, bucket); 28031dbd01fSIzik Eidus } 28131dbd01fSIzik Eidus 28231dbd01fSIzik Eidus static inline int in_stable_tree(struct rmap_item *rmap_item) 28331dbd01fSIzik Eidus { 28431dbd01fSIzik Eidus return rmap_item->address & STABLE_FLAG; 28531dbd01fSIzik Eidus } 28631dbd01fSIzik Eidus 28731dbd01fSIzik Eidus /* 288a913e182SHugh Dickins * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 289a913e182SHugh Dickins * page tables after it has passed through ksm_exit() - which, if necessary, 290a913e182SHugh Dickins * takes mmap_sem briefly to serialize against them. ksm_exit() does not set 291a913e182SHugh Dickins * a special flag: they can just back out as soon as mm_users goes to zero. 292a913e182SHugh Dickins * ksm_test_exit() is used throughout to make this test for exit: in some 293a913e182SHugh Dickins * places for correctness, in some places just to avoid unnecessary work. 294a913e182SHugh Dickins */ 295a913e182SHugh Dickins static inline bool ksm_test_exit(struct mm_struct *mm) 296a913e182SHugh Dickins { 297a913e182SHugh Dickins return atomic_read(&mm->mm_users) == 0; 298a913e182SHugh Dickins } 299a913e182SHugh Dickins 300a913e182SHugh Dickins /* 30131dbd01fSIzik Eidus * We use break_ksm to break COW on a ksm page: it's a stripped down 30231dbd01fSIzik Eidus * 30331dbd01fSIzik Eidus * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 30431dbd01fSIzik Eidus * put_page(page); 30531dbd01fSIzik Eidus * 30631dbd01fSIzik Eidus * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 30731dbd01fSIzik Eidus * in case the application has unmapped and remapped mm,addr meanwhile. 30831dbd01fSIzik Eidus * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 30931dbd01fSIzik Eidus * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 31031dbd01fSIzik Eidus */ 311d952b791SHugh Dickins static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 31231dbd01fSIzik Eidus { 31331dbd01fSIzik Eidus struct page *page; 314d952b791SHugh Dickins int ret = 0; 31531dbd01fSIzik Eidus 31631dbd01fSIzik Eidus do { 31731dbd01fSIzik Eidus cond_resched(); 31831dbd01fSIzik Eidus page = follow_page(vma, addr, FOLL_GET); 31931dbd01fSIzik Eidus if (!page) 32031dbd01fSIzik Eidus break; 32131dbd01fSIzik Eidus if (PageKsm(page)) 32231dbd01fSIzik Eidus ret = handle_mm_fault(vma->vm_mm, vma, addr, 32331dbd01fSIzik Eidus FAULT_FLAG_WRITE); 32431dbd01fSIzik Eidus else 32531dbd01fSIzik Eidus ret = VM_FAULT_WRITE; 32631dbd01fSIzik Eidus put_page(page); 327d952b791SHugh Dickins } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 328d952b791SHugh Dickins /* 329d952b791SHugh Dickins * We must loop because handle_mm_fault() may back out if there's 330d952b791SHugh Dickins * any difficulty e.g. if pte accessed bit gets updated concurrently. 331d952b791SHugh Dickins * 332d952b791SHugh Dickins * VM_FAULT_WRITE is what we have been hoping for: it indicates that 333d952b791SHugh Dickins * COW has been broken, even if the vma does not permit VM_WRITE; 334d952b791SHugh Dickins * but note that a concurrent fault might break PageKsm for us. 335d952b791SHugh Dickins * 336d952b791SHugh Dickins * VM_FAULT_SIGBUS could occur if we race with truncation of the 337d952b791SHugh Dickins * backing file, which also invalidates anonymous pages: that's 338d952b791SHugh Dickins * okay, that truncation will have unmapped the PageKsm for us. 339d952b791SHugh Dickins * 340d952b791SHugh Dickins * VM_FAULT_OOM: at the time of writing (late July 2009), setting 341d952b791SHugh Dickins * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 342d952b791SHugh Dickins * current task has TIF_MEMDIE set, and will be OOM killed on return 343d952b791SHugh Dickins * to user; and ksmd, having no mm, would never be chosen for that. 344d952b791SHugh Dickins * 345d952b791SHugh Dickins * But if the mm is in a limited mem_cgroup, then the fault may fail 346d952b791SHugh Dickins * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 347d952b791SHugh Dickins * even ksmd can fail in this way - though it's usually breaking ksm 348d952b791SHugh Dickins * just to undo a merge it made a moment before, so unlikely to oom. 349d952b791SHugh Dickins * 350d952b791SHugh Dickins * That's a pity: we might therefore have more kernel pages allocated 351d952b791SHugh Dickins * than we're counting as nodes in the stable tree; but ksm_do_scan 352d952b791SHugh Dickins * will retry to break_cow on each pass, so should recover the page 353d952b791SHugh Dickins * in due course. The important thing is to not let VM_MERGEABLE 354d952b791SHugh Dickins * be cleared while any such pages might remain in the area. 355d952b791SHugh Dickins */ 356d952b791SHugh Dickins return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 35731dbd01fSIzik Eidus } 35831dbd01fSIzik Eidus 35981464e30SHugh Dickins static void break_cow(struct mm_struct *mm, unsigned long addr) 36031dbd01fSIzik Eidus { 36131dbd01fSIzik Eidus struct vm_area_struct *vma; 36231dbd01fSIzik Eidus 36381464e30SHugh Dickins down_read(&mm->mmap_sem); 3649ba69294SHugh Dickins if (ksm_test_exit(mm)) 3659ba69294SHugh Dickins goto out; 36631dbd01fSIzik Eidus vma = find_vma(mm, addr); 36731dbd01fSIzik Eidus if (!vma || vma->vm_start > addr) 36881464e30SHugh Dickins goto out; 36931dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 37081464e30SHugh Dickins goto out; 37131dbd01fSIzik Eidus break_ksm(vma, addr); 37281464e30SHugh Dickins out: 37331dbd01fSIzik Eidus up_read(&mm->mmap_sem); 37431dbd01fSIzik Eidus } 37531dbd01fSIzik Eidus 37631dbd01fSIzik Eidus static struct page *get_mergeable_page(struct rmap_item *rmap_item) 37731dbd01fSIzik Eidus { 37831dbd01fSIzik Eidus struct mm_struct *mm = rmap_item->mm; 37931dbd01fSIzik Eidus unsigned long addr = rmap_item->address; 38031dbd01fSIzik Eidus struct vm_area_struct *vma; 38131dbd01fSIzik Eidus struct page *page; 38231dbd01fSIzik Eidus 38331dbd01fSIzik Eidus down_read(&mm->mmap_sem); 3849ba69294SHugh Dickins if (ksm_test_exit(mm)) 3859ba69294SHugh Dickins goto out; 38631dbd01fSIzik Eidus vma = find_vma(mm, addr); 38731dbd01fSIzik Eidus if (!vma || vma->vm_start > addr) 38831dbd01fSIzik Eidus goto out; 38931dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 39031dbd01fSIzik Eidus goto out; 39131dbd01fSIzik Eidus 39231dbd01fSIzik Eidus page = follow_page(vma, addr, FOLL_GET); 39331dbd01fSIzik Eidus if (!page) 39431dbd01fSIzik Eidus goto out; 39531dbd01fSIzik Eidus if (PageAnon(page)) { 39631dbd01fSIzik Eidus flush_anon_page(vma, page, addr); 39731dbd01fSIzik Eidus flush_dcache_page(page); 39831dbd01fSIzik Eidus } else { 39931dbd01fSIzik Eidus put_page(page); 40031dbd01fSIzik Eidus out: page = NULL; 40131dbd01fSIzik Eidus } 40231dbd01fSIzik Eidus up_read(&mm->mmap_sem); 40331dbd01fSIzik Eidus return page; 40431dbd01fSIzik Eidus } 40531dbd01fSIzik Eidus 40631dbd01fSIzik Eidus /* 40731dbd01fSIzik Eidus * get_ksm_page: checks if the page at the virtual address in rmap_item 40831dbd01fSIzik Eidus * is still PageKsm, in which case we can trust the content of the page, 40931dbd01fSIzik Eidus * and it returns the gotten page; but NULL if the page has been zapped. 41031dbd01fSIzik Eidus */ 41131dbd01fSIzik Eidus static struct page *get_ksm_page(struct rmap_item *rmap_item) 41231dbd01fSIzik Eidus { 41331dbd01fSIzik Eidus struct page *page; 41431dbd01fSIzik Eidus 41531dbd01fSIzik Eidus page = get_mergeable_page(rmap_item); 41631dbd01fSIzik Eidus if (page && !PageKsm(page)) { 41731dbd01fSIzik Eidus put_page(page); 41831dbd01fSIzik Eidus page = NULL; 41931dbd01fSIzik Eidus } 42031dbd01fSIzik Eidus return page; 42131dbd01fSIzik Eidus } 42231dbd01fSIzik Eidus 42331dbd01fSIzik Eidus /* 42431dbd01fSIzik Eidus * Removing rmap_item from stable or unstable tree. 42531dbd01fSIzik Eidus * This function will clean the information from the stable/unstable tree. 42631dbd01fSIzik Eidus */ 42731dbd01fSIzik Eidus static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 42831dbd01fSIzik Eidus { 42931dbd01fSIzik Eidus if (in_stable_tree(rmap_item)) { 43031dbd01fSIzik Eidus struct rmap_item *next_item = rmap_item->next; 43131dbd01fSIzik Eidus 43231dbd01fSIzik Eidus if (rmap_item->address & NODE_FLAG) { 43331dbd01fSIzik Eidus if (next_item) { 43431dbd01fSIzik Eidus rb_replace_node(&rmap_item->node, 43531dbd01fSIzik Eidus &next_item->node, 43631dbd01fSIzik Eidus &root_stable_tree); 43731dbd01fSIzik Eidus next_item->address |= NODE_FLAG; 438e178dfdeSHugh Dickins ksm_pages_sharing--; 43931dbd01fSIzik Eidus } else { 44031dbd01fSIzik Eidus rb_erase(&rmap_item->node, &root_stable_tree); 441b4028260SHugh Dickins ksm_pages_shared--; 44231dbd01fSIzik Eidus } 44331dbd01fSIzik Eidus } else { 44431dbd01fSIzik Eidus struct rmap_item *prev_item = rmap_item->prev; 44531dbd01fSIzik Eidus 44631dbd01fSIzik Eidus BUG_ON(prev_item->next != rmap_item); 44731dbd01fSIzik Eidus prev_item->next = next_item; 44831dbd01fSIzik Eidus if (next_item) { 44931dbd01fSIzik Eidus BUG_ON(next_item->prev != rmap_item); 45031dbd01fSIzik Eidus next_item->prev = rmap_item->prev; 45131dbd01fSIzik Eidus } 452e178dfdeSHugh Dickins ksm_pages_sharing--; 45331dbd01fSIzik Eidus } 45431dbd01fSIzik Eidus 45531dbd01fSIzik Eidus rmap_item->next = NULL; 45631dbd01fSIzik Eidus 45731dbd01fSIzik Eidus } else if (rmap_item->address & NODE_FLAG) { 45831dbd01fSIzik Eidus unsigned char age; 45931dbd01fSIzik Eidus /* 4609ba69294SHugh Dickins * Usually ksmd can and must skip the rb_erase, because 46131dbd01fSIzik Eidus * root_unstable_tree was already reset to RB_ROOT. 4629ba69294SHugh Dickins * But be careful when an mm is exiting: do the rb_erase 4639ba69294SHugh Dickins * if this rmap_item was inserted by this scan, rather 4649ba69294SHugh Dickins * than left over from before. 46531dbd01fSIzik Eidus */ 46631dbd01fSIzik Eidus age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 467cd551f97SHugh Dickins BUG_ON(age > 1); 46831dbd01fSIzik Eidus if (!age) 46931dbd01fSIzik Eidus rb_erase(&rmap_item->node, &root_unstable_tree); 470473b0ce4SHugh Dickins ksm_pages_unshared--; 47131dbd01fSIzik Eidus } 47231dbd01fSIzik Eidus 47331dbd01fSIzik Eidus rmap_item->address &= PAGE_MASK; 47431dbd01fSIzik Eidus 47531dbd01fSIzik Eidus cond_resched(); /* we're called from many long loops */ 47631dbd01fSIzik Eidus } 47731dbd01fSIzik Eidus 47831dbd01fSIzik Eidus static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 47931dbd01fSIzik Eidus struct list_head *cur) 48031dbd01fSIzik Eidus { 48131dbd01fSIzik Eidus struct rmap_item *rmap_item; 48231dbd01fSIzik Eidus 48331dbd01fSIzik Eidus while (cur != &mm_slot->rmap_list) { 48431dbd01fSIzik Eidus rmap_item = list_entry(cur, struct rmap_item, link); 48531dbd01fSIzik Eidus cur = cur->next; 48631dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item); 48731dbd01fSIzik Eidus list_del(&rmap_item->link); 48831dbd01fSIzik Eidus free_rmap_item(rmap_item); 48931dbd01fSIzik Eidus } 49031dbd01fSIzik Eidus } 49131dbd01fSIzik Eidus 49231dbd01fSIzik Eidus /* 49331dbd01fSIzik Eidus * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 49431dbd01fSIzik Eidus * than check every pte of a given vma, the locking doesn't quite work for 49531dbd01fSIzik Eidus * that - an rmap_item is assigned to the stable tree after inserting ksm 49631dbd01fSIzik Eidus * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 49731dbd01fSIzik Eidus * rmap_items from parent to child at fork time (so as not to waste time 49831dbd01fSIzik Eidus * if exit comes before the next scan reaches it). 49981464e30SHugh Dickins * 50081464e30SHugh Dickins * Similarly, although we'd like to remove rmap_items (so updating counts 50181464e30SHugh Dickins * and freeing memory) when unmerging an area, it's easier to leave that 50281464e30SHugh Dickins * to the next pass of ksmd - consider, for example, how ksmd might be 50381464e30SHugh Dickins * in cmp_and_merge_page on one of the rmap_items we would be removing. 50431dbd01fSIzik Eidus */ 505d952b791SHugh Dickins static int unmerge_ksm_pages(struct vm_area_struct *vma, 50631dbd01fSIzik Eidus unsigned long start, unsigned long end) 50731dbd01fSIzik Eidus { 50831dbd01fSIzik Eidus unsigned long addr; 509d952b791SHugh Dickins int err = 0; 51031dbd01fSIzik Eidus 511d952b791SHugh Dickins for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 5129ba69294SHugh Dickins if (ksm_test_exit(vma->vm_mm)) 5139ba69294SHugh Dickins break; 514d952b791SHugh Dickins if (signal_pending(current)) 515d952b791SHugh Dickins err = -ERESTARTSYS; 516d952b791SHugh Dickins else 517d952b791SHugh Dickins err = break_ksm(vma, addr); 518d952b791SHugh Dickins } 519d952b791SHugh Dickins return err; 52031dbd01fSIzik Eidus } 52131dbd01fSIzik Eidus 5222ffd8679SHugh Dickins #ifdef CONFIG_SYSFS 5232ffd8679SHugh Dickins /* 5242ffd8679SHugh Dickins * Only called through the sysfs control interface: 5252ffd8679SHugh Dickins */ 526d952b791SHugh Dickins static int unmerge_and_remove_all_rmap_items(void) 52731dbd01fSIzik Eidus { 52831dbd01fSIzik Eidus struct mm_slot *mm_slot; 52931dbd01fSIzik Eidus struct mm_struct *mm; 53031dbd01fSIzik Eidus struct vm_area_struct *vma; 531d952b791SHugh Dickins int err = 0; 53231dbd01fSIzik Eidus 533d952b791SHugh Dickins spin_lock(&ksm_mmlist_lock); 5349ba69294SHugh Dickins ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, 535d952b791SHugh Dickins struct mm_slot, mm_list); 536d952b791SHugh Dickins spin_unlock(&ksm_mmlist_lock); 537d952b791SHugh Dickins 5389ba69294SHugh Dickins for (mm_slot = ksm_scan.mm_slot; 5399ba69294SHugh Dickins mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { 54031dbd01fSIzik Eidus mm = mm_slot->mm; 54131dbd01fSIzik Eidus down_read(&mm->mmap_sem); 54231dbd01fSIzik Eidus for (vma = mm->mmap; vma; vma = vma->vm_next) { 5439ba69294SHugh Dickins if (ksm_test_exit(mm)) 5449ba69294SHugh Dickins break; 54531dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 54631dbd01fSIzik Eidus continue; 547d952b791SHugh Dickins err = unmerge_ksm_pages(vma, 548d952b791SHugh Dickins vma->vm_start, vma->vm_end); 5499ba69294SHugh Dickins if (err) 5509ba69294SHugh Dickins goto error; 551d952b791SHugh Dickins } 5529ba69294SHugh Dickins 55381464e30SHugh Dickins remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 55431dbd01fSIzik Eidus 55531dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock); 5569ba69294SHugh Dickins ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 557d952b791SHugh Dickins struct mm_slot, mm_list); 5589ba69294SHugh Dickins if (ksm_test_exit(mm)) { 5599ba69294SHugh Dickins hlist_del(&mm_slot->link); 5609ba69294SHugh Dickins list_del(&mm_slot->mm_list); 56131dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock); 5629ba69294SHugh Dickins 5639ba69294SHugh Dickins free_mm_slot(mm_slot); 5649ba69294SHugh Dickins clear_bit(MMF_VM_MERGEABLE, &mm->flags); 5659ba69294SHugh Dickins up_read(&mm->mmap_sem); 5669ba69294SHugh Dickins mmdrop(mm); 5679ba69294SHugh Dickins } else { 5689ba69294SHugh Dickins spin_unlock(&ksm_mmlist_lock); 5699ba69294SHugh Dickins up_read(&mm->mmap_sem); 5709ba69294SHugh Dickins } 57131dbd01fSIzik Eidus } 57231dbd01fSIzik Eidus 573d952b791SHugh Dickins ksm_scan.seqnr = 0; 5749ba69294SHugh Dickins return 0; 5759ba69294SHugh Dickins 5769ba69294SHugh Dickins error: 5779ba69294SHugh Dickins up_read(&mm->mmap_sem); 578d952b791SHugh Dickins spin_lock(&ksm_mmlist_lock); 579d952b791SHugh Dickins ksm_scan.mm_slot = &ksm_mm_head; 580d952b791SHugh Dickins spin_unlock(&ksm_mmlist_lock); 581d952b791SHugh Dickins return err; 582d952b791SHugh Dickins } 5832ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */ 584d952b791SHugh Dickins 58531dbd01fSIzik Eidus static u32 calc_checksum(struct page *page) 58631dbd01fSIzik Eidus { 58731dbd01fSIzik Eidus u32 checksum; 58831dbd01fSIzik Eidus void *addr = kmap_atomic(page, KM_USER0); 58931dbd01fSIzik Eidus checksum = jhash2(addr, PAGE_SIZE / 4, 17); 59031dbd01fSIzik Eidus kunmap_atomic(addr, KM_USER0); 59131dbd01fSIzik Eidus return checksum; 59231dbd01fSIzik Eidus } 59331dbd01fSIzik Eidus 59431dbd01fSIzik Eidus static int memcmp_pages(struct page *page1, struct page *page2) 59531dbd01fSIzik Eidus { 59631dbd01fSIzik Eidus char *addr1, *addr2; 59731dbd01fSIzik Eidus int ret; 59831dbd01fSIzik Eidus 59931dbd01fSIzik Eidus addr1 = kmap_atomic(page1, KM_USER0); 60031dbd01fSIzik Eidus addr2 = kmap_atomic(page2, KM_USER1); 60131dbd01fSIzik Eidus ret = memcmp(addr1, addr2, PAGE_SIZE); 60231dbd01fSIzik Eidus kunmap_atomic(addr2, KM_USER1); 60331dbd01fSIzik Eidus kunmap_atomic(addr1, KM_USER0); 60431dbd01fSIzik Eidus return ret; 60531dbd01fSIzik Eidus } 60631dbd01fSIzik Eidus 60731dbd01fSIzik Eidus static inline int pages_identical(struct page *page1, struct page *page2) 60831dbd01fSIzik Eidus { 60931dbd01fSIzik Eidus return !memcmp_pages(page1, page2); 61031dbd01fSIzik Eidus } 61131dbd01fSIzik Eidus 61231dbd01fSIzik Eidus static int write_protect_page(struct vm_area_struct *vma, struct page *page, 61331dbd01fSIzik Eidus pte_t *orig_pte) 61431dbd01fSIzik Eidus { 61531dbd01fSIzik Eidus struct mm_struct *mm = vma->vm_mm; 61631dbd01fSIzik Eidus unsigned long addr; 61731dbd01fSIzik Eidus pte_t *ptep; 61831dbd01fSIzik Eidus spinlock_t *ptl; 61931dbd01fSIzik Eidus int swapped; 62031dbd01fSIzik Eidus int err = -EFAULT; 62131dbd01fSIzik Eidus 62231dbd01fSIzik Eidus addr = page_address_in_vma(page, vma); 62331dbd01fSIzik Eidus if (addr == -EFAULT) 62431dbd01fSIzik Eidus goto out; 62531dbd01fSIzik Eidus 62631dbd01fSIzik Eidus ptep = page_check_address(page, mm, addr, &ptl, 0); 62731dbd01fSIzik Eidus if (!ptep) 62831dbd01fSIzik Eidus goto out; 62931dbd01fSIzik Eidus 63031dbd01fSIzik Eidus if (pte_write(*ptep)) { 63131dbd01fSIzik Eidus pte_t entry; 63231dbd01fSIzik Eidus 63331dbd01fSIzik Eidus swapped = PageSwapCache(page); 63431dbd01fSIzik Eidus flush_cache_page(vma, addr, page_to_pfn(page)); 63531dbd01fSIzik Eidus /* 63631dbd01fSIzik Eidus * Ok this is tricky, when get_user_pages_fast() run it doesnt 63731dbd01fSIzik Eidus * take any lock, therefore the check that we are going to make 63831dbd01fSIzik Eidus * with the pagecount against the mapcount is racey and 63931dbd01fSIzik Eidus * O_DIRECT can happen right after the check. 64031dbd01fSIzik Eidus * So we clear the pte and flush the tlb before the check 64131dbd01fSIzik Eidus * this assure us that no O_DIRECT can happen after the check 64231dbd01fSIzik Eidus * or in the middle of the check. 64331dbd01fSIzik Eidus */ 64431dbd01fSIzik Eidus entry = ptep_clear_flush(vma, addr, ptep); 64531dbd01fSIzik Eidus /* 64631dbd01fSIzik Eidus * Check that no O_DIRECT or similar I/O is in progress on the 64731dbd01fSIzik Eidus * page 64831dbd01fSIzik Eidus */ 64931dbd01fSIzik Eidus if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 65031dbd01fSIzik Eidus set_pte_at_notify(mm, addr, ptep, entry); 65131dbd01fSIzik Eidus goto out_unlock; 65231dbd01fSIzik Eidus } 65331dbd01fSIzik Eidus entry = pte_wrprotect(entry); 65431dbd01fSIzik Eidus set_pte_at_notify(mm, addr, ptep, entry); 65531dbd01fSIzik Eidus } 65631dbd01fSIzik Eidus *orig_pte = *ptep; 65731dbd01fSIzik Eidus err = 0; 65831dbd01fSIzik Eidus 65931dbd01fSIzik Eidus out_unlock: 66031dbd01fSIzik Eidus pte_unmap_unlock(ptep, ptl); 66131dbd01fSIzik Eidus out: 66231dbd01fSIzik Eidus return err; 66331dbd01fSIzik Eidus } 66431dbd01fSIzik Eidus 66531dbd01fSIzik Eidus /** 66631dbd01fSIzik Eidus * replace_page - replace page in vma by new ksm page 66731dbd01fSIzik Eidus * @vma: vma that holds the pte pointing to oldpage 66831dbd01fSIzik Eidus * @oldpage: the page we are replacing by newpage 66931dbd01fSIzik Eidus * @newpage: the ksm page we replace oldpage by 67031dbd01fSIzik Eidus * @orig_pte: the original value of the pte 67131dbd01fSIzik Eidus * 67231dbd01fSIzik Eidus * Returns 0 on success, -EFAULT on failure. 67331dbd01fSIzik Eidus */ 67431dbd01fSIzik Eidus static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 67531dbd01fSIzik Eidus struct page *newpage, pte_t orig_pte) 67631dbd01fSIzik Eidus { 67731dbd01fSIzik Eidus struct mm_struct *mm = vma->vm_mm; 67831dbd01fSIzik Eidus pgd_t *pgd; 67931dbd01fSIzik Eidus pud_t *pud; 68031dbd01fSIzik Eidus pmd_t *pmd; 68131dbd01fSIzik Eidus pte_t *ptep; 68231dbd01fSIzik Eidus spinlock_t *ptl; 68331dbd01fSIzik Eidus unsigned long addr; 68431dbd01fSIzik Eidus pgprot_t prot; 68531dbd01fSIzik Eidus int err = -EFAULT; 68631dbd01fSIzik Eidus 68731dbd01fSIzik Eidus prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 68831dbd01fSIzik Eidus 68931dbd01fSIzik Eidus addr = page_address_in_vma(oldpage, vma); 69031dbd01fSIzik Eidus if (addr == -EFAULT) 69131dbd01fSIzik Eidus goto out; 69231dbd01fSIzik Eidus 69331dbd01fSIzik Eidus pgd = pgd_offset(mm, addr); 69431dbd01fSIzik Eidus if (!pgd_present(*pgd)) 69531dbd01fSIzik Eidus goto out; 69631dbd01fSIzik Eidus 69731dbd01fSIzik Eidus pud = pud_offset(pgd, addr); 69831dbd01fSIzik Eidus if (!pud_present(*pud)) 69931dbd01fSIzik Eidus goto out; 70031dbd01fSIzik Eidus 70131dbd01fSIzik Eidus pmd = pmd_offset(pud, addr); 70231dbd01fSIzik Eidus if (!pmd_present(*pmd)) 70331dbd01fSIzik Eidus goto out; 70431dbd01fSIzik Eidus 70531dbd01fSIzik Eidus ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 70631dbd01fSIzik Eidus if (!pte_same(*ptep, orig_pte)) { 70731dbd01fSIzik Eidus pte_unmap_unlock(ptep, ptl); 70831dbd01fSIzik Eidus goto out; 70931dbd01fSIzik Eidus } 71031dbd01fSIzik Eidus 71131dbd01fSIzik Eidus get_page(newpage); 71231dbd01fSIzik Eidus page_add_ksm_rmap(newpage); 71331dbd01fSIzik Eidus 71431dbd01fSIzik Eidus flush_cache_page(vma, addr, pte_pfn(*ptep)); 71531dbd01fSIzik Eidus ptep_clear_flush(vma, addr, ptep); 71631dbd01fSIzik Eidus set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 71731dbd01fSIzik Eidus 71831dbd01fSIzik Eidus page_remove_rmap(oldpage); 71931dbd01fSIzik Eidus put_page(oldpage); 72031dbd01fSIzik Eidus 72131dbd01fSIzik Eidus pte_unmap_unlock(ptep, ptl); 72231dbd01fSIzik Eidus err = 0; 72331dbd01fSIzik Eidus out: 72431dbd01fSIzik Eidus return err; 72531dbd01fSIzik Eidus } 72631dbd01fSIzik Eidus 72731dbd01fSIzik Eidus /* 72831dbd01fSIzik Eidus * try_to_merge_one_page - take two pages and merge them into one 72931dbd01fSIzik Eidus * @vma: the vma that hold the pte pointing into oldpage 73031dbd01fSIzik Eidus * @oldpage: the page that we want to replace with newpage 73131dbd01fSIzik Eidus * @newpage: the page that we want to map instead of oldpage 73231dbd01fSIzik Eidus * 73331dbd01fSIzik Eidus * Note: 73431dbd01fSIzik Eidus * oldpage should be a PageAnon page, while newpage should be a PageKsm page, 73531dbd01fSIzik Eidus * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. 73631dbd01fSIzik Eidus * 73731dbd01fSIzik Eidus * This function returns 0 if the pages were merged, -EFAULT otherwise. 73831dbd01fSIzik Eidus */ 73931dbd01fSIzik Eidus static int try_to_merge_one_page(struct vm_area_struct *vma, 74031dbd01fSIzik Eidus struct page *oldpage, 74131dbd01fSIzik Eidus struct page *newpage) 74231dbd01fSIzik Eidus { 74331dbd01fSIzik Eidus pte_t orig_pte = __pte(0); 74431dbd01fSIzik Eidus int err = -EFAULT; 74531dbd01fSIzik Eidus 74631dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE)) 74731dbd01fSIzik Eidus goto out; 74831dbd01fSIzik Eidus 74931dbd01fSIzik Eidus if (!PageAnon(oldpage)) 75031dbd01fSIzik Eidus goto out; 75131dbd01fSIzik Eidus 75231dbd01fSIzik Eidus get_page(newpage); 75331dbd01fSIzik Eidus get_page(oldpage); 75431dbd01fSIzik Eidus 75531dbd01fSIzik Eidus /* 75631dbd01fSIzik Eidus * We need the page lock to read a stable PageSwapCache in 75731dbd01fSIzik Eidus * write_protect_page(). We use trylock_page() instead of 75831dbd01fSIzik Eidus * lock_page() because we don't want to wait here - we 75931dbd01fSIzik Eidus * prefer to continue scanning and merging different pages, 76031dbd01fSIzik Eidus * then come back to this page when it is unlocked. 76131dbd01fSIzik Eidus */ 76231dbd01fSIzik Eidus if (!trylock_page(oldpage)) 76331dbd01fSIzik Eidus goto out_putpage; 76431dbd01fSIzik Eidus /* 76531dbd01fSIzik Eidus * If this anonymous page is mapped only here, its pte may need 76631dbd01fSIzik Eidus * to be write-protected. If it's mapped elsewhere, all of its 76731dbd01fSIzik Eidus * ptes are necessarily already write-protected. But in either 76831dbd01fSIzik Eidus * case, we need to lock and check page_count is not raised. 76931dbd01fSIzik Eidus */ 77031dbd01fSIzik Eidus if (write_protect_page(vma, oldpage, &orig_pte)) { 77131dbd01fSIzik Eidus unlock_page(oldpage); 77231dbd01fSIzik Eidus goto out_putpage; 77331dbd01fSIzik Eidus } 77431dbd01fSIzik Eidus unlock_page(oldpage); 77531dbd01fSIzik Eidus 77631dbd01fSIzik Eidus if (pages_identical(oldpage, newpage)) 77731dbd01fSIzik Eidus err = replace_page(vma, oldpage, newpage, orig_pte); 77831dbd01fSIzik Eidus 77931dbd01fSIzik Eidus out_putpage: 78031dbd01fSIzik Eidus put_page(oldpage); 78131dbd01fSIzik Eidus put_page(newpage); 78231dbd01fSIzik Eidus out: 78331dbd01fSIzik Eidus return err; 78431dbd01fSIzik Eidus } 78531dbd01fSIzik Eidus 78631dbd01fSIzik Eidus /* 78781464e30SHugh Dickins * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 78881464e30SHugh Dickins * but no new kernel page is allocated: kpage must already be a ksm page. 78981464e30SHugh Dickins */ 79081464e30SHugh Dickins static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 79181464e30SHugh Dickins unsigned long addr1, 79281464e30SHugh Dickins struct page *page1, 79381464e30SHugh Dickins struct page *kpage) 79481464e30SHugh Dickins { 79581464e30SHugh Dickins struct vm_area_struct *vma; 79681464e30SHugh Dickins int err = -EFAULT; 79781464e30SHugh Dickins 79881464e30SHugh Dickins down_read(&mm1->mmap_sem); 7999ba69294SHugh Dickins if (ksm_test_exit(mm1)) 8009ba69294SHugh Dickins goto out; 8019ba69294SHugh Dickins 80281464e30SHugh Dickins vma = find_vma(mm1, addr1); 80381464e30SHugh Dickins if (!vma || vma->vm_start > addr1) 80481464e30SHugh Dickins goto out; 80581464e30SHugh Dickins 80681464e30SHugh Dickins err = try_to_merge_one_page(vma, page1, kpage); 80781464e30SHugh Dickins out: 80881464e30SHugh Dickins up_read(&mm1->mmap_sem); 80981464e30SHugh Dickins return err; 81081464e30SHugh Dickins } 81181464e30SHugh Dickins 81281464e30SHugh Dickins /* 81331dbd01fSIzik Eidus * try_to_merge_two_pages - take two identical pages and prepare them 81431dbd01fSIzik Eidus * to be merged into one page. 81531dbd01fSIzik Eidus * 81631dbd01fSIzik Eidus * This function returns 0 if we successfully mapped two identical pages 81731dbd01fSIzik Eidus * into one page, -EFAULT otherwise. 81831dbd01fSIzik Eidus * 81931dbd01fSIzik Eidus * Note that this function allocates a new kernel page: if one of the pages 82031dbd01fSIzik Eidus * is already a ksm page, try_to_merge_with_ksm_page should be used. 82131dbd01fSIzik Eidus */ 82231dbd01fSIzik Eidus static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 82331dbd01fSIzik Eidus struct page *page1, struct mm_struct *mm2, 82431dbd01fSIzik Eidus unsigned long addr2, struct page *page2) 82531dbd01fSIzik Eidus { 82631dbd01fSIzik Eidus struct vm_area_struct *vma; 82731dbd01fSIzik Eidus struct page *kpage; 82831dbd01fSIzik Eidus int err = -EFAULT; 82931dbd01fSIzik Eidus 83031dbd01fSIzik Eidus /* 83131dbd01fSIzik Eidus * The number of nodes in the stable tree 83231dbd01fSIzik Eidus * is the number of kernel pages that we hold. 83331dbd01fSIzik Eidus */ 83431dbd01fSIzik Eidus if (ksm_max_kernel_pages && 835b4028260SHugh Dickins ksm_max_kernel_pages <= ksm_pages_shared) 83631dbd01fSIzik Eidus return err; 83731dbd01fSIzik Eidus 83831dbd01fSIzik Eidus kpage = alloc_page(GFP_HIGHUSER); 83931dbd01fSIzik Eidus if (!kpage) 84031dbd01fSIzik Eidus return err; 84131dbd01fSIzik Eidus 84231dbd01fSIzik Eidus down_read(&mm1->mmap_sem); 8439ba69294SHugh Dickins if (ksm_test_exit(mm1)) { 8449ba69294SHugh Dickins up_read(&mm1->mmap_sem); 8459ba69294SHugh Dickins goto out; 8469ba69294SHugh Dickins } 84731dbd01fSIzik Eidus vma = find_vma(mm1, addr1); 84831dbd01fSIzik Eidus if (!vma || vma->vm_start > addr1) { 84931dbd01fSIzik Eidus up_read(&mm1->mmap_sem); 85081464e30SHugh Dickins goto out; 85131dbd01fSIzik Eidus } 85231dbd01fSIzik Eidus 85331dbd01fSIzik Eidus copy_user_highpage(kpage, page1, addr1, vma); 85431dbd01fSIzik Eidus err = try_to_merge_one_page(vma, page1, kpage); 85531dbd01fSIzik Eidus up_read(&mm1->mmap_sem); 85631dbd01fSIzik Eidus 85731dbd01fSIzik Eidus if (!err) { 85881464e30SHugh Dickins err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); 85931dbd01fSIzik Eidus /* 86081464e30SHugh Dickins * If that fails, we have a ksm page with only one pte 86181464e30SHugh Dickins * pointing to it: so break it. 86231dbd01fSIzik Eidus */ 86331dbd01fSIzik Eidus if (err) 86431dbd01fSIzik Eidus break_cow(mm1, addr1); 86531dbd01fSIzik Eidus } 86681464e30SHugh Dickins out: 86731dbd01fSIzik Eidus put_page(kpage); 86831dbd01fSIzik Eidus return err; 86931dbd01fSIzik Eidus } 87031dbd01fSIzik Eidus 87131dbd01fSIzik Eidus /* 87231dbd01fSIzik Eidus * stable_tree_search - search page inside the stable tree 87331dbd01fSIzik Eidus * @page: the page that we are searching identical pages to. 87431dbd01fSIzik Eidus * @page2: pointer into identical page that we are holding inside the stable 87531dbd01fSIzik Eidus * tree that we have found. 87631dbd01fSIzik Eidus * @rmap_item: the reverse mapping item 87731dbd01fSIzik Eidus * 87831dbd01fSIzik Eidus * This function checks if there is a page inside the stable tree 87931dbd01fSIzik Eidus * with identical content to the page that we are scanning right now. 88031dbd01fSIzik Eidus * 88131dbd01fSIzik Eidus * This function return rmap_item pointer to the identical item if found, 88231dbd01fSIzik Eidus * NULL otherwise. 88331dbd01fSIzik Eidus */ 88431dbd01fSIzik Eidus static struct rmap_item *stable_tree_search(struct page *page, 88531dbd01fSIzik Eidus struct page **page2, 88631dbd01fSIzik Eidus struct rmap_item *rmap_item) 88731dbd01fSIzik Eidus { 88831dbd01fSIzik Eidus struct rb_node *node = root_stable_tree.rb_node; 88931dbd01fSIzik Eidus 89031dbd01fSIzik Eidus while (node) { 89131dbd01fSIzik Eidus struct rmap_item *tree_rmap_item, *next_rmap_item; 89231dbd01fSIzik Eidus int ret; 89331dbd01fSIzik Eidus 89431dbd01fSIzik Eidus tree_rmap_item = rb_entry(node, struct rmap_item, node); 89531dbd01fSIzik Eidus while (tree_rmap_item) { 89631dbd01fSIzik Eidus BUG_ON(!in_stable_tree(tree_rmap_item)); 89731dbd01fSIzik Eidus cond_resched(); 89831dbd01fSIzik Eidus page2[0] = get_ksm_page(tree_rmap_item); 89931dbd01fSIzik Eidus if (page2[0]) 90031dbd01fSIzik Eidus break; 90131dbd01fSIzik Eidus next_rmap_item = tree_rmap_item->next; 90231dbd01fSIzik Eidus remove_rmap_item_from_tree(tree_rmap_item); 90331dbd01fSIzik Eidus tree_rmap_item = next_rmap_item; 90431dbd01fSIzik Eidus } 90531dbd01fSIzik Eidus if (!tree_rmap_item) 90631dbd01fSIzik Eidus return NULL; 90731dbd01fSIzik Eidus 90831dbd01fSIzik Eidus ret = memcmp_pages(page, page2[0]); 90931dbd01fSIzik Eidus 91031dbd01fSIzik Eidus if (ret < 0) { 91131dbd01fSIzik Eidus put_page(page2[0]); 91231dbd01fSIzik Eidus node = node->rb_left; 91331dbd01fSIzik Eidus } else if (ret > 0) { 91431dbd01fSIzik Eidus put_page(page2[0]); 91531dbd01fSIzik Eidus node = node->rb_right; 91631dbd01fSIzik Eidus } else { 91731dbd01fSIzik Eidus return tree_rmap_item; 91831dbd01fSIzik Eidus } 91931dbd01fSIzik Eidus } 92031dbd01fSIzik Eidus 92131dbd01fSIzik Eidus return NULL; 92231dbd01fSIzik Eidus } 92331dbd01fSIzik Eidus 92431dbd01fSIzik Eidus /* 92531dbd01fSIzik Eidus * stable_tree_insert - insert rmap_item pointing to new ksm page 92631dbd01fSIzik Eidus * into the stable tree. 92731dbd01fSIzik Eidus * 92831dbd01fSIzik Eidus * @page: the page that we are searching identical page to inside the stable 92931dbd01fSIzik Eidus * tree. 93031dbd01fSIzik Eidus * @rmap_item: pointer to the reverse mapping item. 93131dbd01fSIzik Eidus * 93231dbd01fSIzik Eidus * This function returns rmap_item if success, NULL otherwise. 93331dbd01fSIzik Eidus */ 93431dbd01fSIzik Eidus static struct rmap_item *stable_tree_insert(struct page *page, 93531dbd01fSIzik Eidus struct rmap_item *rmap_item) 93631dbd01fSIzik Eidus { 93731dbd01fSIzik Eidus struct rb_node **new = &root_stable_tree.rb_node; 93831dbd01fSIzik Eidus struct rb_node *parent = NULL; 93931dbd01fSIzik Eidus 94031dbd01fSIzik Eidus while (*new) { 94131dbd01fSIzik Eidus struct rmap_item *tree_rmap_item, *next_rmap_item; 94231dbd01fSIzik Eidus struct page *tree_page; 94331dbd01fSIzik Eidus int ret; 94431dbd01fSIzik Eidus 94531dbd01fSIzik Eidus tree_rmap_item = rb_entry(*new, struct rmap_item, node); 94631dbd01fSIzik Eidus while (tree_rmap_item) { 94731dbd01fSIzik Eidus BUG_ON(!in_stable_tree(tree_rmap_item)); 94831dbd01fSIzik Eidus cond_resched(); 94931dbd01fSIzik Eidus tree_page = get_ksm_page(tree_rmap_item); 95031dbd01fSIzik Eidus if (tree_page) 95131dbd01fSIzik Eidus break; 95231dbd01fSIzik Eidus next_rmap_item = tree_rmap_item->next; 95331dbd01fSIzik Eidus remove_rmap_item_from_tree(tree_rmap_item); 95431dbd01fSIzik Eidus tree_rmap_item = next_rmap_item; 95531dbd01fSIzik Eidus } 95631dbd01fSIzik Eidus if (!tree_rmap_item) 95731dbd01fSIzik Eidus return NULL; 95831dbd01fSIzik Eidus 95931dbd01fSIzik Eidus ret = memcmp_pages(page, tree_page); 96031dbd01fSIzik Eidus put_page(tree_page); 96131dbd01fSIzik Eidus 96231dbd01fSIzik Eidus parent = *new; 96331dbd01fSIzik Eidus if (ret < 0) 96431dbd01fSIzik Eidus new = &parent->rb_left; 96531dbd01fSIzik Eidus else if (ret > 0) 96631dbd01fSIzik Eidus new = &parent->rb_right; 96731dbd01fSIzik Eidus else { 96831dbd01fSIzik Eidus /* 96931dbd01fSIzik Eidus * It is not a bug that stable_tree_search() didn't 97031dbd01fSIzik Eidus * find this node: because at that time our page was 97131dbd01fSIzik Eidus * not yet write-protected, so may have changed since. 97231dbd01fSIzik Eidus */ 97331dbd01fSIzik Eidus return NULL; 97431dbd01fSIzik Eidus } 97531dbd01fSIzik Eidus } 97631dbd01fSIzik Eidus 97731dbd01fSIzik Eidus rmap_item->address |= NODE_FLAG | STABLE_FLAG; 97831dbd01fSIzik Eidus rmap_item->next = NULL; 97931dbd01fSIzik Eidus rb_link_node(&rmap_item->node, parent, new); 98031dbd01fSIzik Eidus rb_insert_color(&rmap_item->node, &root_stable_tree); 98131dbd01fSIzik Eidus 982e178dfdeSHugh Dickins ksm_pages_shared++; 98331dbd01fSIzik Eidus return rmap_item; 98431dbd01fSIzik Eidus } 98531dbd01fSIzik Eidus 98631dbd01fSIzik Eidus /* 98731dbd01fSIzik Eidus * unstable_tree_search_insert - search and insert items into the unstable tree. 98831dbd01fSIzik Eidus * 98931dbd01fSIzik Eidus * @page: the page that we are going to search for identical page or to insert 99031dbd01fSIzik Eidus * into the unstable tree 99131dbd01fSIzik Eidus * @page2: pointer into identical page that was found inside the unstable tree 99231dbd01fSIzik Eidus * @rmap_item: the reverse mapping item of page 99331dbd01fSIzik Eidus * 99431dbd01fSIzik Eidus * This function searches for a page in the unstable tree identical to the 99531dbd01fSIzik Eidus * page currently being scanned; and if no identical page is found in the 99631dbd01fSIzik Eidus * tree, we insert rmap_item as a new object into the unstable tree. 99731dbd01fSIzik Eidus * 99831dbd01fSIzik Eidus * This function returns pointer to rmap_item found to be identical 99931dbd01fSIzik Eidus * to the currently scanned page, NULL otherwise. 100031dbd01fSIzik Eidus * 100131dbd01fSIzik Eidus * This function does both searching and inserting, because they share 100231dbd01fSIzik Eidus * the same walking algorithm in an rbtree. 100331dbd01fSIzik Eidus */ 100431dbd01fSIzik Eidus static struct rmap_item *unstable_tree_search_insert(struct page *page, 100531dbd01fSIzik Eidus struct page **page2, 100631dbd01fSIzik Eidus struct rmap_item *rmap_item) 100731dbd01fSIzik Eidus { 100831dbd01fSIzik Eidus struct rb_node **new = &root_unstable_tree.rb_node; 100931dbd01fSIzik Eidus struct rb_node *parent = NULL; 101031dbd01fSIzik Eidus 101131dbd01fSIzik Eidus while (*new) { 101231dbd01fSIzik Eidus struct rmap_item *tree_rmap_item; 101331dbd01fSIzik Eidus int ret; 101431dbd01fSIzik Eidus 1015*d178f27fSHugh Dickins cond_resched(); 101631dbd01fSIzik Eidus tree_rmap_item = rb_entry(*new, struct rmap_item, node); 101731dbd01fSIzik Eidus page2[0] = get_mergeable_page(tree_rmap_item); 101831dbd01fSIzik Eidus if (!page2[0]) 101931dbd01fSIzik Eidus return NULL; 102031dbd01fSIzik Eidus 102131dbd01fSIzik Eidus /* 102231dbd01fSIzik Eidus * Don't substitute an unswappable ksm page 102331dbd01fSIzik Eidus * just for one good swappable forked page. 102431dbd01fSIzik Eidus */ 102531dbd01fSIzik Eidus if (page == page2[0]) { 102631dbd01fSIzik Eidus put_page(page2[0]); 102731dbd01fSIzik Eidus return NULL; 102831dbd01fSIzik Eidus } 102931dbd01fSIzik Eidus 103031dbd01fSIzik Eidus ret = memcmp_pages(page, page2[0]); 103131dbd01fSIzik Eidus 103231dbd01fSIzik Eidus parent = *new; 103331dbd01fSIzik Eidus if (ret < 0) { 103431dbd01fSIzik Eidus put_page(page2[0]); 103531dbd01fSIzik Eidus new = &parent->rb_left; 103631dbd01fSIzik Eidus } else if (ret > 0) { 103731dbd01fSIzik Eidus put_page(page2[0]); 103831dbd01fSIzik Eidus new = &parent->rb_right; 103931dbd01fSIzik Eidus } else { 104031dbd01fSIzik Eidus return tree_rmap_item; 104131dbd01fSIzik Eidus } 104231dbd01fSIzik Eidus } 104331dbd01fSIzik Eidus 104431dbd01fSIzik Eidus rmap_item->address |= NODE_FLAG; 104531dbd01fSIzik Eidus rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 104631dbd01fSIzik Eidus rb_link_node(&rmap_item->node, parent, new); 104731dbd01fSIzik Eidus rb_insert_color(&rmap_item->node, &root_unstable_tree); 104831dbd01fSIzik Eidus 1049473b0ce4SHugh Dickins ksm_pages_unshared++; 105031dbd01fSIzik Eidus return NULL; 105131dbd01fSIzik Eidus } 105231dbd01fSIzik Eidus 105331dbd01fSIzik Eidus /* 105431dbd01fSIzik Eidus * stable_tree_append - add another rmap_item to the linked list of 105531dbd01fSIzik Eidus * rmap_items hanging off a given node of the stable tree, all sharing 105631dbd01fSIzik Eidus * the same ksm page. 105731dbd01fSIzik Eidus */ 105831dbd01fSIzik Eidus static void stable_tree_append(struct rmap_item *rmap_item, 105931dbd01fSIzik Eidus struct rmap_item *tree_rmap_item) 106031dbd01fSIzik Eidus { 106131dbd01fSIzik Eidus rmap_item->next = tree_rmap_item->next; 106231dbd01fSIzik Eidus rmap_item->prev = tree_rmap_item; 106331dbd01fSIzik Eidus 106431dbd01fSIzik Eidus if (tree_rmap_item->next) 106531dbd01fSIzik Eidus tree_rmap_item->next->prev = rmap_item; 106631dbd01fSIzik Eidus 106731dbd01fSIzik Eidus tree_rmap_item->next = rmap_item; 106831dbd01fSIzik Eidus rmap_item->address |= STABLE_FLAG; 1069e178dfdeSHugh Dickins 1070e178dfdeSHugh Dickins ksm_pages_sharing++; 107131dbd01fSIzik Eidus } 107231dbd01fSIzik Eidus 107331dbd01fSIzik Eidus /* 107481464e30SHugh Dickins * cmp_and_merge_page - first see if page can be merged into the stable tree; 107581464e30SHugh Dickins * if not, compare checksum to previous and if it's the same, see if page can 107681464e30SHugh Dickins * be inserted into the unstable tree, or merged with a page already there and 107781464e30SHugh Dickins * both transferred to the stable tree. 107831dbd01fSIzik Eidus * 107931dbd01fSIzik Eidus * @page: the page that we are searching identical page to. 108031dbd01fSIzik Eidus * @rmap_item: the reverse mapping into the virtual address of this page 108131dbd01fSIzik Eidus */ 108231dbd01fSIzik Eidus static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 108331dbd01fSIzik Eidus { 108431dbd01fSIzik Eidus struct page *page2[1]; 108531dbd01fSIzik Eidus struct rmap_item *tree_rmap_item; 108631dbd01fSIzik Eidus unsigned int checksum; 108731dbd01fSIzik Eidus int err; 108831dbd01fSIzik Eidus 108931dbd01fSIzik Eidus if (in_stable_tree(rmap_item)) 109031dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item); 109131dbd01fSIzik Eidus 109231dbd01fSIzik Eidus /* We first start with searching the page inside the stable tree */ 109331dbd01fSIzik Eidus tree_rmap_item = stable_tree_search(page, page2, rmap_item); 109431dbd01fSIzik Eidus if (tree_rmap_item) { 1095e178dfdeSHugh Dickins if (page == page2[0]) /* forked */ 109631dbd01fSIzik Eidus err = 0; 1097e178dfdeSHugh Dickins else 109831dbd01fSIzik Eidus err = try_to_merge_with_ksm_page(rmap_item->mm, 109931dbd01fSIzik Eidus rmap_item->address, 110031dbd01fSIzik Eidus page, page2[0]); 110131dbd01fSIzik Eidus put_page(page2[0]); 110231dbd01fSIzik Eidus 110331dbd01fSIzik Eidus if (!err) { 110431dbd01fSIzik Eidus /* 110531dbd01fSIzik Eidus * The page was successfully merged: 110631dbd01fSIzik Eidus * add its rmap_item to the stable tree. 110731dbd01fSIzik Eidus */ 110831dbd01fSIzik Eidus stable_tree_append(rmap_item, tree_rmap_item); 110931dbd01fSIzik Eidus } 111031dbd01fSIzik Eidus return; 111131dbd01fSIzik Eidus } 111231dbd01fSIzik Eidus 111331dbd01fSIzik Eidus /* 111431dbd01fSIzik Eidus * A ksm page might have got here by fork, but its other 111531dbd01fSIzik Eidus * references have already been removed from the stable tree. 1116d952b791SHugh Dickins * Or it might be left over from a break_ksm which failed 1117d952b791SHugh Dickins * when the mem_cgroup had reached its limit: try again now. 111831dbd01fSIzik Eidus */ 111931dbd01fSIzik Eidus if (PageKsm(page)) 112031dbd01fSIzik Eidus break_cow(rmap_item->mm, rmap_item->address); 112131dbd01fSIzik Eidus 112231dbd01fSIzik Eidus /* 112331dbd01fSIzik Eidus * In case the hash value of the page was changed from the last time we 112431dbd01fSIzik Eidus * have calculated it, this page to be changed frequely, therefore we 112531dbd01fSIzik Eidus * don't want to insert it to the unstable tree, and we don't want to 112631dbd01fSIzik Eidus * waste our time to search if there is something identical to it there. 112731dbd01fSIzik Eidus */ 112831dbd01fSIzik Eidus checksum = calc_checksum(page); 112931dbd01fSIzik Eidus if (rmap_item->oldchecksum != checksum) { 113031dbd01fSIzik Eidus rmap_item->oldchecksum = checksum; 113131dbd01fSIzik Eidus return; 113231dbd01fSIzik Eidus } 113331dbd01fSIzik Eidus 113431dbd01fSIzik Eidus tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 113531dbd01fSIzik Eidus if (tree_rmap_item) { 113631dbd01fSIzik Eidus err = try_to_merge_two_pages(rmap_item->mm, 113731dbd01fSIzik Eidus rmap_item->address, page, 113831dbd01fSIzik Eidus tree_rmap_item->mm, 113931dbd01fSIzik Eidus tree_rmap_item->address, page2[0]); 114031dbd01fSIzik Eidus /* 114131dbd01fSIzik Eidus * As soon as we merge this page, we want to remove the 114231dbd01fSIzik Eidus * rmap_item of the page we have merged with from the unstable 114331dbd01fSIzik Eidus * tree, and insert it instead as new node in the stable tree. 114431dbd01fSIzik Eidus */ 114531dbd01fSIzik Eidus if (!err) { 114631dbd01fSIzik Eidus rb_erase(&tree_rmap_item->node, &root_unstable_tree); 114731dbd01fSIzik Eidus tree_rmap_item->address &= ~NODE_FLAG; 1148473b0ce4SHugh Dickins ksm_pages_unshared--; 1149473b0ce4SHugh Dickins 115031dbd01fSIzik Eidus /* 115131dbd01fSIzik Eidus * If we fail to insert the page into the stable tree, 115231dbd01fSIzik Eidus * we will have 2 virtual addresses that are pointing 115331dbd01fSIzik Eidus * to a ksm page left outside the stable tree, 115431dbd01fSIzik Eidus * in which case we need to break_cow on both. 115531dbd01fSIzik Eidus */ 115631dbd01fSIzik Eidus if (stable_tree_insert(page2[0], tree_rmap_item)) 115731dbd01fSIzik Eidus stable_tree_append(rmap_item, tree_rmap_item); 115831dbd01fSIzik Eidus else { 115931dbd01fSIzik Eidus break_cow(tree_rmap_item->mm, 116031dbd01fSIzik Eidus tree_rmap_item->address); 116131dbd01fSIzik Eidus break_cow(rmap_item->mm, rmap_item->address); 116231dbd01fSIzik Eidus } 116331dbd01fSIzik Eidus } 116431dbd01fSIzik Eidus 116531dbd01fSIzik Eidus put_page(page2[0]); 116631dbd01fSIzik Eidus } 116731dbd01fSIzik Eidus } 116831dbd01fSIzik Eidus 116931dbd01fSIzik Eidus static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 117031dbd01fSIzik Eidus struct list_head *cur, 117131dbd01fSIzik Eidus unsigned long addr) 117231dbd01fSIzik Eidus { 117331dbd01fSIzik Eidus struct rmap_item *rmap_item; 117431dbd01fSIzik Eidus 117531dbd01fSIzik Eidus while (cur != &mm_slot->rmap_list) { 117631dbd01fSIzik Eidus rmap_item = list_entry(cur, struct rmap_item, link); 117731dbd01fSIzik Eidus if ((rmap_item->address & PAGE_MASK) == addr) { 117831dbd01fSIzik Eidus if (!in_stable_tree(rmap_item)) 117931dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item); 118031dbd01fSIzik Eidus return rmap_item; 118131dbd01fSIzik Eidus } 118231dbd01fSIzik Eidus if (rmap_item->address > addr) 118331dbd01fSIzik Eidus break; 118431dbd01fSIzik Eidus cur = cur->next; 118531dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item); 118631dbd01fSIzik Eidus list_del(&rmap_item->link); 118731dbd01fSIzik Eidus free_rmap_item(rmap_item); 118831dbd01fSIzik Eidus } 118931dbd01fSIzik Eidus 119031dbd01fSIzik Eidus rmap_item = alloc_rmap_item(); 119131dbd01fSIzik Eidus if (rmap_item) { 119231dbd01fSIzik Eidus /* It has already been zeroed */ 119331dbd01fSIzik Eidus rmap_item->mm = mm_slot->mm; 119431dbd01fSIzik Eidus rmap_item->address = addr; 119531dbd01fSIzik Eidus list_add_tail(&rmap_item->link, cur); 119631dbd01fSIzik Eidus } 119731dbd01fSIzik Eidus return rmap_item; 119831dbd01fSIzik Eidus } 119931dbd01fSIzik Eidus 120031dbd01fSIzik Eidus static struct rmap_item *scan_get_next_rmap_item(struct page **page) 120131dbd01fSIzik Eidus { 120231dbd01fSIzik Eidus struct mm_struct *mm; 120331dbd01fSIzik Eidus struct mm_slot *slot; 120431dbd01fSIzik Eidus struct vm_area_struct *vma; 120531dbd01fSIzik Eidus struct rmap_item *rmap_item; 120631dbd01fSIzik Eidus 120731dbd01fSIzik Eidus if (list_empty(&ksm_mm_head.mm_list)) 120831dbd01fSIzik Eidus return NULL; 120931dbd01fSIzik Eidus 121031dbd01fSIzik Eidus slot = ksm_scan.mm_slot; 121131dbd01fSIzik Eidus if (slot == &ksm_mm_head) { 121231dbd01fSIzik Eidus root_unstable_tree = RB_ROOT; 121331dbd01fSIzik Eidus 121431dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock); 121531dbd01fSIzik Eidus slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 121631dbd01fSIzik Eidus ksm_scan.mm_slot = slot; 121731dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock); 121831dbd01fSIzik Eidus next_mm: 121931dbd01fSIzik Eidus ksm_scan.address = 0; 122031dbd01fSIzik Eidus ksm_scan.rmap_item = list_entry(&slot->rmap_list, 122131dbd01fSIzik Eidus struct rmap_item, link); 122231dbd01fSIzik Eidus } 122331dbd01fSIzik Eidus 122431dbd01fSIzik Eidus mm = slot->mm; 122531dbd01fSIzik Eidus down_read(&mm->mmap_sem); 12269ba69294SHugh Dickins if (ksm_test_exit(mm)) 12279ba69294SHugh Dickins vma = NULL; 12289ba69294SHugh Dickins else 12299ba69294SHugh Dickins vma = find_vma(mm, ksm_scan.address); 12309ba69294SHugh Dickins 12319ba69294SHugh Dickins for (; vma; vma = vma->vm_next) { 123231dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE)) 123331dbd01fSIzik Eidus continue; 123431dbd01fSIzik Eidus if (ksm_scan.address < vma->vm_start) 123531dbd01fSIzik Eidus ksm_scan.address = vma->vm_start; 123631dbd01fSIzik Eidus if (!vma->anon_vma) 123731dbd01fSIzik Eidus ksm_scan.address = vma->vm_end; 123831dbd01fSIzik Eidus 123931dbd01fSIzik Eidus while (ksm_scan.address < vma->vm_end) { 12409ba69294SHugh Dickins if (ksm_test_exit(mm)) 12419ba69294SHugh Dickins break; 124231dbd01fSIzik Eidus *page = follow_page(vma, ksm_scan.address, FOLL_GET); 124331dbd01fSIzik Eidus if (*page && PageAnon(*page)) { 124431dbd01fSIzik Eidus flush_anon_page(vma, *page, ksm_scan.address); 124531dbd01fSIzik Eidus flush_dcache_page(*page); 124631dbd01fSIzik Eidus rmap_item = get_next_rmap_item(slot, 124731dbd01fSIzik Eidus ksm_scan.rmap_item->link.next, 124831dbd01fSIzik Eidus ksm_scan.address); 124931dbd01fSIzik Eidus if (rmap_item) { 125031dbd01fSIzik Eidus ksm_scan.rmap_item = rmap_item; 125131dbd01fSIzik Eidus ksm_scan.address += PAGE_SIZE; 125231dbd01fSIzik Eidus } else 125331dbd01fSIzik Eidus put_page(*page); 125431dbd01fSIzik Eidus up_read(&mm->mmap_sem); 125531dbd01fSIzik Eidus return rmap_item; 125631dbd01fSIzik Eidus } 125731dbd01fSIzik Eidus if (*page) 125831dbd01fSIzik Eidus put_page(*page); 125931dbd01fSIzik Eidus ksm_scan.address += PAGE_SIZE; 126031dbd01fSIzik Eidus cond_resched(); 126131dbd01fSIzik Eidus } 126231dbd01fSIzik Eidus } 126331dbd01fSIzik Eidus 12649ba69294SHugh Dickins if (ksm_test_exit(mm)) { 12659ba69294SHugh Dickins ksm_scan.address = 0; 12669ba69294SHugh Dickins ksm_scan.rmap_item = list_entry(&slot->rmap_list, 12679ba69294SHugh Dickins struct rmap_item, link); 12689ba69294SHugh Dickins } 126931dbd01fSIzik Eidus /* 127031dbd01fSIzik Eidus * Nuke all the rmap_items that are above this current rmap: 127131dbd01fSIzik Eidus * because there were no VM_MERGEABLE vmas with such addresses. 127231dbd01fSIzik Eidus */ 127331dbd01fSIzik Eidus remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 127431dbd01fSIzik Eidus 127531dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock); 1276cd551f97SHugh Dickins ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1277cd551f97SHugh Dickins struct mm_slot, mm_list); 1278cd551f97SHugh Dickins if (ksm_scan.address == 0) { 1279cd551f97SHugh Dickins /* 1280cd551f97SHugh Dickins * We've completed a full scan of all vmas, holding mmap_sem 1281cd551f97SHugh Dickins * throughout, and found no VM_MERGEABLE: so do the same as 1282cd551f97SHugh Dickins * __ksm_exit does to remove this mm from all our lists now. 12839ba69294SHugh Dickins * This applies either when cleaning up after __ksm_exit 12849ba69294SHugh Dickins * (but beware: we can reach here even before __ksm_exit), 12859ba69294SHugh Dickins * or when all VM_MERGEABLE areas have been unmapped (and 12869ba69294SHugh Dickins * mmap_sem then protects against race with MADV_MERGEABLE). 1287cd551f97SHugh Dickins */ 1288cd551f97SHugh Dickins hlist_del(&slot->link); 1289cd551f97SHugh Dickins list_del(&slot->mm_list); 12909ba69294SHugh Dickins spin_unlock(&ksm_mmlist_lock); 12919ba69294SHugh Dickins 1292cd551f97SHugh Dickins free_mm_slot(slot); 1293cd551f97SHugh Dickins clear_bit(MMF_VM_MERGEABLE, &mm->flags); 12949ba69294SHugh Dickins up_read(&mm->mmap_sem); 12959ba69294SHugh Dickins mmdrop(mm); 12969ba69294SHugh Dickins } else { 129731dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock); 1298cd551f97SHugh Dickins up_read(&mm->mmap_sem); 12999ba69294SHugh Dickins } 130031dbd01fSIzik Eidus 130131dbd01fSIzik Eidus /* Repeat until we've completed scanning the whole list */ 1302cd551f97SHugh Dickins slot = ksm_scan.mm_slot; 130331dbd01fSIzik Eidus if (slot != &ksm_mm_head) 130431dbd01fSIzik Eidus goto next_mm; 130531dbd01fSIzik Eidus 130631dbd01fSIzik Eidus ksm_scan.seqnr++; 130731dbd01fSIzik Eidus return NULL; 130831dbd01fSIzik Eidus } 130931dbd01fSIzik Eidus 131031dbd01fSIzik Eidus /** 131131dbd01fSIzik Eidus * ksm_do_scan - the ksm scanner main worker function. 131231dbd01fSIzik Eidus * @scan_npages - number of pages we want to scan before we return. 131331dbd01fSIzik Eidus */ 131431dbd01fSIzik Eidus static void ksm_do_scan(unsigned int scan_npages) 131531dbd01fSIzik Eidus { 131631dbd01fSIzik Eidus struct rmap_item *rmap_item; 131731dbd01fSIzik Eidus struct page *page; 131831dbd01fSIzik Eidus 131931dbd01fSIzik Eidus while (scan_npages--) { 132031dbd01fSIzik Eidus cond_resched(); 132131dbd01fSIzik Eidus rmap_item = scan_get_next_rmap_item(&page); 132231dbd01fSIzik Eidus if (!rmap_item) 132331dbd01fSIzik Eidus return; 132431dbd01fSIzik Eidus if (!PageKsm(page) || !in_stable_tree(rmap_item)) 132531dbd01fSIzik Eidus cmp_and_merge_page(page, rmap_item); 132626465d3eSHugh Dickins else if (page_mapcount(page) == 1) { 132726465d3eSHugh Dickins /* 132826465d3eSHugh Dickins * Replace now-unshared ksm page by ordinary page. 132926465d3eSHugh Dickins */ 133026465d3eSHugh Dickins break_cow(rmap_item->mm, rmap_item->address); 133126465d3eSHugh Dickins remove_rmap_item_from_tree(rmap_item); 133226465d3eSHugh Dickins rmap_item->oldchecksum = calc_checksum(page); 133326465d3eSHugh Dickins } 133431dbd01fSIzik Eidus put_page(page); 133531dbd01fSIzik Eidus } 133631dbd01fSIzik Eidus } 133731dbd01fSIzik Eidus 13386e158384SHugh Dickins static int ksmd_should_run(void) 13396e158384SHugh Dickins { 13406e158384SHugh Dickins return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); 13416e158384SHugh Dickins } 13426e158384SHugh Dickins 134331dbd01fSIzik Eidus static int ksm_scan_thread(void *nothing) 134431dbd01fSIzik Eidus { 1345339aa624SIzik Eidus set_user_nice(current, 5); 134631dbd01fSIzik Eidus 134731dbd01fSIzik Eidus while (!kthread_should_stop()) { 134831dbd01fSIzik Eidus mutex_lock(&ksm_thread_mutex); 13496e158384SHugh Dickins if (ksmd_should_run()) 135031dbd01fSIzik Eidus ksm_do_scan(ksm_thread_pages_to_scan); 135131dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex); 13526e158384SHugh Dickins 13536e158384SHugh Dickins if (ksmd_should_run()) { 135431dbd01fSIzik Eidus schedule_timeout_interruptible( 135531dbd01fSIzik Eidus msecs_to_jiffies(ksm_thread_sleep_millisecs)); 135631dbd01fSIzik Eidus } else { 135731dbd01fSIzik Eidus wait_event_interruptible(ksm_thread_wait, 13586e158384SHugh Dickins ksmd_should_run() || kthread_should_stop()); 135931dbd01fSIzik Eidus } 136031dbd01fSIzik Eidus } 136131dbd01fSIzik Eidus return 0; 136231dbd01fSIzik Eidus } 136331dbd01fSIzik Eidus 1364f8af4da3SHugh Dickins int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1365f8af4da3SHugh Dickins unsigned long end, int advice, unsigned long *vm_flags) 1366f8af4da3SHugh Dickins { 1367f8af4da3SHugh Dickins struct mm_struct *mm = vma->vm_mm; 1368d952b791SHugh Dickins int err; 1369f8af4da3SHugh Dickins 1370f8af4da3SHugh Dickins switch (advice) { 1371f8af4da3SHugh Dickins case MADV_MERGEABLE: 1372f8af4da3SHugh Dickins /* 1373f8af4da3SHugh Dickins * Be somewhat over-protective for now! 1374f8af4da3SHugh Dickins */ 1375f8af4da3SHugh Dickins if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1376f8af4da3SHugh Dickins VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1377f8af4da3SHugh Dickins VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1378f8af4da3SHugh Dickins VM_MIXEDMAP | VM_SAO)) 1379f8af4da3SHugh Dickins return 0; /* just ignore the advice */ 1380f8af4da3SHugh Dickins 1381d952b791SHugh Dickins if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1382d952b791SHugh Dickins err = __ksm_enter(mm); 1383d952b791SHugh Dickins if (err) 1384d952b791SHugh Dickins return err; 1385d952b791SHugh Dickins } 1386f8af4da3SHugh Dickins 1387f8af4da3SHugh Dickins *vm_flags |= VM_MERGEABLE; 1388f8af4da3SHugh Dickins break; 1389f8af4da3SHugh Dickins 1390f8af4da3SHugh Dickins case MADV_UNMERGEABLE: 1391f8af4da3SHugh Dickins if (!(*vm_flags & VM_MERGEABLE)) 1392f8af4da3SHugh Dickins return 0; /* just ignore the advice */ 1393f8af4da3SHugh Dickins 1394d952b791SHugh Dickins if (vma->anon_vma) { 1395d952b791SHugh Dickins err = unmerge_ksm_pages(vma, start, end); 1396d952b791SHugh Dickins if (err) 1397d952b791SHugh Dickins return err; 1398d952b791SHugh Dickins } 1399f8af4da3SHugh Dickins 1400f8af4da3SHugh Dickins *vm_flags &= ~VM_MERGEABLE; 1401f8af4da3SHugh Dickins break; 1402f8af4da3SHugh Dickins } 1403f8af4da3SHugh Dickins 1404f8af4da3SHugh Dickins return 0; 1405f8af4da3SHugh Dickins } 1406f8af4da3SHugh Dickins 1407f8af4da3SHugh Dickins int __ksm_enter(struct mm_struct *mm) 1408f8af4da3SHugh Dickins { 14096e158384SHugh Dickins struct mm_slot *mm_slot; 14106e158384SHugh Dickins int needs_wakeup; 14116e158384SHugh Dickins 14126e158384SHugh Dickins mm_slot = alloc_mm_slot(); 141331dbd01fSIzik Eidus if (!mm_slot) 141431dbd01fSIzik Eidus return -ENOMEM; 141531dbd01fSIzik Eidus 14166e158384SHugh Dickins /* Check ksm_run too? Would need tighter locking */ 14176e158384SHugh Dickins needs_wakeup = list_empty(&ksm_mm_head.mm_list); 14186e158384SHugh Dickins 141931dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock); 142031dbd01fSIzik Eidus insert_to_mm_slots_hash(mm, mm_slot); 142131dbd01fSIzik Eidus /* 142231dbd01fSIzik Eidus * Insert just behind the scanning cursor, to let the area settle 142331dbd01fSIzik Eidus * down a little; when fork is followed by immediate exec, we don't 142431dbd01fSIzik Eidus * want ksmd to waste time setting up and tearing down an rmap_list. 142531dbd01fSIzik Eidus */ 142631dbd01fSIzik Eidus list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 142731dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock); 142831dbd01fSIzik Eidus 1429f8af4da3SHugh Dickins set_bit(MMF_VM_MERGEABLE, &mm->flags); 14309ba69294SHugh Dickins atomic_inc(&mm->mm_count); 14316e158384SHugh Dickins 14326e158384SHugh Dickins if (needs_wakeup) 14336e158384SHugh Dickins wake_up_interruptible(&ksm_thread_wait); 14346e158384SHugh Dickins 1435f8af4da3SHugh Dickins return 0; 1436f8af4da3SHugh Dickins } 1437f8af4da3SHugh Dickins 14381c2fb7a4SAndrea Arcangeli void __ksm_exit(struct mm_struct *mm) 1439f8af4da3SHugh Dickins { 1440cd551f97SHugh Dickins struct mm_slot *mm_slot; 14419ba69294SHugh Dickins int easy_to_free = 0; 1442cd551f97SHugh Dickins 144331dbd01fSIzik Eidus /* 14449ba69294SHugh Dickins * This process is exiting: if it's straightforward (as is the 14459ba69294SHugh Dickins * case when ksmd was never running), free mm_slot immediately. 14469ba69294SHugh Dickins * But if it's at the cursor or has rmap_items linked to it, use 14479ba69294SHugh Dickins * mmap_sem to synchronize with any break_cows before pagetables 14489ba69294SHugh Dickins * are freed, and leave the mm_slot on the list for ksmd to free. 14499ba69294SHugh Dickins * Beware: ksm may already have noticed it exiting and freed the slot. 145031dbd01fSIzik Eidus */ 14519ba69294SHugh Dickins 1452cd551f97SHugh Dickins spin_lock(&ksm_mmlist_lock); 1453cd551f97SHugh Dickins mm_slot = get_mm_slot(mm); 14549ba69294SHugh Dickins if (mm_slot && ksm_scan.mm_slot != mm_slot) { 14559ba69294SHugh Dickins if (list_empty(&mm_slot->rmap_list)) { 1456cd551f97SHugh Dickins hlist_del(&mm_slot->link); 1457cd551f97SHugh Dickins list_del(&mm_slot->mm_list); 14589ba69294SHugh Dickins easy_to_free = 1; 14599ba69294SHugh Dickins } else { 14609ba69294SHugh Dickins list_move(&mm_slot->mm_list, 14619ba69294SHugh Dickins &ksm_scan.mm_slot->mm_list); 14629ba69294SHugh Dickins } 14639ba69294SHugh Dickins } 1464cd551f97SHugh Dickins spin_unlock(&ksm_mmlist_lock); 1465cd551f97SHugh Dickins 14669ba69294SHugh Dickins if (easy_to_free) { 1467cd551f97SHugh Dickins free_mm_slot(mm_slot); 1468cd551f97SHugh Dickins clear_bit(MMF_VM_MERGEABLE, &mm->flags); 14699ba69294SHugh Dickins mmdrop(mm); 14709ba69294SHugh Dickins } else if (mm_slot) { 14719ba69294SHugh Dickins down_write(&mm->mmap_sem); 14729ba69294SHugh Dickins up_write(&mm->mmap_sem); 14739ba69294SHugh Dickins } 1474f8af4da3SHugh Dickins } 147531dbd01fSIzik Eidus 14762ffd8679SHugh Dickins #ifdef CONFIG_SYSFS 14772ffd8679SHugh Dickins /* 14782ffd8679SHugh Dickins * This all compiles without CONFIG_SYSFS, but is a waste of space. 14792ffd8679SHugh Dickins */ 14802ffd8679SHugh Dickins 148131dbd01fSIzik Eidus #define KSM_ATTR_RO(_name) \ 148231dbd01fSIzik Eidus static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 148331dbd01fSIzik Eidus #define KSM_ATTR(_name) \ 148431dbd01fSIzik Eidus static struct kobj_attribute _name##_attr = \ 148531dbd01fSIzik Eidus __ATTR(_name, 0644, _name##_show, _name##_store) 148631dbd01fSIzik Eidus 148731dbd01fSIzik Eidus static ssize_t sleep_millisecs_show(struct kobject *kobj, 148831dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf) 148931dbd01fSIzik Eidus { 149031dbd01fSIzik Eidus return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 149131dbd01fSIzik Eidus } 149231dbd01fSIzik Eidus 149331dbd01fSIzik Eidus static ssize_t sleep_millisecs_store(struct kobject *kobj, 149431dbd01fSIzik Eidus struct kobj_attribute *attr, 149531dbd01fSIzik Eidus const char *buf, size_t count) 149631dbd01fSIzik Eidus { 149731dbd01fSIzik Eidus unsigned long msecs; 149831dbd01fSIzik Eidus int err; 149931dbd01fSIzik Eidus 150031dbd01fSIzik Eidus err = strict_strtoul(buf, 10, &msecs); 150131dbd01fSIzik Eidus if (err || msecs > UINT_MAX) 150231dbd01fSIzik Eidus return -EINVAL; 150331dbd01fSIzik Eidus 150431dbd01fSIzik Eidus ksm_thread_sleep_millisecs = msecs; 150531dbd01fSIzik Eidus 150631dbd01fSIzik Eidus return count; 150731dbd01fSIzik Eidus } 150831dbd01fSIzik Eidus KSM_ATTR(sleep_millisecs); 150931dbd01fSIzik Eidus 151031dbd01fSIzik Eidus static ssize_t pages_to_scan_show(struct kobject *kobj, 151131dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf) 151231dbd01fSIzik Eidus { 151331dbd01fSIzik Eidus return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 151431dbd01fSIzik Eidus } 151531dbd01fSIzik Eidus 151631dbd01fSIzik Eidus static ssize_t pages_to_scan_store(struct kobject *kobj, 151731dbd01fSIzik Eidus struct kobj_attribute *attr, 151831dbd01fSIzik Eidus const char *buf, size_t count) 151931dbd01fSIzik Eidus { 152031dbd01fSIzik Eidus int err; 152131dbd01fSIzik Eidus unsigned long nr_pages; 152231dbd01fSIzik Eidus 152331dbd01fSIzik Eidus err = strict_strtoul(buf, 10, &nr_pages); 152431dbd01fSIzik Eidus if (err || nr_pages > UINT_MAX) 152531dbd01fSIzik Eidus return -EINVAL; 152631dbd01fSIzik Eidus 152731dbd01fSIzik Eidus ksm_thread_pages_to_scan = nr_pages; 152831dbd01fSIzik Eidus 152931dbd01fSIzik Eidus return count; 153031dbd01fSIzik Eidus } 153131dbd01fSIzik Eidus KSM_ATTR(pages_to_scan); 153231dbd01fSIzik Eidus 153331dbd01fSIzik Eidus static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 153431dbd01fSIzik Eidus char *buf) 153531dbd01fSIzik Eidus { 153631dbd01fSIzik Eidus return sprintf(buf, "%u\n", ksm_run); 153731dbd01fSIzik Eidus } 153831dbd01fSIzik Eidus 153931dbd01fSIzik Eidus static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 154031dbd01fSIzik Eidus const char *buf, size_t count) 154131dbd01fSIzik Eidus { 154231dbd01fSIzik Eidus int err; 154331dbd01fSIzik Eidus unsigned long flags; 154431dbd01fSIzik Eidus 154531dbd01fSIzik Eidus err = strict_strtoul(buf, 10, &flags); 154631dbd01fSIzik Eidus if (err || flags > UINT_MAX) 154731dbd01fSIzik Eidus return -EINVAL; 154831dbd01fSIzik Eidus if (flags > KSM_RUN_UNMERGE) 154931dbd01fSIzik Eidus return -EINVAL; 155031dbd01fSIzik Eidus 155131dbd01fSIzik Eidus /* 155231dbd01fSIzik Eidus * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 155331dbd01fSIzik Eidus * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1554b4028260SHugh Dickins * breaking COW to free the unswappable pages_shared (but leaves 155531dbd01fSIzik Eidus * mm_slots on the list for when ksmd may be set running again). 155631dbd01fSIzik Eidus */ 155731dbd01fSIzik Eidus 155831dbd01fSIzik Eidus mutex_lock(&ksm_thread_mutex); 155931dbd01fSIzik Eidus if (ksm_run != flags) { 156031dbd01fSIzik Eidus ksm_run = flags; 1561d952b791SHugh Dickins if (flags & KSM_RUN_UNMERGE) { 156235451beeSHugh Dickins current->flags |= PF_OOM_ORIGIN; 1563d952b791SHugh Dickins err = unmerge_and_remove_all_rmap_items(); 156435451beeSHugh Dickins current->flags &= ~PF_OOM_ORIGIN; 1565d952b791SHugh Dickins if (err) { 1566d952b791SHugh Dickins ksm_run = KSM_RUN_STOP; 1567d952b791SHugh Dickins count = err; 1568d952b791SHugh Dickins } 1569d952b791SHugh Dickins } 157031dbd01fSIzik Eidus } 157131dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex); 157231dbd01fSIzik Eidus 157331dbd01fSIzik Eidus if (flags & KSM_RUN_MERGE) 157431dbd01fSIzik Eidus wake_up_interruptible(&ksm_thread_wait); 157531dbd01fSIzik Eidus 157631dbd01fSIzik Eidus return count; 157731dbd01fSIzik Eidus } 157831dbd01fSIzik Eidus KSM_ATTR(run); 157931dbd01fSIzik Eidus 158031dbd01fSIzik Eidus static ssize_t max_kernel_pages_store(struct kobject *kobj, 158131dbd01fSIzik Eidus struct kobj_attribute *attr, 158231dbd01fSIzik Eidus const char *buf, size_t count) 158331dbd01fSIzik Eidus { 158431dbd01fSIzik Eidus int err; 158531dbd01fSIzik Eidus unsigned long nr_pages; 158631dbd01fSIzik Eidus 158731dbd01fSIzik Eidus err = strict_strtoul(buf, 10, &nr_pages); 158831dbd01fSIzik Eidus if (err) 158931dbd01fSIzik Eidus return -EINVAL; 159031dbd01fSIzik Eidus 159131dbd01fSIzik Eidus ksm_max_kernel_pages = nr_pages; 159231dbd01fSIzik Eidus 159331dbd01fSIzik Eidus return count; 159431dbd01fSIzik Eidus } 159531dbd01fSIzik Eidus 159631dbd01fSIzik Eidus static ssize_t max_kernel_pages_show(struct kobject *kobj, 159731dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf) 159831dbd01fSIzik Eidus { 159931dbd01fSIzik Eidus return sprintf(buf, "%lu\n", ksm_max_kernel_pages); 160031dbd01fSIzik Eidus } 160131dbd01fSIzik Eidus KSM_ATTR(max_kernel_pages); 160231dbd01fSIzik Eidus 1603b4028260SHugh Dickins static ssize_t pages_shared_show(struct kobject *kobj, 1604b4028260SHugh Dickins struct kobj_attribute *attr, char *buf) 1605b4028260SHugh Dickins { 1606b4028260SHugh Dickins return sprintf(buf, "%lu\n", ksm_pages_shared); 1607b4028260SHugh Dickins } 1608b4028260SHugh Dickins KSM_ATTR_RO(pages_shared); 1609b4028260SHugh Dickins 1610b4028260SHugh Dickins static ssize_t pages_sharing_show(struct kobject *kobj, 1611b4028260SHugh Dickins struct kobj_attribute *attr, char *buf) 1612b4028260SHugh Dickins { 1613e178dfdeSHugh Dickins return sprintf(buf, "%lu\n", ksm_pages_sharing); 1614b4028260SHugh Dickins } 1615b4028260SHugh Dickins KSM_ATTR_RO(pages_sharing); 1616b4028260SHugh Dickins 1617473b0ce4SHugh Dickins static ssize_t pages_unshared_show(struct kobject *kobj, 1618473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf) 1619473b0ce4SHugh Dickins { 1620473b0ce4SHugh Dickins return sprintf(buf, "%lu\n", ksm_pages_unshared); 1621473b0ce4SHugh Dickins } 1622473b0ce4SHugh Dickins KSM_ATTR_RO(pages_unshared); 1623473b0ce4SHugh Dickins 1624473b0ce4SHugh Dickins static ssize_t pages_volatile_show(struct kobject *kobj, 1625473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf) 1626473b0ce4SHugh Dickins { 1627473b0ce4SHugh Dickins long ksm_pages_volatile; 1628473b0ce4SHugh Dickins 1629473b0ce4SHugh Dickins ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1630473b0ce4SHugh Dickins - ksm_pages_sharing - ksm_pages_unshared; 1631473b0ce4SHugh Dickins /* 1632473b0ce4SHugh Dickins * It was not worth any locking to calculate that statistic, 1633473b0ce4SHugh Dickins * but it might therefore sometimes be negative: conceal that. 1634473b0ce4SHugh Dickins */ 1635473b0ce4SHugh Dickins if (ksm_pages_volatile < 0) 1636473b0ce4SHugh Dickins ksm_pages_volatile = 0; 1637473b0ce4SHugh Dickins return sprintf(buf, "%ld\n", ksm_pages_volatile); 1638473b0ce4SHugh Dickins } 1639473b0ce4SHugh Dickins KSM_ATTR_RO(pages_volatile); 1640473b0ce4SHugh Dickins 1641473b0ce4SHugh Dickins static ssize_t full_scans_show(struct kobject *kobj, 1642473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf) 1643473b0ce4SHugh Dickins { 1644473b0ce4SHugh Dickins return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1645473b0ce4SHugh Dickins } 1646473b0ce4SHugh Dickins KSM_ATTR_RO(full_scans); 1647473b0ce4SHugh Dickins 164831dbd01fSIzik Eidus static struct attribute *ksm_attrs[] = { 164931dbd01fSIzik Eidus &sleep_millisecs_attr.attr, 165031dbd01fSIzik Eidus &pages_to_scan_attr.attr, 165131dbd01fSIzik Eidus &run_attr.attr, 165231dbd01fSIzik Eidus &max_kernel_pages_attr.attr, 1653b4028260SHugh Dickins &pages_shared_attr.attr, 1654b4028260SHugh Dickins &pages_sharing_attr.attr, 1655473b0ce4SHugh Dickins &pages_unshared_attr.attr, 1656473b0ce4SHugh Dickins &pages_volatile_attr.attr, 1657473b0ce4SHugh Dickins &full_scans_attr.attr, 165831dbd01fSIzik Eidus NULL, 165931dbd01fSIzik Eidus }; 166031dbd01fSIzik Eidus 166131dbd01fSIzik Eidus static struct attribute_group ksm_attr_group = { 166231dbd01fSIzik Eidus .attrs = ksm_attrs, 166331dbd01fSIzik Eidus .name = "ksm", 166431dbd01fSIzik Eidus }; 16652ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */ 166631dbd01fSIzik Eidus 166731dbd01fSIzik Eidus static int __init ksm_init(void) 166831dbd01fSIzik Eidus { 166931dbd01fSIzik Eidus struct task_struct *ksm_thread; 167031dbd01fSIzik Eidus int err; 167131dbd01fSIzik Eidus 1672c73602adSHugh Dickins ksm_max_kernel_pages = totalram_pages / 4; 16732c6854fdSIzik Eidus 167431dbd01fSIzik Eidus err = ksm_slab_init(); 167531dbd01fSIzik Eidus if (err) 167631dbd01fSIzik Eidus goto out; 167731dbd01fSIzik Eidus 167831dbd01fSIzik Eidus err = mm_slots_hash_init(); 167931dbd01fSIzik Eidus if (err) 168031dbd01fSIzik Eidus goto out_free1; 168131dbd01fSIzik Eidus 168231dbd01fSIzik Eidus ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 168331dbd01fSIzik Eidus if (IS_ERR(ksm_thread)) { 168431dbd01fSIzik Eidus printk(KERN_ERR "ksm: creating kthread failed\n"); 168531dbd01fSIzik Eidus err = PTR_ERR(ksm_thread); 168631dbd01fSIzik Eidus goto out_free2; 168731dbd01fSIzik Eidus } 168831dbd01fSIzik Eidus 16892ffd8679SHugh Dickins #ifdef CONFIG_SYSFS 169031dbd01fSIzik Eidus err = sysfs_create_group(mm_kobj, &ksm_attr_group); 169131dbd01fSIzik Eidus if (err) { 169231dbd01fSIzik Eidus printk(KERN_ERR "ksm: register sysfs failed\n"); 16932ffd8679SHugh Dickins kthread_stop(ksm_thread); 16942ffd8679SHugh Dickins goto out_free2; 169531dbd01fSIzik Eidus } 1696c73602adSHugh Dickins #else 1697c73602adSHugh Dickins ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 1698c73602adSHugh Dickins 16992ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */ 170031dbd01fSIzik Eidus 170131dbd01fSIzik Eidus return 0; 170231dbd01fSIzik Eidus 170331dbd01fSIzik Eidus out_free2: 170431dbd01fSIzik Eidus mm_slots_hash_free(); 170531dbd01fSIzik Eidus out_free1: 170631dbd01fSIzik Eidus ksm_slab_free(); 170731dbd01fSIzik Eidus out: 170831dbd01fSIzik Eidus return err; 170931dbd01fSIzik Eidus } 171031dbd01fSIzik Eidus module_init(ksm_init) 1711