1 /* 2 * Memory merging support. 3 * 4 * This code enables dynamic sharing of identical pages found in different 5 * memory areas, even if they are not shared by fork() 6 * 7 * Copyright (C) 2008-2009 Red Hat, Inc. 8 * Authors: 9 * Izik Eidus 10 * Andrea Arcangeli 11 * Chris Wright 12 * Hugh Dickins 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. 15 */ 16 17 #include <linux/errno.h> 18 #include <linux/mm.h> 19 #include <linux/fs.h> 20 #include <linux/mman.h> 21 #include <linux/sched.h> 22 #include <linux/rwsem.h> 23 #include <linux/pagemap.h> 24 #include <linux/rmap.h> 25 #include <linux/spinlock.h> 26 #include <linux/jhash.h> 27 #include <linux/delay.h> 28 #include <linux/kthread.h> 29 #include <linux/wait.h> 30 #include <linux/slab.h> 31 #include <linux/rbtree.h> 32 #include <linux/memory.h> 33 #include <linux/mmu_notifier.h> 34 #include <linux/swap.h> 35 #include <linux/ksm.h> 36 37 #include <asm/tlbflush.h> 38 #include "internal.h" 39 40 /* 41 * A few notes about the KSM scanning process, 42 * to make it easier to understand the data structures below: 43 * 44 * In order to reduce excessive scanning, KSM sorts the memory pages by their 45 * contents into a data structure that holds pointers to the pages' locations. 46 * 47 * Since the contents of the pages may change at any moment, KSM cannot just 48 * insert the pages into a normal sorted tree and expect it to find anything. 49 * Therefore KSM uses two data structures - the stable and the unstable tree. 50 * 51 * The stable tree holds pointers to all the merged pages (ksm pages), sorted 52 * by their contents. Because each such page is write-protected, searching on 53 * this tree is fully assured to be working (except when pages are unmapped), 54 * and therefore this tree is called the stable tree. 55 * 56 * In addition to the stable tree, KSM uses a second data structure called the 57 * unstable tree: this tree holds pointers to pages which have been found to 58 * be "unchanged for a period of time". The unstable tree sorts these pages 59 * by their contents, but since they are not write-protected, KSM cannot rely 60 * upon the unstable tree to work correctly - the unstable tree is liable to 61 * be corrupted as its contents are modified, and so it is called unstable. 62 * 63 * KSM solves this problem by several techniques: 64 * 65 * 1) The unstable tree is flushed every time KSM completes scanning all 66 * memory areas, and then the tree is rebuilt again from the beginning. 67 * 2) KSM will only insert into the unstable tree, pages whose hash value 68 * has not changed since the previous scan of all memory areas. 69 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 70 * colors of the nodes and not on their contents, assuring that even when 71 * the tree gets "corrupted" it won't get out of balance, so scanning time 72 * remains the same (also, searching and inserting nodes in an rbtree uses 73 * the same algorithm, so we have no overhead when we flush and rebuild). 74 * 4) KSM never flushes the stable tree, which means that even if it were to 75 * take 10 attempts to find a page in the unstable tree, once it is found, 76 * it is secured in the stable tree. (When we scan a new page, we first 77 * compare it against the stable tree, and then against the unstable tree.) 78 */ 79 80 /** 81 * struct mm_slot - ksm information per mm that is being scanned 82 * @link: link to the mm_slots hash list 83 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 84 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items 85 * @mm: the mm that this information is valid for 86 */ 87 struct mm_slot { 88 struct hlist_node link; 89 struct list_head mm_list; 90 struct rmap_item *rmap_list; 91 struct mm_struct *mm; 92 }; 93 94 /** 95 * struct ksm_scan - cursor for scanning 96 * @mm_slot: the current mm_slot we are scanning 97 * @address: the next address inside that to be scanned 98 * @rmap_list: link to the next rmap to be scanned in the rmap_list 99 * @seqnr: count of completed full scans (needed when removing unstable node) 100 * 101 * There is only the one ksm_scan instance of this cursor structure. 102 */ 103 struct ksm_scan { 104 struct mm_slot *mm_slot; 105 unsigned long address; 106 struct rmap_item **rmap_list; 107 unsigned long seqnr; 108 }; 109 110 /** 111 * struct stable_node - node of the stable rbtree 112 * @node: rb node of this ksm page in the stable tree 113 * @hlist: hlist head of rmap_items using this ksm page 114 * @kpfn: page frame number of this ksm page 115 */ 116 struct stable_node { 117 struct rb_node node; 118 struct hlist_head hlist; 119 unsigned long kpfn; 120 }; 121 122 /** 123 * struct rmap_item - reverse mapping item for virtual addresses 124 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 125 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 126 * @mm: the memory structure this rmap_item is pointing into 127 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 128 * @oldchecksum: previous checksum of the page at that virtual address 129 * @node: rb node of this rmap_item in the unstable tree 130 * @head: pointer to stable_node heading this list in the stable tree 131 * @hlist: link into hlist of rmap_items hanging off that stable_node 132 */ 133 struct rmap_item { 134 struct rmap_item *rmap_list; 135 struct anon_vma *anon_vma; /* when stable */ 136 struct mm_struct *mm; 137 unsigned long address; /* + low bits used for flags below */ 138 unsigned int oldchecksum; /* when unstable */ 139 union { 140 struct rb_node node; /* when node of unstable tree */ 141 struct { /* when listed from stable tree */ 142 struct stable_node *head; 143 struct hlist_node hlist; 144 }; 145 }; 146 }; 147 148 #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 149 #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ 150 #define STABLE_FLAG 0x200 /* is listed from the stable tree */ 151 152 /* The stable and unstable tree heads */ 153 static struct rb_root root_stable_tree = RB_ROOT; 154 static struct rb_root root_unstable_tree = RB_ROOT; 155 156 #define MM_SLOTS_HASH_HEADS 1024 157 static struct hlist_head *mm_slots_hash; 158 159 static struct mm_slot ksm_mm_head = { 160 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 161 }; 162 static struct ksm_scan ksm_scan = { 163 .mm_slot = &ksm_mm_head, 164 }; 165 166 static struct kmem_cache *rmap_item_cache; 167 static struct kmem_cache *stable_node_cache; 168 static struct kmem_cache *mm_slot_cache; 169 170 /* The number of nodes in the stable tree */ 171 static unsigned long ksm_pages_shared; 172 173 /* The number of page slots additionally sharing those nodes */ 174 static unsigned long ksm_pages_sharing; 175 176 /* The number of nodes in the unstable tree */ 177 static unsigned long ksm_pages_unshared; 178 179 /* The number of rmap_items in use: to calculate pages_volatile */ 180 static unsigned long ksm_rmap_items; 181 182 /* Number of pages ksmd should scan in one batch */ 183 static unsigned int ksm_thread_pages_to_scan = 100; 184 185 /* Milliseconds ksmd should sleep between batches */ 186 static unsigned int ksm_thread_sleep_millisecs = 20; 187 188 #define KSM_RUN_STOP 0 189 #define KSM_RUN_MERGE 1 190 #define KSM_RUN_UNMERGE 2 191 static unsigned int ksm_run = KSM_RUN_STOP; 192 193 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 194 static DEFINE_MUTEX(ksm_thread_mutex); 195 static DEFINE_SPINLOCK(ksm_mmlist_lock); 196 197 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 198 sizeof(struct __struct), __alignof__(struct __struct),\ 199 (__flags), NULL) 200 201 static int __init ksm_slab_init(void) 202 { 203 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 204 if (!rmap_item_cache) 205 goto out; 206 207 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); 208 if (!stable_node_cache) 209 goto out_free1; 210 211 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 212 if (!mm_slot_cache) 213 goto out_free2; 214 215 return 0; 216 217 out_free2: 218 kmem_cache_destroy(stable_node_cache); 219 out_free1: 220 kmem_cache_destroy(rmap_item_cache); 221 out: 222 return -ENOMEM; 223 } 224 225 static void __init ksm_slab_free(void) 226 { 227 kmem_cache_destroy(mm_slot_cache); 228 kmem_cache_destroy(stable_node_cache); 229 kmem_cache_destroy(rmap_item_cache); 230 mm_slot_cache = NULL; 231 } 232 233 static inline struct rmap_item *alloc_rmap_item(void) 234 { 235 struct rmap_item *rmap_item; 236 237 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 238 if (rmap_item) 239 ksm_rmap_items++; 240 return rmap_item; 241 } 242 243 static inline void free_rmap_item(struct rmap_item *rmap_item) 244 { 245 ksm_rmap_items--; 246 rmap_item->mm = NULL; /* debug safety */ 247 kmem_cache_free(rmap_item_cache, rmap_item); 248 } 249 250 static inline struct stable_node *alloc_stable_node(void) 251 { 252 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); 253 } 254 255 static inline void free_stable_node(struct stable_node *stable_node) 256 { 257 kmem_cache_free(stable_node_cache, stable_node); 258 } 259 260 static inline struct mm_slot *alloc_mm_slot(void) 261 { 262 if (!mm_slot_cache) /* initialization failed */ 263 return NULL; 264 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 265 } 266 267 static inline void free_mm_slot(struct mm_slot *mm_slot) 268 { 269 kmem_cache_free(mm_slot_cache, mm_slot); 270 } 271 272 static int __init mm_slots_hash_init(void) 273 { 274 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), 275 GFP_KERNEL); 276 if (!mm_slots_hash) 277 return -ENOMEM; 278 return 0; 279 } 280 281 static void __init mm_slots_hash_free(void) 282 { 283 kfree(mm_slots_hash); 284 } 285 286 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 287 { 288 struct mm_slot *mm_slot; 289 struct hlist_head *bucket; 290 struct hlist_node *node; 291 292 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 293 % MM_SLOTS_HASH_HEADS]; 294 hlist_for_each_entry(mm_slot, node, bucket, link) { 295 if (mm == mm_slot->mm) 296 return mm_slot; 297 } 298 return NULL; 299 } 300 301 static void insert_to_mm_slots_hash(struct mm_struct *mm, 302 struct mm_slot *mm_slot) 303 { 304 struct hlist_head *bucket; 305 306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 307 % MM_SLOTS_HASH_HEADS]; 308 mm_slot->mm = mm; 309 hlist_add_head(&mm_slot->link, bucket); 310 } 311 312 static inline int in_stable_tree(struct rmap_item *rmap_item) 313 { 314 return rmap_item->address & STABLE_FLAG; 315 } 316 317 static void hold_anon_vma(struct rmap_item *rmap_item, 318 struct anon_vma *anon_vma) 319 { 320 rmap_item->anon_vma = anon_vma; 321 atomic_inc(&anon_vma->ksm_refcount); 322 } 323 324 static void drop_anon_vma(struct rmap_item *rmap_item) 325 { 326 struct anon_vma *anon_vma = rmap_item->anon_vma; 327 328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { 329 int empty = list_empty(&anon_vma->head); 330 spin_unlock(&anon_vma->lock); 331 if (empty) 332 anon_vma_free(anon_vma); 333 } 334 } 335 336 /* 337 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 338 * page tables after it has passed through ksm_exit() - which, if necessary, 339 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set 340 * a special flag: they can just back out as soon as mm_users goes to zero. 341 * ksm_test_exit() is used throughout to make this test for exit: in some 342 * places for correctness, in some places just to avoid unnecessary work. 343 */ 344 static inline bool ksm_test_exit(struct mm_struct *mm) 345 { 346 return atomic_read(&mm->mm_users) == 0; 347 } 348 349 /* 350 * We use break_ksm to break COW on a ksm page: it's a stripped down 351 * 352 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 353 * put_page(page); 354 * 355 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 356 * in case the application has unmapped and remapped mm,addr meanwhile. 357 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 358 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 359 */ 360 static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 361 { 362 struct page *page; 363 int ret = 0; 364 365 do { 366 cond_resched(); 367 page = follow_page(vma, addr, FOLL_GET); 368 if (!page) 369 break; 370 if (PageKsm(page)) 371 ret = handle_mm_fault(vma->vm_mm, vma, addr, 372 FAULT_FLAG_WRITE); 373 else 374 ret = VM_FAULT_WRITE; 375 put_page(page); 376 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 377 /* 378 * We must loop because handle_mm_fault() may back out if there's 379 * any difficulty e.g. if pte accessed bit gets updated concurrently. 380 * 381 * VM_FAULT_WRITE is what we have been hoping for: it indicates that 382 * COW has been broken, even if the vma does not permit VM_WRITE; 383 * but note that a concurrent fault might break PageKsm for us. 384 * 385 * VM_FAULT_SIGBUS could occur if we race with truncation of the 386 * backing file, which also invalidates anonymous pages: that's 387 * okay, that truncation will have unmapped the PageKsm for us. 388 * 389 * VM_FAULT_OOM: at the time of writing (late July 2009), setting 390 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 391 * current task has TIF_MEMDIE set, and will be OOM killed on return 392 * to user; and ksmd, having no mm, would never be chosen for that. 393 * 394 * But if the mm is in a limited mem_cgroup, then the fault may fail 395 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 396 * even ksmd can fail in this way - though it's usually breaking ksm 397 * just to undo a merge it made a moment before, so unlikely to oom. 398 * 399 * That's a pity: we might therefore have more kernel pages allocated 400 * than we're counting as nodes in the stable tree; but ksm_do_scan 401 * will retry to break_cow on each pass, so should recover the page 402 * in due course. The important thing is to not let VM_MERGEABLE 403 * be cleared while any such pages might remain in the area. 404 */ 405 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 406 } 407 408 static void break_cow(struct rmap_item *rmap_item) 409 { 410 struct mm_struct *mm = rmap_item->mm; 411 unsigned long addr = rmap_item->address; 412 struct vm_area_struct *vma; 413 414 /* 415 * It is not an accident that whenever we want to break COW 416 * to undo, we also need to drop a reference to the anon_vma. 417 */ 418 drop_anon_vma(rmap_item); 419 420 down_read(&mm->mmap_sem); 421 if (ksm_test_exit(mm)) 422 goto out; 423 vma = find_vma(mm, addr); 424 if (!vma || vma->vm_start > addr) 425 goto out; 426 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 427 goto out; 428 break_ksm(vma, addr); 429 out: 430 up_read(&mm->mmap_sem); 431 } 432 433 static struct page *get_mergeable_page(struct rmap_item *rmap_item) 434 { 435 struct mm_struct *mm = rmap_item->mm; 436 unsigned long addr = rmap_item->address; 437 struct vm_area_struct *vma; 438 struct page *page; 439 440 down_read(&mm->mmap_sem); 441 if (ksm_test_exit(mm)) 442 goto out; 443 vma = find_vma(mm, addr); 444 if (!vma || vma->vm_start > addr) 445 goto out; 446 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 447 goto out; 448 449 page = follow_page(vma, addr, FOLL_GET); 450 if (!page) 451 goto out; 452 if (PageAnon(page)) { 453 flush_anon_page(vma, page, addr); 454 flush_dcache_page(page); 455 } else { 456 put_page(page); 457 out: page = NULL; 458 } 459 up_read(&mm->mmap_sem); 460 return page; 461 } 462 463 static void remove_node_from_stable_tree(struct stable_node *stable_node) 464 { 465 struct rmap_item *rmap_item; 466 struct hlist_node *hlist; 467 468 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 469 if (rmap_item->hlist.next) 470 ksm_pages_sharing--; 471 else 472 ksm_pages_shared--; 473 drop_anon_vma(rmap_item); 474 rmap_item->address &= PAGE_MASK; 475 cond_resched(); 476 } 477 478 rb_erase(&stable_node->node, &root_stable_tree); 479 free_stable_node(stable_node); 480 } 481 482 /* 483 * get_ksm_page: checks if the page indicated by the stable node 484 * is still its ksm page, despite having held no reference to it. 485 * In which case we can trust the content of the page, and it 486 * returns the gotten page; but if the page has now been zapped, 487 * remove the stale node from the stable tree and return NULL. 488 * 489 * You would expect the stable_node to hold a reference to the ksm page. 490 * But if it increments the page's count, swapping out has to wait for 491 * ksmd to come around again before it can free the page, which may take 492 * seconds or even minutes: much too unresponsive. So instead we use a 493 * "keyhole reference": access to the ksm page from the stable node peeps 494 * out through its keyhole to see if that page still holds the right key, 495 * pointing back to this stable node. This relies on freeing a PageAnon 496 * page to reset its page->mapping to NULL, and relies on no other use of 497 * a page to put something that might look like our key in page->mapping. 498 * 499 * include/linux/pagemap.h page_cache_get_speculative() is a good reference, 500 * but this is different - made simpler by ksm_thread_mutex being held, but 501 * interesting for assuming that no other use of the struct page could ever 502 * put our expected_mapping into page->mapping (or a field of the union which 503 * coincides with page->mapping). The RCU calls are not for KSM at all, but 504 * to keep the page_count protocol described with page_cache_get_speculative. 505 * 506 * Note: it is possible that get_ksm_page() will return NULL one moment, 507 * then page the next, if the page is in between page_freeze_refs() and 508 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page 509 * is on its way to being freed; but it is an anomaly to bear in mind. 510 */ 511 static struct page *get_ksm_page(struct stable_node *stable_node) 512 { 513 struct page *page; 514 void *expected_mapping; 515 516 page = pfn_to_page(stable_node->kpfn); 517 expected_mapping = (void *)stable_node + 518 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 519 rcu_read_lock(); 520 if (page->mapping != expected_mapping) 521 goto stale; 522 if (!get_page_unless_zero(page)) 523 goto stale; 524 if (page->mapping != expected_mapping) { 525 put_page(page); 526 goto stale; 527 } 528 rcu_read_unlock(); 529 return page; 530 stale: 531 rcu_read_unlock(); 532 remove_node_from_stable_tree(stable_node); 533 return NULL; 534 } 535 536 /* 537 * Removing rmap_item from stable or unstable tree. 538 * This function will clean the information from the stable/unstable tree. 539 */ 540 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 541 { 542 if (rmap_item->address & STABLE_FLAG) { 543 struct stable_node *stable_node; 544 struct page *page; 545 546 stable_node = rmap_item->head; 547 page = get_ksm_page(stable_node); 548 if (!page) 549 goto out; 550 551 lock_page(page); 552 hlist_del(&rmap_item->hlist); 553 unlock_page(page); 554 put_page(page); 555 556 if (stable_node->hlist.first) 557 ksm_pages_sharing--; 558 else 559 ksm_pages_shared--; 560 561 drop_anon_vma(rmap_item); 562 rmap_item->address &= PAGE_MASK; 563 564 } else if (rmap_item->address & UNSTABLE_FLAG) { 565 unsigned char age; 566 /* 567 * Usually ksmd can and must skip the rb_erase, because 568 * root_unstable_tree was already reset to RB_ROOT. 569 * But be careful when an mm is exiting: do the rb_erase 570 * if this rmap_item was inserted by this scan, rather 571 * than left over from before. 572 */ 573 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 574 BUG_ON(age > 1); 575 if (!age) 576 rb_erase(&rmap_item->node, &root_unstable_tree); 577 578 ksm_pages_unshared--; 579 rmap_item->address &= PAGE_MASK; 580 } 581 out: 582 cond_resched(); /* we're called from many long loops */ 583 } 584 585 static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 586 struct rmap_item **rmap_list) 587 { 588 while (*rmap_list) { 589 struct rmap_item *rmap_item = *rmap_list; 590 *rmap_list = rmap_item->rmap_list; 591 remove_rmap_item_from_tree(rmap_item); 592 free_rmap_item(rmap_item); 593 } 594 } 595 596 /* 597 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 598 * than check every pte of a given vma, the locking doesn't quite work for 599 * that - an rmap_item is assigned to the stable tree after inserting ksm 600 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 601 * rmap_items from parent to child at fork time (so as not to waste time 602 * if exit comes before the next scan reaches it). 603 * 604 * Similarly, although we'd like to remove rmap_items (so updating counts 605 * and freeing memory) when unmerging an area, it's easier to leave that 606 * to the next pass of ksmd - consider, for example, how ksmd might be 607 * in cmp_and_merge_page on one of the rmap_items we would be removing. 608 */ 609 static int unmerge_ksm_pages(struct vm_area_struct *vma, 610 unsigned long start, unsigned long end) 611 { 612 unsigned long addr; 613 int err = 0; 614 615 for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 616 if (ksm_test_exit(vma->vm_mm)) 617 break; 618 if (signal_pending(current)) 619 err = -ERESTARTSYS; 620 else 621 err = break_ksm(vma, addr); 622 } 623 return err; 624 } 625 626 #ifdef CONFIG_SYSFS 627 /* 628 * Only called through the sysfs control interface: 629 */ 630 static int unmerge_and_remove_all_rmap_items(void) 631 { 632 struct mm_slot *mm_slot; 633 struct mm_struct *mm; 634 struct vm_area_struct *vma; 635 int err = 0; 636 637 spin_lock(&ksm_mmlist_lock); 638 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, 639 struct mm_slot, mm_list); 640 spin_unlock(&ksm_mmlist_lock); 641 642 for (mm_slot = ksm_scan.mm_slot; 643 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { 644 mm = mm_slot->mm; 645 down_read(&mm->mmap_sem); 646 for (vma = mm->mmap; vma; vma = vma->vm_next) { 647 if (ksm_test_exit(mm)) 648 break; 649 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 650 continue; 651 err = unmerge_ksm_pages(vma, 652 vma->vm_start, vma->vm_end); 653 if (err) 654 goto error; 655 } 656 657 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); 658 659 spin_lock(&ksm_mmlist_lock); 660 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 661 struct mm_slot, mm_list); 662 if (ksm_test_exit(mm)) { 663 hlist_del(&mm_slot->link); 664 list_del(&mm_slot->mm_list); 665 spin_unlock(&ksm_mmlist_lock); 666 667 free_mm_slot(mm_slot); 668 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 669 up_read(&mm->mmap_sem); 670 mmdrop(mm); 671 } else { 672 spin_unlock(&ksm_mmlist_lock); 673 up_read(&mm->mmap_sem); 674 } 675 } 676 677 ksm_scan.seqnr = 0; 678 return 0; 679 680 error: 681 up_read(&mm->mmap_sem); 682 spin_lock(&ksm_mmlist_lock); 683 ksm_scan.mm_slot = &ksm_mm_head; 684 spin_unlock(&ksm_mmlist_lock); 685 return err; 686 } 687 #endif /* CONFIG_SYSFS */ 688 689 static u32 calc_checksum(struct page *page) 690 { 691 u32 checksum; 692 void *addr = kmap_atomic(page, KM_USER0); 693 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 694 kunmap_atomic(addr, KM_USER0); 695 return checksum; 696 } 697 698 static int memcmp_pages(struct page *page1, struct page *page2) 699 { 700 char *addr1, *addr2; 701 int ret; 702 703 addr1 = kmap_atomic(page1, KM_USER0); 704 addr2 = kmap_atomic(page2, KM_USER1); 705 ret = memcmp(addr1, addr2, PAGE_SIZE); 706 kunmap_atomic(addr2, KM_USER1); 707 kunmap_atomic(addr1, KM_USER0); 708 return ret; 709 } 710 711 static inline int pages_identical(struct page *page1, struct page *page2) 712 { 713 return !memcmp_pages(page1, page2); 714 } 715 716 static int write_protect_page(struct vm_area_struct *vma, struct page *page, 717 pte_t *orig_pte) 718 { 719 struct mm_struct *mm = vma->vm_mm; 720 unsigned long addr; 721 pte_t *ptep; 722 spinlock_t *ptl; 723 int swapped; 724 int err = -EFAULT; 725 726 addr = page_address_in_vma(page, vma); 727 if (addr == -EFAULT) 728 goto out; 729 730 ptep = page_check_address(page, mm, addr, &ptl, 0); 731 if (!ptep) 732 goto out; 733 734 if (pte_write(*ptep)) { 735 pte_t entry; 736 737 swapped = PageSwapCache(page); 738 flush_cache_page(vma, addr, page_to_pfn(page)); 739 /* 740 * Ok this is tricky, when get_user_pages_fast() run it doesnt 741 * take any lock, therefore the check that we are going to make 742 * with the pagecount against the mapcount is racey and 743 * O_DIRECT can happen right after the check. 744 * So we clear the pte and flush the tlb before the check 745 * this assure us that no O_DIRECT can happen after the check 746 * or in the middle of the check. 747 */ 748 entry = ptep_clear_flush(vma, addr, ptep); 749 /* 750 * Check that no O_DIRECT or similar I/O is in progress on the 751 * page 752 */ 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 754 set_pte_at_notify(mm, addr, ptep, entry); 755 goto out_unlock; 756 } 757 entry = pte_wrprotect(entry); 758 set_pte_at_notify(mm, addr, ptep, entry); 759 } 760 *orig_pte = *ptep; 761 err = 0; 762 763 out_unlock: 764 pte_unmap_unlock(ptep, ptl); 765 out: 766 return err; 767 } 768 769 /** 770 * replace_page - replace page in vma by new ksm page 771 * @vma: vma that holds the pte pointing to page 772 * @page: the page we are replacing by kpage 773 * @kpage: the ksm page we replace page by 774 * @orig_pte: the original value of the pte 775 * 776 * Returns 0 on success, -EFAULT on failure. 777 */ 778 static int replace_page(struct vm_area_struct *vma, struct page *page, 779 struct page *kpage, pte_t orig_pte) 780 { 781 struct mm_struct *mm = vma->vm_mm; 782 pgd_t *pgd; 783 pud_t *pud; 784 pmd_t *pmd; 785 pte_t *ptep; 786 spinlock_t *ptl; 787 unsigned long addr; 788 int err = -EFAULT; 789 790 addr = page_address_in_vma(page, vma); 791 if (addr == -EFAULT) 792 goto out; 793 794 pgd = pgd_offset(mm, addr); 795 if (!pgd_present(*pgd)) 796 goto out; 797 798 pud = pud_offset(pgd, addr); 799 if (!pud_present(*pud)) 800 goto out; 801 802 pmd = pmd_offset(pud, addr); 803 if (!pmd_present(*pmd)) 804 goto out; 805 806 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 807 if (!pte_same(*ptep, orig_pte)) { 808 pte_unmap_unlock(ptep, ptl); 809 goto out; 810 } 811 812 get_page(kpage); 813 page_add_anon_rmap(kpage, vma, addr); 814 815 flush_cache_page(vma, addr, pte_pfn(*ptep)); 816 ptep_clear_flush(vma, addr, ptep); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 818 819 page_remove_rmap(page); 820 put_page(page); 821 822 pte_unmap_unlock(ptep, ptl); 823 err = 0; 824 out: 825 return err; 826 } 827 828 /* 829 * try_to_merge_one_page - take two pages and merge them into one 830 * @vma: the vma that holds the pte pointing to page 831 * @page: the PageAnon page that we want to replace with kpage 832 * @kpage: the PageKsm page that we want to map instead of page, 833 * or NULL the first time when we want to use page as kpage. 834 * 835 * This function returns 0 if the pages were merged, -EFAULT otherwise. 836 */ 837 static int try_to_merge_one_page(struct vm_area_struct *vma, 838 struct page *page, struct page *kpage) 839 { 840 pte_t orig_pte = __pte(0); 841 int err = -EFAULT; 842 843 if (page == kpage) /* ksm page forked */ 844 return 0; 845 846 if (!(vma->vm_flags & VM_MERGEABLE)) 847 goto out; 848 if (!PageAnon(page)) 849 goto out; 850 851 /* 852 * We need the page lock to read a stable PageSwapCache in 853 * write_protect_page(). We use trylock_page() instead of 854 * lock_page() because we don't want to wait here - we 855 * prefer to continue scanning and merging different pages, 856 * then come back to this page when it is unlocked. 857 */ 858 if (!trylock_page(page)) 859 goto out; 860 /* 861 * If this anonymous page is mapped only here, its pte may need 862 * to be write-protected. If it's mapped elsewhere, all of its 863 * ptes are necessarily already write-protected. But in either 864 * case, we need to lock and check page_count is not raised. 865 */ 866 if (write_protect_page(vma, page, &orig_pte) == 0) { 867 if (!kpage) { 868 /* 869 * While we hold page lock, upgrade page from 870 * PageAnon+anon_vma to PageKsm+NULL stable_node: 871 * stable_tree_insert() will update stable_node. 872 */ 873 set_page_stable_node(page, NULL); 874 mark_page_accessed(page); 875 err = 0; 876 } else if (pages_identical(page, kpage)) 877 err = replace_page(vma, page, kpage, orig_pte); 878 } 879 880 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { 881 munlock_vma_page(page); 882 if (!PageMlocked(kpage)) { 883 unlock_page(page); 884 lock_page(kpage); 885 mlock_vma_page(kpage); 886 page = kpage; /* for final unlock */ 887 } 888 } 889 890 unlock_page(page); 891 out: 892 return err; 893 } 894 895 /* 896 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 897 * but no new kernel page is allocated: kpage must already be a ksm page. 898 * 899 * This function returns 0 if the pages were merged, -EFAULT otherwise. 900 */ 901 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, 902 struct page *page, struct page *kpage) 903 { 904 struct mm_struct *mm = rmap_item->mm; 905 struct vm_area_struct *vma; 906 int err = -EFAULT; 907 908 down_read(&mm->mmap_sem); 909 if (ksm_test_exit(mm)) 910 goto out; 911 vma = find_vma(mm, rmap_item->address); 912 if (!vma || vma->vm_start > rmap_item->address) 913 goto out; 914 915 err = try_to_merge_one_page(vma, page, kpage); 916 if (err) 917 goto out; 918 919 /* Must get reference to anon_vma while still holding mmap_sem */ 920 hold_anon_vma(rmap_item, vma->anon_vma); 921 out: 922 up_read(&mm->mmap_sem); 923 return err; 924 } 925 926 /* 927 * try_to_merge_two_pages - take two identical pages and prepare them 928 * to be merged into one page. 929 * 930 * This function returns the kpage if we successfully merged two identical 931 * pages into one ksm page, NULL otherwise. 932 * 933 * Note that this function upgrades page to ksm page: if one of the pages 934 * is already a ksm page, try_to_merge_with_ksm_page should be used. 935 */ 936 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, 937 struct page *page, 938 struct rmap_item *tree_rmap_item, 939 struct page *tree_page) 940 { 941 int err; 942 943 err = try_to_merge_with_ksm_page(rmap_item, page, NULL); 944 if (!err) { 945 err = try_to_merge_with_ksm_page(tree_rmap_item, 946 tree_page, page); 947 /* 948 * If that fails, we have a ksm page with only one pte 949 * pointing to it: so break it. 950 */ 951 if (err) 952 break_cow(rmap_item); 953 } 954 return err ? NULL : page; 955 } 956 957 /* 958 * stable_tree_search - search for page inside the stable tree 959 * 960 * This function checks if there is a page inside the stable tree 961 * with identical content to the page that we are scanning right now. 962 * 963 * This function returns the stable tree node of identical content if found, 964 * NULL otherwise. 965 */ 966 static struct page *stable_tree_search(struct page *page) 967 { 968 struct rb_node *node = root_stable_tree.rb_node; 969 struct stable_node *stable_node; 970 971 stable_node = page_stable_node(page); 972 if (stable_node) { /* ksm page forked */ 973 get_page(page); 974 return page; 975 } 976 977 while (node) { 978 struct page *tree_page; 979 int ret; 980 981 cond_resched(); 982 stable_node = rb_entry(node, struct stable_node, node); 983 tree_page = get_ksm_page(stable_node); 984 if (!tree_page) 985 return NULL; 986 987 ret = memcmp_pages(page, tree_page); 988 989 if (ret < 0) { 990 put_page(tree_page); 991 node = node->rb_left; 992 } else if (ret > 0) { 993 put_page(tree_page); 994 node = node->rb_right; 995 } else 996 return tree_page; 997 } 998 999 return NULL; 1000 } 1001 1002 /* 1003 * stable_tree_insert - insert rmap_item pointing to new ksm page 1004 * into the stable tree. 1005 * 1006 * This function returns the stable tree node just allocated on success, 1007 * NULL otherwise. 1008 */ 1009 static struct stable_node *stable_tree_insert(struct page *kpage) 1010 { 1011 struct rb_node **new = &root_stable_tree.rb_node; 1012 struct rb_node *parent = NULL; 1013 struct stable_node *stable_node; 1014 1015 while (*new) { 1016 struct page *tree_page; 1017 int ret; 1018 1019 cond_resched(); 1020 stable_node = rb_entry(*new, struct stable_node, node); 1021 tree_page = get_ksm_page(stable_node); 1022 if (!tree_page) 1023 return NULL; 1024 1025 ret = memcmp_pages(kpage, tree_page); 1026 put_page(tree_page); 1027 1028 parent = *new; 1029 if (ret < 0) 1030 new = &parent->rb_left; 1031 else if (ret > 0) 1032 new = &parent->rb_right; 1033 else { 1034 /* 1035 * It is not a bug that stable_tree_search() didn't 1036 * find this node: because at that time our page was 1037 * not yet write-protected, so may have changed since. 1038 */ 1039 return NULL; 1040 } 1041 } 1042 1043 stable_node = alloc_stable_node(); 1044 if (!stable_node) 1045 return NULL; 1046 1047 rb_link_node(&stable_node->node, parent, new); 1048 rb_insert_color(&stable_node->node, &root_stable_tree); 1049 1050 INIT_HLIST_HEAD(&stable_node->hlist); 1051 1052 stable_node->kpfn = page_to_pfn(kpage); 1053 set_page_stable_node(kpage, stable_node); 1054 1055 return stable_node; 1056 } 1057 1058 /* 1059 * unstable_tree_search_insert - search for identical page, 1060 * else insert rmap_item into the unstable tree. 1061 * 1062 * This function searches for a page in the unstable tree identical to the 1063 * page currently being scanned; and if no identical page is found in the 1064 * tree, we insert rmap_item as a new object into the unstable tree. 1065 * 1066 * This function returns pointer to rmap_item found to be identical 1067 * to the currently scanned page, NULL otherwise. 1068 * 1069 * This function does both searching and inserting, because they share 1070 * the same walking algorithm in an rbtree. 1071 */ 1072 static 1073 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1074 struct page *page, 1075 struct page **tree_pagep) 1076 1077 { 1078 struct rb_node **new = &root_unstable_tree.rb_node; 1079 struct rb_node *parent = NULL; 1080 1081 while (*new) { 1082 struct rmap_item *tree_rmap_item; 1083 struct page *tree_page; 1084 int ret; 1085 1086 cond_resched(); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1088 tree_page = get_mergeable_page(tree_rmap_item); 1089 if (!tree_page) 1090 return NULL; 1091 1092 /* 1093 * Don't substitute a ksm page for a forked page. 1094 */ 1095 if (page == tree_page) { 1096 put_page(tree_page); 1097 return NULL; 1098 } 1099 1100 ret = memcmp_pages(page, tree_page); 1101 1102 parent = *new; 1103 if (ret < 0) { 1104 put_page(tree_page); 1105 new = &parent->rb_left; 1106 } else if (ret > 0) { 1107 put_page(tree_page); 1108 new = &parent->rb_right; 1109 } else { 1110 *tree_pagep = tree_page; 1111 return tree_rmap_item; 1112 } 1113 } 1114 1115 rmap_item->address |= UNSTABLE_FLAG; 1116 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1117 rb_link_node(&rmap_item->node, parent, new); 1118 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1119 1120 ksm_pages_unshared++; 1121 return NULL; 1122 } 1123 1124 /* 1125 * stable_tree_append - add another rmap_item to the linked list of 1126 * rmap_items hanging off a given node of the stable tree, all sharing 1127 * the same ksm page. 1128 */ 1129 static void stable_tree_append(struct rmap_item *rmap_item, 1130 struct stable_node *stable_node) 1131 { 1132 rmap_item->head = stable_node; 1133 rmap_item->address |= STABLE_FLAG; 1134 hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 1135 1136 if (rmap_item->hlist.next) 1137 ksm_pages_sharing++; 1138 else 1139 ksm_pages_shared++; 1140 } 1141 1142 /* 1143 * cmp_and_merge_page - first see if page can be merged into the stable tree; 1144 * if not, compare checksum to previous and if it's the same, see if page can 1145 * be inserted into the unstable tree, or merged with a page already there and 1146 * both transferred to the stable tree. 1147 * 1148 * @page: the page that we are searching identical page to. 1149 * @rmap_item: the reverse mapping into the virtual address of this page 1150 */ 1151 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1152 { 1153 struct rmap_item *tree_rmap_item; 1154 struct page *tree_page = NULL; 1155 struct stable_node *stable_node; 1156 struct page *kpage; 1157 unsigned int checksum; 1158 int err; 1159 1160 remove_rmap_item_from_tree(rmap_item); 1161 1162 /* We first start with searching the page inside the stable tree */ 1163 kpage = stable_tree_search(page); 1164 if (kpage) { 1165 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1166 if (!err) { 1167 /* 1168 * The page was successfully merged: 1169 * add its rmap_item to the stable tree. 1170 */ 1171 lock_page(kpage); 1172 stable_tree_append(rmap_item, page_stable_node(kpage)); 1173 unlock_page(kpage); 1174 } 1175 put_page(kpage); 1176 return; 1177 } 1178 1179 /* 1180 * If the hash value of the page has changed from the last time 1181 * we calculated it, this page is changing frequently: therefore we 1182 * don't want to insert it in the unstable tree, and we don't want 1183 * to waste our time searching for something identical to it there. 1184 */ 1185 checksum = calc_checksum(page); 1186 if (rmap_item->oldchecksum != checksum) { 1187 rmap_item->oldchecksum = checksum; 1188 return; 1189 } 1190 1191 tree_rmap_item = 1192 unstable_tree_search_insert(rmap_item, page, &tree_page); 1193 if (tree_rmap_item) { 1194 kpage = try_to_merge_two_pages(rmap_item, page, 1195 tree_rmap_item, tree_page); 1196 put_page(tree_page); 1197 /* 1198 * As soon as we merge this page, we want to remove the 1199 * rmap_item of the page we have merged with from the unstable 1200 * tree, and insert it instead as new node in the stable tree. 1201 */ 1202 if (kpage) { 1203 remove_rmap_item_from_tree(tree_rmap_item); 1204 1205 lock_page(kpage); 1206 stable_node = stable_tree_insert(kpage); 1207 if (stable_node) { 1208 stable_tree_append(tree_rmap_item, stable_node); 1209 stable_tree_append(rmap_item, stable_node); 1210 } 1211 unlock_page(kpage); 1212 1213 /* 1214 * If we fail to insert the page into the stable tree, 1215 * we will have 2 virtual addresses that are pointing 1216 * to a ksm page left outside the stable tree, 1217 * in which case we need to break_cow on both. 1218 */ 1219 if (!stable_node) { 1220 break_cow(tree_rmap_item); 1221 break_cow(rmap_item); 1222 } 1223 } 1224 } 1225 } 1226 1227 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1228 struct rmap_item **rmap_list, 1229 unsigned long addr) 1230 { 1231 struct rmap_item *rmap_item; 1232 1233 while (*rmap_list) { 1234 rmap_item = *rmap_list; 1235 if ((rmap_item->address & PAGE_MASK) == addr) 1236 return rmap_item; 1237 if (rmap_item->address > addr) 1238 break; 1239 *rmap_list = rmap_item->rmap_list; 1240 remove_rmap_item_from_tree(rmap_item); 1241 free_rmap_item(rmap_item); 1242 } 1243 1244 rmap_item = alloc_rmap_item(); 1245 if (rmap_item) { 1246 /* It has already been zeroed */ 1247 rmap_item->mm = mm_slot->mm; 1248 rmap_item->address = addr; 1249 rmap_item->rmap_list = *rmap_list; 1250 *rmap_list = rmap_item; 1251 } 1252 return rmap_item; 1253 } 1254 1255 static struct rmap_item *scan_get_next_rmap_item(struct page **page) 1256 { 1257 struct mm_struct *mm; 1258 struct mm_slot *slot; 1259 struct vm_area_struct *vma; 1260 struct rmap_item *rmap_item; 1261 1262 if (list_empty(&ksm_mm_head.mm_list)) 1263 return NULL; 1264 1265 slot = ksm_scan.mm_slot; 1266 if (slot == &ksm_mm_head) { 1267 root_unstable_tree = RB_ROOT; 1268 1269 spin_lock(&ksm_mmlist_lock); 1270 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1271 ksm_scan.mm_slot = slot; 1272 spin_unlock(&ksm_mmlist_lock); 1273 next_mm: 1274 ksm_scan.address = 0; 1275 ksm_scan.rmap_list = &slot->rmap_list; 1276 } 1277 1278 mm = slot->mm; 1279 down_read(&mm->mmap_sem); 1280 if (ksm_test_exit(mm)) 1281 vma = NULL; 1282 else 1283 vma = find_vma(mm, ksm_scan.address); 1284 1285 for (; vma; vma = vma->vm_next) { 1286 if (!(vma->vm_flags & VM_MERGEABLE)) 1287 continue; 1288 if (ksm_scan.address < vma->vm_start) 1289 ksm_scan.address = vma->vm_start; 1290 if (!vma->anon_vma) 1291 ksm_scan.address = vma->vm_end; 1292 1293 while (ksm_scan.address < vma->vm_end) { 1294 if (ksm_test_exit(mm)) 1295 break; 1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1297 if (*page && PageAnon(*page)) { 1298 flush_anon_page(vma, *page, ksm_scan.address); 1299 flush_dcache_page(*page); 1300 rmap_item = get_next_rmap_item(slot, 1301 ksm_scan.rmap_list, ksm_scan.address); 1302 if (rmap_item) { 1303 ksm_scan.rmap_list = 1304 &rmap_item->rmap_list; 1305 ksm_scan.address += PAGE_SIZE; 1306 } else 1307 put_page(*page); 1308 up_read(&mm->mmap_sem); 1309 return rmap_item; 1310 } 1311 if (*page) 1312 put_page(*page); 1313 ksm_scan.address += PAGE_SIZE; 1314 cond_resched(); 1315 } 1316 } 1317 1318 if (ksm_test_exit(mm)) { 1319 ksm_scan.address = 0; 1320 ksm_scan.rmap_list = &slot->rmap_list; 1321 } 1322 /* 1323 * Nuke all the rmap_items that are above this current rmap: 1324 * because there were no VM_MERGEABLE vmas with such addresses. 1325 */ 1326 remove_trailing_rmap_items(slot, ksm_scan.rmap_list); 1327 1328 spin_lock(&ksm_mmlist_lock); 1329 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1330 struct mm_slot, mm_list); 1331 if (ksm_scan.address == 0) { 1332 /* 1333 * We've completed a full scan of all vmas, holding mmap_sem 1334 * throughout, and found no VM_MERGEABLE: so do the same as 1335 * __ksm_exit does to remove this mm from all our lists now. 1336 * This applies either when cleaning up after __ksm_exit 1337 * (but beware: we can reach here even before __ksm_exit), 1338 * or when all VM_MERGEABLE areas have been unmapped (and 1339 * mmap_sem then protects against race with MADV_MERGEABLE). 1340 */ 1341 hlist_del(&slot->link); 1342 list_del(&slot->mm_list); 1343 spin_unlock(&ksm_mmlist_lock); 1344 1345 free_mm_slot(slot); 1346 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1347 up_read(&mm->mmap_sem); 1348 mmdrop(mm); 1349 } else { 1350 spin_unlock(&ksm_mmlist_lock); 1351 up_read(&mm->mmap_sem); 1352 } 1353 1354 /* Repeat until we've completed scanning the whole list */ 1355 slot = ksm_scan.mm_slot; 1356 if (slot != &ksm_mm_head) 1357 goto next_mm; 1358 1359 ksm_scan.seqnr++; 1360 return NULL; 1361 } 1362 1363 /** 1364 * ksm_do_scan - the ksm scanner main worker function. 1365 * @scan_npages - number of pages we want to scan before we return. 1366 */ 1367 static void ksm_do_scan(unsigned int scan_npages) 1368 { 1369 struct rmap_item *rmap_item; 1370 struct page *page; 1371 1372 while (scan_npages--) { 1373 cond_resched(); 1374 rmap_item = scan_get_next_rmap_item(&page); 1375 if (!rmap_item) 1376 return; 1377 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1378 cmp_and_merge_page(page, rmap_item); 1379 put_page(page); 1380 } 1381 } 1382 1383 static int ksmd_should_run(void) 1384 { 1385 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); 1386 } 1387 1388 static int ksm_scan_thread(void *nothing) 1389 { 1390 set_user_nice(current, 5); 1391 1392 while (!kthread_should_stop()) { 1393 mutex_lock(&ksm_thread_mutex); 1394 if (ksmd_should_run()) 1395 ksm_do_scan(ksm_thread_pages_to_scan); 1396 mutex_unlock(&ksm_thread_mutex); 1397 1398 if (ksmd_should_run()) { 1399 schedule_timeout_interruptible( 1400 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1401 } else { 1402 wait_event_interruptible(ksm_thread_wait, 1403 ksmd_should_run() || kthread_should_stop()); 1404 } 1405 } 1406 return 0; 1407 } 1408 1409 int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1410 unsigned long end, int advice, unsigned long *vm_flags) 1411 { 1412 struct mm_struct *mm = vma->vm_mm; 1413 int err; 1414 1415 switch (advice) { 1416 case MADV_MERGEABLE: 1417 /* 1418 * Be somewhat over-protective for now! 1419 */ 1420 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1421 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1422 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1423 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) 1424 return 0; /* just ignore the advice */ 1425 1426 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1427 err = __ksm_enter(mm); 1428 if (err) 1429 return err; 1430 } 1431 1432 *vm_flags |= VM_MERGEABLE; 1433 break; 1434 1435 case MADV_UNMERGEABLE: 1436 if (!(*vm_flags & VM_MERGEABLE)) 1437 return 0; /* just ignore the advice */ 1438 1439 if (vma->anon_vma) { 1440 err = unmerge_ksm_pages(vma, start, end); 1441 if (err) 1442 return err; 1443 } 1444 1445 *vm_flags &= ~VM_MERGEABLE; 1446 break; 1447 } 1448 1449 return 0; 1450 } 1451 1452 int __ksm_enter(struct mm_struct *mm) 1453 { 1454 struct mm_slot *mm_slot; 1455 int needs_wakeup; 1456 1457 mm_slot = alloc_mm_slot(); 1458 if (!mm_slot) 1459 return -ENOMEM; 1460 1461 /* Check ksm_run too? Would need tighter locking */ 1462 needs_wakeup = list_empty(&ksm_mm_head.mm_list); 1463 1464 spin_lock(&ksm_mmlist_lock); 1465 insert_to_mm_slots_hash(mm, mm_slot); 1466 /* 1467 * Insert just behind the scanning cursor, to let the area settle 1468 * down a little; when fork is followed by immediate exec, we don't 1469 * want ksmd to waste time setting up and tearing down an rmap_list. 1470 */ 1471 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1472 spin_unlock(&ksm_mmlist_lock); 1473 1474 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1475 atomic_inc(&mm->mm_count); 1476 1477 if (needs_wakeup) 1478 wake_up_interruptible(&ksm_thread_wait); 1479 1480 return 0; 1481 } 1482 1483 void __ksm_exit(struct mm_struct *mm) 1484 { 1485 struct mm_slot *mm_slot; 1486 int easy_to_free = 0; 1487 1488 /* 1489 * This process is exiting: if it's straightforward (as is the 1490 * case when ksmd was never running), free mm_slot immediately. 1491 * But if it's at the cursor or has rmap_items linked to it, use 1492 * mmap_sem to synchronize with any break_cows before pagetables 1493 * are freed, and leave the mm_slot on the list for ksmd to free. 1494 * Beware: ksm may already have noticed it exiting and freed the slot. 1495 */ 1496 1497 spin_lock(&ksm_mmlist_lock); 1498 mm_slot = get_mm_slot(mm); 1499 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1500 if (!mm_slot->rmap_list) { 1501 hlist_del(&mm_slot->link); 1502 list_del(&mm_slot->mm_list); 1503 easy_to_free = 1; 1504 } else { 1505 list_move(&mm_slot->mm_list, 1506 &ksm_scan.mm_slot->mm_list); 1507 } 1508 } 1509 spin_unlock(&ksm_mmlist_lock); 1510 1511 if (easy_to_free) { 1512 free_mm_slot(mm_slot); 1513 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1514 mmdrop(mm); 1515 } else if (mm_slot) { 1516 down_write(&mm->mmap_sem); 1517 up_write(&mm->mmap_sem); 1518 } 1519 } 1520 1521 struct page *ksm_does_need_to_copy(struct page *page, 1522 struct vm_area_struct *vma, unsigned long address) 1523 { 1524 struct page *new_page; 1525 1526 unlock_page(page); /* any racers will COW it, not modify it */ 1527 1528 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1529 if (new_page) { 1530 copy_user_highpage(new_page, page, address, vma); 1531 1532 SetPageDirty(new_page); 1533 __SetPageUptodate(new_page); 1534 SetPageSwapBacked(new_page); 1535 __set_page_locked(new_page); 1536 1537 if (page_evictable(new_page, vma)) 1538 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1539 else 1540 add_page_to_unevictable_list(new_page); 1541 } 1542 1543 page_cache_release(page); 1544 return new_page; 1545 } 1546 1547 int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1548 unsigned long *vm_flags) 1549 { 1550 struct stable_node *stable_node; 1551 struct rmap_item *rmap_item; 1552 struct hlist_node *hlist; 1553 unsigned int mapcount = page_mapcount(page); 1554 int referenced = 0; 1555 int search_new_forks = 0; 1556 1557 VM_BUG_ON(!PageKsm(page)); 1558 VM_BUG_ON(!PageLocked(page)); 1559 1560 stable_node = page_stable_node(page); 1561 if (!stable_node) 1562 return 0; 1563 again: 1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1565 struct anon_vma *anon_vma = rmap_item->anon_vma; 1566 struct vm_area_struct *vma; 1567 1568 spin_lock(&anon_vma->lock); 1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1570 if (rmap_item->address < vma->vm_start || 1571 rmap_item->address >= vma->vm_end) 1572 continue; 1573 /* 1574 * Initially we examine only the vma which covers this 1575 * rmap_item; but later, if there is still work to do, 1576 * we examine covering vmas in other mms: in case they 1577 * were forked from the original since ksmd passed. 1578 */ 1579 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1580 continue; 1581 1582 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1583 continue; 1584 1585 referenced += page_referenced_one(page, vma, 1586 rmap_item->address, &mapcount, vm_flags); 1587 if (!search_new_forks || !mapcount) 1588 break; 1589 } 1590 spin_unlock(&anon_vma->lock); 1591 if (!mapcount) 1592 goto out; 1593 } 1594 if (!search_new_forks++) 1595 goto again; 1596 out: 1597 return referenced; 1598 } 1599 1600 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 1601 { 1602 struct stable_node *stable_node; 1603 struct hlist_node *hlist; 1604 struct rmap_item *rmap_item; 1605 int ret = SWAP_AGAIN; 1606 int search_new_forks = 0; 1607 1608 VM_BUG_ON(!PageKsm(page)); 1609 VM_BUG_ON(!PageLocked(page)); 1610 1611 stable_node = page_stable_node(page); 1612 if (!stable_node) 1613 return SWAP_FAIL; 1614 again: 1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1616 struct anon_vma *anon_vma = rmap_item->anon_vma; 1617 struct vm_area_struct *vma; 1618 1619 spin_lock(&anon_vma->lock); 1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1621 if (rmap_item->address < vma->vm_start || 1622 rmap_item->address >= vma->vm_end) 1623 continue; 1624 /* 1625 * Initially we examine only the vma which covers this 1626 * rmap_item; but later, if there is still work to do, 1627 * we examine covering vmas in other mms: in case they 1628 * were forked from the original since ksmd passed. 1629 */ 1630 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1631 continue; 1632 1633 ret = try_to_unmap_one(page, vma, 1634 rmap_item->address, flags); 1635 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1636 spin_unlock(&anon_vma->lock); 1637 goto out; 1638 } 1639 } 1640 spin_unlock(&anon_vma->lock); 1641 } 1642 if (!search_new_forks++) 1643 goto again; 1644 out: 1645 return ret; 1646 } 1647 1648 #ifdef CONFIG_MIGRATION 1649 int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, 1650 struct vm_area_struct *, unsigned long, void *), void *arg) 1651 { 1652 struct stable_node *stable_node; 1653 struct hlist_node *hlist; 1654 struct rmap_item *rmap_item; 1655 int ret = SWAP_AGAIN; 1656 int search_new_forks = 0; 1657 1658 VM_BUG_ON(!PageKsm(page)); 1659 VM_BUG_ON(!PageLocked(page)); 1660 1661 stable_node = page_stable_node(page); 1662 if (!stable_node) 1663 return ret; 1664 again: 1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1666 struct anon_vma *anon_vma = rmap_item->anon_vma; 1667 struct vm_area_struct *vma; 1668 1669 spin_lock(&anon_vma->lock); 1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1671 if (rmap_item->address < vma->vm_start || 1672 rmap_item->address >= vma->vm_end) 1673 continue; 1674 /* 1675 * Initially we examine only the vma which covers this 1676 * rmap_item; but later, if there is still work to do, 1677 * we examine covering vmas in other mms: in case they 1678 * were forked from the original since ksmd passed. 1679 */ 1680 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1681 continue; 1682 1683 ret = rmap_one(page, vma, rmap_item->address, arg); 1684 if (ret != SWAP_AGAIN) { 1685 spin_unlock(&anon_vma->lock); 1686 goto out; 1687 } 1688 } 1689 spin_unlock(&anon_vma->lock); 1690 } 1691 if (!search_new_forks++) 1692 goto again; 1693 out: 1694 return ret; 1695 } 1696 1697 void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1698 { 1699 struct stable_node *stable_node; 1700 1701 VM_BUG_ON(!PageLocked(oldpage)); 1702 VM_BUG_ON(!PageLocked(newpage)); 1703 VM_BUG_ON(newpage->mapping != oldpage->mapping); 1704 1705 stable_node = page_stable_node(newpage); 1706 if (stable_node) { 1707 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 1708 stable_node->kpfn = page_to_pfn(newpage); 1709 } 1710 } 1711 #endif /* CONFIG_MIGRATION */ 1712 1713 #ifdef CONFIG_MEMORY_HOTREMOVE 1714 static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 1715 unsigned long end_pfn) 1716 { 1717 struct rb_node *node; 1718 1719 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 1720 struct stable_node *stable_node; 1721 1722 stable_node = rb_entry(node, struct stable_node, node); 1723 if (stable_node->kpfn >= start_pfn && 1724 stable_node->kpfn < end_pfn) 1725 return stable_node; 1726 } 1727 return NULL; 1728 } 1729 1730 static int ksm_memory_callback(struct notifier_block *self, 1731 unsigned long action, void *arg) 1732 { 1733 struct memory_notify *mn = arg; 1734 struct stable_node *stable_node; 1735 1736 switch (action) { 1737 case MEM_GOING_OFFLINE: 1738 /* 1739 * Keep it very simple for now: just lock out ksmd and 1740 * MADV_UNMERGEABLE while any memory is going offline. 1741 */ 1742 mutex_lock(&ksm_thread_mutex); 1743 break; 1744 1745 case MEM_OFFLINE: 1746 /* 1747 * Most of the work is done by page migration; but there might 1748 * be a few stable_nodes left over, still pointing to struct 1749 * pages which have been offlined: prune those from the tree. 1750 */ 1751 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 1752 mn->start_pfn + mn->nr_pages)) != NULL) 1753 remove_node_from_stable_tree(stable_node); 1754 /* fallthrough */ 1755 1756 case MEM_CANCEL_OFFLINE: 1757 mutex_unlock(&ksm_thread_mutex); 1758 break; 1759 } 1760 return NOTIFY_OK; 1761 } 1762 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1763 1764 #ifdef CONFIG_SYSFS 1765 /* 1766 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1767 */ 1768 1769 #define KSM_ATTR_RO(_name) \ 1770 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1771 #define KSM_ATTR(_name) \ 1772 static struct kobj_attribute _name##_attr = \ 1773 __ATTR(_name, 0644, _name##_show, _name##_store) 1774 1775 static ssize_t sleep_millisecs_show(struct kobject *kobj, 1776 struct kobj_attribute *attr, char *buf) 1777 { 1778 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 1779 } 1780 1781 static ssize_t sleep_millisecs_store(struct kobject *kobj, 1782 struct kobj_attribute *attr, 1783 const char *buf, size_t count) 1784 { 1785 unsigned long msecs; 1786 int err; 1787 1788 err = strict_strtoul(buf, 10, &msecs); 1789 if (err || msecs > UINT_MAX) 1790 return -EINVAL; 1791 1792 ksm_thread_sleep_millisecs = msecs; 1793 1794 return count; 1795 } 1796 KSM_ATTR(sleep_millisecs); 1797 1798 static ssize_t pages_to_scan_show(struct kobject *kobj, 1799 struct kobj_attribute *attr, char *buf) 1800 { 1801 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 1802 } 1803 1804 static ssize_t pages_to_scan_store(struct kobject *kobj, 1805 struct kobj_attribute *attr, 1806 const char *buf, size_t count) 1807 { 1808 int err; 1809 unsigned long nr_pages; 1810 1811 err = strict_strtoul(buf, 10, &nr_pages); 1812 if (err || nr_pages > UINT_MAX) 1813 return -EINVAL; 1814 1815 ksm_thread_pages_to_scan = nr_pages; 1816 1817 return count; 1818 } 1819 KSM_ATTR(pages_to_scan); 1820 1821 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 1822 char *buf) 1823 { 1824 return sprintf(buf, "%u\n", ksm_run); 1825 } 1826 1827 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 1828 const char *buf, size_t count) 1829 { 1830 int err; 1831 unsigned long flags; 1832 1833 err = strict_strtoul(buf, 10, &flags); 1834 if (err || flags > UINT_MAX) 1835 return -EINVAL; 1836 if (flags > KSM_RUN_UNMERGE) 1837 return -EINVAL; 1838 1839 /* 1840 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1841 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1842 * breaking COW to free the pages_shared (but leaves mm_slots 1843 * on the list for when ksmd may be set running again). 1844 */ 1845 1846 mutex_lock(&ksm_thread_mutex); 1847 if (ksm_run != flags) { 1848 ksm_run = flags; 1849 if (flags & KSM_RUN_UNMERGE) { 1850 current->flags |= PF_OOM_ORIGIN; 1851 err = unmerge_and_remove_all_rmap_items(); 1852 current->flags &= ~PF_OOM_ORIGIN; 1853 if (err) { 1854 ksm_run = KSM_RUN_STOP; 1855 count = err; 1856 } 1857 } 1858 } 1859 mutex_unlock(&ksm_thread_mutex); 1860 1861 if (flags & KSM_RUN_MERGE) 1862 wake_up_interruptible(&ksm_thread_wait); 1863 1864 return count; 1865 } 1866 KSM_ATTR(run); 1867 1868 static ssize_t pages_shared_show(struct kobject *kobj, 1869 struct kobj_attribute *attr, char *buf) 1870 { 1871 return sprintf(buf, "%lu\n", ksm_pages_shared); 1872 } 1873 KSM_ATTR_RO(pages_shared); 1874 1875 static ssize_t pages_sharing_show(struct kobject *kobj, 1876 struct kobj_attribute *attr, char *buf) 1877 { 1878 return sprintf(buf, "%lu\n", ksm_pages_sharing); 1879 } 1880 KSM_ATTR_RO(pages_sharing); 1881 1882 static ssize_t pages_unshared_show(struct kobject *kobj, 1883 struct kobj_attribute *attr, char *buf) 1884 { 1885 return sprintf(buf, "%lu\n", ksm_pages_unshared); 1886 } 1887 KSM_ATTR_RO(pages_unshared); 1888 1889 static ssize_t pages_volatile_show(struct kobject *kobj, 1890 struct kobj_attribute *attr, char *buf) 1891 { 1892 long ksm_pages_volatile; 1893 1894 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1895 - ksm_pages_sharing - ksm_pages_unshared; 1896 /* 1897 * It was not worth any locking to calculate that statistic, 1898 * but it might therefore sometimes be negative: conceal that. 1899 */ 1900 if (ksm_pages_volatile < 0) 1901 ksm_pages_volatile = 0; 1902 return sprintf(buf, "%ld\n", ksm_pages_volatile); 1903 } 1904 KSM_ATTR_RO(pages_volatile); 1905 1906 static ssize_t full_scans_show(struct kobject *kobj, 1907 struct kobj_attribute *attr, char *buf) 1908 { 1909 return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1910 } 1911 KSM_ATTR_RO(full_scans); 1912 1913 static struct attribute *ksm_attrs[] = { 1914 &sleep_millisecs_attr.attr, 1915 &pages_to_scan_attr.attr, 1916 &run_attr.attr, 1917 &pages_shared_attr.attr, 1918 &pages_sharing_attr.attr, 1919 &pages_unshared_attr.attr, 1920 &pages_volatile_attr.attr, 1921 &full_scans_attr.attr, 1922 NULL, 1923 }; 1924 1925 static struct attribute_group ksm_attr_group = { 1926 .attrs = ksm_attrs, 1927 .name = "ksm", 1928 }; 1929 #endif /* CONFIG_SYSFS */ 1930 1931 static int __init ksm_init(void) 1932 { 1933 struct task_struct *ksm_thread; 1934 int err; 1935 1936 err = ksm_slab_init(); 1937 if (err) 1938 goto out; 1939 1940 err = mm_slots_hash_init(); 1941 if (err) 1942 goto out_free1; 1943 1944 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1945 if (IS_ERR(ksm_thread)) { 1946 printk(KERN_ERR "ksm: creating kthread failed\n"); 1947 err = PTR_ERR(ksm_thread); 1948 goto out_free2; 1949 } 1950 1951 #ifdef CONFIG_SYSFS 1952 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 1953 if (err) { 1954 printk(KERN_ERR "ksm: register sysfs failed\n"); 1955 kthread_stop(ksm_thread); 1956 goto out_free2; 1957 } 1958 #else 1959 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 1960 1961 #endif /* CONFIG_SYSFS */ 1962 1963 #ifdef CONFIG_MEMORY_HOTREMOVE 1964 /* 1965 * Choose a high priority since the callback takes ksm_thread_mutex: 1966 * later callbacks could only be taking locks which nest within that. 1967 */ 1968 hotplug_memory_notifier(ksm_memory_callback, 100); 1969 #endif 1970 return 0; 1971 1972 out_free2: 1973 mm_slots_hash_free(); 1974 out_free1: 1975 ksm_slab_free(); 1976 out: 1977 return err; 1978 } 1979 module_init(ksm_init) 1980