1 /* 2 * Memory merging support. 3 * 4 * This code enables dynamic sharing of identical pages found in different 5 * memory areas, even if they are not shared by fork() 6 * 7 * Copyright (C) 2008-2009 Red Hat, Inc. 8 * Authors: 9 * Izik Eidus 10 * Andrea Arcangeli 11 * Chris Wright 12 * Hugh Dickins 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. 15 */ 16 17 #include <linux/errno.h> 18 #include <linux/mm.h> 19 #include <linux/fs.h> 20 #include <linux/mman.h> 21 #include <linux/sched.h> 22 #include <linux/rwsem.h> 23 #include <linux/pagemap.h> 24 #include <linux/rmap.h> 25 #include <linux/spinlock.h> 26 #include <linux/jhash.h> 27 #include <linux/delay.h> 28 #include <linux/kthread.h> 29 #include <linux/wait.h> 30 #include <linux/slab.h> 31 #include <linux/memcontrol.h> 32 #include <linux/rbtree.h> 33 #include <linux/memory.h> 34 #include <linux/mmu_notifier.h> 35 #include <linux/swap.h> 36 #include <linux/ksm.h> 37 #include <linux/hash.h> 38 #include <linux/freezer.h> 39 #include <linux/oom.h> 40 41 #include <asm/tlbflush.h> 42 #include "internal.h" 43 44 /* 45 * A few notes about the KSM scanning process, 46 * to make it easier to understand the data structures below: 47 * 48 * In order to reduce excessive scanning, KSM sorts the memory pages by their 49 * contents into a data structure that holds pointers to the pages' locations. 50 * 51 * Since the contents of the pages may change at any moment, KSM cannot just 52 * insert the pages into a normal sorted tree and expect it to find anything. 53 * Therefore KSM uses two data structures - the stable and the unstable tree. 54 * 55 * The stable tree holds pointers to all the merged pages (ksm pages), sorted 56 * by their contents. Because each such page is write-protected, searching on 57 * this tree is fully assured to be working (except when pages are unmapped), 58 * and therefore this tree is called the stable tree. 59 * 60 * In addition to the stable tree, KSM uses a second data structure called the 61 * unstable tree: this tree holds pointers to pages which have been found to 62 * be "unchanged for a period of time". The unstable tree sorts these pages 63 * by their contents, but since they are not write-protected, KSM cannot rely 64 * upon the unstable tree to work correctly - the unstable tree is liable to 65 * be corrupted as its contents are modified, and so it is called unstable. 66 * 67 * KSM solves this problem by several techniques: 68 * 69 * 1) The unstable tree is flushed every time KSM completes scanning all 70 * memory areas, and then the tree is rebuilt again from the beginning. 71 * 2) KSM will only insert into the unstable tree, pages whose hash value 72 * has not changed since the previous scan of all memory areas. 73 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 74 * colors of the nodes and not on their contents, assuring that even when 75 * the tree gets "corrupted" it won't get out of balance, so scanning time 76 * remains the same (also, searching and inserting nodes in an rbtree uses 77 * the same algorithm, so we have no overhead when we flush and rebuild). 78 * 4) KSM never flushes the stable tree, which means that even if it were to 79 * take 10 attempts to find a page in the unstable tree, once it is found, 80 * it is secured in the stable tree. (When we scan a new page, we first 81 * compare it against the stable tree, and then against the unstable tree.) 82 */ 83 84 /** 85 * struct mm_slot - ksm information per mm that is being scanned 86 * @link: link to the mm_slots hash list 87 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 88 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items 89 * @mm: the mm that this information is valid for 90 */ 91 struct mm_slot { 92 struct hlist_node link; 93 struct list_head mm_list; 94 struct rmap_item *rmap_list; 95 struct mm_struct *mm; 96 }; 97 98 /** 99 * struct ksm_scan - cursor for scanning 100 * @mm_slot: the current mm_slot we are scanning 101 * @address: the next address inside that to be scanned 102 * @rmap_list: link to the next rmap to be scanned in the rmap_list 103 * @seqnr: count of completed full scans (needed when removing unstable node) 104 * 105 * There is only the one ksm_scan instance of this cursor structure. 106 */ 107 struct ksm_scan { 108 struct mm_slot *mm_slot; 109 unsigned long address; 110 struct rmap_item **rmap_list; 111 unsigned long seqnr; 112 }; 113 114 /** 115 * struct stable_node - node of the stable rbtree 116 * @node: rb node of this ksm page in the stable tree 117 * @hlist: hlist head of rmap_items using this ksm page 118 * @kpfn: page frame number of this ksm page 119 */ 120 struct stable_node { 121 struct rb_node node; 122 struct hlist_head hlist; 123 unsigned long kpfn; 124 }; 125 126 /** 127 * struct rmap_item - reverse mapping item for virtual addresses 128 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 129 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 130 * @mm: the memory structure this rmap_item is pointing into 131 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 132 * @oldchecksum: previous checksum of the page at that virtual address 133 * @node: rb node of this rmap_item in the unstable tree 134 * @head: pointer to stable_node heading this list in the stable tree 135 * @hlist: link into hlist of rmap_items hanging off that stable_node 136 */ 137 struct rmap_item { 138 struct rmap_item *rmap_list; 139 struct anon_vma *anon_vma; /* when stable */ 140 struct mm_struct *mm; 141 unsigned long address; /* + low bits used for flags below */ 142 unsigned int oldchecksum; /* when unstable */ 143 union { 144 struct rb_node node; /* when node of unstable tree */ 145 struct { /* when listed from stable tree */ 146 struct stable_node *head; 147 struct hlist_node hlist; 148 }; 149 }; 150 }; 151 152 #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 153 #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ 154 #define STABLE_FLAG 0x200 /* is listed from the stable tree */ 155 156 /* The stable and unstable tree heads */ 157 static struct rb_root root_stable_tree = RB_ROOT; 158 static struct rb_root root_unstable_tree = RB_ROOT; 159 160 #define MM_SLOTS_HASH_SHIFT 10 161 #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 162 static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 163 164 static struct mm_slot ksm_mm_head = { 165 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 166 }; 167 static struct ksm_scan ksm_scan = { 168 .mm_slot = &ksm_mm_head, 169 }; 170 171 static struct kmem_cache *rmap_item_cache; 172 static struct kmem_cache *stable_node_cache; 173 static struct kmem_cache *mm_slot_cache; 174 175 /* The number of nodes in the stable tree */ 176 static unsigned long ksm_pages_shared; 177 178 /* The number of page slots additionally sharing those nodes */ 179 static unsigned long ksm_pages_sharing; 180 181 /* The number of nodes in the unstable tree */ 182 static unsigned long ksm_pages_unshared; 183 184 /* The number of rmap_items in use: to calculate pages_volatile */ 185 static unsigned long ksm_rmap_items; 186 187 /* Number of pages ksmd should scan in one batch */ 188 static unsigned int ksm_thread_pages_to_scan = 100; 189 190 /* Milliseconds ksmd should sleep between batches */ 191 static unsigned int ksm_thread_sleep_millisecs = 20; 192 193 #define KSM_RUN_STOP 0 194 #define KSM_RUN_MERGE 1 195 #define KSM_RUN_UNMERGE 2 196 static unsigned int ksm_run = KSM_RUN_STOP; 197 198 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 199 static DEFINE_MUTEX(ksm_thread_mutex); 200 static DEFINE_SPINLOCK(ksm_mmlist_lock); 201 202 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 203 sizeof(struct __struct), __alignof__(struct __struct),\ 204 (__flags), NULL) 205 206 static int __init ksm_slab_init(void) 207 { 208 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 209 if (!rmap_item_cache) 210 goto out; 211 212 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); 213 if (!stable_node_cache) 214 goto out_free1; 215 216 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 217 if (!mm_slot_cache) 218 goto out_free2; 219 220 return 0; 221 222 out_free2: 223 kmem_cache_destroy(stable_node_cache); 224 out_free1: 225 kmem_cache_destroy(rmap_item_cache); 226 out: 227 return -ENOMEM; 228 } 229 230 static void __init ksm_slab_free(void) 231 { 232 kmem_cache_destroy(mm_slot_cache); 233 kmem_cache_destroy(stable_node_cache); 234 kmem_cache_destroy(rmap_item_cache); 235 mm_slot_cache = NULL; 236 } 237 238 static inline struct rmap_item *alloc_rmap_item(void) 239 { 240 struct rmap_item *rmap_item; 241 242 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); 243 if (rmap_item) 244 ksm_rmap_items++; 245 return rmap_item; 246 } 247 248 static inline void free_rmap_item(struct rmap_item *rmap_item) 249 { 250 ksm_rmap_items--; 251 rmap_item->mm = NULL; /* debug safety */ 252 kmem_cache_free(rmap_item_cache, rmap_item); 253 } 254 255 static inline struct stable_node *alloc_stable_node(void) 256 { 257 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); 258 } 259 260 static inline void free_stable_node(struct stable_node *stable_node) 261 { 262 kmem_cache_free(stable_node_cache, stable_node); 263 } 264 265 static inline struct mm_slot *alloc_mm_slot(void) 266 { 267 if (!mm_slot_cache) /* initialization failed */ 268 return NULL; 269 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 270 } 271 272 static inline void free_mm_slot(struct mm_slot *mm_slot) 273 { 274 kmem_cache_free(mm_slot_cache, mm_slot); 275 } 276 277 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 278 { 279 struct mm_slot *mm_slot; 280 struct hlist_head *bucket; 281 struct hlist_node *node; 282 283 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; 284 hlist_for_each_entry(mm_slot, node, bucket, link) { 285 if (mm == mm_slot->mm) 286 return mm_slot; 287 } 288 return NULL; 289 } 290 291 static void insert_to_mm_slots_hash(struct mm_struct *mm, 292 struct mm_slot *mm_slot) 293 { 294 struct hlist_head *bucket; 295 296 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; 297 mm_slot->mm = mm; 298 hlist_add_head(&mm_slot->link, bucket); 299 } 300 301 static inline int in_stable_tree(struct rmap_item *rmap_item) 302 { 303 return rmap_item->address & STABLE_FLAG; 304 } 305 306 /* 307 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 308 * page tables after it has passed through ksm_exit() - which, if necessary, 309 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set 310 * a special flag: they can just back out as soon as mm_users goes to zero. 311 * ksm_test_exit() is used throughout to make this test for exit: in some 312 * places for correctness, in some places just to avoid unnecessary work. 313 */ 314 static inline bool ksm_test_exit(struct mm_struct *mm) 315 { 316 return atomic_read(&mm->mm_users) == 0; 317 } 318 319 /* 320 * We use break_ksm to break COW on a ksm page: it's a stripped down 321 * 322 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) 323 * put_page(page); 324 * 325 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, 326 * in case the application has unmapped and remapped mm,addr meanwhile. 327 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 328 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. 329 */ 330 static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 331 { 332 struct page *page; 333 int ret = 0; 334 335 do { 336 cond_resched(); 337 page = follow_page(vma, addr, FOLL_GET); 338 if (IS_ERR_OR_NULL(page)) 339 break; 340 if (PageKsm(page)) 341 ret = handle_mm_fault(vma->vm_mm, vma, addr, 342 FAULT_FLAG_WRITE); 343 else 344 ret = VM_FAULT_WRITE; 345 put_page(page); 346 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 347 /* 348 * We must loop because handle_mm_fault() may back out if there's 349 * any difficulty e.g. if pte accessed bit gets updated concurrently. 350 * 351 * VM_FAULT_WRITE is what we have been hoping for: it indicates that 352 * COW has been broken, even if the vma does not permit VM_WRITE; 353 * but note that a concurrent fault might break PageKsm for us. 354 * 355 * VM_FAULT_SIGBUS could occur if we race with truncation of the 356 * backing file, which also invalidates anonymous pages: that's 357 * okay, that truncation will have unmapped the PageKsm for us. 358 * 359 * VM_FAULT_OOM: at the time of writing (late July 2009), setting 360 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 361 * current task has TIF_MEMDIE set, and will be OOM killed on return 362 * to user; and ksmd, having no mm, would never be chosen for that. 363 * 364 * But if the mm is in a limited mem_cgroup, then the fault may fail 365 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 366 * even ksmd can fail in this way - though it's usually breaking ksm 367 * just to undo a merge it made a moment before, so unlikely to oom. 368 * 369 * That's a pity: we might therefore have more kernel pages allocated 370 * than we're counting as nodes in the stable tree; but ksm_do_scan 371 * will retry to break_cow on each pass, so should recover the page 372 * in due course. The important thing is to not let VM_MERGEABLE 373 * be cleared while any such pages might remain in the area. 374 */ 375 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 376 } 377 378 static void break_cow(struct rmap_item *rmap_item) 379 { 380 struct mm_struct *mm = rmap_item->mm; 381 unsigned long addr = rmap_item->address; 382 struct vm_area_struct *vma; 383 384 /* 385 * It is not an accident that whenever we want to break COW 386 * to undo, we also need to drop a reference to the anon_vma. 387 */ 388 put_anon_vma(rmap_item->anon_vma); 389 390 down_read(&mm->mmap_sem); 391 if (ksm_test_exit(mm)) 392 goto out; 393 vma = find_vma(mm, addr); 394 if (!vma || vma->vm_start > addr) 395 goto out; 396 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 397 goto out; 398 break_ksm(vma, addr); 399 out: 400 up_read(&mm->mmap_sem); 401 } 402 403 static struct page *page_trans_compound_anon(struct page *page) 404 { 405 if (PageTransCompound(page)) { 406 struct page *head = compound_trans_head(page); 407 /* 408 * head may actually be splitted and freed from under 409 * us but it's ok here. 410 */ 411 if (PageAnon(head)) 412 return head; 413 } 414 return NULL; 415 } 416 417 static struct page *get_mergeable_page(struct rmap_item *rmap_item) 418 { 419 struct mm_struct *mm = rmap_item->mm; 420 unsigned long addr = rmap_item->address; 421 struct vm_area_struct *vma; 422 struct page *page; 423 424 down_read(&mm->mmap_sem); 425 if (ksm_test_exit(mm)) 426 goto out; 427 vma = find_vma(mm, addr); 428 if (!vma || vma->vm_start > addr) 429 goto out; 430 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 431 goto out; 432 433 page = follow_page(vma, addr, FOLL_GET); 434 if (IS_ERR_OR_NULL(page)) 435 goto out; 436 if (PageAnon(page) || page_trans_compound_anon(page)) { 437 flush_anon_page(vma, page, addr); 438 flush_dcache_page(page); 439 } else { 440 put_page(page); 441 out: page = NULL; 442 } 443 up_read(&mm->mmap_sem); 444 return page; 445 } 446 447 static void remove_node_from_stable_tree(struct stable_node *stable_node) 448 { 449 struct rmap_item *rmap_item; 450 struct hlist_node *hlist; 451 452 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 453 if (rmap_item->hlist.next) 454 ksm_pages_sharing--; 455 else 456 ksm_pages_shared--; 457 put_anon_vma(rmap_item->anon_vma); 458 rmap_item->address &= PAGE_MASK; 459 cond_resched(); 460 } 461 462 rb_erase(&stable_node->node, &root_stable_tree); 463 free_stable_node(stable_node); 464 } 465 466 /* 467 * get_ksm_page: checks if the page indicated by the stable node 468 * is still its ksm page, despite having held no reference to it. 469 * In which case we can trust the content of the page, and it 470 * returns the gotten page; but if the page has now been zapped, 471 * remove the stale node from the stable tree and return NULL. 472 * 473 * You would expect the stable_node to hold a reference to the ksm page. 474 * But if it increments the page's count, swapping out has to wait for 475 * ksmd to come around again before it can free the page, which may take 476 * seconds or even minutes: much too unresponsive. So instead we use a 477 * "keyhole reference": access to the ksm page from the stable node peeps 478 * out through its keyhole to see if that page still holds the right key, 479 * pointing back to this stable node. This relies on freeing a PageAnon 480 * page to reset its page->mapping to NULL, and relies on no other use of 481 * a page to put something that might look like our key in page->mapping. 482 * 483 * include/linux/pagemap.h page_cache_get_speculative() is a good reference, 484 * but this is different - made simpler by ksm_thread_mutex being held, but 485 * interesting for assuming that no other use of the struct page could ever 486 * put our expected_mapping into page->mapping (or a field of the union which 487 * coincides with page->mapping). The RCU calls are not for KSM at all, but 488 * to keep the page_count protocol described with page_cache_get_speculative. 489 * 490 * Note: it is possible that get_ksm_page() will return NULL one moment, 491 * then page the next, if the page is in between page_freeze_refs() and 492 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page 493 * is on its way to being freed; but it is an anomaly to bear in mind. 494 */ 495 static struct page *get_ksm_page(struct stable_node *stable_node) 496 { 497 struct page *page; 498 void *expected_mapping; 499 500 page = pfn_to_page(stable_node->kpfn); 501 expected_mapping = (void *)stable_node + 502 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 503 rcu_read_lock(); 504 if (page->mapping != expected_mapping) 505 goto stale; 506 if (!get_page_unless_zero(page)) 507 goto stale; 508 if (page->mapping != expected_mapping) { 509 put_page(page); 510 goto stale; 511 } 512 rcu_read_unlock(); 513 return page; 514 stale: 515 rcu_read_unlock(); 516 remove_node_from_stable_tree(stable_node); 517 return NULL; 518 } 519 520 /* 521 * Removing rmap_item from stable or unstable tree. 522 * This function will clean the information from the stable/unstable tree. 523 */ 524 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 525 { 526 if (rmap_item->address & STABLE_FLAG) { 527 struct stable_node *stable_node; 528 struct page *page; 529 530 stable_node = rmap_item->head; 531 page = get_ksm_page(stable_node); 532 if (!page) 533 goto out; 534 535 lock_page(page); 536 hlist_del(&rmap_item->hlist); 537 unlock_page(page); 538 put_page(page); 539 540 if (stable_node->hlist.first) 541 ksm_pages_sharing--; 542 else 543 ksm_pages_shared--; 544 545 put_anon_vma(rmap_item->anon_vma); 546 rmap_item->address &= PAGE_MASK; 547 548 } else if (rmap_item->address & UNSTABLE_FLAG) { 549 unsigned char age; 550 /* 551 * Usually ksmd can and must skip the rb_erase, because 552 * root_unstable_tree was already reset to RB_ROOT. 553 * But be careful when an mm is exiting: do the rb_erase 554 * if this rmap_item was inserted by this scan, rather 555 * than left over from before. 556 */ 557 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 558 BUG_ON(age > 1); 559 if (!age) 560 rb_erase(&rmap_item->node, &root_unstable_tree); 561 562 ksm_pages_unshared--; 563 rmap_item->address &= PAGE_MASK; 564 } 565 out: 566 cond_resched(); /* we're called from many long loops */ 567 } 568 569 static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 570 struct rmap_item **rmap_list) 571 { 572 while (*rmap_list) { 573 struct rmap_item *rmap_item = *rmap_list; 574 *rmap_list = rmap_item->rmap_list; 575 remove_rmap_item_from_tree(rmap_item); 576 free_rmap_item(rmap_item); 577 } 578 } 579 580 /* 581 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 582 * than check every pte of a given vma, the locking doesn't quite work for 583 * that - an rmap_item is assigned to the stable tree after inserting ksm 584 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 585 * rmap_items from parent to child at fork time (so as not to waste time 586 * if exit comes before the next scan reaches it). 587 * 588 * Similarly, although we'd like to remove rmap_items (so updating counts 589 * and freeing memory) when unmerging an area, it's easier to leave that 590 * to the next pass of ksmd - consider, for example, how ksmd might be 591 * in cmp_and_merge_page on one of the rmap_items we would be removing. 592 */ 593 static int unmerge_ksm_pages(struct vm_area_struct *vma, 594 unsigned long start, unsigned long end) 595 { 596 unsigned long addr; 597 int err = 0; 598 599 for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 600 if (ksm_test_exit(vma->vm_mm)) 601 break; 602 if (signal_pending(current)) 603 err = -ERESTARTSYS; 604 else 605 err = break_ksm(vma, addr); 606 } 607 return err; 608 } 609 610 #ifdef CONFIG_SYSFS 611 /* 612 * Only called through the sysfs control interface: 613 */ 614 static int unmerge_and_remove_all_rmap_items(void) 615 { 616 struct mm_slot *mm_slot; 617 struct mm_struct *mm; 618 struct vm_area_struct *vma; 619 int err = 0; 620 621 spin_lock(&ksm_mmlist_lock); 622 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, 623 struct mm_slot, mm_list); 624 spin_unlock(&ksm_mmlist_lock); 625 626 for (mm_slot = ksm_scan.mm_slot; 627 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { 628 mm = mm_slot->mm; 629 down_read(&mm->mmap_sem); 630 for (vma = mm->mmap; vma; vma = vma->vm_next) { 631 if (ksm_test_exit(mm)) 632 break; 633 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 634 continue; 635 err = unmerge_ksm_pages(vma, 636 vma->vm_start, vma->vm_end); 637 if (err) 638 goto error; 639 } 640 641 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); 642 643 spin_lock(&ksm_mmlist_lock); 644 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 645 struct mm_slot, mm_list); 646 if (ksm_test_exit(mm)) { 647 hlist_del(&mm_slot->link); 648 list_del(&mm_slot->mm_list); 649 spin_unlock(&ksm_mmlist_lock); 650 651 free_mm_slot(mm_slot); 652 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 653 up_read(&mm->mmap_sem); 654 mmdrop(mm); 655 } else { 656 spin_unlock(&ksm_mmlist_lock); 657 up_read(&mm->mmap_sem); 658 } 659 } 660 661 ksm_scan.seqnr = 0; 662 return 0; 663 664 error: 665 up_read(&mm->mmap_sem); 666 spin_lock(&ksm_mmlist_lock); 667 ksm_scan.mm_slot = &ksm_mm_head; 668 spin_unlock(&ksm_mmlist_lock); 669 return err; 670 } 671 #endif /* CONFIG_SYSFS */ 672 673 static u32 calc_checksum(struct page *page) 674 { 675 u32 checksum; 676 void *addr = kmap_atomic(page, KM_USER0); 677 checksum = jhash2(addr, PAGE_SIZE / 4, 17); 678 kunmap_atomic(addr, KM_USER0); 679 return checksum; 680 } 681 682 static int memcmp_pages(struct page *page1, struct page *page2) 683 { 684 char *addr1, *addr2; 685 int ret; 686 687 addr1 = kmap_atomic(page1, KM_USER0); 688 addr2 = kmap_atomic(page2, KM_USER1); 689 ret = memcmp(addr1, addr2, PAGE_SIZE); 690 kunmap_atomic(addr2, KM_USER1); 691 kunmap_atomic(addr1, KM_USER0); 692 return ret; 693 } 694 695 static inline int pages_identical(struct page *page1, struct page *page2) 696 { 697 return !memcmp_pages(page1, page2); 698 } 699 700 static int write_protect_page(struct vm_area_struct *vma, struct page *page, 701 pte_t *orig_pte) 702 { 703 struct mm_struct *mm = vma->vm_mm; 704 unsigned long addr; 705 pte_t *ptep; 706 spinlock_t *ptl; 707 int swapped; 708 int err = -EFAULT; 709 710 addr = page_address_in_vma(page, vma); 711 if (addr == -EFAULT) 712 goto out; 713 714 BUG_ON(PageTransCompound(page)); 715 ptep = page_check_address(page, mm, addr, &ptl, 0); 716 if (!ptep) 717 goto out; 718 719 if (pte_write(*ptep) || pte_dirty(*ptep)) { 720 pte_t entry; 721 722 swapped = PageSwapCache(page); 723 flush_cache_page(vma, addr, page_to_pfn(page)); 724 /* 725 * Ok this is tricky, when get_user_pages_fast() run it doesn't 726 * take any lock, therefore the check that we are going to make 727 * with the pagecount against the mapcount is racey and 728 * O_DIRECT can happen right after the check. 729 * So we clear the pte and flush the tlb before the check 730 * this assure us that no O_DIRECT can happen after the check 731 * or in the middle of the check. 732 */ 733 entry = ptep_clear_flush(vma, addr, ptep); 734 /* 735 * Check that no O_DIRECT or similar I/O is in progress on the 736 * page 737 */ 738 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 739 set_pte_at(mm, addr, ptep, entry); 740 goto out_unlock; 741 } 742 if (pte_dirty(entry)) 743 set_page_dirty(page); 744 entry = pte_mkclean(pte_wrprotect(entry)); 745 set_pte_at_notify(mm, addr, ptep, entry); 746 } 747 *orig_pte = *ptep; 748 err = 0; 749 750 out_unlock: 751 pte_unmap_unlock(ptep, ptl); 752 out: 753 return err; 754 } 755 756 /** 757 * replace_page - replace page in vma by new ksm page 758 * @vma: vma that holds the pte pointing to page 759 * @page: the page we are replacing by kpage 760 * @kpage: the ksm page we replace page by 761 * @orig_pte: the original value of the pte 762 * 763 * Returns 0 on success, -EFAULT on failure. 764 */ 765 static int replace_page(struct vm_area_struct *vma, struct page *page, 766 struct page *kpage, pte_t orig_pte) 767 { 768 struct mm_struct *mm = vma->vm_mm; 769 pgd_t *pgd; 770 pud_t *pud; 771 pmd_t *pmd; 772 pte_t *ptep; 773 spinlock_t *ptl; 774 unsigned long addr; 775 int err = -EFAULT; 776 777 addr = page_address_in_vma(page, vma); 778 if (addr == -EFAULT) 779 goto out; 780 781 pgd = pgd_offset(mm, addr); 782 if (!pgd_present(*pgd)) 783 goto out; 784 785 pud = pud_offset(pgd, addr); 786 if (!pud_present(*pud)) 787 goto out; 788 789 pmd = pmd_offset(pud, addr); 790 BUG_ON(pmd_trans_huge(*pmd)); 791 if (!pmd_present(*pmd)) 792 goto out; 793 794 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 795 if (!pte_same(*ptep, orig_pte)) { 796 pte_unmap_unlock(ptep, ptl); 797 goto out; 798 } 799 800 get_page(kpage); 801 page_add_anon_rmap(kpage, vma, addr); 802 803 flush_cache_page(vma, addr, pte_pfn(*ptep)); 804 ptep_clear_flush(vma, addr, ptep); 805 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 806 807 page_remove_rmap(page); 808 if (!page_mapped(page)) 809 try_to_free_swap(page); 810 put_page(page); 811 812 pte_unmap_unlock(ptep, ptl); 813 err = 0; 814 out: 815 return err; 816 } 817 818 static int page_trans_compound_anon_split(struct page *page) 819 { 820 int ret = 0; 821 struct page *transhuge_head = page_trans_compound_anon(page); 822 if (transhuge_head) { 823 /* Get the reference on the head to split it. */ 824 if (get_page_unless_zero(transhuge_head)) { 825 /* 826 * Recheck we got the reference while the head 827 * was still anonymous. 828 */ 829 if (PageAnon(transhuge_head)) 830 ret = split_huge_page(transhuge_head); 831 else 832 /* 833 * Retry later if split_huge_page run 834 * from under us. 835 */ 836 ret = 1; 837 put_page(transhuge_head); 838 } else 839 /* Retry later if split_huge_page run from under us. */ 840 ret = 1; 841 } 842 return ret; 843 } 844 845 /* 846 * try_to_merge_one_page - take two pages and merge them into one 847 * @vma: the vma that holds the pte pointing to page 848 * @page: the PageAnon page that we want to replace with kpage 849 * @kpage: the PageKsm page that we want to map instead of page, 850 * or NULL the first time when we want to use page as kpage. 851 * 852 * This function returns 0 if the pages were merged, -EFAULT otherwise. 853 */ 854 static int try_to_merge_one_page(struct vm_area_struct *vma, 855 struct page *page, struct page *kpage) 856 { 857 pte_t orig_pte = __pte(0); 858 int err = -EFAULT; 859 860 if (page == kpage) /* ksm page forked */ 861 return 0; 862 863 if (!(vma->vm_flags & VM_MERGEABLE)) 864 goto out; 865 if (PageTransCompound(page) && page_trans_compound_anon_split(page)) 866 goto out; 867 BUG_ON(PageTransCompound(page)); 868 if (!PageAnon(page)) 869 goto out; 870 871 /* 872 * We need the page lock to read a stable PageSwapCache in 873 * write_protect_page(). We use trylock_page() instead of 874 * lock_page() because we don't want to wait here - we 875 * prefer to continue scanning and merging different pages, 876 * then come back to this page when it is unlocked. 877 */ 878 if (!trylock_page(page)) 879 goto out; 880 /* 881 * If this anonymous page is mapped only here, its pte may need 882 * to be write-protected. If it's mapped elsewhere, all of its 883 * ptes are necessarily already write-protected. But in either 884 * case, we need to lock and check page_count is not raised. 885 */ 886 if (write_protect_page(vma, page, &orig_pte) == 0) { 887 if (!kpage) { 888 /* 889 * While we hold page lock, upgrade page from 890 * PageAnon+anon_vma to PageKsm+NULL stable_node: 891 * stable_tree_insert() will update stable_node. 892 */ 893 set_page_stable_node(page, NULL); 894 mark_page_accessed(page); 895 err = 0; 896 } else if (pages_identical(page, kpage)) 897 err = replace_page(vma, page, kpage, orig_pte); 898 } 899 900 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { 901 munlock_vma_page(page); 902 if (!PageMlocked(kpage)) { 903 unlock_page(page); 904 lock_page(kpage); 905 mlock_vma_page(kpage); 906 page = kpage; /* for final unlock */ 907 } 908 } 909 910 unlock_page(page); 911 out: 912 return err; 913 } 914 915 /* 916 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 917 * but no new kernel page is allocated: kpage must already be a ksm page. 918 * 919 * This function returns 0 if the pages were merged, -EFAULT otherwise. 920 */ 921 static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, 922 struct page *page, struct page *kpage) 923 { 924 struct mm_struct *mm = rmap_item->mm; 925 struct vm_area_struct *vma; 926 int err = -EFAULT; 927 928 down_read(&mm->mmap_sem); 929 if (ksm_test_exit(mm)) 930 goto out; 931 vma = find_vma(mm, rmap_item->address); 932 if (!vma || vma->vm_start > rmap_item->address) 933 goto out; 934 935 err = try_to_merge_one_page(vma, page, kpage); 936 if (err) 937 goto out; 938 939 /* Must get reference to anon_vma while still holding mmap_sem */ 940 rmap_item->anon_vma = vma->anon_vma; 941 get_anon_vma(vma->anon_vma); 942 out: 943 up_read(&mm->mmap_sem); 944 return err; 945 } 946 947 /* 948 * try_to_merge_two_pages - take two identical pages and prepare them 949 * to be merged into one page. 950 * 951 * This function returns the kpage if we successfully merged two identical 952 * pages into one ksm page, NULL otherwise. 953 * 954 * Note that this function upgrades page to ksm page: if one of the pages 955 * is already a ksm page, try_to_merge_with_ksm_page should be used. 956 */ 957 static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, 958 struct page *page, 959 struct rmap_item *tree_rmap_item, 960 struct page *tree_page) 961 { 962 int err; 963 964 err = try_to_merge_with_ksm_page(rmap_item, page, NULL); 965 if (!err) { 966 err = try_to_merge_with_ksm_page(tree_rmap_item, 967 tree_page, page); 968 /* 969 * If that fails, we have a ksm page with only one pte 970 * pointing to it: so break it. 971 */ 972 if (err) 973 break_cow(rmap_item); 974 } 975 return err ? NULL : page; 976 } 977 978 /* 979 * stable_tree_search - search for page inside the stable tree 980 * 981 * This function checks if there is a page inside the stable tree 982 * with identical content to the page that we are scanning right now. 983 * 984 * This function returns the stable tree node of identical content if found, 985 * NULL otherwise. 986 */ 987 static struct page *stable_tree_search(struct page *page) 988 { 989 struct rb_node *node = root_stable_tree.rb_node; 990 struct stable_node *stable_node; 991 992 stable_node = page_stable_node(page); 993 if (stable_node) { /* ksm page forked */ 994 get_page(page); 995 return page; 996 } 997 998 while (node) { 999 struct page *tree_page; 1000 int ret; 1001 1002 cond_resched(); 1003 stable_node = rb_entry(node, struct stable_node, node); 1004 tree_page = get_ksm_page(stable_node); 1005 if (!tree_page) 1006 return NULL; 1007 1008 ret = memcmp_pages(page, tree_page); 1009 1010 if (ret < 0) { 1011 put_page(tree_page); 1012 node = node->rb_left; 1013 } else if (ret > 0) { 1014 put_page(tree_page); 1015 node = node->rb_right; 1016 } else 1017 return tree_page; 1018 } 1019 1020 return NULL; 1021 } 1022 1023 /* 1024 * stable_tree_insert - insert rmap_item pointing to new ksm page 1025 * into the stable tree. 1026 * 1027 * This function returns the stable tree node just allocated on success, 1028 * NULL otherwise. 1029 */ 1030 static struct stable_node *stable_tree_insert(struct page *kpage) 1031 { 1032 struct rb_node **new = &root_stable_tree.rb_node; 1033 struct rb_node *parent = NULL; 1034 struct stable_node *stable_node; 1035 1036 while (*new) { 1037 struct page *tree_page; 1038 int ret; 1039 1040 cond_resched(); 1041 stable_node = rb_entry(*new, struct stable_node, node); 1042 tree_page = get_ksm_page(stable_node); 1043 if (!tree_page) 1044 return NULL; 1045 1046 ret = memcmp_pages(kpage, tree_page); 1047 put_page(tree_page); 1048 1049 parent = *new; 1050 if (ret < 0) 1051 new = &parent->rb_left; 1052 else if (ret > 0) 1053 new = &parent->rb_right; 1054 else { 1055 /* 1056 * It is not a bug that stable_tree_search() didn't 1057 * find this node: because at that time our page was 1058 * not yet write-protected, so may have changed since. 1059 */ 1060 return NULL; 1061 } 1062 } 1063 1064 stable_node = alloc_stable_node(); 1065 if (!stable_node) 1066 return NULL; 1067 1068 rb_link_node(&stable_node->node, parent, new); 1069 rb_insert_color(&stable_node->node, &root_stable_tree); 1070 1071 INIT_HLIST_HEAD(&stable_node->hlist); 1072 1073 stable_node->kpfn = page_to_pfn(kpage); 1074 set_page_stable_node(kpage, stable_node); 1075 1076 return stable_node; 1077 } 1078 1079 /* 1080 * unstable_tree_search_insert - search for identical page, 1081 * else insert rmap_item into the unstable tree. 1082 * 1083 * This function searches for a page in the unstable tree identical to the 1084 * page currently being scanned; and if no identical page is found in the 1085 * tree, we insert rmap_item as a new object into the unstable tree. 1086 * 1087 * This function returns pointer to rmap_item found to be identical 1088 * to the currently scanned page, NULL otherwise. 1089 * 1090 * This function does both searching and inserting, because they share 1091 * the same walking algorithm in an rbtree. 1092 */ 1093 static 1094 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1095 struct page *page, 1096 struct page **tree_pagep) 1097 1098 { 1099 struct rb_node **new = &root_unstable_tree.rb_node; 1100 struct rb_node *parent = NULL; 1101 1102 while (*new) { 1103 struct rmap_item *tree_rmap_item; 1104 struct page *tree_page; 1105 int ret; 1106 1107 cond_resched(); 1108 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1109 tree_page = get_mergeable_page(tree_rmap_item); 1110 if (IS_ERR_OR_NULL(tree_page)) 1111 return NULL; 1112 1113 /* 1114 * Don't substitute a ksm page for a forked page. 1115 */ 1116 if (page == tree_page) { 1117 put_page(tree_page); 1118 return NULL; 1119 } 1120 1121 ret = memcmp_pages(page, tree_page); 1122 1123 parent = *new; 1124 if (ret < 0) { 1125 put_page(tree_page); 1126 new = &parent->rb_left; 1127 } else if (ret > 0) { 1128 put_page(tree_page); 1129 new = &parent->rb_right; 1130 } else { 1131 *tree_pagep = tree_page; 1132 return tree_rmap_item; 1133 } 1134 } 1135 1136 rmap_item->address |= UNSTABLE_FLAG; 1137 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1138 rb_link_node(&rmap_item->node, parent, new); 1139 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1140 1141 ksm_pages_unshared++; 1142 return NULL; 1143 } 1144 1145 /* 1146 * stable_tree_append - add another rmap_item to the linked list of 1147 * rmap_items hanging off a given node of the stable tree, all sharing 1148 * the same ksm page. 1149 */ 1150 static void stable_tree_append(struct rmap_item *rmap_item, 1151 struct stable_node *stable_node) 1152 { 1153 rmap_item->head = stable_node; 1154 rmap_item->address |= STABLE_FLAG; 1155 hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 1156 1157 if (rmap_item->hlist.next) 1158 ksm_pages_sharing++; 1159 else 1160 ksm_pages_shared++; 1161 } 1162 1163 /* 1164 * cmp_and_merge_page - first see if page can be merged into the stable tree; 1165 * if not, compare checksum to previous and if it's the same, see if page can 1166 * be inserted into the unstable tree, or merged with a page already there and 1167 * both transferred to the stable tree. 1168 * 1169 * @page: the page that we are searching identical page to. 1170 * @rmap_item: the reverse mapping into the virtual address of this page 1171 */ 1172 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1173 { 1174 struct rmap_item *tree_rmap_item; 1175 struct page *tree_page = NULL; 1176 struct stable_node *stable_node; 1177 struct page *kpage; 1178 unsigned int checksum; 1179 int err; 1180 1181 remove_rmap_item_from_tree(rmap_item); 1182 1183 /* We first start with searching the page inside the stable tree */ 1184 kpage = stable_tree_search(page); 1185 if (kpage) { 1186 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1187 if (!err) { 1188 /* 1189 * The page was successfully merged: 1190 * add its rmap_item to the stable tree. 1191 */ 1192 lock_page(kpage); 1193 stable_tree_append(rmap_item, page_stable_node(kpage)); 1194 unlock_page(kpage); 1195 } 1196 put_page(kpage); 1197 return; 1198 } 1199 1200 /* 1201 * If the hash value of the page has changed from the last time 1202 * we calculated it, this page is changing frequently: therefore we 1203 * don't want to insert it in the unstable tree, and we don't want 1204 * to waste our time searching for something identical to it there. 1205 */ 1206 checksum = calc_checksum(page); 1207 if (rmap_item->oldchecksum != checksum) { 1208 rmap_item->oldchecksum = checksum; 1209 return; 1210 } 1211 1212 tree_rmap_item = 1213 unstable_tree_search_insert(rmap_item, page, &tree_page); 1214 if (tree_rmap_item) { 1215 kpage = try_to_merge_two_pages(rmap_item, page, 1216 tree_rmap_item, tree_page); 1217 put_page(tree_page); 1218 /* 1219 * As soon as we merge this page, we want to remove the 1220 * rmap_item of the page we have merged with from the unstable 1221 * tree, and insert it instead as new node in the stable tree. 1222 */ 1223 if (kpage) { 1224 remove_rmap_item_from_tree(tree_rmap_item); 1225 1226 lock_page(kpage); 1227 stable_node = stable_tree_insert(kpage); 1228 if (stable_node) { 1229 stable_tree_append(tree_rmap_item, stable_node); 1230 stable_tree_append(rmap_item, stable_node); 1231 } 1232 unlock_page(kpage); 1233 1234 /* 1235 * If we fail to insert the page into the stable tree, 1236 * we will have 2 virtual addresses that are pointing 1237 * to a ksm page left outside the stable tree, 1238 * in which case we need to break_cow on both. 1239 */ 1240 if (!stable_node) { 1241 break_cow(tree_rmap_item); 1242 break_cow(rmap_item); 1243 } 1244 } 1245 } 1246 } 1247 1248 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1249 struct rmap_item **rmap_list, 1250 unsigned long addr) 1251 { 1252 struct rmap_item *rmap_item; 1253 1254 while (*rmap_list) { 1255 rmap_item = *rmap_list; 1256 if ((rmap_item->address & PAGE_MASK) == addr) 1257 return rmap_item; 1258 if (rmap_item->address > addr) 1259 break; 1260 *rmap_list = rmap_item->rmap_list; 1261 remove_rmap_item_from_tree(rmap_item); 1262 free_rmap_item(rmap_item); 1263 } 1264 1265 rmap_item = alloc_rmap_item(); 1266 if (rmap_item) { 1267 /* It has already been zeroed */ 1268 rmap_item->mm = mm_slot->mm; 1269 rmap_item->address = addr; 1270 rmap_item->rmap_list = *rmap_list; 1271 *rmap_list = rmap_item; 1272 } 1273 return rmap_item; 1274 } 1275 1276 static struct rmap_item *scan_get_next_rmap_item(struct page **page) 1277 { 1278 struct mm_struct *mm; 1279 struct mm_slot *slot; 1280 struct vm_area_struct *vma; 1281 struct rmap_item *rmap_item; 1282 1283 if (list_empty(&ksm_mm_head.mm_list)) 1284 return NULL; 1285 1286 slot = ksm_scan.mm_slot; 1287 if (slot == &ksm_mm_head) { 1288 /* 1289 * A number of pages can hang around indefinitely on per-cpu 1290 * pagevecs, raised page count preventing write_protect_page 1291 * from merging them. Though it doesn't really matter much, 1292 * it is puzzling to see some stuck in pages_volatile until 1293 * other activity jostles them out, and they also prevented 1294 * LTP's KSM test from succeeding deterministically; so drain 1295 * them here (here rather than on entry to ksm_do_scan(), 1296 * so we don't IPI too often when pages_to_scan is set low). 1297 */ 1298 lru_add_drain_all(); 1299 1300 root_unstable_tree = RB_ROOT; 1301 1302 spin_lock(&ksm_mmlist_lock); 1303 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1304 ksm_scan.mm_slot = slot; 1305 spin_unlock(&ksm_mmlist_lock); 1306 /* 1307 * Although we tested list_empty() above, a racing __ksm_exit 1308 * of the last mm on the list may have removed it since then. 1309 */ 1310 if (slot == &ksm_mm_head) 1311 return NULL; 1312 next_mm: 1313 ksm_scan.address = 0; 1314 ksm_scan.rmap_list = &slot->rmap_list; 1315 } 1316 1317 mm = slot->mm; 1318 down_read(&mm->mmap_sem); 1319 if (ksm_test_exit(mm)) 1320 vma = NULL; 1321 else 1322 vma = find_vma(mm, ksm_scan.address); 1323 1324 for (; vma; vma = vma->vm_next) { 1325 if (!(vma->vm_flags & VM_MERGEABLE)) 1326 continue; 1327 if (ksm_scan.address < vma->vm_start) 1328 ksm_scan.address = vma->vm_start; 1329 if (!vma->anon_vma) 1330 ksm_scan.address = vma->vm_end; 1331 1332 while (ksm_scan.address < vma->vm_end) { 1333 if (ksm_test_exit(mm)) 1334 break; 1335 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1336 if (IS_ERR_OR_NULL(*page)) { 1337 ksm_scan.address += PAGE_SIZE; 1338 cond_resched(); 1339 continue; 1340 } 1341 if (PageAnon(*page) || 1342 page_trans_compound_anon(*page)) { 1343 flush_anon_page(vma, *page, ksm_scan.address); 1344 flush_dcache_page(*page); 1345 rmap_item = get_next_rmap_item(slot, 1346 ksm_scan.rmap_list, ksm_scan.address); 1347 if (rmap_item) { 1348 ksm_scan.rmap_list = 1349 &rmap_item->rmap_list; 1350 ksm_scan.address += PAGE_SIZE; 1351 } else 1352 put_page(*page); 1353 up_read(&mm->mmap_sem); 1354 return rmap_item; 1355 } 1356 put_page(*page); 1357 ksm_scan.address += PAGE_SIZE; 1358 cond_resched(); 1359 } 1360 } 1361 1362 if (ksm_test_exit(mm)) { 1363 ksm_scan.address = 0; 1364 ksm_scan.rmap_list = &slot->rmap_list; 1365 } 1366 /* 1367 * Nuke all the rmap_items that are above this current rmap: 1368 * because there were no VM_MERGEABLE vmas with such addresses. 1369 */ 1370 remove_trailing_rmap_items(slot, ksm_scan.rmap_list); 1371 1372 spin_lock(&ksm_mmlist_lock); 1373 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1374 struct mm_slot, mm_list); 1375 if (ksm_scan.address == 0) { 1376 /* 1377 * We've completed a full scan of all vmas, holding mmap_sem 1378 * throughout, and found no VM_MERGEABLE: so do the same as 1379 * __ksm_exit does to remove this mm from all our lists now. 1380 * This applies either when cleaning up after __ksm_exit 1381 * (but beware: we can reach here even before __ksm_exit), 1382 * or when all VM_MERGEABLE areas have been unmapped (and 1383 * mmap_sem then protects against race with MADV_MERGEABLE). 1384 */ 1385 hlist_del(&slot->link); 1386 list_del(&slot->mm_list); 1387 spin_unlock(&ksm_mmlist_lock); 1388 1389 free_mm_slot(slot); 1390 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1391 up_read(&mm->mmap_sem); 1392 mmdrop(mm); 1393 } else { 1394 spin_unlock(&ksm_mmlist_lock); 1395 up_read(&mm->mmap_sem); 1396 } 1397 1398 /* Repeat until we've completed scanning the whole list */ 1399 slot = ksm_scan.mm_slot; 1400 if (slot != &ksm_mm_head) 1401 goto next_mm; 1402 1403 ksm_scan.seqnr++; 1404 return NULL; 1405 } 1406 1407 /** 1408 * ksm_do_scan - the ksm scanner main worker function. 1409 * @scan_npages - number of pages we want to scan before we return. 1410 */ 1411 static void ksm_do_scan(unsigned int scan_npages) 1412 { 1413 struct rmap_item *rmap_item; 1414 struct page *uninitialized_var(page); 1415 1416 while (scan_npages-- && likely(!freezing(current))) { 1417 cond_resched(); 1418 rmap_item = scan_get_next_rmap_item(&page); 1419 if (!rmap_item) 1420 return; 1421 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1422 cmp_and_merge_page(page, rmap_item); 1423 put_page(page); 1424 } 1425 } 1426 1427 static int ksmd_should_run(void) 1428 { 1429 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); 1430 } 1431 1432 static int ksm_scan_thread(void *nothing) 1433 { 1434 set_freezable(); 1435 set_user_nice(current, 5); 1436 1437 while (!kthread_should_stop()) { 1438 mutex_lock(&ksm_thread_mutex); 1439 if (ksmd_should_run()) 1440 ksm_do_scan(ksm_thread_pages_to_scan); 1441 mutex_unlock(&ksm_thread_mutex); 1442 1443 try_to_freeze(); 1444 1445 if (ksmd_should_run()) { 1446 schedule_timeout_interruptible( 1447 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1448 } else { 1449 wait_event_freezable(ksm_thread_wait, 1450 ksmd_should_run() || kthread_should_stop()); 1451 } 1452 } 1453 return 0; 1454 } 1455 1456 int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 1457 unsigned long end, int advice, unsigned long *vm_flags) 1458 { 1459 struct mm_struct *mm = vma->vm_mm; 1460 int err; 1461 1462 switch (advice) { 1463 case MADV_MERGEABLE: 1464 /* 1465 * Be somewhat over-protective for now! 1466 */ 1467 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1468 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1469 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1470 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) 1471 return 0; /* just ignore the advice */ 1472 1473 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1474 err = __ksm_enter(mm); 1475 if (err) 1476 return err; 1477 } 1478 1479 *vm_flags |= VM_MERGEABLE; 1480 break; 1481 1482 case MADV_UNMERGEABLE: 1483 if (!(*vm_flags & VM_MERGEABLE)) 1484 return 0; /* just ignore the advice */ 1485 1486 if (vma->anon_vma) { 1487 err = unmerge_ksm_pages(vma, start, end); 1488 if (err) 1489 return err; 1490 } 1491 1492 *vm_flags &= ~VM_MERGEABLE; 1493 break; 1494 } 1495 1496 return 0; 1497 } 1498 1499 int __ksm_enter(struct mm_struct *mm) 1500 { 1501 struct mm_slot *mm_slot; 1502 int needs_wakeup; 1503 1504 mm_slot = alloc_mm_slot(); 1505 if (!mm_slot) 1506 return -ENOMEM; 1507 1508 /* Check ksm_run too? Would need tighter locking */ 1509 needs_wakeup = list_empty(&ksm_mm_head.mm_list); 1510 1511 spin_lock(&ksm_mmlist_lock); 1512 insert_to_mm_slots_hash(mm, mm_slot); 1513 /* 1514 * Insert just behind the scanning cursor, to let the area settle 1515 * down a little; when fork is followed by immediate exec, we don't 1516 * want ksmd to waste time setting up and tearing down an rmap_list. 1517 */ 1518 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1519 spin_unlock(&ksm_mmlist_lock); 1520 1521 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1522 atomic_inc(&mm->mm_count); 1523 1524 if (needs_wakeup) 1525 wake_up_interruptible(&ksm_thread_wait); 1526 1527 return 0; 1528 } 1529 1530 void __ksm_exit(struct mm_struct *mm) 1531 { 1532 struct mm_slot *mm_slot; 1533 int easy_to_free = 0; 1534 1535 /* 1536 * This process is exiting: if it's straightforward (as is the 1537 * case when ksmd was never running), free mm_slot immediately. 1538 * But if it's at the cursor or has rmap_items linked to it, use 1539 * mmap_sem to synchronize with any break_cows before pagetables 1540 * are freed, and leave the mm_slot on the list for ksmd to free. 1541 * Beware: ksm may already have noticed it exiting and freed the slot. 1542 */ 1543 1544 spin_lock(&ksm_mmlist_lock); 1545 mm_slot = get_mm_slot(mm); 1546 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1547 if (!mm_slot->rmap_list) { 1548 hlist_del(&mm_slot->link); 1549 list_del(&mm_slot->mm_list); 1550 easy_to_free = 1; 1551 } else { 1552 list_move(&mm_slot->mm_list, 1553 &ksm_scan.mm_slot->mm_list); 1554 } 1555 } 1556 spin_unlock(&ksm_mmlist_lock); 1557 1558 if (easy_to_free) { 1559 free_mm_slot(mm_slot); 1560 clear_bit(MMF_VM_MERGEABLE, &mm->flags); 1561 mmdrop(mm); 1562 } else if (mm_slot) { 1563 down_write(&mm->mmap_sem); 1564 up_write(&mm->mmap_sem); 1565 } 1566 } 1567 1568 struct page *ksm_does_need_to_copy(struct page *page, 1569 struct vm_area_struct *vma, unsigned long address) 1570 { 1571 struct page *new_page; 1572 1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1574 if (new_page) { 1575 /* 1576 * The memcg-specific accounting when moving 1577 * pages around the LRU lists relies on the 1578 * page's owner (memcg) to be valid. Usually, 1579 * pages are assigned to a new owner before 1580 * being put on the LRU list, but since this 1581 * is not the case here, the stale owner from 1582 * a previous allocation cycle must be reset. 1583 */ 1584 mem_cgroup_reset_owner(new_page); 1585 copy_user_highpage(new_page, page, address, vma); 1586 1587 SetPageDirty(new_page); 1588 __SetPageUptodate(new_page); 1589 SetPageSwapBacked(new_page); 1590 __set_page_locked(new_page); 1591 1592 if (page_evictable(new_page, vma)) 1593 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1594 else 1595 add_page_to_unevictable_list(new_page); 1596 } 1597 1598 return new_page; 1599 } 1600 1601 int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, 1602 unsigned long *vm_flags) 1603 { 1604 struct stable_node *stable_node; 1605 struct rmap_item *rmap_item; 1606 struct hlist_node *hlist; 1607 unsigned int mapcount = page_mapcount(page); 1608 int referenced = 0; 1609 int search_new_forks = 0; 1610 1611 VM_BUG_ON(!PageKsm(page)); 1612 VM_BUG_ON(!PageLocked(page)); 1613 1614 stable_node = page_stable_node(page); 1615 if (!stable_node) 1616 return 0; 1617 again: 1618 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1619 struct anon_vma *anon_vma = rmap_item->anon_vma; 1620 struct anon_vma_chain *vmac; 1621 struct vm_area_struct *vma; 1622 1623 anon_vma_lock(anon_vma); 1624 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1625 vma = vmac->vma; 1626 if (rmap_item->address < vma->vm_start || 1627 rmap_item->address >= vma->vm_end) 1628 continue; 1629 /* 1630 * Initially we examine only the vma which covers this 1631 * rmap_item; but later, if there is still work to do, 1632 * we examine covering vmas in other mms: in case they 1633 * were forked from the original since ksmd passed. 1634 */ 1635 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1636 continue; 1637 1638 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 1639 continue; 1640 1641 referenced += page_referenced_one(page, vma, 1642 rmap_item->address, &mapcount, vm_flags); 1643 if (!search_new_forks || !mapcount) 1644 break; 1645 } 1646 anon_vma_unlock(anon_vma); 1647 if (!mapcount) 1648 goto out; 1649 } 1650 if (!search_new_forks++) 1651 goto again; 1652 out: 1653 return referenced; 1654 } 1655 1656 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) 1657 { 1658 struct stable_node *stable_node; 1659 struct hlist_node *hlist; 1660 struct rmap_item *rmap_item; 1661 int ret = SWAP_AGAIN; 1662 int search_new_forks = 0; 1663 1664 VM_BUG_ON(!PageKsm(page)); 1665 VM_BUG_ON(!PageLocked(page)); 1666 1667 stable_node = page_stable_node(page); 1668 if (!stable_node) 1669 return SWAP_FAIL; 1670 again: 1671 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1672 struct anon_vma *anon_vma = rmap_item->anon_vma; 1673 struct anon_vma_chain *vmac; 1674 struct vm_area_struct *vma; 1675 1676 anon_vma_lock(anon_vma); 1677 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1678 vma = vmac->vma; 1679 if (rmap_item->address < vma->vm_start || 1680 rmap_item->address >= vma->vm_end) 1681 continue; 1682 /* 1683 * Initially we examine only the vma which covers this 1684 * rmap_item; but later, if there is still work to do, 1685 * we examine covering vmas in other mms: in case they 1686 * were forked from the original since ksmd passed. 1687 */ 1688 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1689 continue; 1690 1691 ret = try_to_unmap_one(page, vma, 1692 rmap_item->address, flags); 1693 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1694 anon_vma_unlock(anon_vma); 1695 goto out; 1696 } 1697 } 1698 anon_vma_unlock(anon_vma); 1699 } 1700 if (!search_new_forks++) 1701 goto again; 1702 out: 1703 return ret; 1704 } 1705 1706 #ifdef CONFIG_MIGRATION 1707 int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, 1708 struct vm_area_struct *, unsigned long, void *), void *arg) 1709 { 1710 struct stable_node *stable_node; 1711 struct hlist_node *hlist; 1712 struct rmap_item *rmap_item; 1713 int ret = SWAP_AGAIN; 1714 int search_new_forks = 0; 1715 1716 VM_BUG_ON(!PageKsm(page)); 1717 VM_BUG_ON(!PageLocked(page)); 1718 1719 stable_node = page_stable_node(page); 1720 if (!stable_node) 1721 return ret; 1722 again: 1723 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1724 struct anon_vma *anon_vma = rmap_item->anon_vma; 1725 struct anon_vma_chain *vmac; 1726 struct vm_area_struct *vma; 1727 1728 anon_vma_lock(anon_vma); 1729 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1730 vma = vmac->vma; 1731 if (rmap_item->address < vma->vm_start || 1732 rmap_item->address >= vma->vm_end) 1733 continue; 1734 /* 1735 * Initially we examine only the vma which covers this 1736 * rmap_item; but later, if there is still work to do, 1737 * we examine covering vmas in other mms: in case they 1738 * were forked from the original since ksmd passed. 1739 */ 1740 if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 1741 continue; 1742 1743 ret = rmap_one(page, vma, rmap_item->address, arg); 1744 if (ret != SWAP_AGAIN) { 1745 anon_vma_unlock(anon_vma); 1746 goto out; 1747 } 1748 } 1749 anon_vma_unlock(anon_vma); 1750 } 1751 if (!search_new_forks++) 1752 goto again; 1753 out: 1754 return ret; 1755 } 1756 1757 void ksm_migrate_page(struct page *newpage, struct page *oldpage) 1758 { 1759 struct stable_node *stable_node; 1760 1761 VM_BUG_ON(!PageLocked(oldpage)); 1762 VM_BUG_ON(!PageLocked(newpage)); 1763 VM_BUG_ON(newpage->mapping != oldpage->mapping); 1764 1765 stable_node = page_stable_node(newpage); 1766 if (stable_node) { 1767 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 1768 stable_node->kpfn = page_to_pfn(newpage); 1769 } 1770 } 1771 #endif /* CONFIG_MIGRATION */ 1772 1773 #ifdef CONFIG_MEMORY_HOTREMOVE 1774 static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 1775 unsigned long end_pfn) 1776 { 1777 struct rb_node *node; 1778 1779 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 1780 struct stable_node *stable_node; 1781 1782 stable_node = rb_entry(node, struct stable_node, node); 1783 if (stable_node->kpfn >= start_pfn && 1784 stable_node->kpfn < end_pfn) 1785 return stable_node; 1786 } 1787 return NULL; 1788 } 1789 1790 static int ksm_memory_callback(struct notifier_block *self, 1791 unsigned long action, void *arg) 1792 { 1793 struct memory_notify *mn = arg; 1794 struct stable_node *stable_node; 1795 1796 switch (action) { 1797 case MEM_GOING_OFFLINE: 1798 /* 1799 * Keep it very simple for now: just lock out ksmd and 1800 * MADV_UNMERGEABLE while any memory is going offline. 1801 * mutex_lock_nested() is necessary because lockdep was alarmed 1802 * that here we take ksm_thread_mutex inside notifier chain 1803 * mutex, and later take notifier chain mutex inside 1804 * ksm_thread_mutex to unlock it. But that's safe because both 1805 * are inside mem_hotplug_mutex. 1806 */ 1807 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 1808 break; 1809 1810 case MEM_OFFLINE: 1811 /* 1812 * Most of the work is done by page migration; but there might 1813 * be a few stable_nodes left over, still pointing to struct 1814 * pages which have been offlined: prune those from the tree. 1815 */ 1816 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 1817 mn->start_pfn + mn->nr_pages)) != NULL) 1818 remove_node_from_stable_tree(stable_node); 1819 /* fallthrough */ 1820 1821 case MEM_CANCEL_OFFLINE: 1822 mutex_unlock(&ksm_thread_mutex); 1823 break; 1824 } 1825 return NOTIFY_OK; 1826 } 1827 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1828 1829 #ifdef CONFIG_SYSFS 1830 /* 1831 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1832 */ 1833 1834 #define KSM_ATTR_RO(_name) \ 1835 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1836 #define KSM_ATTR(_name) \ 1837 static struct kobj_attribute _name##_attr = \ 1838 __ATTR(_name, 0644, _name##_show, _name##_store) 1839 1840 static ssize_t sleep_millisecs_show(struct kobject *kobj, 1841 struct kobj_attribute *attr, char *buf) 1842 { 1843 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); 1844 } 1845 1846 static ssize_t sleep_millisecs_store(struct kobject *kobj, 1847 struct kobj_attribute *attr, 1848 const char *buf, size_t count) 1849 { 1850 unsigned long msecs; 1851 int err; 1852 1853 err = strict_strtoul(buf, 10, &msecs); 1854 if (err || msecs > UINT_MAX) 1855 return -EINVAL; 1856 1857 ksm_thread_sleep_millisecs = msecs; 1858 1859 return count; 1860 } 1861 KSM_ATTR(sleep_millisecs); 1862 1863 static ssize_t pages_to_scan_show(struct kobject *kobj, 1864 struct kobj_attribute *attr, char *buf) 1865 { 1866 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); 1867 } 1868 1869 static ssize_t pages_to_scan_store(struct kobject *kobj, 1870 struct kobj_attribute *attr, 1871 const char *buf, size_t count) 1872 { 1873 int err; 1874 unsigned long nr_pages; 1875 1876 err = strict_strtoul(buf, 10, &nr_pages); 1877 if (err || nr_pages > UINT_MAX) 1878 return -EINVAL; 1879 1880 ksm_thread_pages_to_scan = nr_pages; 1881 1882 return count; 1883 } 1884 KSM_ATTR(pages_to_scan); 1885 1886 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 1887 char *buf) 1888 { 1889 return sprintf(buf, "%u\n", ksm_run); 1890 } 1891 1892 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 1893 const char *buf, size_t count) 1894 { 1895 int err; 1896 unsigned long flags; 1897 1898 err = strict_strtoul(buf, 10, &flags); 1899 if (err || flags > UINT_MAX) 1900 return -EINVAL; 1901 if (flags > KSM_RUN_UNMERGE) 1902 return -EINVAL; 1903 1904 /* 1905 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1906 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1907 * breaking COW to free the pages_shared (but leaves mm_slots 1908 * on the list for when ksmd may be set running again). 1909 */ 1910 1911 mutex_lock(&ksm_thread_mutex); 1912 if (ksm_run != flags) { 1913 ksm_run = flags; 1914 if (flags & KSM_RUN_UNMERGE) { 1915 int oom_score_adj; 1916 1917 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1918 err = unmerge_and_remove_all_rmap_items(); 1919 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1920 oom_score_adj); 1921 if (err) { 1922 ksm_run = KSM_RUN_STOP; 1923 count = err; 1924 } 1925 } 1926 } 1927 mutex_unlock(&ksm_thread_mutex); 1928 1929 if (flags & KSM_RUN_MERGE) 1930 wake_up_interruptible(&ksm_thread_wait); 1931 1932 return count; 1933 } 1934 KSM_ATTR(run); 1935 1936 static ssize_t pages_shared_show(struct kobject *kobj, 1937 struct kobj_attribute *attr, char *buf) 1938 { 1939 return sprintf(buf, "%lu\n", ksm_pages_shared); 1940 } 1941 KSM_ATTR_RO(pages_shared); 1942 1943 static ssize_t pages_sharing_show(struct kobject *kobj, 1944 struct kobj_attribute *attr, char *buf) 1945 { 1946 return sprintf(buf, "%lu\n", ksm_pages_sharing); 1947 } 1948 KSM_ATTR_RO(pages_sharing); 1949 1950 static ssize_t pages_unshared_show(struct kobject *kobj, 1951 struct kobj_attribute *attr, char *buf) 1952 { 1953 return sprintf(buf, "%lu\n", ksm_pages_unshared); 1954 } 1955 KSM_ATTR_RO(pages_unshared); 1956 1957 static ssize_t pages_volatile_show(struct kobject *kobj, 1958 struct kobj_attribute *attr, char *buf) 1959 { 1960 long ksm_pages_volatile; 1961 1962 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 1963 - ksm_pages_sharing - ksm_pages_unshared; 1964 /* 1965 * It was not worth any locking to calculate that statistic, 1966 * but it might therefore sometimes be negative: conceal that. 1967 */ 1968 if (ksm_pages_volatile < 0) 1969 ksm_pages_volatile = 0; 1970 return sprintf(buf, "%ld\n", ksm_pages_volatile); 1971 } 1972 KSM_ATTR_RO(pages_volatile); 1973 1974 static ssize_t full_scans_show(struct kobject *kobj, 1975 struct kobj_attribute *attr, char *buf) 1976 { 1977 return sprintf(buf, "%lu\n", ksm_scan.seqnr); 1978 } 1979 KSM_ATTR_RO(full_scans); 1980 1981 static struct attribute *ksm_attrs[] = { 1982 &sleep_millisecs_attr.attr, 1983 &pages_to_scan_attr.attr, 1984 &run_attr.attr, 1985 &pages_shared_attr.attr, 1986 &pages_sharing_attr.attr, 1987 &pages_unshared_attr.attr, 1988 &pages_volatile_attr.attr, 1989 &full_scans_attr.attr, 1990 NULL, 1991 }; 1992 1993 static struct attribute_group ksm_attr_group = { 1994 .attrs = ksm_attrs, 1995 .name = "ksm", 1996 }; 1997 #endif /* CONFIG_SYSFS */ 1998 1999 static int __init ksm_init(void) 2000 { 2001 struct task_struct *ksm_thread; 2002 int err; 2003 2004 err = ksm_slab_init(); 2005 if (err) 2006 goto out; 2007 2008 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 2009 if (IS_ERR(ksm_thread)) { 2010 printk(KERN_ERR "ksm: creating kthread failed\n"); 2011 err = PTR_ERR(ksm_thread); 2012 goto out_free; 2013 } 2014 2015 #ifdef CONFIG_SYSFS 2016 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 2017 if (err) { 2018 printk(KERN_ERR "ksm: register sysfs failed\n"); 2019 kthread_stop(ksm_thread); 2020 goto out_free; 2021 } 2022 #else 2023 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 2024 2025 #endif /* CONFIG_SYSFS */ 2026 2027 #ifdef CONFIG_MEMORY_HOTREMOVE 2028 /* 2029 * Choose a high priority since the callback takes ksm_thread_mutex: 2030 * later callbacks could only be taking locks which nest within that. 2031 */ 2032 hotplug_memory_notifier(ksm_memory_callback, 100); 2033 #endif 2034 return 0; 2035 2036 out_free: 2037 ksm_slab_free(); 2038 out: 2039 return err; 2040 } 2041 module_init(ksm_init) 2042