1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempolicy.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 #include <linux/zswap.h> 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 #include <linux/list_lru.h> 39 40 #include "swap.h" 41 #include "internal.h" 42 43 /********************************* 44 * statistics 45 **********************************/ 46 /* Total bytes used by the compressed storage */ 47 u64 zswap_pool_total_size; 48 /* The number of compressed pages currently stored in zswap */ 49 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 50 /* The number of same-value filled pages currently stored in zswap */ 51 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Store failed due to compression algorithm failure */ 67 static u64 zswap_reject_compress_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 /* Duplicate store was encountered (rare) */ 75 static u64 zswap_duplicate_entry; 76 77 /* Shrinker work queue */ 78 static struct workqueue_struct *shrink_wq; 79 /* Pool limit was hit, we need to calm down */ 80 static bool zswap_pool_reached_full; 81 82 /********************************* 83 * tunables 84 **********************************/ 85 86 #define ZSWAP_PARAM_UNSET "" 87 88 static int zswap_setup(void); 89 90 /* Enable/disable zswap */ 91 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 92 static int zswap_enabled_param_set(const char *, 93 const struct kernel_param *); 94 static const struct kernel_param_ops zswap_enabled_param_ops = { 95 .set = zswap_enabled_param_set, 96 .get = param_get_bool, 97 }; 98 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 99 100 /* Crypto compressor to use */ 101 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 102 static int zswap_compressor_param_set(const char *, 103 const struct kernel_param *); 104 static const struct kernel_param_ops zswap_compressor_param_ops = { 105 .set = zswap_compressor_param_set, 106 .get = param_get_charp, 107 .free = param_free_charp, 108 }; 109 module_param_cb(compressor, &zswap_compressor_param_ops, 110 &zswap_compressor, 0644); 111 112 /* Compressed storage zpool to use */ 113 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 114 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 115 static const struct kernel_param_ops zswap_zpool_param_ops = { 116 .set = zswap_zpool_param_set, 117 .get = param_get_charp, 118 .free = param_free_charp, 119 }; 120 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 121 122 /* The maximum percentage of memory that the compressed pool can occupy */ 123 static unsigned int zswap_max_pool_percent = 20; 124 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 125 126 /* The threshold for accepting new pages after the max_pool_percent was hit */ 127 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 128 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 129 uint, 0644); 130 131 /* 132 * Enable/disable handling same-value filled pages (enabled by default). 133 * If disabled every page is considered non-same-value filled. 134 */ 135 static bool zswap_same_filled_pages_enabled = true; 136 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 137 bool, 0644); 138 139 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 140 static bool zswap_non_same_filled_pages_enabled = true; 141 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 142 bool, 0644); 143 144 static bool zswap_exclusive_loads_enabled = IS_ENABLED( 145 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); 146 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); 147 148 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 149 #define ZSWAP_NR_ZPOOLS 32 150 151 /* Enable/disable memory pressure-based shrinker. */ 152 static bool zswap_shrinker_enabled = IS_ENABLED( 153 CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); 154 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); 155 156 bool is_zswap_enabled(void) 157 { 158 return zswap_enabled; 159 } 160 161 /********************************* 162 * data structures 163 **********************************/ 164 165 struct crypto_acomp_ctx { 166 struct crypto_acomp *acomp; 167 struct acomp_req *req; 168 struct crypto_wait wait; 169 u8 *buffer; 170 struct mutex mutex; 171 }; 172 173 /* 174 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 175 * The only case where lru_lock is not acquired while holding tree.lock is 176 * when a zswap_entry is taken off the lru for writeback, in that case it 177 * needs to be verified that it's still valid in the tree. 178 */ 179 struct zswap_pool { 180 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 181 struct crypto_acomp_ctx __percpu *acomp_ctx; 182 struct kref kref; 183 struct list_head list; 184 struct work_struct release_work; 185 struct work_struct shrink_work; 186 struct hlist_node node; 187 char tfm_name[CRYPTO_MAX_ALG_NAME]; 188 struct list_lru list_lru; 189 struct mem_cgroup *next_shrink; 190 struct shrinker *shrinker; 191 atomic_t nr_stored; 192 }; 193 194 /* 195 * struct zswap_entry 196 * 197 * This structure contains the metadata for tracking a single compressed 198 * page within zswap. 199 * 200 * rbnode - links the entry into red-black tree for the appropriate swap type 201 * swpentry - associated swap entry, the offset indexes into the red-black tree 202 * refcount - the number of outstanding reference to the entry. This is needed 203 * to protect against premature freeing of the entry by code 204 * concurrent calls to load, invalidate, and writeback. The lock 205 * for the zswap_tree structure that contains the entry must 206 * be held while changing the refcount. Since the lock must 207 * be held, there is no reason to also make refcount atomic. 208 * length - the length in bytes of the compressed page data. Needed during 209 * decompression. For a same value filled page length is 0, and both 210 * pool and lru are invalid and must be ignored. 211 * pool - the zswap_pool the entry's data is in 212 * handle - zpool allocation handle that stores the compressed page data 213 * value - value of the same-value filled pages which have same content 214 * objcg - the obj_cgroup that the compressed memory is charged to 215 * lru - handle to the pool's lru used to evict pages. 216 */ 217 struct zswap_entry { 218 struct rb_node rbnode; 219 swp_entry_t swpentry; 220 int refcount; 221 unsigned int length; 222 struct zswap_pool *pool; 223 union { 224 unsigned long handle; 225 unsigned long value; 226 }; 227 struct obj_cgroup *objcg; 228 struct list_head lru; 229 }; 230 231 /* 232 * The tree lock in the zswap_tree struct protects a few things: 233 * - the rbtree 234 * - the refcount field of each entry in the tree 235 */ 236 struct zswap_tree { 237 struct rb_root rbroot; 238 spinlock_t lock; 239 }; 240 241 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 242 243 /* RCU-protected iteration */ 244 static LIST_HEAD(zswap_pools); 245 /* protects zswap_pools list modification */ 246 static DEFINE_SPINLOCK(zswap_pools_lock); 247 /* pool counter to provide unique names to zpool */ 248 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 249 250 enum zswap_init_type { 251 ZSWAP_UNINIT, 252 ZSWAP_INIT_SUCCEED, 253 ZSWAP_INIT_FAILED 254 }; 255 256 static enum zswap_init_type zswap_init_state; 257 258 /* used to ensure the integrity of initialization */ 259 static DEFINE_MUTEX(zswap_init_lock); 260 261 /* init completed, but couldn't create the initial pool */ 262 static bool zswap_has_pool; 263 264 /********************************* 265 * helpers and fwd declarations 266 **********************************/ 267 268 #define zswap_pool_debug(msg, p) \ 269 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 270 zpool_get_type((p)->zpools[0])) 271 272 static int zswap_writeback_entry(struct zswap_entry *entry, 273 struct zswap_tree *tree); 274 static int zswap_pool_get(struct zswap_pool *pool); 275 static void zswap_pool_put(struct zswap_pool *pool); 276 277 static bool zswap_is_full(void) 278 { 279 return totalram_pages() * zswap_max_pool_percent / 100 < 280 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 281 } 282 283 static bool zswap_can_accept(void) 284 { 285 return totalram_pages() * zswap_accept_thr_percent / 100 * 286 zswap_max_pool_percent / 100 > 287 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 288 } 289 290 static u64 get_zswap_pool_size(struct zswap_pool *pool) 291 { 292 u64 pool_size = 0; 293 int i; 294 295 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 296 pool_size += zpool_get_total_size(pool->zpools[i]); 297 298 return pool_size; 299 } 300 301 static void zswap_update_total_size(void) 302 { 303 struct zswap_pool *pool; 304 u64 total = 0; 305 306 rcu_read_lock(); 307 308 list_for_each_entry_rcu(pool, &zswap_pools, list) 309 total += get_zswap_pool_size(pool); 310 311 rcu_read_unlock(); 312 313 zswap_pool_total_size = total; 314 } 315 316 /* should be called under RCU */ 317 #ifdef CONFIG_MEMCG 318 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 319 { 320 return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; 321 } 322 #else 323 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 324 { 325 return NULL; 326 } 327 #endif 328 329 static inline int entry_to_nid(struct zswap_entry *entry) 330 { 331 return page_to_nid(virt_to_page(entry)); 332 } 333 334 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) 335 { 336 struct zswap_pool *pool; 337 338 /* lock out zswap pools list modification */ 339 spin_lock(&zswap_pools_lock); 340 list_for_each_entry(pool, &zswap_pools, list) { 341 if (pool->next_shrink == memcg) 342 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 343 } 344 spin_unlock(&zswap_pools_lock); 345 } 346 347 /********************************* 348 * zswap entry functions 349 **********************************/ 350 static struct kmem_cache *zswap_entry_cache; 351 352 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) 353 { 354 struct zswap_entry *entry; 355 entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); 356 if (!entry) 357 return NULL; 358 entry->refcount = 1; 359 RB_CLEAR_NODE(&entry->rbnode); 360 return entry; 361 } 362 363 static void zswap_entry_cache_free(struct zswap_entry *entry) 364 { 365 kmem_cache_free(zswap_entry_cache, entry); 366 } 367 368 /********************************* 369 * zswap lruvec functions 370 **********************************/ 371 void zswap_lruvec_state_init(struct lruvec *lruvec) 372 { 373 atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); 374 } 375 376 void zswap_folio_swapin(struct folio *folio) 377 { 378 struct lruvec *lruvec; 379 380 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 381 lruvec = folio_lruvec(folio); 382 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 383 } 384 385 /********************************* 386 * lru functions 387 **********************************/ 388 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) 389 { 390 atomic_long_t *nr_zswap_protected; 391 unsigned long lru_size, old, new; 392 int nid = entry_to_nid(entry); 393 struct mem_cgroup *memcg; 394 struct lruvec *lruvec; 395 396 /* 397 * Note that it is safe to use rcu_read_lock() here, even in the face of 398 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 399 * used in list_lru lookup, only two scenarios are possible: 400 * 401 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 402 * new entry will be reparented to memcg's parent's list_lru. 403 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 404 * new entry will be added directly to memcg's parent's list_lru. 405 * 406 * Similar reasoning holds for list_lru_del() and list_lru_putback(). 407 */ 408 rcu_read_lock(); 409 memcg = mem_cgroup_from_entry(entry); 410 /* will always succeed */ 411 list_lru_add(list_lru, &entry->lru, nid, memcg); 412 413 /* Update the protection area */ 414 lru_size = list_lru_count_one(list_lru, nid, memcg); 415 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 416 nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; 417 old = atomic_long_inc_return(nr_zswap_protected); 418 /* 419 * Decay to avoid overflow and adapt to changing workloads. 420 * This is based on LRU reclaim cost decaying heuristics. 421 */ 422 do { 423 new = old > lru_size / 4 ? old / 2 : old; 424 } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); 425 rcu_read_unlock(); 426 } 427 428 static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) 429 { 430 int nid = entry_to_nid(entry); 431 struct mem_cgroup *memcg; 432 433 rcu_read_lock(); 434 memcg = mem_cgroup_from_entry(entry); 435 /* will always succeed */ 436 list_lru_del(list_lru, &entry->lru, nid, memcg); 437 rcu_read_unlock(); 438 } 439 440 static void zswap_lru_putback(struct list_lru *list_lru, 441 struct zswap_entry *entry) 442 { 443 int nid = entry_to_nid(entry); 444 spinlock_t *lock = &list_lru->node[nid].lock; 445 struct mem_cgroup *memcg; 446 struct lruvec *lruvec; 447 448 rcu_read_lock(); 449 memcg = mem_cgroup_from_entry(entry); 450 spin_lock(lock); 451 /* we cannot use list_lru_add here, because it increments node's lru count */ 452 list_lru_putback(list_lru, &entry->lru, nid, memcg); 453 spin_unlock(lock); 454 455 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); 456 /* increment the protection area to account for the LRU rotation. */ 457 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 458 rcu_read_unlock(); 459 } 460 461 /********************************* 462 * rbtree functions 463 **********************************/ 464 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 465 { 466 struct rb_node *node = root->rb_node; 467 struct zswap_entry *entry; 468 pgoff_t entry_offset; 469 470 while (node) { 471 entry = rb_entry(node, struct zswap_entry, rbnode); 472 entry_offset = swp_offset(entry->swpentry); 473 if (entry_offset > offset) 474 node = node->rb_left; 475 else if (entry_offset < offset) 476 node = node->rb_right; 477 else 478 return entry; 479 } 480 return NULL; 481 } 482 483 /* 484 * In the case that a entry with the same offset is found, a pointer to 485 * the existing entry is stored in dupentry and the function returns -EEXIST 486 */ 487 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 488 struct zswap_entry **dupentry) 489 { 490 struct rb_node **link = &root->rb_node, *parent = NULL; 491 struct zswap_entry *myentry; 492 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 493 494 while (*link) { 495 parent = *link; 496 myentry = rb_entry(parent, struct zswap_entry, rbnode); 497 myentry_offset = swp_offset(myentry->swpentry); 498 if (myentry_offset > entry_offset) 499 link = &(*link)->rb_left; 500 else if (myentry_offset < entry_offset) 501 link = &(*link)->rb_right; 502 else { 503 *dupentry = myentry; 504 return -EEXIST; 505 } 506 } 507 rb_link_node(&entry->rbnode, parent, link); 508 rb_insert_color(&entry->rbnode, root); 509 return 0; 510 } 511 512 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 513 { 514 if (!RB_EMPTY_NODE(&entry->rbnode)) { 515 rb_erase(&entry->rbnode, root); 516 RB_CLEAR_NODE(&entry->rbnode); 517 return true; 518 } 519 return false; 520 } 521 522 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 523 { 524 int i = 0; 525 526 if (ZSWAP_NR_ZPOOLS > 1) 527 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 528 529 return entry->pool->zpools[i]; 530 } 531 532 /* 533 * Carries out the common pattern of freeing and entry's zpool allocation, 534 * freeing the entry itself, and decrementing the number of stored pages. 535 */ 536 static void zswap_free_entry(struct zswap_entry *entry) 537 { 538 if (!entry->length) 539 atomic_dec(&zswap_same_filled_pages); 540 else { 541 zswap_lru_del(&entry->pool->list_lru, entry); 542 zpool_free(zswap_find_zpool(entry), entry->handle); 543 atomic_dec(&entry->pool->nr_stored); 544 zswap_pool_put(entry->pool); 545 } 546 if (entry->objcg) { 547 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 548 obj_cgroup_put(entry->objcg); 549 } 550 zswap_entry_cache_free(entry); 551 atomic_dec(&zswap_stored_pages); 552 zswap_update_total_size(); 553 } 554 555 /* caller must hold the tree lock */ 556 static void zswap_entry_get(struct zswap_entry *entry) 557 { 558 entry->refcount++; 559 } 560 561 /* caller must hold the tree lock 562 * remove from the tree and free it, if nobody reference the entry 563 */ 564 static void zswap_entry_put(struct zswap_tree *tree, 565 struct zswap_entry *entry) 566 { 567 int refcount = --entry->refcount; 568 569 WARN_ON_ONCE(refcount < 0); 570 if (refcount == 0) { 571 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); 572 zswap_free_entry(entry); 573 } 574 } 575 576 /* caller must hold the tree lock */ 577 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 578 pgoff_t offset) 579 { 580 struct zswap_entry *entry; 581 582 entry = zswap_rb_search(root, offset); 583 if (entry) 584 zswap_entry_get(entry); 585 586 return entry; 587 } 588 589 /********************************* 590 * shrinker functions 591 **********************************/ 592 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 593 spinlock_t *lock, void *arg); 594 595 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, 596 struct shrink_control *sc) 597 { 598 struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 599 unsigned long shrink_ret, nr_protected, lru_size; 600 struct zswap_pool *pool = shrinker->private_data; 601 bool encountered_page_in_swapcache = false; 602 603 if (!zswap_shrinker_enabled || 604 !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { 605 sc->nr_scanned = 0; 606 return SHRINK_STOP; 607 } 608 609 nr_protected = 610 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 611 lru_size = list_lru_shrink_count(&pool->list_lru, sc); 612 613 /* 614 * Abort if we are shrinking into the protected region. 615 * 616 * This short-circuiting is necessary because if we have too many multiple 617 * concurrent reclaimers getting the freeable zswap object counts at the 618 * same time (before any of them made reasonable progress), the total 619 * number of reclaimed objects might be more than the number of unprotected 620 * objects (i.e the reclaimers will reclaim into the protected area of the 621 * zswap LRU). 622 */ 623 if (nr_protected >= lru_size - sc->nr_to_scan) { 624 sc->nr_scanned = 0; 625 return SHRINK_STOP; 626 } 627 628 shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, 629 &encountered_page_in_swapcache); 630 631 if (encountered_page_in_swapcache) 632 return SHRINK_STOP; 633 634 return shrink_ret ? shrink_ret : SHRINK_STOP; 635 } 636 637 static unsigned long zswap_shrinker_count(struct shrinker *shrinker, 638 struct shrink_control *sc) 639 { 640 struct zswap_pool *pool = shrinker->private_data; 641 struct mem_cgroup *memcg = sc->memcg; 642 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); 643 unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; 644 645 if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) 646 return 0; 647 648 #ifdef CONFIG_MEMCG_KMEM 649 mem_cgroup_flush_stats(memcg); 650 nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; 651 nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); 652 #else 653 /* use pool stats instead of memcg stats */ 654 nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; 655 nr_stored = atomic_read(&pool->nr_stored); 656 #endif 657 658 if (!nr_stored) 659 return 0; 660 661 nr_protected = 662 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 663 nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); 664 /* 665 * Subtract the lru size by an estimate of the number of pages 666 * that should be protected. 667 */ 668 nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; 669 670 /* 671 * Scale the number of freeable pages by the memory saving factor. 672 * This ensures that the better zswap compresses memory, the fewer 673 * pages we will evict to swap (as it will otherwise incur IO for 674 * relatively small memory saving). 675 */ 676 return mult_frac(nr_freeable, nr_backing, nr_stored); 677 } 678 679 static void zswap_alloc_shrinker(struct zswap_pool *pool) 680 { 681 pool->shrinker = 682 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); 683 if (!pool->shrinker) 684 return; 685 686 pool->shrinker->private_data = pool; 687 pool->shrinker->scan_objects = zswap_shrinker_scan; 688 pool->shrinker->count_objects = zswap_shrinker_count; 689 pool->shrinker->batch = 0; 690 pool->shrinker->seeks = DEFAULT_SEEKS; 691 } 692 693 /********************************* 694 * per-cpu code 695 **********************************/ 696 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 697 { 698 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 699 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 700 struct crypto_acomp *acomp; 701 struct acomp_req *req; 702 int ret; 703 704 mutex_init(&acomp_ctx->mutex); 705 706 acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 707 if (!acomp_ctx->buffer) 708 return -ENOMEM; 709 710 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 711 if (IS_ERR(acomp)) { 712 pr_err("could not alloc crypto acomp %s : %ld\n", 713 pool->tfm_name, PTR_ERR(acomp)); 714 ret = PTR_ERR(acomp); 715 goto acomp_fail; 716 } 717 acomp_ctx->acomp = acomp; 718 719 req = acomp_request_alloc(acomp_ctx->acomp); 720 if (!req) { 721 pr_err("could not alloc crypto acomp_request %s\n", 722 pool->tfm_name); 723 ret = -ENOMEM; 724 goto req_fail; 725 } 726 acomp_ctx->req = req; 727 728 crypto_init_wait(&acomp_ctx->wait); 729 /* 730 * if the backend of acomp is async zip, crypto_req_done() will wakeup 731 * crypto_wait_req(); if the backend of acomp is scomp, the callback 732 * won't be called, crypto_wait_req() will return without blocking. 733 */ 734 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 735 crypto_req_done, &acomp_ctx->wait); 736 737 return 0; 738 739 req_fail: 740 crypto_free_acomp(acomp_ctx->acomp); 741 acomp_fail: 742 kfree(acomp_ctx->buffer); 743 return ret; 744 } 745 746 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 747 { 748 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 749 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 750 751 if (!IS_ERR_OR_NULL(acomp_ctx)) { 752 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 753 acomp_request_free(acomp_ctx->req); 754 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 755 crypto_free_acomp(acomp_ctx->acomp); 756 kfree(acomp_ctx->buffer); 757 } 758 759 return 0; 760 } 761 762 /********************************* 763 * pool functions 764 **********************************/ 765 766 static struct zswap_pool *__zswap_pool_current(void) 767 { 768 struct zswap_pool *pool; 769 770 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 771 WARN_ONCE(!pool && zswap_has_pool, 772 "%s: no page storage pool!\n", __func__); 773 774 return pool; 775 } 776 777 static struct zswap_pool *zswap_pool_current(void) 778 { 779 assert_spin_locked(&zswap_pools_lock); 780 781 return __zswap_pool_current(); 782 } 783 784 static struct zswap_pool *zswap_pool_current_get(void) 785 { 786 struct zswap_pool *pool; 787 788 rcu_read_lock(); 789 790 pool = __zswap_pool_current(); 791 if (!zswap_pool_get(pool)) 792 pool = NULL; 793 794 rcu_read_unlock(); 795 796 return pool; 797 } 798 799 static struct zswap_pool *zswap_pool_last_get(void) 800 { 801 struct zswap_pool *pool, *last = NULL; 802 803 rcu_read_lock(); 804 805 list_for_each_entry_rcu(pool, &zswap_pools, list) 806 last = pool; 807 WARN_ONCE(!last && zswap_has_pool, 808 "%s: no page storage pool!\n", __func__); 809 if (!zswap_pool_get(last)) 810 last = NULL; 811 812 rcu_read_unlock(); 813 814 return last; 815 } 816 817 /* type and compressor must be null-terminated */ 818 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 819 { 820 struct zswap_pool *pool; 821 822 assert_spin_locked(&zswap_pools_lock); 823 824 list_for_each_entry_rcu(pool, &zswap_pools, list) { 825 if (strcmp(pool->tfm_name, compressor)) 826 continue; 827 /* all zpools share the same type */ 828 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 829 continue; 830 /* if we can't get it, it's about to be destroyed */ 831 if (!zswap_pool_get(pool)) 832 continue; 833 return pool; 834 } 835 836 return NULL; 837 } 838 839 /* 840 * If the entry is still valid in the tree, drop the initial ref and remove it 841 * from the tree. This function must be called with an additional ref held, 842 * otherwise it may race with another invalidation freeing the entry. 843 */ 844 static void zswap_invalidate_entry(struct zswap_tree *tree, 845 struct zswap_entry *entry) 846 { 847 if (zswap_rb_erase(&tree->rbroot, entry)) 848 zswap_entry_put(tree, entry); 849 } 850 851 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 852 spinlock_t *lock, void *arg) 853 { 854 struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); 855 bool *encountered_page_in_swapcache = (bool *)arg; 856 struct zswap_tree *tree; 857 pgoff_t swpoffset; 858 enum lru_status ret = LRU_REMOVED_RETRY; 859 int writeback_result; 860 861 /* 862 * Once the lru lock is dropped, the entry might get freed. The 863 * swpoffset is copied to the stack, and entry isn't deref'd again 864 * until the entry is verified to still be alive in the tree. 865 */ 866 swpoffset = swp_offset(entry->swpentry); 867 tree = zswap_trees[swp_type(entry->swpentry)]; 868 list_lru_isolate(l, item); 869 /* 870 * It's safe to drop the lock here because we return either 871 * LRU_REMOVED_RETRY or LRU_RETRY. 872 */ 873 spin_unlock(lock); 874 875 /* Check for invalidate() race */ 876 spin_lock(&tree->lock); 877 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) 878 goto unlock; 879 880 /* Hold a reference to prevent a free during writeback */ 881 zswap_entry_get(entry); 882 spin_unlock(&tree->lock); 883 884 writeback_result = zswap_writeback_entry(entry, tree); 885 886 spin_lock(&tree->lock); 887 if (writeback_result) { 888 zswap_reject_reclaim_fail++; 889 zswap_lru_putback(&entry->pool->list_lru, entry); 890 ret = LRU_RETRY; 891 892 /* 893 * Encountering a page already in swap cache is a sign that we are shrinking 894 * into the warmer region. We should terminate shrinking (if we're in the dynamic 895 * shrinker context). 896 */ 897 if (writeback_result == -EEXIST && encountered_page_in_swapcache) 898 *encountered_page_in_swapcache = true; 899 900 goto put_unlock; 901 } 902 zswap_written_back_pages++; 903 904 if (entry->objcg) 905 count_objcg_event(entry->objcg, ZSWPWB); 906 907 count_vm_event(ZSWPWB); 908 /* 909 * Writeback started successfully, the page now belongs to the 910 * swapcache. Drop the entry from zswap - unless invalidate already 911 * took it out while we had the tree->lock released for IO. 912 */ 913 zswap_invalidate_entry(tree, entry); 914 915 put_unlock: 916 /* Drop local reference */ 917 zswap_entry_put(tree, entry); 918 unlock: 919 spin_unlock(&tree->lock); 920 spin_lock(lock); 921 return ret; 922 } 923 924 static int shrink_memcg(struct mem_cgroup *memcg) 925 { 926 struct zswap_pool *pool; 927 int nid, shrunk = 0; 928 929 if (!mem_cgroup_zswap_writeback_enabled(memcg)) 930 return -EINVAL; 931 932 /* 933 * Skip zombies because their LRUs are reparented and we would be 934 * reclaiming from the parent instead of the dead memcg. 935 */ 936 if (memcg && !mem_cgroup_online(memcg)) 937 return -ENOENT; 938 939 pool = zswap_pool_current_get(); 940 if (!pool) 941 return -EINVAL; 942 943 for_each_node_state(nid, N_NORMAL_MEMORY) { 944 unsigned long nr_to_walk = 1; 945 946 shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, 947 &shrink_memcg_cb, NULL, &nr_to_walk); 948 } 949 zswap_pool_put(pool); 950 return shrunk ? 0 : -EAGAIN; 951 } 952 953 static void shrink_worker(struct work_struct *w) 954 { 955 struct zswap_pool *pool = container_of(w, typeof(*pool), 956 shrink_work); 957 struct mem_cgroup *memcg; 958 int ret, failures = 0; 959 960 /* global reclaim will select cgroup in a round-robin fashion. */ 961 do { 962 spin_lock(&zswap_pools_lock); 963 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 964 memcg = pool->next_shrink; 965 966 /* 967 * We need to retry if we have gone through a full round trip, or if we 968 * got an offline memcg (or else we risk undoing the effect of the 969 * zswap memcg offlining cleanup callback). This is not catastrophic 970 * per se, but it will keep the now offlined memcg hostage for a while. 971 * 972 * Note that if we got an online memcg, we will keep the extra 973 * reference in case the original reference obtained by mem_cgroup_iter 974 * is dropped by the zswap memcg offlining callback, ensuring that the 975 * memcg is not killed when we are reclaiming. 976 */ 977 if (!memcg) { 978 spin_unlock(&zswap_pools_lock); 979 if (++failures == MAX_RECLAIM_RETRIES) 980 break; 981 982 goto resched; 983 } 984 985 if (!mem_cgroup_tryget_online(memcg)) { 986 /* drop the reference from mem_cgroup_iter() */ 987 mem_cgroup_iter_break(NULL, memcg); 988 pool->next_shrink = NULL; 989 spin_unlock(&zswap_pools_lock); 990 991 if (++failures == MAX_RECLAIM_RETRIES) 992 break; 993 994 goto resched; 995 } 996 spin_unlock(&zswap_pools_lock); 997 998 ret = shrink_memcg(memcg); 999 /* drop the extra reference */ 1000 mem_cgroup_put(memcg); 1001 1002 if (ret == -EINVAL) 1003 break; 1004 if (ret && ++failures == MAX_RECLAIM_RETRIES) 1005 break; 1006 1007 resched: 1008 cond_resched(); 1009 } while (!zswap_can_accept()); 1010 zswap_pool_put(pool); 1011 } 1012 1013 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 1014 { 1015 int i; 1016 struct zswap_pool *pool; 1017 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 1018 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1019 int ret; 1020 1021 if (!zswap_has_pool) { 1022 /* if either are unset, pool initialization failed, and we 1023 * need both params to be set correctly before trying to 1024 * create a pool. 1025 */ 1026 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 1027 return NULL; 1028 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 1029 return NULL; 1030 } 1031 1032 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1033 if (!pool) 1034 return NULL; 1035 1036 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 1037 /* unique name for each pool specifically required by zsmalloc */ 1038 snprintf(name, 38, "zswap%x", 1039 atomic_inc_return(&zswap_pools_count)); 1040 1041 pool->zpools[i] = zpool_create_pool(type, name, gfp); 1042 if (!pool->zpools[i]) { 1043 pr_err("%s zpool not available\n", type); 1044 goto error; 1045 } 1046 } 1047 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 1048 1049 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 1050 1051 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 1052 if (!pool->acomp_ctx) { 1053 pr_err("percpu alloc failed\n"); 1054 goto error; 1055 } 1056 1057 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 1058 &pool->node); 1059 if (ret) 1060 goto error; 1061 1062 zswap_alloc_shrinker(pool); 1063 if (!pool->shrinker) 1064 goto error; 1065 1066 pr_debug("using %s compressor\n", pool->tfm_name); 1067 1068 /* being the current pool takes 1 ref; this func expects the 1069 * caller to always add the new pool as the current pool 1070 */ 1071 kref_init(&pool->kref); 1072 INIT_LIST_HEAD(&pool->list); 1073 if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) 1074 goto lru_fail; 1075 shrinker_register(pool->shrinker); 1076 INIT_WORK(&pool->shrink_work, shrink_worker); 1077 atomic_set(&pool->nr_stored, 0); 1078 1079 zswap_pool_debug("created", pool); 1080 1081 return pool; 1082 1083 lru_fail: 1084 list_lru_destroy(&pool->list_lru); 1085 shrinker_free(pool->shrinker); 1086 error: 1087 if (pool->acomp_ctx) 1088 free_percpu(pool->acomp_ctx); 1089 while (i--) 1090 zpool_destroy_pool(pool->zpools[i]); 1091 kfree(pool); 1092 return NULL; 1093 } 1094 1095 static struct zswap_pool *__zswap_pool_create_fallback(void) 1096 { 1097 bool has_comp, has_zpool; 1098 1099 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1100 if (!has_comp && strcmp(zswap_compressor, 1101 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 1102 pr_err("compressor %s not available, using default %s\n", 1103 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 1104 param_free_charp(&zswap_compressor); 1105 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 1106 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1107 } 1108 if (!has_comp) { 1109 pr_err("default compressor %s not available\n", 1110 zswap_compressor); 1111 param_free_charp(&zswap_compressor); 1112 zswap_compressor = ZSWAP_PARAM_UNSET; 1113 } 1114 1115 has_zpool = zpool_has_pool(zswap_zpool_type); 1116 if (!has_zpool && strcmp(zswap_zpool_type, 1117 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 1118 pr_err("zpool %s not available, using default %s\n", 1119 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 1120 param_free_charp(&zswap_zpool_type); 1121 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 1122 has_zpool = zpool_has_pool(zswap_zpool_type); 1123 } 1124 if (!has_zpool) { 1125 pr_err("default zpool %s not available\n", 1126 zswap_zpool_type); 1127 param_free_charp(&zswap_zpool_type); 1128 zswap_zpool_type = ZSWAP_PARAM_UNSET; 1129 } 1130 1131 if (!has_comp || !has_zpool) 1132 return NULL; 1133 1134 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 1135 } 1136 1137 static void zswap_pool_destroy(struct zswap_pool *pool) 1138 { 1139 int i; 1140 1141 zswap_pool_debug("destroying", pool); 1142 1143 shrinker_free(pool->shrinker); 1144 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 1145 free_percpu(pool->acomp_ctx); 1146 list_lru_destroy(&pool->list_lru); 1147 1148 spin_lock(&zswap_pools_lock); 1149 mem_cgroup_iter_break(NULL, pool->next_shrink); 1150 pool->next_shrink = NULL; 1151 spin_unlock(&zswap_pools_lock); 1152 1153 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 1154 zpool_destroy_pool(pool->zpools[i]); 1155 kfree(pool); 1156 } 1157 1158 static int __must_check zswap_pool_get(struct zswap_pool *pool) 1159 { 1160 if (!pool) 1161 return 0; 1162 1163 return kref_get_unless_zero(&pool->kref); 1164 } 1165 1166 static void __zswap_pool_release(struct work_struct *work) 1167 { 1168 struct zswap_pool *pool = container_of(work, typeof(*pool), 1169 release_work); 1170 1171 synchronize_rcu(); 1172 1173 /* nobody should have been able to get a kref... */ 1174 WARN_ON(kref_get_unless_zero(&pool->kref)); 1175 1176 /* pool is now off zswap_pools list and has no references. */ 1177 zswap_pool_destroy(pool); 1178 } 1179 1180 static void __zswap_pool_empty(struct kref *kref) 1181 { 1182 struct zswap_pool *pool; 1183 1184 pool = container_of(kref, typeof(*pool), kref); 1185 1186 spin_lock(&zswap_pools_lock); 1187 1188 WARN_ON(pool == zswap_pool_current()); 1189 1190 list_del_rcu(&pool->list); 1191 1192 INIT_WORK(&pool->release_work, __zswap_pool_release); 1193 schedule_work(&pool->release_work); 1194 1195 spin_unlock(&zswap_pools_lock); 1196 } 1197 1198 static void zswap_pool_put(struct zswap_pool *pool) 1199 { 1200 kref_put(&pool->kref, __zswap_pool_empty); 1201 } 1202 1203 /********************************* 1204 * param callbacks 1205 **********************************/ 1206 1207 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 1208 { 1209 /* no change required */ 1210 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 1211 return false; 1212 return true; 1213 } 1214 1215 /* val must be a null-terminated string */ 1216 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 1217 char *type, char *compressor) 1218 { 1219 struct zswap_pool *pool, *put_pool = NULL; 1220 char *s = strstrip((char *)val); 1221 int ret = 0; 1222 bool new_pool = false; 1223 1224 mutex_lock(&zswap_init_lock); 1225 switch (zswap_init_state) { 1226 case ZSWAP_UNINIT: 1227 /* if this is load-time (pre-init) param setting, 1228 * don't create a pool; that's done during init. 1229 */ 1230 ret = param_set_charp(s, kp); 1231 break; 1232 case ZSWAP_INIT_SUCCEED: 1233 new_pool = zswap_pool_changed(s, kp); 1234 break; 1235 case ZSWAP_INIT_FAILED: 1236 pr_err("can't set param, initialization failed\n"); 1237 ret = -ENODEV; 1238 } 1239 mutex_unlock(&zswap_init_lock); 1240 1241 /* no need to create a new pool, return directly */ 1242 if (!new_pool) 1243 return ret; 1244 1245 if (!type) { 1246 if (!zpool_has_pool(s)) { 1247 pr_err("zpool %s not available\n", s); 1248 return -ENOENT; 1249 } 1250 type = s; 1251 } else if (!compressor) { 1252 if (!crypto_has_acomp(s, 0, 0)) { 1253 pr_err("compressor %s not available\n", s); 1254 return -ENOENT; 1255 } 1256 compressor = s; 1257 } else { 1258 WARN_ON(1); 1259 return -EINVAL; 1260 } 1261 1262 spin_lock(&zswap_pools_lock); 1263 1264 pool = zswap_pool_find_get(type, compressor); 1265 if (pool) { 1266 zswap_pool_debug("using existing", pool); 1267 WARN_ON(pool == zswap_pool_current()); 1268 list_del_rcu(&pool->list); 1269 } 1270 1271 spin_unlock(&zswap_pools_lock); 1272 1273 if (!pool) 1274 pool = zswap_pool_create(type, compressor); 1275 1276 if (pool) 1277 ret = param_set_charp(s, kp); 1278 else 1279 ret = -EINVAL; 1280 1281 spin_lock(&zswap_pools_lock); 1282 1283 if (!ret) { 1284 put_pool = zswap_pool_current(); 1285 list_add_rcu(&pool->list, &zswap_pools); 1286 zswap_has_pool = true; 1287 } else if (pool) { 1288 /* add the possibly pre-existing pool to the end of the pools 1289 * list; if it's new (and empty) then it'll be removed and 1290 * destroyed by the put after we drop the lock 1291 */ 1292 list_add_tail_rcu(&pool->list, &zswap_pools); 1293 put_pool = pool; 1294 } 1295 1296 spin_unlock(&zswap_pools_lock); 1297 1298 if (!zswap_has_pool && !pool) { 1299 /* if initial pool creation failed, and this pool creation also 1300 * failed, maybe both compressor and zpool params were bad. 1301 * Allow changing this param, so pool creation will succeed 1302 * when the other param is changed. We already verified this 1303 * param is ok in the zpool_has_pool() or crypto_has_acomp() 1304 * checks above. 1305 */ 1306 ret = param_set_charp(s, kp); 1307 } 1308 1309 /* drop the ref from either the old current pool, 1310 * or the new pool we failed to add 1311 */ 1312 if (put_pool) 1313 zswap_pool_put(put_pool); 1314 1315 return ret; 1316 } 1317 1318 static int zswap_compressor_param_set(const char *val, 1319 const struct kernel_param *kp) 1320 { 1321 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 1322 } 1323 1324 static int zswap_zpool_param_set(const char *val, 1325 const struct kernel_param *kp) 1326 { 1327 return __zswap_param_set(val, kp, NULL, zswap_compressor); 1328 } 1329 1330 static int zswap_enabled_param_set(const char *val, 1331 const struct kernel_param *kp) 1332 { 1333 int ret = -ENODEV; 1334 1335 /* if this is load-time (pre-init) param setting, only set param. */ 1336 if (system_state != SYSTEM_RUNNING) 1337 return param_set_bool(val, kp); 1338 1339 mutex_lock(&zswap_init_lock); 1340 switch (zswap_init_state) { 1341 case ZSWAP_UNINIT: 1342 if (zswap_setup()) 1343 break; 1344 fallthrough; 1345 case ZSWAP_INIT_SUCCEED: 1346 if (!zswap_has_pool) 1347 pr_err("can't enable, no pool configured\n"); 1348 else 1349 ret = param_set_bool(val, kp); 1350 break; 1351 case ZSWAP_INIT_FAILED: 1352 pr_err("can't enable, initialization failed\n"); 1353 } 1354 mutex_unlock(&zswap_init_lock); 1355 1356 return ret; 1357 } 1358 1359 static void __zswap_load(struct zswap_entry *entry, struct page *page) 1360 { 1361 struct zpool *zpool = zswap_find_zpool(entry); 1362 struct scatterlist input, output; 1363 struct crypto_acomp_ctx *acomp_ctx; 1364 u8 *src; 1365 1366 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1367 mutex_lock(&acomp_ctx->mutex); 1368 1369 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1370 if (!zpool_can_sleep_mapped(zpool)) { 1371 memcpy(acomp_ctx->buffer, src, entry->length); 1372 src = acomp_ctx->buffer; 1373 zpool_unmap_handle(zpool, entry->handle); 1374 } 1375 1376 sg_init_one(&input, src, entry->length); 1377 sg_init_table(&output, 1); 1378 sg_set_page(&output, page, PAGE_SIZE, 0); 1379 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); 1380 BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); 1381 BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); 1382 mutex_unlock(&acomp_ctx->mutex); 1383 1384 if (zpool_can_sleep_mapped(zpool)) 1385 zpool_unmap_handle(zpool, entry->handle); 1386 } 1387 1388 /********************************* 1389 * writeback code 1390 **********************************/ 1391 /* 1392 * Attempts to free an entry by adding a folio to the swap cache, 1393 * decompressing the entry data into the folio, and issuing a 1394 * bio write to write the folio back to the swap device. 1395 * 1396 * This can be thought of as a "resumed writeback" of the folio 1397 * to the swap device. We are basically resuming the same swap 1398 * writeback path that was intercepted with the zswap_store() 1399 * in the first place. After the folio has been decompressed into 1400 * the swap cache, the compressed version stored by zswap can be 1401 * freed. 1402 */ 1403 static int zswap_writeback_entry(struct zswap_entry *entry, 1404 struct zswap_tree *tree) 1405 { 1406 swp_entry_t swpentry = entry->swpentry; 1407 struct folio *folio; 1408 struct mempolicy *mpol; 1409 bool folio_was_allocated; 1410 struct writeback_control wbc = { 1411 .sync_mode = WB_SYNC_NONE, 1412 }; 1413 1414 /* try to allocate swap cache folio */ 1415 mpol = get_task_policy(current); 1416 folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1417 NO_INTERLEAVE_INDEX, &folio_was_allocated, true); 1418 if (!folio) 1419 return -ENOMEM; 1420 1421 /* 1422 * Found an existing folio, we raced with load/swapin. We generally 1423 * writeback cold folios from zswap, and swapin means the folio just 1424 * became hot. Skip this folio and let the caller find another one. 1425 */ 1426 if (!folio_was_allocated) { 1427 folio_put(folio); 1428 return -EEXIST; 1429 } 1430 1431 /* 1432 * folio is locked, and the swapcache is now secured against 1433 * concurrent swapping to and from the slot. Verify that the 1434 * swap entry hasn't been invalidated and recycled behind our 1435 * backs (our zswap_entry reference doesn't prevent that), to 1436 * avoid overwriting a new swap folio with old compressed data. 1437 */ 1438 spin_lock(&tree->lock); 1439 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { 1440 spin_unlock(&tree->lock); 1441 delete_from_swap_cache(folio); 1442 folio_unlock(folio); 1443 folio_put(folio); 1444 return -ENOMEM; 1445 } 1446 spin_unlock(&tree->lock); 1447 1448 __zswap_load(entry, &folio->page); 1449 1450 /* folio is up to date */ 1451 folio_mark_uptodate(folio); 1452 1453 /* move it to the tail of the inactive list after end_writeback */ 1454 folio_set_reclaim(folio); 1455 1456 /* start writeback */ 1457 __swap_writepage(folio, &wbc); 1458 folio_put(folio); 1459 1460 return 0; 1461 } 1462 1463 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1464 { 1465 unsigned long *page; 1466 unsigned long val; 1467 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1468 1469 page = (unsigned long *)ptr; 1470 val = page[0]; 1471 1472 if (val != page[last_pos]) 1473 return 0; 1474 1475 for (pos = 1; pos < last_pos; pos++) { 1476 if (val != page[pos]) 1477 return 0; 1478 } 1479 1480 *value = val; 1481 1482 return 1; 1483 } 1484 1485 static void zswap_fill_page(void *ptr, unsigned long value) 1486 { 1487 unsigned long *page; 1488 1489 page = (unsigned long *)ptr; 1490 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1491 } 1492 1493 bool zswap_store(struct folio *folio) 1494 { 1495 swp_entry_t swp = folio->swap; 1496 int type = swp_type(swp); 1497 pgoff_t offset = swp_offset(swp); 1498 struct page *page = &folio->page; 1499 struct zswap_tree *tree = zswap_trees[type]; 1500 struct zswap_entry *entry, *dupentry; 1501 struct scatterlist input, output; 1502 struct crypto_acomp_ctx *acomp_ctx; 1503 struct obj_cgroup *objcg = NULL; 1504 struct mem_cgroup *memcg = NULL; 1505 struct zswap_pool *pool; 1506 struct zpool *zpool; 1507 unsigned int dlen = PAGE_SIZE; 1508 unsigned long handle, value; 1509 char *buf; 1510 u8 *src, *dst; 1511 gfp_t gfp; 1512 int ret; 1513 1514 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1515 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1516 1517 /* Large folios aren't supported */ 1518 if (folio_test_large(folio)) 1519 return false; 1520 1521 if (!tree) 1522 return false; 1523 1524 /* 1525 * If this is a duplicate, it must be removed before attempting to store 1526 * it, otherwise, if the store fails the old page won't be removed from 1527 * the tree, and it might be written back overriding the new data. 1528 */ 1529 spin_lock(&tree->lock); 1530 dupentry = zswap_rb_search(&tree->rbroot, offset); 1531 if (dupentry) { 1532 zswap_duplicate_entry++; 1533 zswap_invalidate_entry(tree, dupentry); 1534 } 1535 spin_unlock(&tree->lock); 1536 1537 if (!zswap_enabled) 1538 return false; 1539 1540 objcg = get_obj_cgroup_from_folio(folio); 1541 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1542 memcg = get_mem_cgroup_from_objcg(objcg); 1543 if (shrink_memcg(memcg)) { 1544 mem_cgroup_put(memcg); 1545 goto reject; 1546 } 1547 mem_cgroup_put(memcg); 1548 } 1549 1550 /* reclaim space if needed */ 1551 if (zswap_is_full()) { 1552 zswap_pool_limit_hit++; 1553 zswap_pool_reached_full = true; 1554 goto shrink; 1555 } 1556 1557 if (zswap_pool_reached_full) { 1558 if (!zswap_can_accept()) 1559 goto shrink; 1560 else 1561 zswap_pool_reached_full = false; 1562 } 1563 1564 /* allocate entry */ 1565 entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); 1566 if (!entry) { 1567 zswap_reject_kmemcache_fail++; 1568 goto reject; 1569 } 1570 1571 if (zswap_same_filled_pages_enabled) { 1572 src = kmap_local_page(page); 1573 if (zswap_is_page_same_filled(src, &value)) { 1574 kunmap_local(src); 1575 entry->swpentry = swp_entry(type, offset); 1576 entry->length = 0; 1577 entry->value = value; 1578 atomic_inc(&zswap_same_filled_pages); 1579 goto insert_entry; 1580 } 1581 kunmap_local(src); 1582 } 1583 1584 if (!zswap_non_same_filled_pages_enabled) 1585 goto freepage; 1586 1587 /* if entry is successfully added, it keeps the reference */ 1588 entry->pool = zswap_pool_current_get(); 1589 if (!entry->pool) 1590 goto freepage; 1591 1592 if (objcg) { 1593 memcg = get_mem_cgroup_from_objcg(objcg); 1594 if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { 1595 mem_cgroup_put(memcg); 1596 goto put_pool; 1597 } 1598 mem_cgroup_put(memcg); 1599 } 1600 1601 /* compress */ 1602 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1603 1604 mutex_lock(&acomp_ctx->mutex); 1605 1606 dst = acomp_ctx->buffer; 1607 sg_init_table(&input, 1); 1608 sg_set_page(&input, &folio->page, PAGE_SIZE, 0); 1609 1610 /* 1611 * We need PAGE_SIZE * 2 here since there maybe over-compression case, 1612 * and hardware-accelerators may won't check the dst buffer size, so 1613 * giving the dst buffer with enough length to avoid buffer overflow. 1614 */ 1615 sg_init_one(&output, dst, PAGE_SIZE * 2); 1616 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1617 /* 1618 * it maybe looks a little bit silly that we send an asynchronous request, 1619 * then wait for its completion synchronously. This makes the process look 1620 * synchronous in fact. 1621 * Theoretically, acomp supports users send multiple acomp requests in one 1622 * acomp instance, then get those requests done simultaneously. but in this 1623 * case, zswap actually does store and load page by page, there is no 1624 * existing method to send the second page before the first page is done 1625 * in one thread doing zwap. 1626 * but in different threads running on different cpu, we have different 1627 * acomp instance, so multiple threads can do (de)compression in parallel. 1628 */ 1629 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1630 dlen = acomp_ctx->req->dlen; 1631 1632 if (ret) { 1633 zswap_reject_compress_fail++; 1634 goto put_dstmem; 1635 } 1636 1637 /* store */ 1638 zpool = zswap_find_zpool(entry); 1639 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1640 if (zpool_malloc_support_movable(zpool)) 1641 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1642 ret = zpool_malloc(zpool, dlen, gfp, &handle); 1643 if (ret == -ENOSPC) { 1644 zswap_reject_compress_poor++; 1645 goto put_dstmem; 1646 } 1647 if (ret) { 1648 zswap_reject_alloc_fail++; 1649 goto put_dstmem; 1650 } 1651 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1652 memcpy(buf, dst, dlen); 1653 zpool_unmap_handle(zpool, handle); 1654 mutex_unlock(&acomp_ctx->mutex); 1655 1656 /* populate entry */ 1657 entry->swpentry = swp_entry(type, offset); 1658 entry->handle = handle; 1659 entry->length = dlen; 1660 1661 insert_entry: 1662 entry->objcg = objcg; 1663 if (objcg) { 1664 obj_cgroup_charge_zswap(objcg, entry->length); 1665 /* Account before objcg ref is moved to tree */ 1666 count_objcg_event(objcg, ZSWPOUT); 1667 } 1668 1669 /* map */ 1670 spin_lock(&tree->lock); 1671 /* 1672 * A duplicate entry should have been removed at the beginning of this 1673 * function. Since the swap entry should be pinned, if a duplicate is 1674 * found again here it means that something went wrong in the swap 1675 * cache. 1676 */ 1677 while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1678 WARN_ON(1); 1679 zswap_duplicate_entry++; 1680 zswap_invalidate_entry(tree, dupentry); 1681 } 1682 if (entry->length) { 1683 INIT_LIST_HEAD(&entry->lru); 1684 zswap_lru_add(&entry->pool->list_lru, entry); 1685 atomic_inc(&entry->pool->nr_stored); 1686 } 1687 spin_unlock(&tree->lock); 1688 1689 /* update stats */ 1690 atomic_inc(&zswap_stored_pages); 1691 zswap_update_total_size(); 1692 count_vm_event(ZSWPOUT); 1693 1694 return true; 1695 1696 put_dstmem: 1697 mutex_unlock(&acomp_ctx->mutex); 1698 put_pool: 1699 zswap_pool_put(entry->pool); 1700 freepage: 1701 zswap_entry_cache_free(entry); 1702 reject: 1703 if (objcg) 1704 obj_cgroup_put(objcg); 1705 return false; 1706 1707 shrink: 1708 pool = zswap_pool_last_get(); 1709 if (pool && !queue_work(shrink_wq, &pool->shrink_work)) 1710 zswap_pool_put(pool); 1711 goto reject; 1712 } 1713 1714 bool zswap_load(struct folio *folio) 1715 { 1716 swp_entry_t swp = folio->swap; 1717 int type = swp_type(swp); 1718 pgoff_t offset = swp_offset(swp); 1719 struct page *page = &folio->page; 1720 struct zswap_tree *tree = zswap_trees[type]; 1721 struct zswap_entry *entry; 1722 u8 *dst; 1723 1724 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1725 1726 /* find */ 1727 spin_lock(&tree->lock); 1728 entry = zswap_entry_find_get(&tree->rbroot, offset); 1729 if (!entry) { 1730 spin_unlock(&tree->lock); 1731 return false; 1732 } 1733 spin_unlock(&tree->lock); 1734 1735 if (entry->length) 1736 __zswap_load(entry, page); 1737 else { 1738 dst = kmap_local_page(page); 1739 zswap_fill_page(dst, entry->value); 1740 kunmap_local(dst); 1741 } 1742 1743 count_vm_event(ZSWPIN); 1744 if (entry->objcg) 1745 count_objcg_event(entry->objcg, ZSWPIN); 1746 1747 spin_lock(&tree->lock); 1748 if (zswap_exclusive_loads_enabled) { 1749 zswap_invalidate_entry(tree, entry); 1750 folio_mark_dirty(folio); 1751 } else if (entry->length) { 1752 zswap_lru_del(&entry->pool->list_lru, entry); 1753 zswap_lru_add(&entry->pool->list_lru, entry); 1754 } 1755 zswap_entry_put(tree, entry); 1756 spin_unlock(&tree->lock); 1757 1758 return true; 1759 } 1760 1761 void zswap_invalidate(int type, pgoff_t offset) 1762 { 1763 struct zswap_tree *tree = zswap_trees[type]; 1764 struct zswap_entry *entry; 1765 1766 /* find */ 1767 spin_lock(&tree->lock); 1768 entry = zswap_rb_search(&tree->rbroot, offset); 1769 if (!entry) { 1770 /* entry was written back */ 1771 spin_unlock(&tree->lock); 1772 return; 1773 } 1774 zswap_invalidate_entry(tree, entry); 1775 spin_unlock(&tree->lock); 1776 } 1777 1778 void zswap_swapon(int type) 1779 { 1780 struct zswap_tree *tree; 1781 1782 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1783 if (!tree) { 1784 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1785 return; 1786 } 1787 1788 tree->rbroot = RB_ROOT; 1789 spin_lock_init(&tree->lock); 1790 zswap_trees[type] = tree; 1791 } 1792 1793 void zswap_swapoff(int type) 1794 { 1795 struct zswap_tree *tree = zswap_trees[type]; 1796 struct zswap_entry *entry, *n; 1797 1798 if (!tree) 1799 return; 1800 1801 /* walk the tree and free everything */ 1802 spin_lock(&tree->lock); 1803 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1804 zswap_free_entry(entry); 1805 tree->rbroot = RB_ROOT; 1806 spin_unlock(&tree->lock); 1807 kfree(tree); 1808 zswap_trees[type] = NULL; 1809 } 1810 1811 /********************************* 1812 * debugfs functions 1813 **********************************/ 1814 #ifdef CONFIG_DEBUG_FS 1815 #include <linux/debugfs.h> 1816 1817 static struct dentry *zswap_debugfs_root; 1818 1819 static int zswap_debugfs_init(void) 1820 { 1821 if (!debugfs_initialized()) 1822 return -ENODEV; 1823 1824 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1825 1826 debugfs_create_u64("pool_limit_hit", 0444, 1827 zswap_debugfs_root, &zswap_pool_limit_hit); 1828 debugfs_create_u64("reject_reclaim_fail", 0444, 1829 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1830 debugfs_create_u64("reject_alloc_fail", 0444, 1831 zswap_debugfs_root, &zswap_reject_alloc_fail); 1832 debugfs_create_u64("reject_kmemcache_fail", 0444, 1833 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1834 debugfs_create_u64("reject_compress_fail", 0444, 1835 zswap_debugfs_root, &zswap_reject_compress_fail); 1836 debugfs_create_u64("reject_compress_poor", 0444, 1837 zswap_debugfs_root, &zswap_reject_compress_poor); 1838 debugfs_create_u64("written_back_pages", 0444, 1839 zswap_debugfs_root, &zswap_written_back_pages); 1840 debugfs_create_u64("duplicate_entry", 0444, 1841 zswap_debugfs_root, &zswap_duplicate_entry); 1842 debugfs_create_u64("pool_total_size", 0444, 1843 zswap_debugfs_root, &zswap_pool_total_size); 1844 debugfs_create_atomic_t("stored_pages", 0444, 1845 zswap_debugfs_root, &zswap_stored_pages); 1846 debugfs_create_atomic_t("same_filled_pages", 0444, 1847 zswap_debugfs_root, &zswap_same_filled_pages); 1848 1849 return 0; 1850 } 1851 #else 1852 static int zswap_debugfs_init(void) 1853 { 1854 return 0; 1855 } 1856 #endif 1857 1858 /********************************* 1859 * module init and exit 1860 **********************************/ 1861 static int zswap_setup(void) 1862 { 1863 struct zswap_pool *pool; 1864 int ret; 1865 1866 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1867 if (!zswap_entry_cache) { 1868 pr_err("entry cache creation failed\n"); 1869 goto cache_fail; 1870 } 1871 1872 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1873 "mm/zswap_pool:prepare", 1874 zswap_cpu_comp_prepare, 1875 zswap_cpu_comp_dead); 1876 if (ret) 1877 goto hp_fail; 1878 1879 pool = __zswap_pool_create_fallback(); 1880 if (pool) { 1881 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1882 zpool_get_type(pool->zpools[0])); 1883 list_add(&pool->list, &zswap_pools); 1884 zswap_has_pool = true; 1885 } else { 1886 pr_err("pool creation failed\n"); 1887 zswap_enabled = false; 1888 } 1889 1890 shrink_wq = create_workqueue("zswap-shrink"); 1891 if (!shrink_wq) 1892 goto fallback_fail; 1893 1894 if (zswap_debugfs_init()) 1895 pr_warn("debugfs initialization failed\n"); 1896 zswap_init_state = ZSWAP_INIT_SUCCEED; 1897 return 0; 1898 1899 fallback_fail: 1900 if (pool) 1901 zswap_pool_destroy(pool); 1902 hp_fail: 1903 kmem_cache_destroy(zswap_entry_cache); 1904 cache_fail: 1905 /* if built-in, we aren't unloaded on failure; don't allow use */ 1906 zswap_init_state = ZSWAP_INIT_FAILED; 1907 zswap_enabled = false; 1908 return -ENOMEM; 1909 } 1910 1911 static int __init zswap_init(void) 1912 { 1913 if (!zswap_enabled) 1914 return 0; 1915 return zswap_setup(); 1916 } 1917 /* must be late so crypto has time to come up */ 1918 late_initcall(zswap_init); 1919 1920 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1921 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1922