1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempolicy.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 #include <linux/zswap.h> 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 #include <linux/list_lru.h> 39 40 #include "swap.h" 41 #include "internal.h" 42 43 /********************************* 44 * statistics 45 **********************************/ 46 /* Total bytes used by the compressed storage */ 47 u64 zswap_pool_total_size; 48 /* The number of compressed pages currently stored in zswap */ 49 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 50 /* The number of same-value filled pages currently stored in zswap */ 51 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Store failed due to compression algorithm failure */ 67 static u64 zswap_reject_compress_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 /* Duplicate store was encountered (rare) */ 75 static u64 zswap_duplicate_entry; 76 77 /* Shrinker work queue */ 78 static struct workqueue_struct *shrink_wq; 79 /* Pool limit was hit, we need to calm down */ 80 static bool zswap_pool_reached_full; 81 82 /********************************* 83 * tunables 84 **********************************/ 85 86 #define ZSWAP_PARAM_UNSET "" 87 88 static int zswap_setup(void); 89 90 /* Enable/disable zswap */ 91 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 92 static int zswap_enabled_param_set(const char *, 93 const struct kernel_param *); 94 static const struct kernel_param_ops zswap_enabled_param_ops = { 95 .set = zswap_enabled_param_set, 96 .get = param_get_bool, 97 }; 98 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 99 100 /* Crypto compressor to use */ 101 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 102 static int zswap_compressor_param_set(const char *, 103 const struct kernel_param *); 104 static const struct kernel_param_ops zswap_compressor_param_ops = { 105 .set = zswap_compressor_param_set, 106 .get = param_get_charp, 107 .free = param_free_charp, 108 }; 109 module_param_cb(compressor, &zswap_compressor_param_ops, 110 &zswap_compressor, 0644); 111 112 /* Compressed storage zpool to use */ 113 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 114 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 115 static const struct kernel_param_ops zswap_zpool_param_ops = { 116 .set = zswap_zpool_param_set, 117 .get = param_get_charp, 118 .free = param_free_charp, 119 }; 120 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 121 122 /* The maximum percentage of memory that the compressed pool can occupy */ 123 static unsigned int zswap_max_pool_percent = 20; 124 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 125 126 /* The threshold for accepting new pages after the max_pool_percent was hit */ 127 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 128 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 129 uint, 0644); 130 131 /* 132 * Enable/disable handling same-value filled pages (enabled by default). 133 * If disabled every page is considered non-same-value filled. 134 */ 135 static bool zswap_same_filled_pages_enabled = true; 136 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 137 bool, 0644); 138 139 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 140 static bool zswap_non_same_filled_pages_enabled = true; 141 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 142 bool, 0644); 143 144 static bool zswap_exclusive_loads_enabled = IS_ENABLED( 145 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); 146 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); 147 148 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 149 #define ZSWAP_NR_ZPOOLS 32 150 151 /* Enable/disable memory pressure-based shrinker. */ 152 static bool zswap_shrinker_enabled = IS_ENABLED( 153 CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); 154 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); 155 156 bool is_zswap_enabled(void) 157 { 158 return zswap_enabled; 159 } 160 161 /********************************* 162 * data structures 163 **********************************/ 164 165 struct crypto_acomp_ctx { 166 struct crypto_acomp *acomp; 167 struct acomp_req *req; 168 struct crypto_wait wait; 169 u8 *buffer; 170 struct mutex mutex; 171 }; 172 173 /* 174 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 175 * The only case where lru_lock is not acquired while holding tree.lock is 176 * when a zswap_entry is taken off the lru for writeback, in that case it 177 * needs to be verified that it's still valid in the tree. 178 */ 179 struct zswap_pool { 180 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 181 struct crypto_acomp_ctx __percpu *acomp_ctx; 182 struct kref kref; 183 struct list_head list; 184 struct work_struct release_work; 185 struct work_struct shrink_work; 186 struct hlist_node node; 187 char tfm_name[CRYPTO_MAX_ALG_NAME]; 188 struct list_lru list_lru; 189 struct mem_cgroup *next_shrink; 190 struct shrinker *shrinker; 191 atomic_t nr_stored; 192 }; 193 194 /* 195 * struct zswap_entry 196 * 197 * This structure contains the metadata for tracking a single compressed 198 * page within zswap. 199 * 200 * rbnode - links the entry into red-black tree for the appropriate swap type 201 * swpentry - associated swap entry, the offset indexes into the red-black tree 202 * refcount - the number of outstanding reference to the entry. This is needed 203 * to protect against premature freeing of the entry by code 204 * concurrent calls to load, invalidate, and writeback. The lock 205 * for the zswap_tree structure that contains the entry must 206 * be held while changing the refcount. Since the lock must 207 * be held, there is no reason to also make refcount atomic. 208 * length - the length in bytes of the compressed page data. Needed during 209 * decompression. For a same value filled page length is 0, and both 210 * pool and lru are invalid and must be ignored. 211 * pool - the zswap_pool the entry's data is in 212 * handle - zpool allocation handle that stores the compressed page data 213 * value - value of the same-value filled pages which have same content 214 * objcg - the obj_cgroup that the compressed memory is charged to 215 * lru - handle to the pool's lru used to evict pages. 216 */ 217 struct zswap_entry { 218 struct rb_node rbnode; 219 swp_entry_t swpentry; 220 int refcount; 221 unsigned int length; 222 struct zswap_pool *pool; 223 union { 224 unsigned long handle; 225 unsigned long value; 226 }; 227 struct obj_cgroup *objcg; 228 struct list_head lru; 229 }; 230 231 /* 232 * The tree lock in the zswap_tree struct protects a few things: 233 * - the rbtree 234 * - the refcount field of each entry in the tree 235 */ 236 struct zswap_tree { 237 struct rb_root rbroot; 238 spinlock_t lock; 239 }; 240 241 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 242 243 /* RCU-protected iteration */ 244 static LIST_HEAD(zswap_pools); 245 /* protects zswap_pools list modification */ 246 static DEFINE_SPINLOCK(zswap_pools_lock); 247 /* pool counter to provide unique names to zpool */ 248 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 249 250 enum zswap_init_type { 251 ZSWAP_UNINIT, 252 ZSWAP_INIT_SUCCEED, 253 ZSWAP_INIT_FAILED 254 }; 255 256 static enum zswap_init_type zswap_init_state; 257 258 /* used to ensure the integrity of initialization */ 259 static DEFINE_MUTEX(zswap_init_lock); 260 261 /* init completed, but couldn't create the initial pool */ 262 static bool zswap_has_pool; 263 264 /********************************* 265 * helpers and fwd declarations 266 **********************************/ 267 268 #define zswap_pool_debug(msg, p) \ 269 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 270 zpool_get_type((p)->zpools[0])) 271 272 static int zswap_writeback_entry(struct zswap_entry *entry, 273 struct zswap_tree *tree); 274 static int zswap_pool_get(struct zswap_pool *pool); 275 static void zswap_pool_put(struct zswap_pool *pool); 276 277 static bool zswap_is_full(void) 278 { 279 return totalram_pages() * zswap_max_pool_percent / 100 < 280 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 281 } 282 283 static bool zswap_can_accept(void) 284 { 285 return totalram_pages() * zswap_accept_thr_percent / 100 * 286 zswap_max_pool_percent / 100 > 287 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 288 } 289 290 static u64 get_zswap_pool_size(struct zswap_pool *pool) 291 { 292 u64 pool_size = 0; 293 int i; 294 295 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 296 pool_size += zpool_get_total_size(pool->zpools[i]); 297 298 return pool_size; 299 } 300 301 static void zswap_update_total_size(void) 302 { 303 struct zswap_pool *pool; 304 u64 total = 0; 305 306 rcu_read_lock(); 307 308 list_for_each_entry_rcu(pool, &zswap_pools, list) 309 total += get_zswap_pool_size(pool); 310 311 rcu_read_unlock(); 312 313 zswap_pool_total_size = total; 314 } 315 316 /* should be called under RCU */ 317 #ifdef CONFIG_MEMCG 318 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 319 { 320 return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; 321 } 322 #else 323 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 324 { 325 return NULL; 326 } 327 #endif 328 329 static inline int entry_to_nid(struct zswap_entry *entry) 330 { 331 return page_to_nid(virt_to_page(entry)); 332 } 333 334 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) 335 { 336 struct zswap_pool *pool; 337 338 /* lock out zswap pools list modification */ 339 spin_lock(&zswap_pools_lock); 340 list_for_each_entry(pool, &zswap_pools, list) { 341 if (pool->next_shrink == memcg) 342 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 343 } 344 spin_unlock(&zswap_pools_lock); 345 } 346 347 /********************************* 348 * zswap entry functions 349 **********************************/ 350 static struct kmem_cache *zswap_entry_cache; 351 352 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) 353 { 354 struct zswap_entry *entry; 355 entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); 356 if (!entry) 357 return NULL; 358 entry->refcount = 1; 359 RB_CLEAR_NODE(&entry->rbnode); 360 return entry; 361 } 362 363 static void zswap_entry_cache_free(struct zswap_entry *entry) 364 { 365 kmem_cache_free(zswap_entry_cache, entry); 366 } 367 368 /********************************* 369 * zswap lruvec functions 370 **********************************/ 371 void zswap_lruvec_state_init(struct lruvec *lruvec) 372 { 373 atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); 374 } 375 376 void zswap_folio_swapin(struct folio *folio) 377 { 378 struct lruvec *lruvec; 379 380 if (folio) { 381 lruvec = folio_lruvec(folio); 382 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 383 } 384 } 385 386 /********************************* 387 * lru functions 388 **********************************/ 389 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) 390 { 391 atomic_long_t *nr_zswap_protected; 392 unsigned long lru_size, old, new; 393 int nid = entry_to_nid(entry); 394 struct mem_cgroup *memcg; 395 struct lruvec *lruvec; 396 397 /* 398 * Note that it is safe to use rcu_read_lock() here, even in the face of 399 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 400 * used in list_lru lookup, only two scenarios are possible: 401 * 402 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 403 * new entry will be reparented to memcg's parent's list_lru. 404 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 405 * new entry will be added directly to memcg's parent's list_lru. 406 * 407 * Similar reasoning holds for list_lru_del() and list_lru_putback(). 408 */ 409 rcu_read_lock(); 410 memcg = mem_cgroup_from_entry(entry); 411 /* will always succeed */ 412 list_lru_add(list_lru, &entry->lru, nid, memcg); 413 414 /* Update the protection area */ 415 lru_size = list_lru_count_one(list_lru, nid, memcg); 416 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 417 nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; 418 old = atomic_long_inc_return(nr_zswap_protected); 419 /* 420 * Decay to avoid overflow and adapt to changing workloads. 421 * This is based on LRU reclaim cost decaying heuristics. 422 */ 423 do { 424 new = old > lru_size / 4 ? old / 2 : old; 425 } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); 426 rcu_read_unlock(); 427 } 428 429 static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) 430 { 431 int nid = entry_to_nid(entry); 432 struct mem_cgroup *memcg; 433 434 rcu_read_lock(); 435 memcg = mem_cgroup_from_entry(entry); 436 /* will always succeed */ 437 list_lru_del(list_lru, &entry->lru, nid, memcg); 438 rcu_read_unlock(); 439 } 440 441 static void zswap_lru_putback(struct list_lru *list_lru, 442 struct zswap_entry *entry) 443 { 444 int nid = entry_to_nid(entry); 445 spinlock_t *lock = &list_lru->node[nid].lock; 446 struct mem_cgroup *memcg; 447 struct lruvec *lruvec; 448 449 rcu_read_lock(); 450 memcg = mem_cgroup_from_entry(entry); 451 spin_lock(lock); 452 /* we cannot use list_lru_add here, because it increments node's lru count */ 453 list_lru_putback(list_lru, &entry->lru, nid, memcg); 454 spin_unlock(lock); 455 456 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); 457 /* increment the protection area to account for the LRU rotation. */ 458 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 459 rcu_read_unlock(); 460 } 461 462 /********************************* 463 * rbtree functions 464 **********************************/ 465 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 466 { 467 struct rb_node *node = root->rb_node; 468 struct zswap_entry *entry; 469 pgoff_t entry_offset; 470 471 while (node) { 472 entry = rb_entry(node, struct zswap_entry, rbnode); 473 entry_offset = swp_offset(entry->swpentry); 474 if (entry_offset > offset) 475 node = node->rb_left; 476 else if (entry_offset < offset) 477 node = node->rb_right; 478 else 479 return entry; 480 } 481 return NULL; 482 } 483 484 /* 485 * In the case that a entry with the same offset is found, a pointer to 486 * the existing entry is stored in dupentry and the function returns -EEXIST 487 */ 488 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 489 struct zswap_entry **dupentry) 490 { 491 struct rb_node **link = &root->rb_node, *parent = NULL; 492 struct zswap_entry *myentry; 493 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 494 495 while (*link) { 496 parent = *link; 497 myentry = rb_entry(parent, struct zswap_entry, rbnode); 498 myentry_offset = swp_offset(myentry->swpentry); 499 if (myentry_offset > entry_offset) 500 link = &(*link)->rb_left; 501 else if (myentry_offset < entry_offset) 502 link = &(*link)->rb_right; 503 else { 504 *dupentry = myentry; 505 return -EEXIST; 506 } 507 } 508 rb_link_node(&entry->rbnode, parent, link); 509 rb_insert_color(&entry->rbnode, root); 510 return 0; 511 } 512 513 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 514 { 515 if (!RB_EMPTY_NODE(&entry->rbnode)) { 516 rb_erase(&entry->rbnode, root); 517 RB_CLEAR_NODE(&entry->rbnode); 518 return true; 519 } 520 return false; 521 } 522 523 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 524 { 525 int i = 0; 526 527 if (ZSWAP_NR_ZPOOLS > 1) 528 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 529 530 return entry->pool->zpools[i]; 531 } 532 533 /* 534 * Carries out the common pattern of freeing and entry's zpool allocation, 535 * freeing the entry itself, and decrementing the number of stored pages. 536 */ 537 static void zswap_free_entry(struct zswap_entry *entry) 538 { 539 if (!entry->length) 540 atomic_dec(&zswap_same_filled_pages); 541 else { 542 zswap_lru_del(&entry->pool->list_lru, entry); 543 zpool_free(zswap_find_zpool(entry), entry->handle); 544 atomic_dec(&entry->pool->nr_stored); 545 zswap_pool_put(entry->pool); 546 } 547 if (entry->objcg) { 548 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 549 obj_cgroup_put(entry->objcg); 550 } 551 zswap_entry_cache_free(entry); 552 atomic_dec(&zswap_stored_pages); 553 zswap_update_total_size(); 554 } 555 556 /* caller must hold the tree lock */ 557 static void zswap_entry_get(struct zswap_entry *entry) 558 { 559 entry->refcount++; 560 } 561 562 /* caller must hold the tree lock 563 * remove from the tree and free it, if nobody reference the entry 564 */ 565 static void zswap_entry_put(struct zswap_tree *tree, 566 struct zswap_entry *entry) 567 { 568 int refcount = --entry->refcount; 569 570 WARN_ON_ONCE(refcount < 0); 571 if (refcount == 0) { 572 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); 573 zswap_free_entry(entry); 574 } 575 } 576 577 /* caller must hold the tree lock */ 578 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 579 pgoff_t offset) 580 { 581 struct zswap_entry *entry; 582 583 entry = zswap_rb_search(root, offset); 584 if (entry) 585 zswap_entry_get(entry); 586 587 return entry; 588 } 589 590 /********************************* 591 * shrinker functions 592 **********************************/ 593 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 594 spinlock_t *lock, void *arg); 595 596 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, 597 struct shrink_control *sc) 598 { 599 struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 600 unsigned long shrink_ret, nr_protected, lru_size; 601 struct zswap_pool *pool = shrinker->private_data; 602 bool encountered_page_in_swapcache = false; 603 604 if (!zswap_shrinker_enabled || 605 !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { 606 sc->nr_scanned = 0; 607 return SHRINK_STOP; 608 } 609 610 nr_protected = 611 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 612 lru_size = list_lru_shrink_count(&pool->list_lru, sc); 613 614 /* 615 * Abort if we are shrinking into the protected region. 616 * 617 * This short-circuiting is necessary because if we have too many multiple 618 * concurrent reclaimers getting the freeable zswap object counts at the 619 * same time (before any of them made reasonable progress), the total 620 * number of reclaimed objects might be more than the number of unprotected 621 * objects (i.e the reclaimers will reclaim into the protected area of the 622 * zswap LRU). 623 */ 624 if (nr_protected >= lru_size - sc->nr_to_scan) { 625 sc->nr_scanned = 0; 626 return SHRINK_STOP; 627 } 628 629 shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, 630 &encountered_page_in_swapcache); 631 632 if (encountered_page_in_swapcache) 633 return SHRINK_STOP; 634 635 return shrink_ret ? shrink_ret : SHRINK_STOP; 636 } 637 638 static unsigned long zswap_shrinker_count(struct shrinker *shrinker, 639 struct shrink_control *sc) 640 { 641 struct zswap_pool *pool = shrinker->private_data; 642 struct mem_cgroup *memcg = sc->memcg; 643 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); 644 unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; 645 646 if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) 647 return 0; 648 649 #ifdef CONFIG_MEMCG_KMEM 650 mem_cgroup_flush_stats(memcg); 651 nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; 652 nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); 653 #else 654 /* use pool stats instead of memcg stats */ 655 nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; 656 nr_stored = atomic_read(&pool->nr_stored); 657 #endif 658 659 if (!nr_stored) 660 return 0; 661 662 nr_protected = 663 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 664 nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); 665 /* 666 * Subtract the lru size by an estimate of the number of pages 667 * that should be protected. 668 */ 669 nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; 670 671 /* 672 * Scale the number of freeable pages by the memory saving factor. 673 * This ensures that the better zswap compresses memory, the fewer 674 * pages we will evict to swap (as it will otherwise incur IO for 675 * relatively small memory saving). 676 */ 677 return mult_frac(nr_freeable, nr_backing, nr_stored); 678 } 679 680 static void zswap_alloc_shrinker(struct zswap_pool *pool) 681 { 682 pool->shrinker = 683 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); 684 if (!pool->shrinker) 685 return; 686 687 pool->shrinker->private_data = pool; 688 pool->shrinker->scan_objects = zswap_shrinker_scan; 689 pool->shrinker->count_objects = zswap_shrinker_count; 690 pool->shrinker->batch = 0; 691 pool->shrinker->seeks = DEFAULT_SEEKS; 692 } 693 694 /********************************* 695 * per-cpu code 696 **********************************/ 697 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 698 { 699 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 700 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 701 struct crypto_acomp *acomp; 702 struct acomp_req *req; 703 int ret; 704 705 mutex_init(&acomp_ctx->mutex); 706 707 acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 708 if (!acomp_ctx->buffer) 709 return -ENOMEM; 710 711 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 712 if (IS_ERR(acomp)) { 713 pr_err("could not alloc crypto acomp %s : %ld\n", 714 pool->tfm_name, PTR_ERR(acomp)); 715 ret = PTR_ERR(acomp); 716 goto acomp_fail; 717 } 718 acomp_ctx->acomp = acomp; 719 720 req = acomp_request_alloc(acomp_ctx->acomp); 721 if (!req) { 722 pr_err("could not alloc crypto acomp_request %s\n", 723 pool->tfm_name); 724 ret = -ENOMEM; 725 goto req_fail; 726 } 727 acomp_ctx->req = req; 728 729 crypto_init_wait(&acomp_ctx->wait); 730 /* 731 * if the backend of acomp is async zip, crypto_req_done() will wakeup 732 * crypto_wait_req(); if the backend of acomp is scomp, the callback 733 * won't be called, crypto_wait_req() will return without blocking. 734 */ 735 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 736 crypto_req_done, &acomp_ctx->wait); 737 738 return 0; 739 740 req_fail: 741 crypto_free_acomp(acomp_ctx->acomp); 742 acomp_fail: 743 kfree(acomp_ctx->buffer); 744 return ret; 745 } 746 747 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 748 { 749 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 750 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 751 752 if (!IS_ERR_OR_NULL(acomp_ctx)) { 753 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 754 acomp_request_free(acomp_ctx->req); 755 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 756 crypto_free_acomp(acomp_ctx->acomp); 757 kfree(acomp_ctx->buffer); 758 } 759 760 return 0; 761 } 762 763 /********************************* 764 * pool functions 765 **********************************/ 766 767 static struct zswap_pool *__zswap_pool_current(void) 768 { 769 struct zswap_pool *pool; 770 771 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 772 WARN_ONCE(!pool && zswap_has_pool, 773 "%s: no page storage pool!\n", __func__); 774 775 return pool; 776 } 777 778 static struct zswap_pool *zswap_pool_current(void) 779 { 780 assert_spin_locked(&zswap_pools_lock); 781 782 return __zswap_pool_current(); 783 } 784 785 static struct zswap_pool *zswap_pool_current_get(void) 786 { 787 struct zswap_pool *pool; 788 789 rcu_read_lock(); 790 791 pool = __zswap_pool_current(); 792 if (!zswap_pool_get(pool)) 793 pool = NULL; 794 795 rcu_read_unlock(); 796 797 return pool; 798 } 799 800 static struct zswap_pool *zswap_pool_last_get(void) 801 { 802 struct zswap_pool *pool, *last = NULL; 803 804 rcu_read_lock(); 805 806 list_for_each_entry_rcu(pool, &zswap_pools, list) 807 last = pool; 808 WARN_ONCE(!last && zswap_has_pool, 809 "%s: no page storage pool!\n", __func__); 810 if (!zswap_pool_get(last)) 811 last = NULL; 812 813 rcu_read_unlock(); 814 815 return last; 816 } 817 818 /* type and compressor must be null-terminated */ 819 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 820 { 821 struct zswap_pool *pool; 822 823 assert_spin_locked(&zswap_pools_lock); 824 825 list_for_each_entry_rcu(pool, &zswap_pools, list) { 826 if (strcmp(pool->tfm_name, compressor)) 827 continue; 828 /* all zpools share the same type */ 829 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 830 continue; 831 /* if we can't get it, it's about to be destroyed */ 832 if (!zswap_pool_get(pool)) 833 continue; 834 return pool; 835 } 836 837 return NULL; 838 } 839 840 /* 841 * If the entry is still valid in the tree, drop the initial ref and remove it 842 * from the tree. This function must be called with an additional ref held, 843 * otherwise it may race with another invalidation freeing the entry. 844 */ 845 static void zswap_invalidate_entry(struct zswap_tree *tree, 846 struct zswap_entry *entry) 847 { 848 if (zswap_rb_erase(&tree->rbroot, entry)) 849 zswap_entry_put(tree, entry); 850 } 851 852 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 853 spinlock_t *lock, void *arg) 854 { 855 struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); 856 bool *encountered_page_in_swapcache = (bool *)arg; 857 struct zswap_tree *tree; 858 pgoff_t swpoffset; 859 enum lru_status ret = LRU_REMOVED_RETRY; 860 int writeback_result; 861 862 /* 863 * Once the lru lock is dropped, the entry might get freed. The 864 * swpoffset is copied to the stack, and entry isn't deref'd again 865 * until the entry is verified to still be alive in the tree. 866 */ 867 swpoffset = swp_offset(entry->swpentry); 868 tree = zswap_trees[swp_type(entry->swpentry)]; 869 list_lru_isolate(l, item); 870 /* 871 * It's safe to drop the lock here because we return either 872 * LRU_REMOVED_RETRY or LRU_RETRY. 873 */ 874 spin_unlock(lock); 875 876 /* Check for invalidate() race */ 877 spin_lock(&tree->lock); 878 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) 879 goto unlock; 880 881 /* Hold a reference to prevent a free during writeback */ 882 zswap_entry_get(entry); 883 spin_unlock(&tree->lock); 884 885 writeback_result = zswap_writeback_entry(entry, tree); 886 887 spin_lock(&tree->lock); 888 if (writeback_result) { 889 zswap_reject_reclaim_fail++; 890 zswap_lru_putback(&entry->pool->list_lru, entry); 891 ret = LRU_RETRY; 892 893 /* 894 * Encountering a page already in swap cache is a sign that we are shrinking 895 * into the warmer region. We should terminate shrinking (if we're in the dynamic 896 * shrinker context). 897 */ 898 if (writeback_result == -EEXIST && encountered_page_in_swapcache) 899 *encountered_page_in_swapcache = true; 900 901 goto put_unlock; 902 } 903 zswap_written_back_pages++; 904 905 if (entry->objcg) 906 count_objcg_event(entry->objcg, ZSWPWB); 907 908 count_vm_event(ZSWPWB); 909 /* 910 * Writeback started successfully, the page now belongs to the 911 * swapcache. Drop the entry from zswap - unless invalidate already 912 * took it out while we had the tree->lock released for IO. 913 */ 914 zswap_invalidate_entry(tree, entry); 915 916 put_unlock: 917 /* Drop local reference */ 918 zswap_entry_put(tree, entry); 919 unlock: 920 spin_unlock(&tree->lock); 921 spin_lock(lock); 922 return ret; 923 } 924 925 static int shrink_memcg(struct mem_cgroup *memcg) 926 { 927 struct zswap_pool *pool; 928 int nid, shrunk = 0; 929 930 if (!mem_cgroup_zswap_writeback_enabled(memcg)) 931 return -EINVAL; 932 933 /* 934 * Skip zombies because their LRUs are reparented and we would be 935 * reclaiming from the parent instead of the dead memcg. 936 */ 937 if (memcg && !mem_cgroup_online(memcg)) 938 return -ENOENT; 939 940 pool = zswap_pool_current_get(); 941 if (!pool) 942 return -EINVAL; 943 944 for_each_node_state(nid, N_NORMAL_MEMORY) { 945 unsigned long nr_to_walk = 1; 946 947 shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, 948 &shrink_memcg_cb, NULL, &nr_to_walk); 949 } 950 zswap_pool_put(pool); 951 return shrunk ? 0 : -EAGAIN; 952 } 953 954 static void shrink_worker(struct work_struct *w) 955 { 956 struct zswap_pool *pool = container_of(w, typeof(*pool), 957 shrink_work); 958 struct mem_cgroup *memcg; 959 int ret, failures = 0; 960 961 /* global reclaim will select cgroup in a round-robin fashion. */ 962 do { 963 spin_lock(&zswap_pools_lock); 964 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 965 memcg = pool->next_shrink; 966 967 /* 968 * We need to retry if we have gone through a full round trip, or if we 969 * got an offline memcg (or else we risk undoing the effect of the 970 * zswap memcg offlining cleanup callback). This is not catastrophic 971 * per se, but it will keep the now offlined memcg hostage for a while. 972 * 973 * Note that if we got an online memcg, we will keep the extra 974 * reference in case the original reference obtained by mem_cgroup_iter 975 * is dropped by the zswap memcg offlining callback, ensuring that the 976 * memcg is not killed when we are reclaiming. 977 */ 978 if (!memcg) { 979 spin_unlock(&zswap_pools_lock); 980 if (++failures == MAX_RECLAIM_RETRIES) 981 break; 982 983 goto resched; 984 } 985 986 if (!mem_cgroup_tryget_online(memcg)) { 987 /* drop the reference from mem_cgroup_iter() */ 988 mem_cgroup_iter_break(NULL, memcg); 989 pool->next_shrink = NULL; 990 spin_unlock(&zswap_pools_lock); 991 992 if (++failures == MAX_RECLAIM_RETRIES) 993 break; 994 995 goto resched; 996 } 997 spin_unlock(&zswap_pools_lock); 998 999 ret = shrink_memcg(memcg); 1000 /* drop the extra reference */ 1001 mem_cgroup_put(memcg); 1002 1003 if (ret == -EINVAL) 1004 break; 1005 if (ret && ++failures == MAX_RECLAIM_RETRIES) 1006 break; 1007 1008 resched: 1009 cond_resched(); 1010 } while (!zswap_can_accept()); 1011 zswap_pool_put(pool); 1012 } 1013 1014 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 1015 { 1016 int i; 1017 struct zswap_pool *pool; 1018 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 1019 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1020 int ret; 1021 1022 if (!zswap_has_pool) { 1023 /* if either are unset, pool initialization failed, and we 1024 * need both params to be set correctly before trying to 1025 * create a pool. 1026 */ 1027 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 1028 return NULL; 1029 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 1030 return NULL; 1031 } 1032 1033 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1034 if (!pool) 1035 return NULL; 1036 1037 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 1038 /* unique name for each pool specifically required by zsmalloc */ 1039 snprintf(name, 38, "zswap%x", 1040 atomic_inc_return(&zswap_pools_count)); 1041 1042 pool->zpools[i] = zpool_create_pool(type, name, gfp); 1043 if (!pool->zpools[i]) { 1044 pr_err("%s zpool not available\n", type); 1045 goto error; 1046 } 1047 } 1048 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 1049 1050 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 1051 1052 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 1053 if (!pool->acomp_ctx) { 1054 pr_err("percpu alloc failed\n"); 1055 goto error; 1056 } 1057 1058 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 1059 &pool->node); 1060 if (ret) 1061 goto error; 1062 1063 zswap_alloc_shrinker(pool); 1064 if (!pool->shrinker) 1065 goto error; 1066 1067 pr_debug("using %s compressor\n", pool->tfm_name); 1068 1069 /* being the current pool takes 1 ref; this func expects the 1070 * caller to always add the new pool as the current pool 1071 */ 1072 kref_init(&pool->kref); 1073 INIT_LIST_HEAD(&pool->list); 1074 if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) 1075 goto lru_fail; 1076 shrinker_register(pool->shrinker); 1077 INIT_WORK(&pool->shrink_work, shrink_worker); 1078 atomic_set(&pool->nr_stored, 0); 1079 1080 zswap_pool_debug("created", pool); 1081 1082 return pool; 1083 1084 lru_fail: 1085 list_lru_destroy(&pool->list_lru); 1086 shrinker_free(pool->shrinker); 1087 error: 1088 if (pool->acomp_ctx) 1089 free_percpu(pool->acomp_ctx); 1090 while (i--) 1091 zpool_destroy_pool(pool->zpools[i]); 1092 kfree(pool); 1093 return NULL; 1094 } 1095 1096 static struct zswap_pool *__zswap_pool_create_fallback(void) 1097 { 1098 bool has_comp, has_zpool; 1099 1100 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1101 if (!has_comp && strcmp(zswap_compressor, 1102 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 1103 pr_err("compressor %s not available, using default %s\n", 1104 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 1105 param_free_charp(&zswap_compressor); 1106 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 1107 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1108 } 1109 if (!has_comp) { 1110 pr_err("default compressor %s not available\n", 1111 zswap_compressor); 1112 param_free_charp(&zswap_compressor); 1113 zswap_compressor = ZSWAP_PARAM_UNSET; 1114 } 1115 1116 has_zpool = zpool_has_pool(zswap_zpool_type); 1117 if (!has_zpool && strcmp(zswap_zpool_type, 1118 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 1119 pr_err("zpool %s not available, using default %s\n", 1120 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 1121 param_free_charp(&zswap_zpool_type); 1122 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 1123 has_zpool = zpool_has_pool(zswap_zpool_type); 1124 } 1125 if (!has_zpool) { 1126 pr_err("default zpool %s not available\n", 1127 zswap_zpool_type); 1128 param_free_charp(&zswap_zpool_type); 1129 zswap_zpool_type = ZSWAP_PARAM_UNSET; 1130 } 1131 1132 if (!has_comp || !has_zpool) 1133 return NULL; 1134 1135 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 1136 } 1137 1138 static void zswap_pool_destroy(struct zswap_pool *pool) 1139 { 1140 int i; 1141 1142 zswap_pool_debug("destroying", pool); 1143 1144 shrinker_free(pool->shrinker); 1145 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 1146 free_percpu(pool->acomp_ctx); 1147 list_lru_destroy(&pool->list_lru); 1148 1149 spin_lock(&zswap_pools_lock); 1150 mem_cgroup_iter_break(NULL, pool->next_shrink); 1151 pool->next_shrink = NULL; 1152 spin_unlock(&zswap_pools_lock); 1153 1154 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 1155 zpool_destroy_pool(pool->zpools[i]); 1156 kfree(pool); 1157 } 1158 1159 static int __must_check zswap_pool_get(struct zswap_pool *pool) 1160 { 1161 if (!pool) 1162 return 0; 1163 1164 return kref_get_unless_zero(&pool->kref); 1165 } 1166 1167 static void __zswap_pool_release(struct work_struct *work) 1168 { 1169 struct zswap_pool *pool = container_of(work, typeof(*pool), 1170 release_work); 1171 1172 synchronize_rcu(); 1173 1174 /* nobody should have been able to get a kref... */ 1175 WARN_ON(kref_get_unless_zero(&pool->kref)); 1176 1177 /* pool is now off zswap_pools list and has no references. */ 1178 zswap_pool_destroy(pool); 1179 } 1180 1181 static void __zswap_pool_empty(struct kref *kref) 1182 { 1183 struct zswap_pool *pool; 1184 1185 pool = container_of(kref, typeof(*pool), kref); 1186 1187 spin_lock(&zswap_pools_lock); 1188 1189 WARN_ON(pool == zswap_pool_current()); 1190 1191 list_del_rcu(&pool->list); 1192 1193 INIT_WORK(&pool->release_work, __zswap_pool_release); 1194 schedule_work(&pool->release_work); 1195 1196 spin_unlock(&zswap_pools_lock); 1197 } 1198 1199 static void zswap_pool_put(struct zswap_pool *pool) 1200 { 1201 kref_put(&pool->kref, __zswap_pool_empty); 1202 } 1203 1204 /********************************* 1205 * param callbacks 1206 **********************************/ 1207 1208 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 1209 { 1210 /* no change required */ 1211 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 1212 return false; 1213 return true; 1214 } 1215 1216 /* val must be a null-terminated string */ 1217 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 1218 char *type, char *compressor) 1219 { 1220 struct zswap_pool *pool, *put_pool = NULL; 1221 char *s = strstrip((char *)val); 1222 int ret = 0; 1223 bool new_pool = false; 1224 1225 mutex_lock(&zswap_init_lock); 1226 switch (zswap_init_state) { 1227 case ZSWAP_UNINIT: 1228 /* if this is load-time (pre-init) param setting, 1229 * don't create a pool; that's done during init. 1230 */ 1231 ret = param_set_charp(s, kp); 1232 break; 1233 case ZSWAP_INIT_SUCCEED: 1234 new_pool = zswap_pool_changed(s, kp); 1235 break; 1236 case ZSWAP_INIT_FAILED: 1237 pr_err("can't set param, initialization failed\n"); 1238 ret = -ENODEV; 1239 } 1240 mutex_unlock(&zswap_init_lock); 1241 1242 /* no need to create a new pool, return directly */ 1243 if (!new_pool) 1244 return ret; 1245 1246 if (!type) { 1247 if (!zpool_has_pool(s)) { 1248 pr_err("zpool %s not available\n", s); 1249 return -ENOENT; 1250 } 1251 type = s; 1252 } else if (!compressor) { 1253 if (!crypto_has_acomp(s, 0, 0)) { 1254 pr_err("compressor %s not available\n", s); 1255 return -ENOENT; 1256 } 1257 compressor = s; 1258 } else { 1259 WARN_ON(1); 1260 return -EINVAL; 1261 } 1262 1263 spin_lock(&zswap_pools_lock); 1264 1265 pool = zswap_pool_find_get(type, compressor); 1266 if (pool) { 1267 zswap_pool_debug("using existing", pool); 1268 WARN_ON(pool == zswap_pool_current()); 1269 list_del_rcu(&pool->list); 1270 } 1271 1272 spin_unlock(&zswap_pools_lock); 1273 1274 if (!pool) 1275 pool = zswap_pool_create(type, compressor); 1276 1277 if (pool) 1278 ret = param_set_charp(s, kp); 1279 else 1280 ret = -EINVAL; 1281 1282 spin_lock(&zswap_pools_lock); 1283 1284 if (!ret) { 1285 put_pool = zswap_pool_current(); 1286 list_add_rcu(&pool->list, &zswap_pools); 1287 zswap_has_pool = true; 1288 } else if (pool) { 1289 /* add the possibly pre-existing pool to the end of the pools 1290 * list; if it's new (and empty) then it'll be removed and 1291 * destroyed by the put after we drop the lock 1292 */ 1293 list_add_tail_rcu(&pool->list, &zswap_pools); 1294 put_pool = pool; 1295 } 1296 1297 spin_unlock(&zswap_pools_lock); 1298 1299 if (!zswap_has_pool && !pool) { 1300 /* if initial pool creation failed, and this pool creation also 1301 * failed, maybe both compressor and zpool params were bad. 1302 * Allow changing this param, so pool creation will succeed 1303 * when the other param is changed. We already verified this 1304 * param is ok in the zpool_has_pool() or crypto_has_acomp() 1305 * checks above. 1306 */ 1307 ret = param_set_charp(s, kp); 1308 } 1309 1310 /* drop the ref from either the old current pool, 1311 * or the new pool we failed to add 1312 */ 1313 if (put_pool) 1314 zswap_pool_put(put_pool); 1315 1316 return ret; 1317 } 1318 1319 static int zswap_compressor_param_set(const char *val, 1320 const struct kernel_param *kp) 1321 { 1322 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 1323 } 1324 1325 static int zswap_zpool_param_set(const char *val, 1326 const struct kernel_param *kp) 1327 { 1328 return __zswap_param_set(val, kp, NULL, zswap_compressor); 1329 } 1330 1331 static int zswap_enabled_param_set(const char *val, 1332 const struct kernel_param *kp) 1333 { 1334 int ret = -ENODEV; 1335 1336 /* if this is load-time (pre-init) param setting, only set param. */ 1337 if (system_state != SYSTEM_RUNNING) 1338 return param_set_bool(val, kp); 1339 1340 mutex_lock(&zswap_init_lock); 1341 switch (zswap_init_state) { 1342 case ZSWAP_UNINIT: 1343 if (zswap_setup()) 1344 break; 1345 fallthrough; 1346 case ZSWAP_INIT_SUCCEED: 1347 if (!zswap_has_pool) 1348 pr_err("can't enable, no pool configured\n"); 1349 else 1350 ret = param_set_bool(val, kp); 1351 break; 1352 case ZSWAP_INIT_FAILED: 1353 pr_err("can't enable, initialization failed\n"); 1354 } 1355 mutex_unlock(&zswap_init_lock); 1356 1357 return ret; 1358 } 1359 1360 static void __zswap_load(struct zswap_entry *entry, struct page *page) 1361 { 1362 struct zpool *zpool = zswap_find_zpool(entry); 1363 struct scatterlist input, output; 1364 struct crypto_acomp_ctx *acomp_ctx; 1365 u8 *src; 1366 1367 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1368 mutex_lock(&acomp_ctx->mutex); 1369 1370 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1371 if (!zpool_can_sleep_mapped(zpool)) { 1372 memcpy(acomp_ctx->buffer, src, entry->length); 1373 src = acomp_ctx->buffer; 1374 zpool_unmap_handle(zpool, entry->handle); 1375 } 1376 1377 sg_init_one(&input, src, entry->length); 1378 sg_init_table(&output, 1); 1379 sg_set_page(&output, page, PAGE_SIZE, 0); 1380 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); 1381 BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); 1382 BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); 1383 mutex_unlock(&acomp_ctx->mutex); 1384 1385 if (zpool_can_sleep_mapped(zpool)) 1386 zpool_unmap_handle(zpool, entry->handle); 1387 } 1388 1389 /********************************* 1390 * writeback code 1391 **********************************/ 1392 /* 1393 * Attempts to free an entry by adding a folio to the swap cache, 1394 * decompressing the entry data into the folio, and issuing a 1395 * bio write to write the folio back to the swap device. 1396 * 1397 * This can be thought of as a "resumed writeback" of the folio 1398 * to the swap device. We are basically resuming the same swap 1399 * writeback path that was intercepted with the zswap_store() 1400 * in the first place. After the folio has been decompressed into 1401 * the swap cache, the compressed version stored by zswap can be 1402 * freed. 1403 */ 1404 static int zswap_writeback_entry(struct zswap_entry *entry, 1405 struct zswap_tree *tree) 1406 { 1407 swp_entry_t swpentry = entry->swpentry; 1408 struct folio *folio; 1409 struct mempolicy *mpol; 1410 bool folio_was_allocated; 1411 struct writeback_control wbc = { 1412 .sync_mode = WB_SYNC_NONE, 1413 }; 1414 1415 /* try to allocate swap cache folio */ 1416 mpol = get_task_policy(current); 1417 folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1418 NO_INTERLEAVE_INDEX, &folio_was_allocated, true); 1419 if (!folio) 1420 return -ENOMEM; 1421 1422 /* 1423 * Found an existing folio, we raced with load/swapin. We generally 1424 * writeback cold folios from zswap, and swapin means the folio just 1425 * became hot. Skip this folio and let the caller find another one. 1426 */ 1427 if (!folio_was_allocated) { 1428 folio_put(folio); 1429 return -EEXIST; 1430 } 1431 1432 /* 1433 * folio is locked, and the swapcache is now secured against 1434 * concurrent swapping to and from the slot. Verify that the 1435 * swap entry hasn't been invalidated and recycled behind our 1436 * backs (our zswap_entry reference doesn't prevent that), to 1437 * avoid overwriting a new swap folio with old compressed data. 1438 */ 1439 spin_lock(&tree->lock); 1440 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { 1441 spin_unlock(&tree->lock); 1442 delete_from_swap_cache(folio); 1443 return -ENOMEM; 1444 } 1445 spin_unlock(&tree->lock); 1446 1447 __zswap_load(entry, &folio->page); 1448 1449 /* folio is up to date */ 1450 folio_mark_uptodate(folio); 1451 1452 /* move it to the tail of the inactive list after end_writeback */ 1453 folio_set_reclaim(folio); 1454 1455 /* start writeback */ 1456 __swap_writepage(folio, &wbc); 1457 folio_put(folio); 1458 1459 return 0; 1460 } 1461 1462 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1463 { 1464 unsigned long *page; 1465 unsigned long val; 1466 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1467 1468 page = (unsigned long *)ptr; 1469 val = page[0]; 1470 1471 if (val != page[last_pos]) 1472 return 0; 1473 1474 for (pos = 1; pos < last_pos; pos++) { 1475 if (val != page[pos]) 1476 return 0; 1477 } 1478 1479 *value = val; 1480 1481 return 1; 1482 } 1483 1484 static void zswap_fill_page(void *ptr, unsigned long value) 1485 { 1486 unsigned long *page; 1487 1488 page = (unsigned long *)ptr; 1489 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1490 } 1491 1492 bool zswap_store(struct folio *folio) 1493 { 1494 swp_entry_t swp = folio->swap; 1495 int type = swp_type(swp); 1496 pgoff_t offset = swp_offset(swp); 1497 struct page *page = &folio->page; 1498 struct zswap_tree *tree = zswap_trees[type]; 1499 struct zswap_entry *entry, *dupentry; 1500 struct scatterlist input, output; 1501 struct crypto_acomp_ctx *acomp_ctx; 1502 struct obj_cgroup *objcg = NULL; 1503 struct mem_cgroup *memcg = NULL; 1504 struct zswap_pool *pool; 1505 struct zpool *zpool; 1506 unsigned int dlen = PAGE_SIZE; 1507 unsigned long handle, value; 1508 char *buf; 1509 u8 *src, *dst; 1510 gfp_t gfp; 1511 int ret; 1512 1513 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1514 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1515 1516 /* Large folios aren't supported */ 1517 if (folio_test_large(folio)) 1518 return false; 1519 1520 if (!zswap_enabled || !tree) 1521 return false; 1522 1523 /* 1524 * If this is a duplicate, it must be removed before attempting to store 1525 * it, otherwise, if the store fails the old page won't be removed from 1526 * the tree, and it might be written back overriding the new data. 1527 */ 1528 spin_lock(&tree->lock); 1529 dupentry = zswap_rb_search(&tree->rbroot, offset); 1530 if (dupentry) { 1531 zswap_duplicate_entry++; 1532 zswap_invalidate_entry(tree, dupentry); 1533 } 1534 spin_unlock(&tree->lock); 1535 objcg = get_obj_cgroup_from_folio(folio); 1536 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1537 memcg = get_mem_cgroup_from_objcg(objcg); 1538 if (shrink_memcg(memcg)) { 1539 mem_cgroup_put(memcg); 1540 goto reject; 1541 } 1542 mem_cgroup_put(memcg); 1543 } 1544 1545 /* reclaim space if needed */ 1546 if (zswap_is_full()) { 1547 zswap_pool_limit_hit++; 1548 zswap_pool_reached_full = true; 1549 goto shrink; 1550 } 1551 1552 if (zswap_pool_reached_full) { 1553 if (!zswap_can_accept()) 1554 goto shrink; 1555 else 1556 zswap_pool_reached_full = false; 1557 } 1558 1559 /* allocate entry */ 1560 entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); 1561 if (!entry) { 1562 zswap_reject_kmemcache_fail++; 1563 goto reject; 1564 } 1565 1566 if (zswap_same_filled_pages_enabled) { 1567 src = kmap_local_page(page); 1568 if (zswap_is_page_same_filled(src, &value)) { 1569 kunmap_local(src); 1570 entry->swpentry = swp_entry(type, offset); 1571 entry->length = 0; 1572 entry->value = value; 1573 atomic_inc(&zswap_same_filled_pages); 1574 goto insert_entry; 1575 } 1576 kunmap_local(src); 1577 } 1578 1579 if (!zswap_non_same_filled_pages_enabled) 1580 goto freepage; 1581 1582 /* if entry is successfully added, it keeps the reference */ 1583 entry->pool = zswap_pool_current_get(); 1584 if (!entry->pool) 1585 goto freepage; 1586 1587 if (objcg) { 1588 memcg = get_mem_cgroup_from_objcg(objcg); 1589 if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { 1590 mem_cgroup_put(memcg); 1591 goto put_pool; 1592 } 1593 mem_cgroup_put(memcg); 1594 } 1595 1596 /* compress */ 1597 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1598 1599 mutex_lock(&acomp_ctx->mutex); 1600 1601 dst = acomp_ctx->buffer; 1602 sg_init_table(&input, 1); 1603 sg_set_page(&input, &folio->page, PAGE_SIZE, 0); 1604 1605 /* 1606 * We need PAGE_SIZE * 2 here since there maybe over-compression case, 1607 * and hardware-accelerators may won't check the dst buffer size, so 1608 * giving the dst buffer with enough length to avoid buffer overflow. 1609 */ 1610 sg_init_one(&output, dst, PAGE_SIZE * 2); 1611 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1612 /* 1613 * it maybe looks a little bit silly that we send an asynchronous request, 1614 * then wait for its completion synchronously. This makes the process look 1615 * synchronous in fact. 1616 * Theoretically, acomp supports users send multiple acomp requests in one 1617 * acomp instance, then get those requests done simultaneously. but in this 1618 * case, zswap actually does store and load page by page, there is no 1619 * existing method to send the second page before the first page is done 1620 * in one thread doing zwap. 1621 * but in different threads running on different cpu, we have different 1622 * acomp instance, so multiple threads can do (de)compression in parallel. 1623 */ 1624 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1625 dlen = acomp_ctx->req->dlen; 1626 1627 if (ret) { 1628 zswap_reject_compress_fail++; 1629 goto put_dstmem; 1630 } 1631 1632 /* store */ 1633 zpool = zswap_find_zpool(entry); 1634 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1635 if (zpool_malloc_support_movable(zpool)) 1636 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1637 ret = zpool_malloc(zpool, dlen, gfp, &handle); 1638 if (ret == -ENOSPC) { 1639 zswap_reject_compress_poor++; 1640 goto put_dstmem; 1641 } 1642 if (ret) { 1643 zswap_reject_alloc_fail++; 1644 goto put_dstmem; 1645 } 1646 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1647 memcpy(buf, dst, dlen); 1648 zpool_unmap_handle(zpool, handle); 1649 mutex_unlock(&acomp_ctx->mutex); 1650 1651 /* populate entry */ 1652 entry->swpentry = swp_entry(type, offset); 1653 entry->handle = handle; 1654 entry->length = dlen; 1655 1656 insert_entry: 1657 entry->objcg = objcg; 1658 if (objcg) { 1659 obj_cgroup_charge_zswap(objcg, entry->length); 1660 /* Account before objcg ref is moved to tree */ 1661 count_objcg_event(objcg, ZSWPOUT); 1662 } 1663 1664 /* map */ 1665 spin_lock(&tree->lock); 1666 /* 1667 * A duplicate entry should have been removed at the beginning of this 1668 * function. Since the swap entry should be pinned, if a duplicate is 1669 * found again here it means that something went wrong in the swap 1670 * cache. 1671 */ 1672 while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1673 WARN_ON(1); 1674 zswap_duplicate_entry++; 1675 zswap_invalidate_entry(tree, dupentry); 1676 } 1677 if (entry->length) { 1678 INIT_LIST_HEAD(&entry->lru); 1679 zswap_lru_add(&entry->pool->list_lru, entry); 1680 atomic_inc(&entry->pool->nr_stored); 1681 } 1682 spin_unlock(&tree->lock); 1683 1684 /* update stats */ 1685 atomic_inc(&zswap_stored_pages); 1686 zswap_update_total_size(); 1687 count_vm_event(ZSWPOUT); 1688 1689 return true; 1690 1691 put_dstmem: 1692 mutex_unlock(&acomp_ctx->mutex); 1693 put_pool: 1694 zswap_pool_put(entry->pool); 1695 freepage: 1696 zswap_entry_cache_free(entry); 1697 reject: 1698 if (objcg) 1699 obj_cgroup_put(objcg); 1700 return false; 1701 1702 shrink: 1703 pool = zswap_pool_last_get(); 1704 if (pool && !queue_work(shrink_wq, &pool->shrink_work)) 1705 zswap_pool_put(pool); 1706 goto reject; 1707 } 1708 1709 bool zswap_load(struct folio *folio) 1710 { 1711 swp_entry_t swp = folio->swap; 1712 int type = swp_type(swp); 1713 pgoff_t offset = swp_offset(swp); 1714 struct page *page = &folio->page; 1715 struct zswap_tree *tree = zswap_trees[type]; 1716 struct zswap_entry *entry; 1717 u8 *dst; 1718 1719 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1720 1721 /* find */ 1722 spin_lock(&tree->lock); 1723 entry = zswap_entry_find_get(&tree->rbroot, offset); 1724 if (!entry) { 1725 spin_unlock(&tree->lock); 1726 return false; 1727 } 1728 spin_unlock(&tree->lock); 1729 1730 if (entry->length) 1731 __zswap_load(entry, page); 1732 else { 1733 dst = kmap_local_page(page); 1734 zswap_fill_page(dst, entry->value); 1735 kunmap_local(dst); 1736 } 1737 1738 count_vm_event(ZSWPIN); 1739 if (entry->objcg) 1740 count_objcg_event(entry->objcg, ZSWPIN); 1741 1742 spin_lock(&tree->lock); 1743 if (zswap_exclusive_loads_enabled) { 1744 zswap_invalidate_entry(tree, entry); 1745 folio_mark_dirty(folio); 1746 } else if (entry->length) { 1747 zswap_lru_del(&entry->pool->list_lru, entry); 1748 zswap_lru_add(&entry->pool->list_lru, entry); 1749 } 1750 zswap_entry_put(tree, entry); 1751 spin_unlock(&tree->lock); 1752 1753 return true; 1754 } 1755 1756 void zswap_invalidate(int type, pgoff_t offset) 1757 { 1758 struct zswap_tree *tree = zswap_trees[type]; 1759 struct zswap_entry *entry; 1760 1761 /* find */ 1762 spin_lock(&tree->lock); 1763 entry = zswap_rb_search(&tree->rbroot, offset); 1764 if (!entry) { 1765 /* entry was written back */ 1766 spin_unlock(&tree->lock); 1767 return; 1768 } 1769 zswap_invalidate_entry(tree, entry); 1770 spin_unlock(&tree->lock); 1771 } 1772 1773 void zswap_swapon(int type) 1774 { 1775 struct zswap_tree *tree; 1776 1777 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1778 if (!tree) { 1779 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1780 return; 1781 } 1782 1783 tree->rbroot = RB_ROOT; 1784 spin_lock_init(&tree->lock); 1785 zswap_trees[type] = tree; 1786 } 1787 1788 void zswap_swapoff(int type) 1789 { 1790 struct zswap_tree *tree = zswap_trees[type]; 1791 struct zswap_entry *entry, *n; 1792 1793 if (!tree) 1794 return; 1795 1796 /* walk the tree and free everything */ 1797 spin_lock(&tree->lock); 1798 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1799 zswap_free_entry(entry); 1800 tree->rbroot = RB_ROOT; 1801 spin_unlock(&tree->lock); 1802 kfree(tree); 1803 zswap_trees[type] = NULL; 1804 } 1805 1806 /********************************* 1807 * debugfs functions 1808 **********************************/ 1809 #ifdef CONFIG_DEBUG_FS 1810 #include <linux/debugfs.h> 1811 1812 static struct dentry *zswap_debugfs_root; 1813 1814 static int zswap_debugfs_init(void) 1815 { 1816 if (!debugfs_initialized()) 1817 return -ENODEV; 1818 1819 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1820 1821 debugfs_create_u64("pool_limit_hit", 0444, 1822 zswap_debugfs_root, &zswap_pool_limit_hit); 1823 debugfs_create_u64("reject_reclaim_fail", 0444, 1824 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1825 debugfs_create_u64("reject_alloc_fail", 0444, 1826 zswap_debugfs_root, &zswap_reject_alloc_fail); 1827 debugfs_create_u64("reject_kmemcache_fail", 0444, 1828 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1829 debugfs_create_u64("reject_compress_fail", 0444, 1830 zswap_debugfs_root, &zswap_reject_compress_fail); 1831 debugfs_create_u64("reject_compress_poor", 0444, 1832 zswap_debugfs_root, &zswap_reject_compress_poor); 1833 debugfs_create_u64("written_back_pages", 0444, 1834 zswap_debugfs_root, &zswap_written_back_pages); 1835 debugfs_create_u64("duplicate_entry", 0444, 1836 zswap_debugfs_root, &zswap_duplicate_entry); 1837 debugfs_create_u64("pool_total_size", 0444, 1838 zswap_debugfs_root, &zswap_pool_total_size); 1839 debugfs_create_atomic_t("stored_pages", 0444, 1840 zswap_debugfs_root, &zswap_stored_pages); 1841 debugfs_create_atomic_t("same_filled_pages", 0444, 1842 zswap_debugfs_root, &zswap_same_filled_pages); 1843 1844 return 0; 1845 } 1846 #else 1847 static int zswap_debugfs_init(void) 1848 { 1849 return 0; 1850 } 1851 #endif 1852 1853 /********************************* 1854 * module init and exit 1855 **********************************/ 1856 static int zswap_setup(void) 1857 { 1858 struct zswap_pool *pool; 1859 int ret; 1860 1861 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1862 if (!zswap_entry_cache) { 1863 pr_err("entry cache creation failed\n"); 1864 goto cache_fail; 1865 } 1866 1867 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1868 "mm/zswap_pool:prepare", 1869 zswap_cpu_comp_prepare, 1870 zswap_cpu_comp_dead); 1871 if (ret) 1872 goto hp_fail; 1873 1874 pool = __zswap_pool_create_fallback(); 1875 if (pool) { 1876 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1877 zpool_get_type(pool->zpools[0])); 1878 list_add(&pool->list, &zswap_pools); 1879 zswap_has_pool = true; 1880 } else { 1881 pr_err("pool creation failed\n"); 1882 zswap_enabled = false; 1883 } 1884 1885 shrink_wq = create_workqueue("zswap-shrink"); 1886 if (!shrink_wq) 1887 goto fallback_fail; 1888 1889 if (zswap_debugfs_init()) 1890 pr_warn("debugfs initialization failed\n"); 1891 zswap_init_state = ZSWAP_INIT_SUCCEED; 1892 return 0; 1893 1894 fallback_fail: 1895 if (pool) 1896 zswap_pool_destroy(pool); 1897 hp_fail: 1898 kmem_cache_destroy(zswap_entry_cache); 1899 cache_fail: 1900 /* if built-in, we aren't unloaded on failure; don't allow use */ 1901 zswap_init_state = ZSWAP_INIT_FAILED; 1902 zswap_enabled = false; 1903 return -ENOMEM; 1904 } 1905 1906 static int __init zswap_init(void) 1907 { 1908 if (!zswap_enabled) 1909 return 0; 1910 return zswap_setup(); 1911 } 1912 /* must be late so crypto has time to come up */ 1913 late_initcall(zswap_init); 1914 1915 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1916 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1917