1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempolicy.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 #include <linux/zswap.h> 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 #include <linux/list_lru.h> 39 40 #include "swap.h" 41 #include "internal.h" 42 43 /********************************* 44 * statistics 45 **********************************/ 46 /* Total bytes used by the compressed storage */ 47 u64 zswap_pool_total_size; 48 /* The number of compressed pages currently stored in zswap */ 49 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 50 /* The number of same-value filled pages currently stored in zswap */ 51 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Store failed due to compression algorithm failure */ 67 static u64 zswap_reject_compress_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 /* Duplicate store was encountered (rare) */ 75 static u64 zswap_duplicate_entry; 76 77 /* Shrinker work queue */ 78 static struct workqueue_struct *shrink_wq; 79 /* Pool limit was hit, we need to calm down */ 80 static bool zswap_pool_reached_full; 81 82 /********************************* 83 * tunables 84 **********************************/ 85 86 #define ZSWAP_PARAM_UNSET "" 87 88 static int zswap_setup(void); 89 90 /* Enable/disable zswap */ 91 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 92 static int zswap_enabled_param_set(const char *, 93 const struct kernel_param *); 94 static const struct kernel_param_ops zswap_enabled_param_ops = { 95 .set = zswap_enabled_param_set, 96 .get = param_get_bool, 97 }; 98 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 99 100 /* Crypto compressor to use */ 101 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 102 static int zswap_compressor_param_set(const char *, 103 const struct kernel_param *); 104 static const struct kernel_param_ops zswap_compressor_param_ops = { 105 .set = zswap_compressor_param_set, 106 .get = param_get_charp, 107 .free = param_free_charp, 108 }; 109 module_param_cb(compressor, &zswap_compressor_param_ops, 110 &zswap_compressor, 0644); 111 112 /* Compressed storage zpool to use */ 113 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 114 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 115 static const struct kernel_param_ops zswap_zpool_param_ops = { 116 .set = zswap_zpool_param_set, 117 .get = param_get_charp, 118 .free = param_free_charp, 119 }; 120 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 121 122 /* The maximum percentage of memory that the compressed pool can occupy */ 123 static unsigned int zswap_max_pool_percent = 20; 124 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 125 126 /* The threshold for accepting new pages after the max_pool_percent was hit */ 127 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 128 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 129 uint, 0644); 130 131 /* 132 * Enable/disable handling same-value filled pages (enabled by default). 133 * If disabled every page is considered non-same-value filled. 134 */ 135 static bool zswap_same_filled_pages_enabled = true; 136 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 137 bool, 0644); 138 139 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 140 static bool zswap_non_same_filled_pages_enabled = true; 141 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 142 bool, 0644); 143 144 static bool zswap_exclusive_loads_enabled = IS_ENABLED( 145 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); 146 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); 147 148 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 149 #define ZSWAP_NR_ZPOOLS 32 150 151 /* Enable/disable memory pressure-based shrinker. */ 152 static bool zswap_shrinker_enabled = IS_ENABLED( 153 CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); 154 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); 155 156 bool is_zswap_enabled(void) 157 { 158 return zswap_enabled; 159 } 160 161 /********************************* 162 * data structures 163 **********************************/ 164 165 struct crypto_acomp_ctx { 166 struct crypto_acomp *acomp; 167 struct acomp_req *req; 168 struct crypto_wait wait; 169 u8 *buffer; 170 struct mutex mutex; 171 }; 172 173 /* 174 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 175 * The only case where lru_lock is not acquired while holding tree.lock is 176 * when a zswap_entry is taken off the lru for writeback, in that case it 177 * needs to be verified that it's still valid in the tree. 178 */ 179 struct zswap_pool { 180 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 181 struct crypto_acomp_ctx __percpu *acomp_ctx; 182 struct kref kref; 183 struct list_head list; 184 struct work_struct release_work; 185 struct work_struct shrink_work; 186 struct hlist_node node; 187 char tfm_name[CRYPTO_MAX_ALG_NAME]; 188 struct list_lru list_lru; 189 struct mem_cgroup *next_shrink; 190 struct shrinker *shrinker; 191 atomic_t nr_stored; 192 }; 193 194 /* 195 * struct zswap_entry 196 * 197 * This structure contains the metadata for tracking a single compressed 198 * page within zswap. 199 * 200 * rbnode - links the entry into red-black tree for the appropriate swap type 201 * swpentry - associated swap entry, the offset indexes into the red-black tree 202 * refcount - the number of outstanding reference to the entry. This is needed 203 * to protect against premature freeing of the entry by code 204 * concurrent calls to load, invalidate, and writeback. The lock 205 * for the zswap_tree structure that contains the entry must 206 * be held while changing the refcount. Since the lock must 207 * be held, there is no reason to also make refcount atomic. 208 * length - the length in bytes of the compressed page data. Needed during 209 * decompression. For a same value filled page length is 0, and both 210 * pool and lru are invalid and must be ignored. 211 * pool - the zswap_pool the entry's data is in 212 * handle - zpool allocation handle that stores the compressed page data 213 * value - value of the same-value filled pages which have same content 214 * objcg - the obj_cgroup that the compressed memory is charged to 215 * lru - handle to the pool's lru used to evict pages. 216 */ 217 struct zswap_entry { 218 struct rb_node rbnode; 219 swp_entry_t swpentry; 220 int refcount; 221 unsigned int length; 222 struct zswap_pool *pool; 223 union { 224 unsigned long handle; 225 unsigned long value; 226 }; 227 struct obj_cgroup *objcg; 228 struct list_head lru; 229 }; 230 231 /* 232 * The tree lock in the zswap_tree struct protects a few things: 233 * - the rbtree 234 * - the refcount field of each entry in the tree 235 */ 236 struct zswap_tree { 237 struct rb_root rbroot; 238 spinlock_t lock; 239 }; 240 241 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 242 243 /* RCU-protected iteration */ 244 static LIST_HEAD(zswap_pools); 245 /* protects zswap_pools list modification */ 246 static DEFINE_SPINLOCK(zswap_pools_lock); 247 /* pool counter to provide unique names to zpool */ 248 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 249 250 enum zswap_init_type { 251 ZSWAP_UNINIT, 252 ZSWAP_INIT_SUCCEED, 253 ZSWAP_INIT_FAILED 254 }; 255 256 static enum zswap_init_type zswap_init_state; 257 258 /* used to ensure the integrity of initialization */ 259 static DEFINE_MUTEX(zswap_init_lock); 260 261 /* init completed, but couldn't create the initial pool */ 262 static bool zswap_has_pool; 263 264 /********************************* 265 * helpers and fwd declarations 266 **********************************/ 267 268 #define zswap_pool_debug(msg, p) \ 269 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 270 zpool_get_type((p)->zpools[0])) 271 272 static int zswap_writeback_entry(struct zswap_entry *entry, 273 struct zswap_tree *tree); 274 static int zswap_pool_get(struct zswap_pool *pool); 275 static void zswap_pool_put(struct zswap_pool *pool); 276 277 static bool zswap_is_full(void) 278 { 279 return totalram_pages() * zswap_max_pool_percent / 100 < 280 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 281 } 282 283 static bool zswap_can_accept(void) 284 { 285 return totalram_pages() * zswap_accept_thr_percent / 100 * 286 zswap_max_pool_percent / 100 > 287 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 288 } 289 290 static u64 get_zswap_pool_size(struct zswap_pool *pool) 291 { 292 u64 pool_size = 0; 293 int i; 294 295 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 296 pool_size += zpool_get_total_size(pool->zpools[i]); 297 298 return pool_size; 299 } 300 301 static void zswap_update_total_size(void) 302 { 303 struct zswap_pool *pool; 304 u64 total = 0; 305 306 rcu_read_lock(); 307 308 list_for_each_entry_rcu(pool, &zswap_pools, list) 309 total += get_zswap_pool_size(pool); 310 311 rcu_read_unlock(); 312 313 zswap_pool_total_size = total; 314 } 315 316 /* should be called under RCU */ 317 #ifdef CONFIG_MEMCG 318 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 319 { 320 return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; 321 } 322 #else 323 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 324 { 325 return NULL; 326 } 327 #endif 328 329 static inline int entry_to_nid(struct zswap_entry *entry) 330 { 331 return page_to_nid(virt_to_page(entry)); 332 } 333 334 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) 335 { 336 struct zswap_pool *pool; 337 338 /* lock out zswap pools list modification */ 339 spin_lock(&zswap_pools_lock); 340 list_for_each_entry(pool, &zswap_pools, list) { 341 if (pool->next_shrink == memcg) 342 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 343 } 344 spin_unlock(&zswap_pools_lock); 345 } 346 347 /********************************* 348 * zswap entry functions 349 **********************************/ 350 static struct kmem_cache *zswap_entry_cache; 351 352 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) 353 { 354 struct zswap_entry *entry; 355 entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); 356 if (!entry) 357 return NULL; 358 entry->refcount = 1; 359 RB_CLEAR_NODE(&entry->rbnode); 360 return entry; 361 } 362 363 static void zswap_entry_cache_free(struct zswap_entry *entry) 364 { 365 kmem_cache_free(zswap_entry_cache, entry); 366 } 367 368 /********************************* 369 * zswap lruvec functions 370 **********************************/ 371 void zswap_lruvec_state_init(struct lruvec *lruvec) 372 { 373 atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); 374 } 375 376 void zswap_folio_swapin(struct folio *folio) 377 { 378 struct lruvec *lruvec; 379 380 if (folio) { 381 lruvec = folio_lruvec(folio); 382 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 383 } 384 } 385 386 /********************************* 387 * lru functions 388 **********************************/ 389 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) 390 { 391 atomic_long_t *nr_zswap_protected; 392 unsigned long lru_size, old, new; 393 int nid = entry_to_nid(entry); 394 struct mem_cgroup *memcg; 395 struct lruvec *lruvec; 396 397 /* 398 * Note that it is safe to use rcu_read_lock() here, even in the face of 399 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 400 * used in list_lru lookup, only two scenarios are possible: 401 * 402 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 403 * new entry will be reparented to memcg's parent's list_lru. 404 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 405 * new entry will be added directly to memcg's parent's list_lru. 406 * 407 * Similar reasoning holds for list_lru_del() and list_lru_putback(). 408 */ 409 rcu_read_lock(); 410 memcg = mem_cgroup_from_entry(entry); 411 /* will always succeed */ 412 list_lru_add(list_lru, &entry->lru, nid, memcg); 413 414 /* Update the protection area */ 415 lru_size = list_lru_count_one(list_lru, nid, memcg); 416 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 417 nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; 418 old = atomic_long_inc_return(nr_zswap_protected); 419 /* 420 * Decay to avoid overflow and adapt to changing workloads. 421 * This is based on LRU reclaim cost decaying heuristics. 422 */ 423 do { 424 new = old > lru_size / 4 ? old / 2 : old; 425 } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); 426 rcu_read_unlock(); 427 } 428 429 static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) 430 { 431 int nid = entry_to_nid(entry); 432 struct mem_cgroup *memcg; 433 434 rcu_read_lock(); 435 memcg = mem_cgroup_from_entry(entry); 436 /* will always succeed */ 437 list_lru_del(list_lru, &entry->lru, nid, memcg); 438 rcu_read_unlock(); 439 } 440 441 static void zswap_lru_putback(struct list_lru *list_lru, 442 struct zswap_entry *entry) 443 { 444 int nid = entry_to_nid(entry); 445 spinlock_t *lock = &list_lru->node[nid].lock; 446 struct mem_cgroup *memcg; 447 struct lruvec *lruvec; 448 449 rcu_read_lock(); 450 memcg = mem_cgroup_from_entry(entry); 451 spin_lock(lock); 452 /* we cannot use list_lru_add here, because it increments node's lru count */ 453 list_lru_putback(list_lru, &entry->lru, nid, memcg); 454 spin_unlock(lock); 455 456 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); 457 /* increment the protection area to account for the LRU rotation. */ 458 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 459 rcu_read_unlock(); 460 } 461 462 /********************************* 463 * rbtree functions 464 **********************************/ 465 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 466 { 467 struct rb_node *node = root->rb_node; 468 struct zswap_entry *entry; 469 pgoff_t entry_offset; 470 471 while (node) { 472 entry = rb_entry(node, struct zswap_entry, rbnode); 473 entry_offset = swp_offset(entry->swpentry); 474 if (entry_offset > offset) 475 node = node->rb_left; 476 else if (entry_offset < offset) 477 node = node->rb_right; 478 else 479 return entry; 480 } 481 return NULL; 482 } 483 484 /* 485 * In the case that a entry with the same offset is found, a pointer to 486 * the existing entry is stored in dupentry and the function returns -EEXIST 487 */ 488 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 489 struct zswap_entry **dupentry) 490 { 491 struct rb_node **link = &root->rb_node, *parent = NULL; 492 struct zswap_entry *myentry; 493 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 494 495 while (*link) { 496 parent = *link; 497 myentry = rb_entry(parent, struct zswap_entry, rbnode); 498 myentry_offset = swp_offset(myentry->swpentry); 499 if (myentry_offset > entry_offset) 500 link = &(*link)->rb_left; 501 else if (myentry_offset < entry_offset) 502 link = &(*link)->rb_right; 503 else { 504 *dupentry = myentry; 505 return -EEXIST; 506 } 507 } 508 rb_link_node(&entry->rbnode, parent, link); 509 rb_insert_color(&entry->rbnode, root); 510 return 0; 511 } 512 513 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 514 { 515 if (!RB_EMPTY_NODE(&entry->rbnode)) { 516 rb_erase(&entry->rbnode, root); 517 RB_CLEAR_NODE(&entry->rbnode); 518 return true; 519 } 520 return false; 521 } 522 523 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 524 { 525 int i = 0; 526 527 if (ZSWAP_NR_ZPOOLS > 1) 528 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 529 530 return entry->pool->zpools[i]; 531 } 532 533 /* 534 * Carries out the common pattern of freeing and entry's zpool allocation, 535 * freeing the entry itself, and decrementing the number of stored pages. 536 */ 537 static void zswap_free_entry(struct zswap_entry *entry) 538 { 539 if (entry->objcg) { 540 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 541 obj_cgroup_put(entry->objcg); 542 } 543 if (!entry->length) 544 atomic_dec(&zswap_same_filled_pages); 545 else { 546 zswap_lru_del(&entry->pool->list_lru, entry); 547 zpool_free(zswap_find_zpool(entry), entry->handle); 548 atomic_dec(&entry->pool->nr_stored); 549 zswap_pool_put(entry->pool); 550 } 551 zswap_entry_cache_free(entry); 552 atomic_dec(&zswap_stored_pages); 553 zswap_update_total_size(); 554 } 555 556 /* caller must hold the tree lock */ 557 static void zswap_entry_get(struct zswap_entry *entry) 558 { 559 entry->refcount++; 560 } 561 562 /* caller must hold the tree lock 563 * remove from the tree and free it, if nobody reference the entry 564 */ 565 static void zswap_entry_put(struct zswap_tree *tree, 566 struct zswap_entry *entry) 567 { 568 int refcount = --entry->refcount; 569 570 WARN_ON_ONCE(refcount < 0); 571 if (refcount == 0) { 572 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); 573 zswap_free_entry(entry); 574 } 575 } 576 577 /* caller must hold the tree lock */ 578 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 579 pgoff_t offset) 580 { 581 struct zswap_entry *entry; 582 583 entry = zswap_rb_search(root, offset); 584 if (entry) 585 zswap_entry_get(entry); 586 587 return entry; 588 } 589 590 /********************************* 591 * shrinker functions 592 **********************************/ 593 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 594 spinlock_t *lock, void *arg); 595 596 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, 597 struct shrink_control *sc) 598 { 599 struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 600 unsigned long shrink_ret, nr_protected, lru_size; 601 struct zswap_pool *pool = shrinker->private_data; 602 bool encountered_page_in_swapcache = false; 603 604 if (!zswap_shrinker_enabled || 605 !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { 606 sc->nr_scanned = 0; 607 return SHRINK_STOP; 608 } 609 610 nr_protected = 611 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 612 lru_size = list_lru_shrink_count(&pool->list_lru, sc); 613 614 /* 615 * Abort if we are shrinking into the protected region. 616 * 617 * This short-circuiting is necessary because if we have too many multiple 618 * concurrent reclaimers getting the freeable zswap object counts at the 619 * same time (before any of them made reasonable progress), the total 620 * number of reclaimed objects might be more than the number of unprotected 621 * objects (i.e the reclaimers will reclaim into the protected area of the 622 * zswap LRU). 623 */ 624 if (nr_protected >= lru_size - sc->nr_to_scan) { 625 sc->nr_scanned = 0; 626 return SHRINK_STOP; 627 } 628 629 shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, 630 &encountered_page_in_swapcache); 631 632 if (encountered_page_in_swapcache) 633 return SHRINK_STOP; 634 635 return shrink_ret ? shrink_ret : SHRINK_STOP; 636 } 637 638 static unsigned long zswap_shrinker_count(struct shrinker *shrinker, 639 struct shrink_control *sc) 640 { 641 struct zswap_pool *pool = shrinker->private_data; 642 struct mem_cgroup *memcg = sc->memcg; 643 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); 644 unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; 645 646 if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) 647 return 0; 648 649 #ifdef CONFIG_MEMCG_KMEM 650 mem_cgroup_flush_stats(memcg); 651 nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; 652 nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); 653 #else 654 /* use pool stats instead of memcg stats */ 655 nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; 656 nr_stored = atomic_read(&pool->nr_stored); 657 #endif 658 659 if (!nr_stored) 660 return 0; 661 662 nr_protected = 663 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 664 nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); 665 /* 666 * Subtract the lru size by an estimate of the number of pages 667 * that should be protected. 668 */ 669 nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; 670 671 /* 672 * Scale the number of freeable pages by the memory saving factor. 673 * This ensures that the better zswap compresses memory, the fewer 674 * pages we will evict to swap (as it will otherwise incur IO for 675 * relatively small memory saving). 676 */ 677 return mult_frac(nr_freeable, nr_backing, nr_stored); 678 } 679 680 static void zswap_alloc_shrinker(struct zswap_pool *pool) 681 { 682 pool->shrinker = 683 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); 684 if (!pool->shrinker) 685 return; 686 687 pool->shrinker->private_data = pool; 688 pool->shrinker->scan_objects = zswap_shrinker_scan; 689 pool->shrinker->count_objects = zswap_shrinker_count; 690 pool->shrinker->batch = 0; 691 pool->shrinker->seeks = DEFAULT_SEEKS; 692 } 693 694 /********************************* 695 * per-cpu code 696 **********************************/ 697 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 698 { 699 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 700 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 701 struct crypto_acomp *acomp; 702 struct acomp_req *req; 703 int ret; 704 705 mutex_init(&acomp_ctx->mutex); 706 707 acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 708 if (!acomp_ctx->buffer) 709 return -ENOMEM; 710 711 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 712 if (IS_ERR(acomp)) { 713 pr_err("could not alloc crypto acomp %s : %ld\n", 714 pool->tfm_name, PTR_ERR(acomp)); 715 ret = PTR_ERR(acomp); 716 goto acomp_fail; 717 } 718 acomp_ctx->acomp = acomp; 719 720 req = acomp_request_alloc(acomp_ctx->acomp); 721 if (!req) { 722 pr_err("could not alloc crypto acomp_request %s\n", 723 pool->tfm_name); 724 ret = -ENOMEM; 725 goto req_fail; 726 } 727 acomp_ctx->req = req; 728 729 crypto_init_wait(&acomp_ctx->wait); 730 /* 731 * if the backend of acomp is async zip, crypto_req_done() will wakeup 732 * crypto_wait_req(); if the backend of acomp is scomp, the callback 733 * won't be called, crypto_wait_req() will return without blocking. 734 */ 735 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 736 crypto_req_done, &acomp_ctx->wait); 737 738 return 0; 739 740 req_fail: 741 crypto_free_acomp(acomp_ctx->acomp); 742 acomp_fail: 743 kfree(acomp_ctx->buffer); 744 return ret; 745 } 746 747 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 748 { 749 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 750 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 751 752 if (!IS_ERR_OR_NULL(acomp_ctx)) { 753 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 754 acomp_request_free(acomp_ctx->req); 755 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 756 crypto_free_acomp(acomp_ctx->acomp); 757 kfree(acomp_ctx->buffer); 758 } 759 760 return 0; 761 } 762 763 /********************************* 764 * pool functions 765 **********************************/ 766 767 static struct zswap_pool *__zswap_pool_current(void) 768 { 769 struct zswap_pool *pool; 770 771 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 772 WARN_ONCE(!pool && zswap_has_pool, 773 "%s: no page storage pool!\n", __func__); 774 775 return pool; 776 } 777 778 static struct zswap_pool *zswap_pool_current(void) 779 { 780 assert_spin_locked(&zswap_pools_lock); 781 782 return __zswap_pool_current(); 783 } 784 785 static struct zswap_pool *zswap_pool_current_get(void) 786 { 787 struct zswap_pool *pool; 788 789 rcu_read_lock(); 790 791 pool = __zswap_pool_current(); 792 if (!zswap_pool_get(pool)) 793 pool = NULL; 794 795 rcu_read_unlock(); 796 797 return pool; 798 } 799 800 static struct zswap_pool *zswap_pool_last_get(void) 801 { 802 struct zswap_pool *pool, *last = NULL; 803 804 rcu_read_lock(); 805 806 list_for_each_entry_rcu(pool, &zswap_pools, list) 807 last = pool; 808 WARN_ONCE(!last && zswap_has_pool, 809 "%s: no page storage pool!\n", __func__); 810 if (!zswap_pool_get(last)) 811 last = NULL; 812 813 rcu_read_unlock(); 814 815 return last; 816 } 817 818 /* type and compressor must be null-terminated */ 819 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 820 { 821 struct zswap_pool *pool; 822 823 assert_spin_locked(&zswap_pools_lock); 824 825 list_for_each_entry_rcu(pool, &zswap_pools, list) { 826 if (strcmp(pool->tfm_name, compressor)) 827 continue; 828 /* all zpools share the same type */ 829 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 830 continue; 831 /* if we can't get it, it's about to be destroyed */ 832 if (!zswap_pool_get(pool)) 833 continue; 834 return pool; 835 } 836 837 return NULL; 838 } 839 840 /* 841 * If the entry is still valid in the tree, drop the initial ref and remove it 842 * from the tree. This function must be called with an additional ref held, 843 * otherwise it may race with another invalidation freeing the entry. 844 */ 845 static void zswap_invalidate_entry(struct zswap_tree *tree, 846 struct zswap_entry *entry) 847 { 848 if (zswap_rb_erase(&tree->rbroot, entry)) 849 zswap_entry_put(tree, entry); 850 } 851 852 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 853 spinlock_t *lock, void *arg) 854 { 855 struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); 856 bool *encountered_page_in_swapcache = (bool *)arg; 857 struct zswap_tree *tree; 858 pgoff_t swpoffset; 859 enum lru_status ret = LRU_REMOVED_RETRY; 860 int writeback_result; 861 862 /* 863 * Once the lru lock is dropped, the entry might get freed. The 864 * swpoffset is copied to the stack, and entry isn't deref'd again 865 * until the entry is verified to still be alive in the tree. 866 */ 867 swpoffset = swp_offset(entry->swpentry); 868 tree = zswap_trees[swp_type(entry->swpentry)]; 869 list_lru_isolate(l, item); 870 /* 871 * It's safe to drop the lock here because we return either 872 * LRU_REMOVED_RETRY or LRU_RETRY. 873 */ 874 spin_unlock(lock); 875 876 /* Check for invalidate() race */ 877 spin_lock(&tree->lock); 878 if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) 879 goto unlock; 880 881 /* Hold a reference to prevent a free during writeback */ 882 zswap_entry_get(entry); 883 spin_unlock(&tree->lock); 884 885 writeback_result = zswap_writeback_entry(entry, tree); 886 887 spin_lock(&tree->lock); 888 if (writeback_result) { 889 zswap_reject_reclaim_fail++; 890 zswap_lru_putback(&entry->pool->list_lru, entry); 891 ret = LRU_RETRY; 892 893 /* 894 * Encountering a page already in swap cache is a sign that we are shrinking 895 * into the warmer region. We should terminate shrinking (if we're in the dynamic 896 * shrinker context). 897 */ 898 if (writeback_result == -EEXIST && encountered_page_in_swapcache) { 899 ret = LRU_SKIP; 900 *encountered_page_in_swapcache = true; 901 } 902 903 goto put_unlock; 904 } 905 zswap_written_back_pages++; 906 907 if (entry->objcg) 908 count_objcg_event(entry->objcg, ZSWPWB); 909 910 count_vm_event(ZSWPWB); 911 /* 912 * Writeback started successfully, the page now belongs to the 913 * swapcache. Drop the entry from zswap - unless invalidate already 914 * took it out while we had the tree->lock released for IO. 915 */ 916 zswap_invalidate_entry(tree, entry); 917 918 put_unlock: 919 /* Drop local reference */ 920 zswap_entry_put(tree, entry); 921 unlock: 922 spin_unlock(&tree->lock); 923 spin_lock(lock); 924 return ret; 925 } 926 927 static int shrink_memcg(struct mem_cgroup *memcg) 928 { 929 struct zswap_pool *pool; 930 int nid, shrunk = 0; 931 932 if (!mem_cgroup_zswap_writeback_enabled(memcg)) 933 return -EINVAL; 934 935 /* 936 * Skip zombies because their LRUs are reparented and we would be 937 * reclaiming from the parent instead of the dead memcg. 938 */ 939 if (memcg && !mem_cgroup_online(memcg)) 940 return -ENOENT; 941 942 pool = zswap_pool_current_get(); 943 if (!pool) 944 return -EINVAL; 945 946 for_each_node_state(nid, N_NORMAL_MEMORY) { 947 unsigned long nr_to_walk = 1; 948 949 shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, 950 &shrink_memcg_cb, NULL, &nr_to_walk); 951 } 952 zswap_pool_put(pool); 953 return shrunk ? 0 : -EAGAIN; 954 } 955 956 static void shrink_worker(struct work_struct *w) 957 { 958 struct zswap_pool *pool = container_of(w, typeof(*pool), 959 shrink_work); 960 struct mem_cgroup *memcg; 961 int ret, failures = 0; 962 963 /* global reclaim will select cgroup in a round-robin fashion. */ 964 do { 965 spin_lock(&zswap_pools_lock); 966 pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); 967 memcg = pool->next_shrink; 968 969 /* 970 * We need to retry if we have gone through a full round trip, or if we 971 * got an offline memcg (or else we risk undoing the effect of the 972 * zswap memcg offlining cleanup callback). This is not catastrophic 973 * per se, but it will keep the now offlined memcg hostage for a while. 974 * 975 * Note that if we got an online memcg, we will keep the extra 976 * reference in case the original reference obtained by mem_cgroup_iter 977 * is dropped by the zswap memcg offlining callback, ensuring that the 978 * memcg is not killed when we are reclaiming. 979 */ 980 if (!memcg) { 981 spin_unlock(&zswap_pools_lock); 982 if (++failures == MAX_RECLAIM_RETRIES) 983 break; 984 985 goto resched; 986 } 987 988 if (!mem_cgroup_tryget_online(memcg)) { 989 /* drop the reference from mem_cgroup_iter() */ 990 mem_cgroup_iter_break(NULL, memcg); 991 pool->next_shrink = NULL; 992 spin_unlock(&zswap_pools_lock); 993 994 if (++failures == MAX_RECLAIM_RETRIES) 995 break; 996 997 goto resched; 998 } 999 spin_unlock(&zswap_pools_lock); 1000 1001 ret = shrink_memcg(memcg); 1002 /* drop the extra reference */ 1003 mem_cgroup_put(memcg); 1004 1005 if (ret == -EINVAL) 1006 break; 1007 if (ret && ++failures == MAX_RECLAIM_RETRIES) 1008 break; 1009 1010 resched: 1011 cond_resched(); 1012 } while (!zswap_can_accept()); 1013 zswap_pool_put(pool); 1014 } 1015 1016 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 1017 { 1018 int i; 1019 struct zswap_pool *pool; 1020 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 1021 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1022 int ret; 1023 1024 if (!zswap_has_pool) { 1025 /* if either are unset, pool initialization failed, and we 1026 * need both params to be set correctly before trying to 1027 * create a pool. 1028 */ 1029 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 1030 return NULL; 1031 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 1032 return NULL; 1033 } 1034 1035 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1036 if (!pool) 1037 return NULL; 1038 1039 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 1040 /* unique name for each pool specifically required by zsmalloc */ 1041 snprintf(name, 38, "zswap%x", 1042 atomic_inc_return(&zswap_pools_count)); 1043 1044 pool->zpools[i] = zpool_create_pool(type, name, gfp); 1045 if (!pool->zpools[i]) { 1046 pr_err("%s zpool not available\n", type); 1047 goto error; 1048 } 1049 } 1050 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 1051 1052 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 1053 1054 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 1055 if (!pool->acomp_ctx) { 1056 pr_err("percpu alloc failed\n"); 1057 goto error; 1058 } 1059 1060 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 1061 &pool->node); 1062 if (ret) 1063 goto error; 1064 1065 zswap_alloc_shrinker(pool); 1066 if (!pool->shrinker) 1067 goto error; 1068 1069 pr_debug("using %s compressor\n", pool->tfm_name); 1070 1071 /* being the current pool takes 1 ref; this func expects the 1072 * caller to always add the new pool as the current pool 1073 */ 1074 kref_init(&pool->kref); 1075 INIT_LIST_HEAD(&pool->list); 1076 if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) 1077 goto lru_fail; 1078 shrinker_register(pool->shrinker); 1079 INIT_WORK(&pool->shrink_work, shrink_worker); 1080 atomic_set(&pool->nr_stored, 0); 1081 1082 zswap_pool_debug("created", pool); 1083 1084 return pool; 1085 1086 lru_fail: 1087 list_lru_destroy(&pool->list_lru); 1088 shrinker_free(pool->shrinker); 1089 error: 1090 if (pool->acomp_ctx) 1091 free_percpu(pool->acomp_ctx); 1092 while (i--) 1093 zpool_destroy_pool(pool->zpools[i]); 1094 kfree(pool); 1095 return NULL; 1096 } 1097 1098 static struct zswap_pool *__zswap_pool_create_fallback(void) 1099 { 1100 bool has_comp, has_zpool; 1101 1102 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1103 if (!has_comp && strcmp(zswap_compressor, 1104 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 1105 pr_err("compressor %s not available, using default %s\n", 1106 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 1107 param_free_charp(&zswap_compressor); 1108 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 1109 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 1110 } 1111 if (!has_comp) { 1112 pr_err("default compressor %s not available\n", 1113 zswap_compressor); 1114 param_free_charp(&zswap_compressor); 1115 zswap_compressor = ZSWAP_PARAM_UNSET; 1116 } 1117 1118 has_zpool = zpool_has_pool(zswap_zpool_type); 1119 if (!has_zpool && strcmp(zswap_zpool_type, 1120 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 1121 pr_err("zpool %s not available, using default %s\n", 1122 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 1123 param_free_charp(&zswap_zpool_type); 1124 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 1125 has_zpool = zpool_has_pool(zswap_zpool_type); 1126 } 1127 if (!has_zpool) { 1128 pr_err("default zpool %s not available\n", 1129 zswap_zpool_type); 1130 param_free_charp(&zswap_zpool_type); 1131 zswap_zpool_type = ZSWAP_PARAM_UNSET; 1132 } 1133 1134 if (!has_comp || !has_zpool) 1135 return NULL; 1136 1137 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 1138 } 1139 1140 static void zswap_pool_destroy(struct zswap_pool *pool) 1141 { 1142 int i; 1143 1144 zswap_pool_debug("destroying", pool); 1145 1146 shrinker_free(pool->shrinker); 1147 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 1148 free_percpu(pool->acomp_ctx); 1149 list_lru_destroy(&pool->list_lru); 1150 1151 spin_lock(&zswap_pools_lock); 1152 mem_cgroup_iter_break(NULL, pool->next_shrink); 1153 pool->next_shrink = NULL; 1154 spin_unlock(&zswap_pools_lock); 1155 1156 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 1157 zpool_destroy_pool(pool->zpools[i]); 1158 kfree(pool); 1159 } 1160 1161 static int __must_check zswap_pool_get(struct zswap_pool *pool) 1162 { 1163 if (!pool) 1164 return 0; 1165 1166 return kref_get_unless_zero(&pool->kref); 1167 } 1168 1169 static void __zswap_pool_release(struct work_struct *work) 1170 { 1171 struct zswap_pool *pool = container_of(work, typeof(*pool), 1172 release_work); 1173 1174 synchronize_rcu(); 1175 1176 /* nobody should have been able to get a kref... */ 1177 WARN_ON(kref_get_unless_zero(&pool->kref)); 1178 1179 /* pool is now off zswap_pools list and has no references. */ 1180 zswap_pool_destroy(pool); 1181 } 1182 1183 static void __zswap_pool_empty(struct kref *kref) 1184 { 1185 struct zswap_pool *pool; 1186 1187 pool = container_of(kref, typeof(*pool), kref); 1188 1189 spin_lock(&zswap_pools_lock); 1190 1191 WARN_ON(pool == zswap_pool_current()); 1192 1193 list_del_rcu(&pool->list); 1194 1195 INIT_WORK(&pool->release_work, __zswap_pool_release); 1196 schedule_work(&pool->release_work); 1197 1198 spin_unlock(&zswap_pools_lock); 1199 } 1200 1201 static void zswap_pool_put(struct zswap_pool *pool) 1202 { 1203 kref_put(&pool->kref, __zswap_pool_empty); 1204 } 1205 1206 /********************************* 1207 * param callbacks 1208 **********************************/ 1209 1210 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 1211 { 1212 /* no change required */ 1213 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 1214 return false; 1215 return true; 1216 } 1217 1218 /* val must be a null-terminated string */ 1219 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 1220 char *type, char *compressor) 1221 { 1222 struct zswap_pool *pool, *put_pool = NULL; 1223 char *s = strstrip((char *)val); 1224 int ret = 0; 1225 bool new_pool = false; 1226 1227 mutex_lock(&zswap_init_lock); 1228 switch (zswap_init_state) { 1229 case ZSWAP_UNINIT: 1230 /* if this is load-time (pre-init) param setting, 1231 * don't create a pool; that's done during init. 1232 */ 1233 ret = param_set_charp(s, kp); 1234 break; 1235 case ZSWAP_INIT_SUCCEED: 1236 new_pool = zswap_pool_changed(s, kp); 1237 break; 1238 case ZSWAP_INIT_FAILED: 1239 pr_err("can't set param, initialization failed\n"); 1240 ret = -ENODEV; 1241 } 1242 mutex_unlock(&zswap_init_lock); 1243 1244 /* no need to create a new pool, return directly */ 1245 if (!new_pool) 1246 return ret; 1247 1248 if (!type) { 1249 if (!zpool_has_pool(s)) { 1250 pr_err("zpool %s not available\n", s); 1251 return -ENOENT; 1252 } 1253 type = s; 1254 } else if (!compressor) { 1255 if (!crypto_has_acomp(s, 0, 0)) { 1256 pr_err("compressor %s not available\n", s); 1257 return -ENOENT; 1258 } 1259 compressor = s; 1260 } else { 1261 WARN_ON(1); 1262 return -EINVAL; 1263 } 1264 1265 spin_lock(&zswap_pools_lock); 1266 1267 pool = zswap_pool_find_get(type, compressor); 1268 if (pool) { 1269 zswap_pool_debug("using existing", pool); 1270 WARN_ON(pool == zswap_pool_current()); 1271 list_del_rcu(&pool->list); 1272 } 1273 1274 spin_unlock(&zswap_pools_lock); 1275 1276 if (!pool) 1277 pool = zswap_pool_create(type, compressor); 1278 1279 if (pool) 1280 ret = param_set_charp(s, kp); 1281 else 1282 ret = -EINVAL; 1283 1284 spin_lock(&zswap_pools_lock); 1285 1286 if (!ret) { 1287 put_pool = zswap_pool_current(); 1288 list_add_rcu(&pool->list, &zswap_pools); 1289 zswap_has_pool = true; 1290 } else if (pool) { 1291 /* add the possibly pre-existing pool to the end of the pools 1292 * list; if it's new (and empty) then it'll be removed and 1293 * destroyed by the put after we drop the lock 1294 */ 1295 list_add_tail_rcu(&pool->list, &zswap_pools); 1296 put_pool = pool; 1297 } 1298 1299 spin_unlock(&zswap_pools_lock); 1300 1301 if (!zswap_has_pool && !pool) { 1302 /* if initial pool creation failed, and this pool creation also 1303 * failed, maybe both compressor and zpool params were bad. 1304 * Allow changing this param, so pool creation will succeed 1305 * when the other param is changed. We already verified this 1306 * param is ok in the zpool_has_pool() or crypto_has_acomp() 1307 * checks above. 1308 */ 1309 ret = param_set_charp(s, kp); 1310 } 1311 1312 /* drop the ref from either the old current pool, 1313 * or the new pool we failed to add 1314 */ 1315 if (put_pool) 1316 zswap_pool_put(put_pool); 1317 1318 return ret; 1319 } 1320 1321 static int zswap_compressor_param_set(const char *val, 1322 const struct kernel_param *kp) 1323 { 1324 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 1325 } 1326 1327 static int zswap_zpool_param_set(const char *val, 1328 const struct kernel_param *kp) 1329 { 1330 return __zswap_param_set(val, kp, NULL, zswap_compressor); 1331 } 1332 1333 static int zswap_enabled_param_set(const char *val, 1334 const struct kernel_param *kp) 1335 { 1336 int ret = -ENODEV; 1337 1338 /* if this is load-time (pre-init) param setting, only set param. */ 1339 if (system_state != SYSTEM_RUNNING) 1340 return param_set_bool(val, kp); 1341 1342 mutex_lock(&zswap_init_lock); 1343 switch (zswap_init_state) { 1344 case ZSWAP_UNINIT: 1345 if (zswap_setup()) 1346 break; 1347 fallthrough; 1348 case ZSWAP_INIT_SUCCEED: 1349 if (!zswap_has_pool) 1350 pr_err("can't enable, no pool configured\n"); 1351 else 1352 ret = param_set_bool(val, kp); 1353 break; 1354 case ZSWAP_INIT_FAILED: 1355 pr_err("can't enable, initialization failed\n"); 1356 } 1357 mutex_unlock(&zswap_init_lock); 1358 1359 return ret; 1360 } 1361 1362 static void __zswap_load(struct zswap_entry *entry, struct page *page) 1363 { 1364 struct zpool *zpool = zswap_find_zpool(entry); 1365 struct scatterlist input, output; 1366 struct crypto_acomp_ctx *acomp_ctx; 1367 u8 *src; 1368 1369 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1370 mutex_lock(&acomp_ctx->mutex); 1371 1372 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1373 if (!zpool_can_sleep_mapped(zpool)) { 1374 memcpy(acomp_ctx->buffer, src, entry->length); 1375 src = acomp_ctx->buffer; 1376 zpool_unmap_handle(zpool, entry->handle); 1377 } 1378 1379 sg_init_one(&input, src, entry->length); 1380 sg_init_table(&output, 1); 1381 sg_set_page(&output, page, PAGE_SIZE, 0); 1382 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); 1383 BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); 1384 BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); 1385 mutex_unlock(&acomp_ctx->mutex); 1386 1387 if (zpool_can_sleep_mapped(zpool)) 1388 zpool_unmap_handle(zpool, entry->handle); 1389 } 1390 1391 /********************************* 1392 * writeback code 1393 **********************************/ 1394 /* 1395 * Attempts to free an entry by adding a folio to the swap cache, 1396 * decompressing the entry data into the folio, and issuing a 1397 * bio write to write the folio back to the swap device. 1398 * 1399 * This can be thought of as a "resumed writeback" of the folio 1400 * to the swap device. We are basically resuming the same swap 1401 * writeback path that was intercepted with the zswap_store() 1402 * in the first place. After the folio has been decompressed into 1403 * the swap cache, the compressed version stored by zswap can be 1404 * freed. 1405 */ 1406 static int zswap_writeback_entry(struct zswap_entry *entry, 1407 struct zswap_tree *tree) 1408 { 1409 swp_entry_t swpentry = entry->swpentry; 1410 struct folio *folio; 1411 struct mempolicy *mpol; 1412 bool folio_was_allocated; 1413 struct writeback_control wbc = { 1414 .sync_mode = WB_SYNC_NONE, 1415 }; 1416 1417 /* try to allocate swap cache folio */ 1418 mpol = get_task_policy(current); 1419 folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1420 NO_INTERLEAVE_INDEX, &folio_was_allocated, true); 1421 if (!folio) 1422 return -ENOMEM; 1423 1424 /* 1425 * Found an existing folio, we raced with load/swapin. We generally 1426 * writeback cold folios from zswap, and swapin means the folio just 1427 * became hot. Skip this folio and let the caller find another one. 1428 */ 1429 if (!folio_was_allocated) { 1430 folio_put(folio); 1431 return -EEXIST; 1432 } 1433 1434 /* 1435 * folio is locked, and the swapcache is now secured against 1436 * concurrent swapping to and from the slot. Verify that the 1437 * swap entry hasn't been invalidated and recycled behind our 1438 * backs (our zswap_entry reference doesn't prevent that), to 1439 * avoid overwriting a new swap folio with old compressed data. 1440 */ 1441 spin_lock(&tree->lock); 1442 if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { 1443 spin_unlock(&tree->lock); 1444 delete_from_swap_cache(folio); 1445 return -ENOMEM; 1446 } 1447 spin_unlock(&tree->lock); 1448 1449 __zswap_load(entry, &folio->page); 1450 1451 /* folio is up to date */ 1452 folio_mark_uptodate(folio); 1453 1454 /* move it to the tail of the inactive list after end_writeback */ 1455 folio_set_reclaim(folio); 1456 1457 /* start writeback */ 1458 __swap_writepage(folio, &wbc); 1459 folio_put(folio); 1460 1461 return 0; 1462 } 1463 1464 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1465 { 1466 unsigned long *page; 1467 unsigned long val; 1468 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1469 1470 page = (unsigned long *)ptr; 1471 val = page[0]; 1472 1473 if (val != page[last_pos]) 1474 return 0; 1475 1476 for (pos = 1; pos < last_pos; pos++) { 1477 if (val != page[pos]) 1478 return 0; 1479 } 1480 1481 *value = val; 1482 1483 return 1; 1484 } 1485 1486 static void zswap_fill_page(void *ptr, unsigned long value) 1487 { 1488 unsigned long *page; 1489 1490 page = (unsigned long *)ptr; 1491 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1492 } 1493 1494 bool zswap_store(struct folio *folio) 1495 { 1496 swp_entry_t swp = folio->swap; 1497 int type = swp_type(swp); 1498 pgoff_t offset = swp_offset(swp); 1499 struct page *page = &folio->page; 1500 struct zswap_tree *tree = zswap_trees[type]; 1501 struct zswap_entry *entry, *dupentry; 1502 struct scatterlist input, output; 1503 struct crypto_acomp_ctx *acomp_ctx; 1504 struct obj_cgroup *objcg = NULL; 1505 struct mem_cgroup *memcg = NULL; 1506 struct zswap_pool *pool; 1507 struct zpool *zpool; 1508 unsigned int dlen = PAGE_SIZE; 1509 unsigned long handle, value; 1510 char *buf; 1511 u8 *src, *dst; 1512 gfp_t gfp; 1513 int ret; 1514 1515 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1516 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1517 1518 /* Large folios aren't supported */ 1519 if (folio_test_large(folio)) 1520 return false; 1521 1522 if (!zswap_enabled || !tree) 1523 return false; 1524 1525 /* 1526 * If this is a duplicate, it must be removed before attempting to store 1527 * it, otherwise, if the store fails the old page won't be removed from 1528 * the tree, and it might be written back overriding the new data. 1529 */ 1530 spin_lock(&tree->lock); 1531 dupentry = zswap_rb_search(&tree->rbroot, offset); 1532 if (dupentry) { 1533 zswap_duplicate_entry++; 1534 zswap_invalidate_entry(tree, dupentry); 1535 } 1536 spin_unlock(&tree->lock); 1537 objcg = get_obj_cgroup_from_folio(folio); 1538 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1539 memcg = get_mem_cgroup_from_objcg(objcg); 1540 if (shrink_memcg(memcg)) { 1541 mem_cgroup_put(memcg); 1542 goto reject; 1543 } 1544 mem_cgroup_put(memcg); 1545 } 1546 1547 /* reclaim space if needed */ 1548 if (zswap_is_full()) { 1549 zswap_pool_limit_hit++; 1550 zswap_pool_reached_full = true; 1551 goto shrink; 1552 } 1553 1554 if (zswap_pool_reached_full) { 1555 if (!zswap_can_accept()) 1556 goto shrink; 1557 else 1558 zswap_pool_reached_full = false; 1559 } 1560 1561 /* allocate entry */ 1562 entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); 1563 if (!entry) { 1564 zswap_reject_kmemcache_fail++; 1565 goto reject; 1566 } 1567 1568 if (zswap_same_filled_pages_enabled) { 1569 src = kmap_local_page(page); 1570 if (zswap_is_page_same_filled(src, &value)) { 1571 kunmap_local(src); 1572 entry->swpentry = swp_entry(type, offset); 1573 entry->length = 0; 1574 entry->value = value; 1575 atomic_inc(&zswap_same_filled_pages); 1576 goto insert_entry; 1577 } 1578 kunmap_local(src); 1579 } 1580 1581 if (!zswap_non_same_filled_pages_enabled) 1582 goto freepage; 1583 1584 /* if entry is successfully added, it keeps the reference */ 1585 entry->pool = zswap_pool_current_get(); 1586 if (!entry->pool) 1587 goto freepage; 1588 1589 if (objcg) { 1590 memcg = get_mem_cgroup_from_objcg(objcg); 1591 if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { 1592 mem_cgroup_put(memcg); 1593 goto put_pool; 1594 } 1595 mem_cgroup_put(memcg); 1596 } 1597 1598 /* compress */ 1599 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1600 1601 mutex_lock(&acomp_ctx->mutex); 1602 1603 dst = acomp_ctx->buffer; 1604 sg_init_table(&input, 1); 1605 sg_set_page(&input, &folio->page, PAGE_SIZE, 0); 1606 1607 /* 1608 * We need PAGE_SIZE * 2 here since there maybe over-compression case, 1609 * and hardware-accelerators may won't check the dst buffer size, so 1610 * giving the dst buffer with enough length to avoid buffer overflow. 1611 */ 1612 sg_init_one(&output, dst, PAGE_SIZE * 2); 1613 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1614 /* 1615 * it maybe looks a little bit silly that we send an asynchronous request, 1616 * then wait for its completion synchronously. This makes the process look 1617 * synchronous in fact. 1618 * Theoretically, acomp supports users send multiple acomp requests in one 1619 * acomp instance, then get those requests done simultaneously. but in this 1620 * case, zswap actually does store and load page by page, there is no 1621 * existing method to send the second page before the first page is done 1622 * in one thread doing zwap. 1623 * but in different threads running on different cpu, we have different 1624 * acomp instance, so multiple threads can do (de)compression in parallel. 1625 */ 1626 ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1627 dlen = acomp_ctx->req->dlen; 1628 1629 if (ret) { 1630 zswap_reject_compress_fail++; 1631 goto put_dstmem; 1632 } 1633 1634 /* store */ 1635 zpool = zswap_find_zpool(entry); 1636 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1637 if (zpool_malloc_support_movable(zpool)) 1638 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1639 ret = zpool_malloc(zpool, dlen, gfp, &handle); 1640 if (ret == -ENOSPC) { 1641 zswap_reject_compress_poor++; 1642 goto put_dstmem; 1643 } 1644 if (ret) { 1645 zswap_reject_alloc_fail++; 1646 goto put_dstmem; 1647 } 1648 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1649 memcpy(buf, dst, dlen); 1650 zpool_unmap_handle(zpool, handle); 1651 mutex_unlock(&acomp_ctx->mutex); 1652 1653 /* populate entry */ 1654 entry->swpentry = swp_entry(type, offset); 1655 entry->handle = handle; 1656 entry->length = dlen; 1657 1658 insert_entry: 1659 entry->objcg = objcg; 1660 if (objcg) { 1661 obj_cgroup_charge_zswap(objcg, entry->length); 1662 /* Account before objcg ref is moved to tree */ 1663 count_objcg_event(objcg, ZSWPOUT); 1664 } 1665 1666 /* map */ 1667 spin_lock(&tree->lock); 1668 /* 1669 * A duplicate entry should have been removed at the beginning of this 1670 * function. Since the swap entry should be pinned, if a duplicate is 1671 * found again here it means that something went wrong in the swap 1672 * cache. 1673 */ 1674 while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1675 WARN_ON(1); 1676 zswap_duplicate_entry++; 1677 zswap_invalidate_entry(tree, dupentry); 1678 } 1679 if (entry->length) { 1680 INIT_LIST_HEAD(&entry->lru); 1681 zswap_lru_add(&entry->pool->list_lru, entry); 1682 atomic_inc(&entry->pool->nr_stored); 1683 } 1684 spin_unlock(&tree->lock); 1685 1686 /* update stats */ 1687 atomic_inc(&zswap_stored_pages); 1688 zswap_update_total_size(); 1689 count_vm_event(ZSWPOUT); 1690 1691 return true; 1692 1693 put_dstmem: 1694 mutex_unlock(&acomp_ctx->mutex); 1695 put_pool: 1696 zswap_pool_put(entry->pool); 1697 freepage: 1698 zswap_entry_cache_free(entry); 1699 reject: 1700 if (objcg) 1701 obj_cgroup_put(objcg); 1702 return false; 1703 1704 shrink: 1705 pool = zswap_pool_last_get(); 1706 if (pool && !queue_work(shrink_wq, &pool->shrink_work)) 1707 zswap_pool_put(pool); 1708 goto reject; 1709 } 1710 1711 bool zswap_load(struct folio *folio) 1712 { 1713 swp_entry_t swp = folio->swap; 1714 int type = swp_type(swp); 1715 pgoff_t offset = swp_offset(swp); 1716 struct page *page = &folio->page; 1717 struct zswap_tree *tree = zswap_trees[type]; 1718 struct zswap_entry *entry; 1719 u8 *dst; 1720 1721 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1722 1723 /* find */ 1724 spin_lock(&tree->lock); 1725 entry = zswap_entry_find_get(&tree->rbroot, offset); 1726 if (!entry) { 1727 spin_unlock(&tree->lock); 1728 return false; 1729 } 1730 spin_unlock(&tree->lock); 1731 1732 if (entry->length) 1733 __zswap_load(entry, page); 1734 else { 1735 dst = kmap_local_page(page); 1736 zswap_fill_page(dst, entry->value); 1737 kunmap_local(dst); 1738 } 1739 1740 count_vm_event(ZSWPIN); 1741 if (entry->objcg) 1742 count_objcg_event(entry->objcg, ZSWPIN); 1743 1744 spin_lock(&tree->lock); 1745 if (zswap_exclusive_loads_enabled) { 1746 zswap_invalidate_entry(tree, entry); 1747 folio_mark_dirty(folio); 1748 } else if (entry->length) { 1749 zswap_lru_del(&entry->pool->list_lru, entry); 1750 zswap_lru_add(&entry->pool->list_lru, entry); 1751 } 1752 zswap_entry_put(tree, entry); 1753 spin_unlock(&tree->lock); 1754 1755 return true; 1756 } 1757 1758 void zswap_invalidate(int type, pgoff_t offset) 1759 { 1760 struct zswap_tree *tree = zswap_trees[type]; 1761 struct zswap_entry *entry; 1762 1763 /* find */ 1764 spin_lock(&tree->lock); 1765 entry = zswap_rb_search(&tree->rbroot, offset); 1766 if (!entry) { 1767 /* entry was written back */ 1768 spin_unlock(&tree->lock); 1769 return; 1770 } 1771 zswap_invalidate_entry(tree, entry); 1772 spin_unlock(&tree->lock); 1773 } 1774 1775 void zswap_swapon(int type) 1776 { 1777 struct zswap_tree *tree; 1778 1779 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1780 if (!tree) { 1781 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1782 return; 1783 } 1784 1785 tree->rbroot = RB_ROOT; 1786 spin_lock_init(&tree->lock); 1787 zswap_trees[type] = tree; 1788 } 1789 1790 void zswap_swapoff(int type) 1791 { 1792 struct zswap_tree *tree = zswap_trees[type]; 1793 struct zswap_entry *entry, *n; 1794 1795 if (!tree) 1796 return; 1797 1798 /* walk the tree and free everything */ 1799 spin_lock(&tree->lock); 1800 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1801 zswap_free_entry(entry); 1802 tree->rbroot = RB_ROOT; 1803 spin_unlock(&tree->lock); 1804 kfree(tree); 1805 zswap_trees[type] = NULL; 1806 } 1807 1808 /********************************* 1809 * debugfs functions 1810 **********************************/ 1811 #ifdef CONFIG_DEBUG_FS 1812 #include <linux/debugfs.h> 1813 1814 static struct dentry *zswap_debugfs_root; 1815 1816 static int zswap_debugfs_init(void) 1817 { 1818 if (!debugfs_initialized()) 1819 return -ENODEV; 1820 1821 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1822 1823 debugfs_create_u64("pool_limit_hit", 0444, 1824 zswap_debugfs_root, &zswap_pool_limit_hit); 1825 debugfs_create_u64("reject_reclaim_fail", 0444, 1826 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1827 debugfs_create_u64("reject_alloc_fail", 0444, 1828 zswap_debugfs_root, &zswap_reject_alloc_fail); 1829 debugfs_create_u64("reject_kmemcache_fail", 0444, 1830 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1831 debugfs_create_u64("reject_compress_fail", 0444, 1832 zswap_debugfs_root, &zswap_reject_compress_fail); 1833 debugfs_create_u64("reject_compress_poor", 0444, 1834 zswap_debugfs_root, &zswap_reject_compress_poor); 1835 debugfs_create_u64("written_back_pages", 0444, 1836 zswap_debugfs_root, &zswap_written_back_pages); 1837 debugfs_create_u64("duplicate_entry", 0444, 1838 zswap_debugfs_root, &zswap_duplicate_entry); 1839 debugfs_create_u64("pool_total_size", 0444, 1840 zswap_debugfs_root, &zswap_pool_total_size); 1841 debugfs_create_atomic_t("stored_pages", 0444, 1842 zswap_debugfs_root, &zswap_stored_pages); 1843 debugfs_create_atomic_t("same_filled_pages", 0444, 1844 zswap_debugfs_root, &zswap_same_filled_pages); 1845 1846 return 0; 1847 } 1848 #else 1849 static int zswap_debugfs_init(void) 1850 { 1851 return 0; 1852 } 1853 #endif 1854 1855 /********************************* 1856 * module init and exit 1857 **********************************/ 1858 static int zswap_setup(void) 1859 { 1860 struct zswap_pool *pool; 1861 int ret; 1862 1863 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1864 if (!zswap_entry_cache) { 1865 pr_err("entry cache creation failed\n"); 1866 goto cache_fail; 1867 } 1868 1869 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1870 "mm/zswap_pool:prepare", 1871 zswap_cpu_comp_prepare, 1872 zswap_cpu_comp_dead); 1873 if (ret) 1874 goto hp_fail; 1875 1876 pool = __zswap_pool_create_fallback(); 1877 if (pool) { 1878 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1879 zpool_get_type(pool->zpools[0])); 1880 list_add(&pool->list, &zswap_pools); 1881 zswap_has_pool = true; 1882 } else { 1883 pr_err("pool creation failed\n"); 1884 zswap_enabled = false; 1885 } 1886 1887 shrink_wq = create_workqueue("zswap-shrink"); 1888 if (!shrink_wq) 1889 goto fallback_fail; 1890 1891 if (zswap_debugfs_init()) 1892 pr_warn("debugfs initialization failed\n"); 1893 zswap_init_state = ZSWAP_INIT_SUCCEED; 1894 return 0; 1895 1896 fallback_fail: 1897 if (pool) 1898 zswap_pool_destroy(pool); 1899 hp_fail: 1900 kmem_cache_destroy(zswap_entry_cache); 1901 cache_fail: 1902 /* if built-in, we aren't unloaded on failure; don't allow use */ 1903 zswap_init_state = ZSWAP_INIT_FAILED; 1904 zswap_enabled = false; 1905 return -ENOMEM; 1906 } 1907 1908 static int __init zswap_init(void) 1909 { 1910 if (!zswap_enabled) 1911 return 0; 1912 return zswap_setup(); 1913 } 1914 /* must be late so crypto has time to come up */ 1915 late_initcall(zswap_init); 1916 1917 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1918 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1919