1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempolicy.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 #include <linux/zswap.h> 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 #include <linux/list_lru.h> 39 40 #include "swap.h" 41 #include "internal.h" 42 43 /********************************* 44 * statistics 45 **********************************/ 46 /* Total bytes used by the compressed storage */ 47 u64 zswap_pool_total_size; 48 /* The number of compressed pages currently stored in zswap */ 49 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 50 /* The number of same-value filled pages currently stored in zswap */ 51 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Store failed due to compression algorithm failure */ 67 static u64 zswap_reject_compress_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 75 /* Shrinker work queue */ 76 static struct workqueue_struct *shrink_wq; 77 /* Pool limit was hit, we need to calm down */ 78 static bool zswap_pool_reached_full; 79 80 /********************************* 81 * tunables 82 **********************************/ 83 84 #define ZSWAP_PARAM_UNSET "" 85 86 static int zswap_setup(void); 87 88 /* Enable/disable zswap */ 89 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 90 static int zswap_enabled_param_set(const char *, 91 const struct kernel_param *); 92 static const struct kernel_param_ops zswap_enabled_param_ops = { 93 .set = zswap_enabled_param_set, 94 .get = param_get_bool, 95 }; 96 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 97 98 /* Crypto compressor to use */ 99 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 100 static int zswap_compressor_param_set(const char *, 101 const struct kernel_param *); 102 static const struct kernel_param_ops zswap_compressor_param_ops = { 103 .set = zswap_compressor_param_set, 104 .get = param_get_charp, 105 .free = param_free_charp, 106 }; 107 module_param_cb(compressor, &zswap_compressor_param_ops, 108 &zswap_compressor, 0644); 109 110 /* Compressed storage zpool to use */ 111 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 112 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 113 static const struct kernel_param_ops zswap_zpool_param_ops = { 114 .set = zswap_zpool_param_set, 115 .get = param_get_charp, 116 .free = param_free_charp, 117 }; 118 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 119 120 /* The maximum percentage of memory that the compressed pool can occupy */ 121 static unsigned int zswap_max_pool_percent = 20; 122 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 123 124 /* The threshold for accepting new pages after the max_pool_percent was hit */ 125 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 126 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 127 uint, 0644); 128 129 /* 130 * Enable/disable handling same-value filled pages (enabled by default). 131 * If disabled every page is considered non-same-value filled. 132 */ 133 static bool zswap_same_filled_pages_enabled = true; 134 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 135 bool, 0644); 136 137 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 138 static bool zswap_non_same_filled_pages_enabled = true; 139 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 140 bool, 0644); 141 142 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 143 #define ZSWAP_NR_ZPOOLS 32 144 145 /* Enable/disable memory pressure-based shrinker. */ 146 static bool zswap_shrinker_enabled = IS_ENABLED( 147 CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); 148 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); 149 150 bool is_zswap_enabled(void) 151 { 152 return zswap_enabled; 153 } 154 155 /********************************* 156 * data structures 157 **********************************/ 158 159 struct crypto_acomp_ctx { 160 struct crypto_acomp *acomp; 161 struct acomp_req *req; 162 struct crypto_wait wait; 163 u8 *buffer; 164 struct mutex mutex; 165 bool is_sleepable; 166 }; 167 168 /* 169 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 170 * The only case where lru_lock is not acquired while holding tree.lock is 171 * when a zswap_entry is taken off the lru for writeback, in that case it 172 * needs to be verified that it's still valid in the tree. 173 */ 174 struct zswap_pool { 175 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 176 struct crypto_acomp_ctx __percpu *acomp_ctx; 177 struct percpu_ref ref; 178 struct list_head list; 179 struct work_struct release_work; 180 struct hlist_node node; 181 char tfm_name[CRYPTO_MAX_ALG_NAME]; 182 }; 183 184 /* Global LRU lists shared by all zswap pools. */ 185 static struct list_lru zswap_list_lru; 186 /* counter of pages stored in all zswap pools. */ 187 static atomic_t zswap_nr_stored = ATOMIC_INIT(0); 188 189 /* The lock protects zswap_next_shrink updates. */ 190 static DEFINE_SPINLOCK(zswap_shrink_lock); 191 static struct mem_cgroup *zswap_next_shrink; 192 static struct work_struct zswap_shrink_work; 193 static struct shrinker *zswap_shrinker; 194 195 /* 196 * struct zswap_entry 197 * 198 * This structure contains the metadata for tracking a single compressed 199 * page within zswap. 200 * 201 * rbnode - links the entry into red-black tree for the appropriate swap type 202 * swpentry - associated swap entry, the offset indexes into the red-black tree 203 * length - the length in bytes of the compressed page data. Needed during 204 * decompression. For a same value filled page length is 0, and both 205 * pool and lru are invalid and must be ignored. 206 * pool - the zswap_pool the entry's data is in 207 * handle - zpool allocation handle that stores the compressed page data 208 * value - value of the same-value filled pages which have same content 209 * objcg - the obj_cgroup that the compressed memory is charged to 210 * lru - handle to the pool's lru used to evict pages. 211 */ 212 struct zswap_entry { 213 struct rb_node rbnode; 214 swp_entry_t swpentry; 215 unsigned int length; 216 struct zswap_pool *pool; 217 union { 218 unsigned long handle; 219 unsigned long value; 220 }; 221 struct obj_cgroup *objcg; 222 struct list_head lru; 223 }; 224 225 struct zswap_tree { 226 struct rb_root rbroot; 227 spinlock_t lock; 228 }; 229 230 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 231 static unsigned int nr_zswap_trees[MAX_SWAPFILES]; 232 233 /* RCU-protected iteration */ 234 static LIST_HEAD(zswap_pools); 235 /* protects zswap_pools list modification */ 236 static DEFINE_SPINLOCK(zswap_pools_lock); 237 /* pool counter to provide unique names to zpool */ 238 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 239 240 enum zswap_init_type { 241 ZSWAP_UNINIT, 242 ZSWAP_INIT_SUCCEED, 243 ZSWAP_INIT_FAILED 244 }; 245 246 static enum zswap_init_type zswap_init_state; 247 248 /* used to ensure the integrity of initialization */ 249 static DEFINE_MUTEX(zswap_init_lock); 250 251 /* init completed, but couldn't create the initial pool */ 252 static bool zswap_has_pool; 253 254 /********************************* 255 * helpers and fwd declarations 256 **********************************/ 257 258 static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) 259 { 260 return &zswap_trees[swp_type(swp)][swp_offset(swp) 261 >> SWAP_ADDRESS_SPACE_SHIFT]; 262 } 263 264 #define zswap_pool_debug(msg, p) \ 265 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 266 zpool_get_type((p)->zpools[0])) 267 268 static bool zswap_is_full(void) 269 { 270 return totalram_pages() * zswap_max_pool_percent / 100 < 271 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 272 } 273 274 static bool zswap_can_accept(void) 275 { 276 return totalram_pages() * zswap_accept_thr_percent / 100 * 277 zswap_max_pool_percent / 100 > 278 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 279 } 280 281 static u64 get_zswap_pool_size(struct zswap_pool *pool) 282 { 283 u64 pool_size = 0; 284 int i; 285 286 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 287 pool_size += zpool_get_total_size(pool->zpools[i]); 288 289 return pool_size; 290 } 291 292 static void zswap_update_total_size(void) 293 { 294 struct zswap_pool *pool; 295 u64 total = 0; 296 297 rcu_read_lock(); 298 299 list_for_each_entry_rcu(pool, &zswap_pools, list) 300 total += get_zswap_pool_size(pool); 301 302 rcu_read_unlock(); 303 304 zswap_pool_total_size = total; 305 } 306 307 /********************************* 308 * pool functions 309 **********************************/ 310 static void __zswap_pool_empty(struct percpu_ref *ref); 311 312 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 313 { 314 int i; 315 struct zswap_pool *pool; 316 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 317 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 318 int ret; 319 320 if (!zswap_has_pool) { 321 /* if either are unset, pool initialization failed, and we 322 * need both params to be set correctly before trying to 323 * create a pool. 324 */ 325 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 326 return NULL; 327 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 328 return NULL; 329 } 330 331 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 332 if (!pool) 333 return NULL; 334 335 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 336 /* unique name for each pool specifically required by zsmalloc */ 337 snprintf(name, 38, "zswap%x", 338 atomic_inc_return(&zswap_pools_count)); 339 340 pool->zpools[i] = zpool_create_pool(type, name, gfp); 341 if (!pool->zpools[i]) { 342 pr_err("%s zpool not available\n", type); 343 goto error; 344 } 345 } 346 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 347 348 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 349 350 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 351 if (!pool->acomp_ctx) { 352 pr_err("percpu alloc failed\n"); 353 goto error; 354 } 355 356 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 357 &pool->node); 358 if (ret) 359 goto error; 360 361 /* being the current pool takes 1 ref; this func expects the 362 * caller to always add the new pool as the current pool 363 */ 364 ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, 365 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 366 if (ret) 367 goto ref_fail; 368 INIT_LIST_HEAD(&pool->list); 369 370 zswap_pool_debug("created", pool); 371 372 return pool; 373 374 ref_fail: 375 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 376 error: 377 if (pool->acomp_ctx) 378 free_percpu(pool->acomp_ctx); 379 while (i--) 380 zpool_destroy_pool(pool->zpools[i]); 381 kfree(pool); 382 return NULL; 383 } 384 385 static struct zswap_pool *__zswap_pool_create_fallback(void) 386 { 387 bool has_comp, has_zpool; 388 389 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 390 if (!has_comp && strcmp(zswap_compressor, 391 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 392 pr_err("compressor %s not available, using default %s\n", 393 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 394 param_free_charp(&zswap_compressor); 395 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 396 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 397 } 398 if (!has_comp) { 399 pr_err("default compressor %s not available\n", 400 zswap_compressor); 401 param_free_charp(&zswap_compressor); 402 zswap_compressor = ZSWAP_PARAM_UNSET; 403 } 404 405 has_zpool = zpool_has_pool(zswap_zpool_type); 406 if (!has_zpool && strcmp(zswap_zpool_type, 407 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 408 pr_err("zpool %s not available, using default %s\n", 409 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 410 param_free_charp(&zswap_zpool_type); 411 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 412 has_zpool = zpool_has_pool(zswap_zpool_type); 413 } 414 if (!has_zpool) { 415 pr_err("default zpool %s not available\n", 416 zswap_zpool_type); 417 param_free_charp(&zswap_zpool_type); 418 zswap_zpool_type = ZSWAP_PARAM_UNSET; 419 } 420 421 if (!has_comp || !has_zpool) 422 return NULL; 423 424 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 425 } 426 427 static void zswap_pool_destroy(struct zswap_pool *pool) 428 { 429 int i; 430 431 zswap_pool_debug("destroying", pool); 432 433 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 434 free_percpu(pool->acomp_ctx); 435 436 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 437 zpool_destroy_pool(pool->zpools[i]); 438 kfree(pool); 439 } 440 441 static void __zswap_pool_release(struct work_struct *work) 442 { 443 struct zswap_pool *pool = container_of(work, typeof(*pool), 444 release_work); 445 446 synchronize_rcu(); 447 448 /* nobody should have been able to get a ref... */ 449 WARN_ON(!percpu_ref_is_zero(&pool->ref)); 450 percpu_ref_exit(&pool->ref); 451 452 /* pool is now off zswap_pools list and has no references. */ 453 zswap_pool_destroy(pool); 454 } 455 456 static struct zswap_pool *zswap_pool_current(void); 457 458 static void __zswap_pool_empty(struct percpu_ref *ref) 459 { 460 struct zswap_pool *pool; 461 462 pool = container_of(ref, typeof(*pool), ref); 463 464 spin_lock_bh(&zswap_pools_lock); 465 466 WARN_ON(pool == zswap_pool_current()); 467 468 list_del_rcu(&pool->list); 469 470 INIT_WORK(&pool->release_work, __zswap_pool_release); 471 schedule_work(&pool->release_work); 472 473 spin_unlock_bh(&zswap_pools_lock); 474 } 475 476 static int __must_check zswap_pool_get(struct zswap_pool *pool) 477 { 478 if (!pool) 479 return 0; 480 481 return percpu_ref_tryget(&pool->ref); 482 } 483 484 static void zswap_pool_put(struct zswap_pool *pool) 485 { 486 percpu_ref_put(&pool->ref); 487 } 488 489 static struct zswap_pool *__zswap_pool_current(void) 490 { 491 struct zswap_pool *pool; 492 493 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 494 WARN_ONCE(!pool && zswap_has_pool, 495 "%s: no page storage pool!\n", __func__); 496 497 return pool; 498 } 499 500 static struct zswap_pool *zswap_pool_current(void) 501 { 502 assert_spin_locked(&zswap_pools_lock); 503 504 return __zswap_pool_current(); 505 } 506 507 static struct zswap_pool *zswap_pool_current_get(void) 508 { 509 struct zswap_pool *pool; 510 511 rcu_read_lock(); 512 513 pool = __zswap_pool_current(); 514 if (!zswap_pool_get(pool)) 515 pool = NULL; 516 517 rcu_read_unlock(); 518 519 return pool; 520 } 521 522 /* type and compressor must be null-terminated */ 523 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 524 { 525 struct zswap_pool *pool; 526 527 assert_spin_locked(&zswap_pools_lock); 528 529 list_for_each_entry_rcu(pool, &zswap_pools, list) { 530 if (strcmp(pool->tfm_name, compressor)) 531 continue; 532 /* all zpools share the same type */ 533 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 534 continue; 535 /* if we can't get it, it's about to be destroyed */ 536 if (!zswap_pool_get(pool)) 537 continue; 538 return pool; 539 } 540 541 return NULL; 542 } 543 544 /********************************* 545 * param callbacks 546 **********************************/ 547 548 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 549 { 550 /* no change required */ 551 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 552 return false; 553 return true; 554 } 555 556 /* val must be a null-terminated string */ 557 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 558 char *type, char *compressor) 559 { 560 struct zswap_pool *pool, *put_pool = NULL; 561 char *s = strstrip((char *)val); 562 int ret = 0; 563 bool new_pool = false; 564 565 mutex_lock(&zswap_init_lock); 566 switch (zswap_init_state) { 567 case ZSWAP_UNINIT: 568 /* if this is load-time (pre-init) param setting, 569 * don't create a pool; that's done during init. 570 */ 571 ret = param_set_charp(s, kp); 572 break; 573 case ZSWAP_INIT_SUCCEED: 574 new_pool = zswap_pool_changed(s, kp); 575 break; 576 case ZSWAP_INIT_FAILED: 577 pr_err("can't set param, initialization failed\n"); 578 ret = -ENODEV; 579 } 580 mutex_unlock(&zswap_init_lock); 581 582 /* no need to create a new pool, return directly */ 583 if (!new_pool) 584 return ret; 585 586 if (!type) { 587 if (!zpool_has_pool(s)) { 588 pr_err("zpool %s not available\n", s); 589 return -ENOENT; 590 } 591 type = s; 592 } else if (!compressor) { 593 if (!crypto_has_acomp(s, 0, 0)) { 594 pr_err("compressor %s not available\n", s); 595 return -ENOENT; 596 } 597 compressor = s; 598 } else { 599 WARN_ON(1); 600 return -EINVAL; 601 } 602 603 spin_lock_bh(&zswap_pools_lock); 604 605 pool = zswap_pool_find_get(type, compressor); 606 if (pool) { 607 zswap_pool_debug("using existing", pool); 608 WARN_ON(pool == zswap_pool_current()); 609 list_del_rcu(&pool->list); 610 } 611 612 spin_unlock_bh(&zswap_pools_lock); 613 614 if (!pool) 615 pool = zswap_pool_create(type, compressor); 616 else { 617 /* 618 * Restore the initial ref dropped by percpu_ref_kill() 619 * when the pool was decommissioned and switch it again 620 * to percpu mode. 621 */ 622 percpu_ref_resurrect(&pool->ref); 623 624 /* Drop the ref from zswap_pool_find_get(). */ 625 zswap_pool_put(pool); 626 } 627 628 if (pool) 629 ret = param_set_charp(s, kp); 630 else 631 ret = -EINVAL; 632 633 spin_lock_bh(&zswap_pools_lock); 634 635 if (!ret) { 636 put_pool = zswap_pool_current(); 637 list_add_rcu(&pool->list, &zswap_pools); 638 zswap_has_pool = true; 639 } else if (pool) { 640 /* add the possibly pre-existing pool to the end of the pools 641 * list; if it's new (and empty) then it'll be removed and 642 * destroyed by the put after we drop the lock 643 */ 644 list_add_tail_rcu(&pool->list, &zswap_pools); 645 put_pool = pool; 646 } 647 648 spin_unlock_bh(&zswap_pools_lock); 649 650 if (!zswap_has_pool && !pool) { 651 /* if initial pool creation failed, and this pool creation also 652 * failed, maybe both compressor and zpool params were bad. 653 * Allow changing this param, so pool creation will succeed 654 * when the other param is changed. We already verified this 655 * param is ok in the zpool_has_pool() or crypto_has_acomp() 656 * checks above. 657 */ 658 ret = param_set_charp(s, kp); 659 } 660 661 /* drop the ref from either the old current pool, 662 * or the new pool we failed to add 663 */ 664 if (put_pool) 665 percpu_ref_kill(&put_pool->ref); 666 667 return ret; 668 } 669 670 static int zswap_compressor_param_set(const char *val, 671 const struct kernel_param *kp) 672 { 673 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 674 } 675 676 static int zswap_zpool_param_set(const char *val, 677 const struct kernel_param *kp) 678 { 679 return __zswap_param_set(val, kp, NULL, zswap_compressor); 680 } 681 682 static int zswap_enabled_param_set(const char *val, 683 const struct kernel_param *kp) 684 { 685 int ret = -ENODEV; 686 687 /* if this is load-time (pre-init) param setting, only set param. */ 688 if (system_state != SYSTEM_RUNNING) 689 return param_set_bool(val, kp); 690 691 mutex_lock(&zswap_init_lock); 692 switch (zswap_init_state) { 693 case ZSWAP_UNINIT: 694 if (zswap_setup()) 695 break; 696 fallthrough; 697 case ZSWAP_INIT_SUCCEED: 698 if (!zswap_has_pool) 699 pr_err("can't enable, no pool configured\n"); 700 else 701 ret = param_set_bool(val, kp); 702 break; 703 case ZSWAP_INIT_FAILED: 704 pr_err("can't enable, initialization failed\n"); 705 } 706 mutex_unlock(&zswap_init_lock); 707 708 return ret; 709 } 710 711 /********************************* 712 * lru functions 713 **********************************/ 714 715 /* should be called under RCU */ 716 #ifdef CONFIG_MEMCG 717 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 718 { 719 return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; 720 } 721 #else 722 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 723 { 724 return NULL; 725 } 726 #endif 727 728 static inline int entry_to_nid(struct zswap_entry *entry) 729 { 730 return page_to_nid(virt_to_page(entry)); 731 } 732 733 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) 734 { 735 atomic_long_t *nr_zswap_protected; 736 unsigned long lru_size, old, new; 737 int nid = entry_to_nid(entry); 738 struct mem_cgroup *memcg; 739 struct lruvec *lruvec; 740 741 /* 742 * Note that it is safe to use rcu_read_lock() here, even in the face of 743 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 744 * used in list_lru lookup, only two scenarios are possible: 745 * 746 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 747 * new entry will be reparented to memcg's parent's list_lru. 748 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 749 * new entry will be added directly to memcg's parent's list_lru. 750 * 751 * Similar reasoning holds for list_lru_del(). 752 */ 753 rcu_read_lock(); 754 memcg = mem_cgroup_from_entry(entry); 755 /* will always succeed */ 756 list_lru_add(list_lru, &entry->lru, nid, memcg); 757 758 /* Update the protection area */ 759 lru_size = list_lru_count_one(list_lru, nid, memcg); 760 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 761 nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; 762 old = atomic_long_inc_return(nr_zswap_protected); 763 /* 764 * Decay to avoid overflow and adapt to changing workloads. 765 * This is based on LRU reclaim cost decaying heuristics. 766 */ 767 do { 768 new = old > lru_size / 4 ? old / 2 : old; 769 } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); 770 rcu_read_unlock(); 771 } 772 773 static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) 774 { 775 int nid = entry_to_nid(entry); 776 struct mem_cgroup *memcg; 777 778 rcu_read_lock(); 779 memcg = mem_cgroup_from_entry(entry); 780 /* will always succeed */ 781 list_lru_del(list_lru, &entry->lru, nid, memcg); 782 rcu_read_unlock(); 783 } 784 785 void zswap_lruvec_state_init(struct lruvec *lruvec) 786 { 787 atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); 788 } 789 790 void zswap_folio_swapin(struct folio *folio) 791 { 792 struct lruvec *lruvec; 793 794 if (folio) { 795 lruvec = folio_lruvec(folio); 796 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 797 } 798 } 799 800 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) 801 { 802 /* lock out zswap shrinker walking memcg tree */ 803 spin_lock(&zswap_shrink_lock); 804 if (zswap_next_shrink == memcg) 805 zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); 806 spin_unlock(&zswap_shrink_lock); 807 } 808 809 /********************************* 810 * rbtree functions 811 **********************************/ 812 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 813 { 814 struct rb_node *node = root->rb_node; 815 struct zswap_entry *entry; 816 pgoff_t entry_offset; 817 818 while (node) { 819 entry = rb_entry(node, struct zswap_entry, rbnode); 820 entry_offset = swp_offset(entry->swpentry); 821 if (entry_offset > offset) 822 node = node->rb_left; 823 else if (entry_offset < offset) 824 node = node->rb_right; 825 else 826 return entry; 827 } 828 return NULL; 829 } 830 831 /* 832 * In the case that a entry with the same offset is found, a pointer to 833 * the existing entry is stored in dupentry and the function returns -EEXIST 834 */ 835 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 836 struct zswap_entry **dupentry) 837 { 838 struct rb_node **link = &root->rb_node, *parent = NULL; 839 struct zswap_entry *myentry; 840 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 841 842 while (*link) { 843 parent = *link; 844 myentry = rb_entry(parent, struct zswap_entry, rbnode); 845 myentry_offset = swp_offset(myentry->swpentry); 846 if (myentry_offset > entry_offset) 847 link = &(*link)->rb_left; 848 else if (myentry_offset < entry_offset) 849 link = &(*link)->rb_right; 850 else { 851 *dupentry = myentry; 852 return -EEXIST; 853 } 854 } 855 rb_link_node(&entry->rbnode, parent, link); 856 rb_insert_color(&entry->rbnode, root); 857 return 0; 858 } 859 860 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 861 { 862 rb_erase(&entry->rbnode, root); 863 RB_CLEAR_NODE(&entry->rbnode); 864 } 865 866 /********************************* 867 * zswap entry functions 868 **********************************/ 869 static struct kmem_cache *zswap_entry_cache; 870 871 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) 872 { 873 struct zswap_entry *entry; 874 entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); 875 if (!entry) 876 return NULL; 877 RB_CLEAR_NODE(&entry->rbnode); 878 return entry; 879 } 880 881 static void zswap_entry_cache_free(struct zswap_entry *entry) 882 { 883 kmem_cache_free(zswap_entry_cache, entry); 884 } 885 886 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 887 { 888 int i = 0; 889 890 if (ZSWAP_NR_ZPOOLS > 1) 891 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 892 893 return entry->pool->zpools[i]; 894 } 895 896 /* 897 * Carries out the common pattern of freeing and entry's zpool allocation, 898 * freeing the entry itself, and decrementing the number of stored pages. 899 */ 900 static void zswap_entry_free(struct zswap_entry *entry) 901 { 902 if (!entry->length) 903 atomic_dec(&zswap_same_filled_pages); 904 else { 905 zswap_lru_del(&zswap_list_lru, entry); 906 zpool_free(zswap_find_zpool(entry), entry->handle); 907 atomic_dec(&zswap_nr_stored); 908 zswap_pool_put(entry->pool); 909 } 910 if (entry->objcg) { 911 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 912 obj_cgroup_put(entry->objcg); 913 } 914 zswap_entry_cache_free(entry); 915 atomic_dec(&zswap_stored_pages); 916 zswap_update_total_size(); 917 } 918 919 /* 920 * The caller hold the tree lock and search the entry from the tree, 921 * so it must be on the tree, remove it from the tree and free it. 922 */ 923 static void zswap_invalidate_entry(struct zswap_tree *tree, 924 struct zswap_entry *entry) 925 { 926 zswap_rb_erase(&tree->rbroot, entry); 927 zswap_entry_free(entry); 928 } 929 930 /********************************* 931 * compressed storage functions 932 **********************************/ 933 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 934 { 935 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 936 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 937 struct crypto_acomp *acomp; 938 struct acomp_req *req; 939 int ret; 940 941 mutex_init(&acomp_ctx->mutex); 942 943 acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 944 if (!acomp_ctx->buffer) 945 return -ENOMEM; 946 947 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 948 if (IS_ERR(acomp)) { 949 pr_err("could not alloc crypto acomp %s : %ld\n", 950 pool->tfm_name, PTR_ERR(acomp)); 951 ret = PTR_ERR(acomp); 952 goto acomp_fail; 953 } 954 acomp_ctx->acomp = acomp; 955 acomp_ctx->is_sleepable = acomp_is_async(acomp); 956 957 req = acomp_request_alloc(acomp_ctx->acomp); 958 if (!req) { 959 pr_err("could not alloc crypto acomp_request %s\n", 960 pool->tfm_name); 961 ret = -ENOMEM; 962 goto req_fail; 963 } 964 acomp_ctx->req = req; 965 966 crypto_init_wait(&acomp_ctx->wait); 967 /* 968 * if the backend of acomp is async zip, crypto_req_done() will wakeup 969 * crypto_wait_req(); if the backend of acomp is scomp, the callback 970 * won't be called, crypto_wait_req() will return without blocking. 971 */ 972 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 973 crypto_req_done, &acomp_ctx->wait); 974 975 return 0; 976 977 req_fail: 978 crypto_free_acomp(acomp_ctx->acomp); 979 acomp_fail: 980 kfree(acomp_ctx->buffer); 981 return ret; 982 } 983 984 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 985 { 986 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 987 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 988 989 if (!IS_ERR_OR_NULL(acomp_ctx)) { 990 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 991 acomp_request_free(acomp_ctx->req); 992 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 993 crypto_free_acomp(acomp_ctx->acomp); 994 kfree(acomp_ctx->buffer); 995 } 996 997 return 0; 998 } 999 1000 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) 1001 { 1002 struct crypto_acomp_ctx *acomp_ctx; 1003 struct scatterlist input, output; 1004 int comp_ret = 0, alloc_ret = 0; 1005 unsigned int dlen = PAGE_SIZE; 1006 unsigned long handle; 1007 struct zpool *zpool; 1008 char *buf; 1009 gfp_t gfp; 1010 u8 *dst; 1011 1012 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1013 1014 mutex_lock(&acomp_ctx->mutex); 1015 1016 dst = acomp_ctx->buffer; 1017 sg_init_table(&input, 1); 1018 sg_set_page(&input, &folio->page, PAGE_SIZE, 0); 1019 1020 /* 1021 * We need PAGE_SIZE * 2 here since there maybe over-compression case, 1022 * and hardware-accelerators may won't check the dst buffer size, so 1023 * giving the dst buffer with enough length to avoid buffer overflow. 1024 */ 1025 sg_init_one(&output, dst, PAGE_SIZE * 2); 1026 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1027 1028 /* 1029 * it maybe looks a little bit silly that we send an asynchronous request, 1030 * then wait for its completion synchronously. This makes the process look 1031 * synchronous in fact. 1032 * Theoretically, acomp supports users send multiple acomp requests in one 1033 * acomp instance, then get those requests done simultaneously. but in this 1034 * case, zswap actually does store and load page by page, there is no 1035 * existing method to send the second page before the first page is done 1036 * in one thread doing zwap. 1037 * but in different threads running on different cpu, we have different 1038 * acomp instance, so multiple threads can do (de)compression in parallel. 1039 */ 1040 comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1041 dlen = acomp_ctx->req->dlen; 1042 if (comp_ret) 1043 goto unlock; 1044 1045 zpool = zswap_find_zpool(entry); 1046 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1047 if (zpool_malloc_support_movable(zpool)) 1048 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1049 alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); 1050 if (alloc_ret) 1051 goto unlock; 1052 1053 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1054 memcpy(buf, dst, dlen); 1055 zpool_unmap_handle(zpool, handle); 1056 1057 entry->handle = handle; 1058 entry->length = dlen; 1059 1060 unlock: 1061 if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) 1062 zswap_reject_compress_poor++; 1063 else if (comp_ret) 1064 zswap_reject_compress_fail++; 1065 else if (alloc_ret) 1066 zswap_reject_alloc_fail++; 1067 1068 mutex_unlock(&acomp_ctx->mutex); 1069 return comp_ret == 0 && alloc_ret == 0; 1070 } 1071 1072 static void zswap_decompress(struct zswap_entry *entry, struct page *page) 1073 { 1074 struct zpool *zpool = zswap_find_zpool(entry); 1075 struct scatterlist input, output; 1076 struct crypto_acomp_ctx *acomp_ctx; 1077 u8 *src; 1078 1079 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1080 mutex_lock(&acomp_ctx->mutex); 1081 1082 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1083 if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) { 1084 memcpy(acomp_ctx->buffer, src, entry->length); 1085 src = acomp_ctx->buffer; 1086 zpool_unmap_handle(zpool, entry->handle); 1087 } 1088 1089 sg_init_one(&input, src, entry->length); 1090 sg_init_table(&output, 1); 1091 sg_set_page(&output, page, PAGE_SIZE, 0); 1092 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); 1093 BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); 1094 BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); 1095 mutex_unlock(&acomp_ctx->mutex); 1096 1097 if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool)) 1098 zpool_unmap_handle(zpool, entry->handle); 1099 } 1100 1101 /********************************* 1102 * writeback code 1103 **********************************/ 1104 /* 1105 * Attempts to free an entry by adding a folio to the swap cache, 1106 * decompressing the entry data into the folio, and issuing a 1107 * bio write to write the folio back to the swap device. 1108 * 1109 * This can be thought of as a "resumed writeback" of the folio 1110 * to the swap device. We are basically resuming the same swap 1111 * writeback path that was intercepted with the zswap_store() 1112 * in the first place. After the folio has been decompressed into 1113 * the swap cache, the compressed version stored by zswap can be 1114 * freed. 1115 */ 1116 static int zswap_writeback_entry(struct zswap_entry *entry, 1117 swp_entry_t swpentry) 1118 { 1119 struct zswap_tree *tree; 1120 struct folio *folio; 1121 struct mempolicy *mpol; 1122 bool folio_was_allocated; 1123 struct writeback_control wbc = { 1124 .sync_mode = WB_SYNC_NONE, 1125 }; 1126 1127 /* try to allocate swap cache folio */ 1128 mpol = get_task_policy(current); 1129 folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1130 NO_INTERLEAVE_INDEX, &folio_was_allocated, true); 1131 if (!folio) 1132 return -ENOMEM; 1133 1134 /* 1135 * Found an existing folio, we raced with swapin or concurrent 1136 * shrinker. We generally writeback cold folios from zswap, and 1137 * swapin means the folio just became hot, so skip this folio. 1138 * For unlikely concurrent shrinker case, it will be unlinked 1139 * and freed when invalidated by the concurrent shrinker anyway. 1140 */ 1141 if (!folio_was_allocated) { 1142 folio_put(folio); 1143 return -EEXIST; 1144 } 1145 1146 /* 1147 * folio is locked, and the swapcache is now secured against 1148 * concurrent swapping to and from the slot, and concurrent 1149 * swapoff so we can safely dereference the zswap tree here. 1150 * Verify that the swap entry hasn't been invalidated and recycled 1151 * behind our backs, to avoid overwriting a new swap folio with 1152 * old compressed data. Only when this is successful can the entry 1153 * be dereferenced. 1154 */ 1155 tree = swap_zswap_tree(swpentry); 1156 spin_lock(&tree->lock); 1157 if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { 1158 spin_unlock(&tree->lock); 1159 delete_from_swap_cache(folio); 1160 folio_unlock(folio); 1161 folio_put(folio); 1162 return -ENOMEM; 1163 } 1164 1165 /* Safe to deref entry after the entry is verified above. */ 1166 zswap_rb_erase(&tree->rbroot, entry); 1167 spin_unlock(&tree->lock); 1168 1169 zswap_decompress(entry, &folio->page); 1170 1171 count_vm_event(ZSWPWB); 1172 if (entry->objcg) 1173 count_objcg_event(entry->objcg, ZSWPWB); 1174 1175 zswap_entry_free(entry); 1176 1177 /* folio is up to date */ 1178 folio_mark_uptodate(folio); 1179 1180 /* move it to the tail of the inactive list after end_writeback */ 1181 folio_set_reclaim(folio); 1182 1183 /* start writeback */ 1184 __swap_writepage(folio, &wbc); 1185 folio_put(folio); 1186 1187 return 0; 1188 } 1189 1190 /********************************* 1191 * shrinker functions 1192 **********************************/ 1193 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 1194 spinlock_t *lock, void *arg) 1195 { 1196 struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); 1197 bool *encountered_page_in_swapcache = (bool *)arg; 1198 swp_entry_t swpentry; 1199 enum lru_status ret = LRU_REMOVED_RETRY; 1200 int writeback_result; 1201 1202 /* 1203 * As soon as we drop the LRU lock, the entry can be freed by 1204 * a concurrent invalidation. This means the following: 1205 * 1206 * 1. We extract the swp_entry_t to the stack, allowing 1207 * zswap_writeback_entry() to pin the swap entry and 1208 * then validate the zwap entry against that swap entry's 1209 * tree using pointer value comparison. Only when that 1210 * is successful can the entry be dereferenced. 1211 * 1212 * 2. Usually, objects are taken off the LRU for reclaim. In 1213 * this case this isn't possible, because if reclaim fails 1214 * for whatever reason, we have no means of knowing if the 1215 * entry is alive to put it back on the LRU. 1216 * 1217 * So rotate it before dropping the lock. If the entry is 1218 * written back or invalidated, the free path will unlink 1219 * it. For failures, rotation is the right thing as well. 1220 * 1221 * Temporary failures, where the same entry should be tried 1222 * again immediately, almost never happen for this shrinker. 1223 * We don't do any trylocking; -ENOMEM comes closest, 1224 * but that's extremely rare and doesn't happen spuriously 1225 * either. Don't bother distinguishing this case. 1226 */ 1227 list_move_tail(item, &l->list); 1228 1229 /* 1230 * Once the lru lock is dropped, the entry might get freed. The 1231 * swpentry is copied to the stack, and entry isn't deref'd again 1232 * until the entry is verified to still be alive in the tree. 1233 */ 1234 swpentry = entry->swpentry; 1235 1236 /* 1237 * It's safe to drop the lock here because we return either 1238 * LRU_REMOVED_RETRY or LRU_RETRY. 1239 */ 1240 spin_unlock(lock); 1241 1242 writeback_result = zswap_writeback_entry(entry, swpentry); 1243 1244 if (writeback_result) { 1245 zswap_reject_reclaim_fail++; 1246 ret = LRU_RETRY; 1247 1248 /* 1249 * Encountering a page already in swap cache is a sign that we are shrinking 1250 * into the warmer region. We should terminate shrinking (if we're in the dynamic 1251 * shrinker context). 1252 */ 1253 if (writeback_result == -EEXIST && encountered_page_in_swapcache) { 1254 ret = LRU_STOP; 1255 *encountered_page_in_swapcache = true; 1256 } 1257 } else { 1258 zswap_written_back_pages++; 1259 } 1260 1261 spin_lock(lock); 1262 return ret; 1263 } 1264 1265 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, 1266 struct shrink_control *sc) 1267 { 1268 struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 1269 unsigned long shrink_ret, nr_protected, lru_size; 1270 bool encountered_page_in_swapcache = false; 1271 1272 if (!zswap_shrinker_enabled || 1273 !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { 1274 sc->nr_scanned = 0; 1275 return SHRINK_STOP; 1276 } 1277 1278 nr_protected = 1279 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 1280 lru_size = list_lru_shrink_count(&zswap_list_lru, sc); 1281 1282 /* 1283 * Abort if we are shrinking into the protected region. 1284 * 1285 * This short-circuiting is necessary because if we have too many multiple 1286 * concurrent reclaimers getting the freeable zswap object counts at the 1287 * same time (before any of them made reasonable progress), the total 1288 * number of reclaimed objects might be more than the number of unprotected 1289 * objects (i.e the reclaimers will reclaim into the protected area of the 1290 * zswap LRU). 1291 */ 1292 if (nr_protected >= lru_size - sc->nr_to_scan) { 1293 sc->nr_scanned = 0; 1294 return SHRINK_STOP; 1295 } 1296 1297 shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, 1298 &encountered_page_in_swapcache); 1299 1300 if (encountered_page_in_swapcache) 1301 return SHRINK_STOP; 1302 1303 return shrink_ret ? shrink_ret : SHRINK_STOP; 1304 } 1305 1306 static unsigned long zswap_shrinker_count(struct shrinker *shrinker, 1307 struct shrink_control *sc) 1308 { 1309 struct mem_cgroup *memcg = sc->memcg; 1310 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); 1311 unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; 1312 1313 if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) 1314 return 0; 1315 1316 #ifdef CONFIG_MEMCG_KMEM 1317 mem_cgroup_flush_stats(memcg); 1318 nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; 1319 nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); 1320 #else 1321 /* use pool stats instead of memcg stats */ 1322 nr_backing = zswap_pool_total_size >> PAGE_SHIFT; 1323 nr_stored = atomic_read(&zswap_nr_stored); 1324 #endif 1325 1326 if (!nr_stored) 1327 return 0; 1328 1329 nr_protected = 1330 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 1331 nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); 1332 /* 1333 * Subtract the lru size by an estimate of the number of pages 1334 * that should be protected. 1335 */ 1336 nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; 1337 1338 /* 1339 * Scale the number of freeable pages by the memory saving factor. 1340 * This ensures that the better zswap compresses memory, the fewer 1341 * pages we will evict to swap (as it will otherwise incur IO for 1342 * relatively small memory saving). 1343 */ 1344 return mult_frac(nr_freeable, nr_backing, nr_stored); 1345 } 1346 1347 static struct shrinker *zswap_alloc_shrinker(void) 1348 { 1349 struct shrinker *shrinker; 1350 1351 shrinker = 1352 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); 1353 if (!shrinker) 1354 return NULL; 1355 1356 shrinker->scan_objects = zswap_shrinker_scan; 1357 shrinker->count_objects = zswap_shrinker_count; 1358 shrinker->batch = 0; 1359 shrinker->seeks = DEFAULT_SEEKS; 1360 return shrinker; 1361 } 1362 1363 static int shrink_memcg(struct mem_cgroup *memcg) 1364 { 1365 int nid, shrunk = 0; 1366 1367 if (!mem_cgroup_zswap_writeback_enabled(memcg)) 1368 return -EINVAL; 1369 1370 /* 1371 * Skip zombies because their LRUs are reparented and we would be 1372 * reclaiming from the parent instead of the dead memcg. 1373 */ 1374 if (memcg && !mem_cgroup_online(memcg)) 1375 return -ENOENT; 1376 1377 for_each_node_state(nid, N_NORMAL_MEMORY) { 1378 unsigned long nr_to_walk = 1; 1379 1380 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, 1381 &shrink_memcg_cb, NULL, &nr_to_walk); 1382 } 1383 return shrunk ? 0 : -EAGAIN; 1384 } 1385 1386 static void shrink_worker(struct work_struct *w) 1387 { 1388 struct mem_cgroup *memcg; 1389 int ret, failures = 0; 1390 1391 /* global reclaim will select cgroup in a round-robin fashion. */ 1392 do { 1393 spin_lock(&zswap_shrink_lock); 1394 zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); 1395 memcg = zswap_next_shrink; 1396 1397 /* 1398 * We need to retry if we have gone through a full round trip, or if we 1399 * got an offline memcg (or else we risk undoing the effect of the 1400 * zswap memcg offlining cleanup callback). This is not catastrophic 1401 * per se, but it will keep the now offlined memcg hostage for a while. 1402 * 1403 * Note that if we got an online memcg, we will keep the extra 1404 * reference in case the original reference obtained by mem_cgroup_iter 1405 * is dropped by the zswap memcg offlining callback, ensuring that the 1406 * memcg is not killed when we are reclaiming. 1407 */ 1408 if (!memcg) { 1409 spin_unlock(&zswap_shrink_lock); 1410 if (++failures == MAX_RECLAIM_RETRIES) 1411 break; 1412 1413 goto resched; 1414 } 1415 1416 if (!mem_cgroup_tryget_online(memcg)) { 1417 /* drop the reference from mem_cgroup_iter() */ 1418 mem_cgroup_iter_break(NULL, memcg); 1419 zswap_next_shrink = NULL; 1420 spin_unlock(&zswap_shrink_lock); 1421 1422 if (++failures == MAX_RECLAIM_RETRIES) 1423 break; 1424 1425 goto resched; 1426 } 1427 spin_unlock(&zswap_shrink_lock); 1428 1429 ret = shrink_memcg(memcg); 1430 /* drop the extra reference */ 1431 mem_cgroup_put(memcg); 1432 1433 if (ret == -EINVAL) 1434 break; 1435 if (ret && ++failures == MAX_RECLAIM_RETRIES) 1436 break; 1437 1438 resched: 1439 cond_resched(); 1440 } while (!zswap_can_accept()); 1441 } 1442 1443 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1444 { 1445 unsigned long *page; 1446 unsigned long val; 1447 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1448 1449 page = (unsigned long *)ptr; 1450 val = page[0]; 1451 1452 if (val != page[last_pos]) 1453 return 0; 1454 1455 for (pos = 1; pos < last_pos; pos++) { 1456 if (val != page[pos]) 1457 return 0; 1458 } 1459 1460 *value = val; 1461 1462 return 1; 1463 } 1464 1465 static void zswap_fill_page(void *ptr, unsigned long value) 1466 { 1467 unsigned long *page; 1468 1469 page = (unsigned long *)ptr; 1470 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1471 } 1472 1473 bool zswap_store(struct folio *folio) 1474 { 1475 swp_entry_t swp = folio->swap; 1476 pgoff_t offset = swp_offset(swp); 1477 struct zswap_tree *tree = swap_zswap_tree(swp); 1478 struct zswap_entry *entry, *dupentry; 1479 struct obj_cgroup *objcg = NULL; 1480 struct mem_cgroup *memcg = NULL; 1481 1482 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1483 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1484 1485 /* Large folios aren't supported */ 1486 if (folio_test_large(folio)) 1487 return false; 1488 1489 if (!zswap_enabled) 1490 goto check_old; 1491 1492 objcg = get_obj_cgroup_from_folio(folio); 1493 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1494 memcg = get_mem_cgroup_from_objcg(objcg); 1495 if (shrink_memcg(memcg)) { 1496 mem_cgroup_put(memcg); 1497 goto reject; 1498 } 1499 mem_cgroup_put(memcg); 1500 } 1501 1502 /* reclaim space if needed */ 1503 if (zswap_is_full()) { 1504 zswap_pool_limit_hit++; 1505 zswap_pool_reached_full = true; 1506 goto shrink; 1507 } 1508 1509 if (zswap_pool_reached_full) { 1510 if (!zswap_can_accept()) 1511 goto shrink; 1512 else 1513 zswap_pool_reached_full = false; 1514 } 1515 1516 /* allocate entry */ 1517 entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); 1518 if (!entry) { 1519 zswap_reject_kmemcache_fail++; 1520 goto reject; 1521 } 1522 1523 if (zswap_same_filled_pages_enabled) { 1524 unsigned long value; 1525 u8 *src; 1526 1527 src = kmap_local_folio(folio, 0); 1528 if (zswap_is_page_same_filled(src, &value)) { 1529 kunmap_local(src); 1530 entry->length = 0; 1531 entry->value = value; 1532 atomic_inc(&zswap_same_filled_pages); 1533 goto insert_entry; 1534 } 1535 kunmap_local(src); 1536 } 1537 1538 if (!zswap_non_same_filled_pages_enabled) 1539 goto freepage; 1540 1541 /* if entry is successfully added, it keeps the reference */ 1542 entry->pool = zswap_pool_current_get(); 1543 if (!entry->pool) 1544 goto freepage; 1545 1546 if (objcg) { 1547 memcg = get_mem_cgroup_from_objcg(objcg); 1548 if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { 1549 mem_cgroup_put(memcg); 1550 goto put_pool; 1551 } 1552 mem_cgroup_put(memcg); 1553 } 1554 1555 if (!zswap_compress(folio, entry)) 1556 goto put_pool; 1557 1558 insert_entry: 1559 entry->swpentry = swp; 1560 entry->objcg = objcg; 1561 if (objcg) { 1562 obj_cgroup_charge_zswap(objcg, entry->length); 1563 /* Account before objcg ref is moved to tree */ 1564 count_objcg_event(objcg, ZSWPOUT); 1565 } 1566 1567 /* map */ 1568 spin_lock(&tree->lock); 1569 /* 1570 * The folio may have been dirtied again, invalidate the 1571 * possibly stale entry before inserting the new entry. 1572 */ 1573 if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1574 zswap_invalidate_entry(tree, dupentry); 1575 WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); 1576 } 1577 if (entry->length) { 1578 INIT_LIST_HEAD(&entry->lru); 1579 zswap_lru_add(&zswap_list_lru, entry); 1580 atomic_inc(&zswap_nr_stored); 1581 } 1582 spin_unlock(&tree->lock); 1583 1584 /* update stats */ 1585 atomic_inc(&zswap_stored_pages); 1586 zswap_update_total_size(); 1587 count_vm_event(ZSWPOUT); 1588 1589 return true; 1590 1591 put_pool: 1592 zswap_pool_put(entry->pool); 1593 freepage: 1594 zswap_entry_cache_free(entry); 1595 reject: 1596 if (objcg) 1597 obj_cgroup_put(objcg); 1598 check_old: 1599 /* 1600 * If the zswap store fails or zswap is disabled, we must invalidate the 1601 * possibly stale entry which was previously stored at this offset. 1602 * Otherwise, writeback could overwrite the new data in the swapfile. 1603 */ 1604 spin_lock(&tree->lock); 1605 entry = zswap_rb_search(&tree->rbroot, offset); 1606 if (entry) 1607 zswap_invalidate_entry(tree, entry); 1608 spin_unlock(&tree->lock); 1609 return false; 1610 1611 shrink: 1612 queue_work(shrink_wq, &zswap_shrink_work); 1613 goto reject; 1614 } 1615 1616 bool zswap_load(struct folio *folio) 1617 { 1618 swp_entry_t swp = folio->swap; 1619 pgoff_t offset = swp_offset(swp); 1620 struct page *page = &folio->page; 1621 struct zswap_tree *tree = swap_zswap_tree(swp); 1622 struct zswap_entry *entry; 1623 u8 *dst; 1624 1625 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1626 1627 spin_lock(&tree->lock); 1628 entry = zswap_rb_search(&tree->rbroot, offset); 1629 if (!entry) { 1630 spin_unlock(&tree->lock); 1631 return false; 1632 } 1633 zswap_rb_erase(&tree->rbroot, entry); 1634 spin_unlock(&tree->lock); 1635 1636 if (entry->length) 1637 zswap_decompress(entry, page); 1638 else { 1639 dst = kmap_local_page(page); 1640 zswap_fill_page(dst, entry->value); 1641 kunmap_local(dst); 1642 } 1643 1644 count_vm_event(ZSWPIN); 1645 if (entry->objcg) 1646 count_objcg_event(entry->objcg, ZSWPIN); 1647 1648 zswap_entry_free(entry); 1649 1650 folio_mark_dirty(folio); 1651 1652 return true; 1653 } 1654 1655 void zswap_invalidate(swp_entry_t swp) 1656 { 1657 pgoff_t offset = swp_offset(swp); 1658 struct zswap_tree *tree = swap_zswap_tree(swp); 1659 struct zswap_entry *entry; 1660 1661 spin_lock(&tree->lock); 1662 entry = zswap_rb_search(&tree->rbroot, offset); 1663 if (entry) 1664 zswap_invalidate_entry(tree, entry); 1665 spin_unlock(&tree->lock); 1666 } 1667 1668 int zswap_swapon(int type, unsigned long nr_pages) 1669 { 1670 struct zswap_tree *trees, *tree; 1671 unsigned int nr, i; 1672 1673 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 1674 trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); 1675 if (!trees) { 1676 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1677 return -ENOMEM; 1678 } 1679 1680 for (i = 0; i < nr; i++) { 1681 tree = trees + i; 1682 tree->rbroot = RB_ROOT; 1683 spin_lock_init(&tree->lock); 1684 } 1685 1686 nr_zswap_trees[type] = nr; 1687 zswap_trees[type] = trees; 1688 return 0; 1689 } 1690 1691 void zswap_swapoff(int type) 1692 { 1693 struct zswap_tree *trees = zswap_trees[type]; 1694 unsigned int i; 1695 1696 if (!trees) 1697 return; 1698 1699 /* try_to_unuse() invalidated all the entries already */ 1700 for (i = 0; i < nr_zswap_trees[type]; i++) 1701 WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); 1702 1703 kvfree(trees); 1704 nr_zswap_trees[type] = 0; 1705 zswap_trees[type] = NULL; 1706 } 1707 1708 /********************************* 1709 * debugfs functions 1710 **********************************/ 1711 #ifdef CONFIG_DEBUG_FS 1712 #include <linux/debugfs.h> 1713 1714 static struct dentry *zswap_debugfs_root; 1715 1716 static int zswap_debugfs_init(void) 1717 { 1718 if (!debugfs_initialized()) 1719 return -ENODEV; 1720 1721 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1722 1723 debugfs_create_u64("pool_limit_hit", 0444, 1724 zswap_debugfs_root, &zswap_pool_limit_hit); 1725 debugfs_create_u64("reject_reclaim_fail", 0444, 1726 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1727 debugfs_create_u64("reject_alloc_fail", 0444, 1728 zswap_debugfs_root, &zswap_reject_alloc_fail); 1729 debugfs_create_u64("reject_kmemcache_fail", 0444, 1730 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1731 debugfs_create_u64("reject_compress_fail", 0444, 1732 zswap_debugfs_root, &zswap_reject_compress_fail); 1733 debugfs_create_u64("reject_compress_poor", 0444, 1734 zswap_debugfs_root, &zswap_reject_compress_poor); 1735 debugfs_create_u64("written_back_pages", 0444, 1736 zswap_debugfs_root, &zswap_written_back_pages); 1737 debugfs_create_u64("pool_total_size", 0444, 1738 zswap_debugfs_root, &zswap_pool_total_size); 1739 debugfs_create_atomic_t("stored_pages", 0444, 1740 zswap_debugfs_root, &zswap_stored_pages); 1741 debugfs_create_atomic_t("same_filled_pages", 0444, 1742 zswap_debugfs_root, &zswap_same_filled_pages); 1743 1744 return 0; 1745 } 1746 #else 1747 static int zswap_debugfs_init(void) 1748 { 1749 return 0; 1750 } 1751 #endif 1752 1753 /********************************* 1754 * module init and exit 1755 **********************************/ 1756 static int zswap_setup(void) 1757 { 1758 struct zswap_pool *pool; 1759 int ret; 1760 1761 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1762 if (!zswap_entry_cache) { 1763 pr_err("entry cache creation failed\n"); 1764 goto cache_fail; 1765 } 1766 1767 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1768 "mm/zswap_pool:prepare", 1769 zswap_cpu_comp_prepare, 1770 zswap_cpu_comp_dead); 1771 if (ret) 1772 goto hp_fail; 1773 1774 shrink_wq = alloc_workqueue("zswap-shrink", 1775 WQ_UNBOUND|WQ_MEM_RECLAIM, 1); 1776 if (!shrink_wq) 1777 goto shrink_wq_fail; 1778 1779 zswap_shrinker = zswap_alloc_shrinker(); 1780 if (!zswap_shrinker) 1781 goto shrinker_fail; 1782 if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) 1783 goto lru_fail; 1784 shrinker_register(zswap_shrinker); 1785 1786 INIT_WORK(&zswap_shrink_work, shrink_worker); 1787 1788 pool = __zswap_pool_create_fallback(); 1789 if (pool) { 1790 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1791 zpool_get_type(pool->zpools[0])); 1792 list_add(&pool->list, &zswap_pools); 1793 zswap_has_pool = true; 1794 } else { 1795 pr_err("pool creation failed\n"); 1796 zswap_enabled = false; 1797 } 1798 1799 if (zswap_debugfs_init()) 1800 pr_warn("debugfs initialization failed\n"); 1801 zswap_init_state = ZSWAP_INIT_SUCCEED; 1802 return 0; 1803 1804 lru_fail: 1805 shrinker_free(zswap_shrinker); 1806 shrinker_fail: 1807 destroy_workqueue(shrink_wq); 1808 shrink_wq_fail: 1809 cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); 1810 hp_fail: 1811 kmem_cache_destroy(zswap_entry_cache); 1812 cache_fail: 1813 /* if built-in, we aren't unloaded on failure; don't allow use */ 1814 zswap_init_state = ZSWAP_INIT_FAILED; 1815 zswap_enabled = false; 1816 return -ENOMEM; 1817 } 1818 1819 static int __init zswap_init(void) 1820 { 1821 if (!zswap_enabled) 1822 return 0; 1823 return zswap_setup(); 1824 } 1825 /* must be late so crypto has time to come up */ 1826 late_initcall(zswap_init); 1827 1828 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1829 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1830