1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * zswap.c - zswap driver file 4 * 5 * zswap is a cache that takes pages that are in the process 6 * of being swapped out and attempts to compress and store them in a 7 * RAM-based memory pool. This can result in a significant I/O reduction on 8 * the swap device and, in the case where decompressing from RAM is faster 9 * than reading from the swap device, can also improve workload performance. 10 * 11 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 12 */ 13 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/highmem.h> 19 #include <linux/slab.h> 20 #include <linux/spinlock.h> 21 #include <linux/types.h> 22 #include <linux/atomic.h> 23 #include <linux/rbtree.h> 24 #include <linux/swap.h> 25 #include <linux/crypto.h> 26 #include <linux/scatterlist.h> 27 #include <linux/mempolicy.h> 28 #include <linux/mempool.h> 29 #include <linux/zpool.h> 30 #include <crypto/acompress.h> 31 #include <linux/zswap.h> 32 #include <linux/mm_types.h> 33 #include <linux/page-flags.h> 34 #include <linux/swapops.h> 35 #include <linux/writeback.h> 36 #include <linux/pagemap.h> 37 #include <linux/workqueue.h> 38 #include <linux/list_lru.h> 39 40 #include "swap.h" 41 #include "internal.h" 42 43 /********************************* 44 * statistics 45 **********************************/ 46 /* Total bytes used by the compressed storage */ 47 u64 zswap_pool_total_size; 48 /* The number of compressed pages currently stored in zswap */ 49 atomic_t zswap_stored_pages = ATOMIC_INIT(0); 50 /* The number of same-value filled pages currently stored in zswap */ 51 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Store failed due to compression algorithm failure */ 67 static u64 zswap_reject_compress_fail; 68 /* Compressed page was too big for the allocator to (optimally) store */ 69 static u64 zswap_reject_compress_poor; 70 /* Store failed because underlying allocator could not get memory */ 71 static u64 zswap_reject_alloc_fail; 72 /* Store failed because the entry metadata could not be allocated (rare) */ 73 static u64 zswap_reject_kmemcache_fail; 74 75 /* Shrinker work queue */ 76 static struct workqueue_struct *shrink_wq; 77 /* Pool limit was hit, we need to calm down */ 78 static bool zswap_pool_reached_full; 79 80 /********************************* 81 * tunables 82 **********************************/ 83 84 #define ZSWAP_PARAM_UNSET "" 85 86 static int zswap_setup(void); 87 88 /* Enable/disable zswap */ 89 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); 90 static int zswap_enabled_param_set(const char *, 91 const struct kernel_param *); 92 static const struct kernel_param_ops zswap_enabled_param_ops = { 93 .set = zswap_enabled_param_set, 94 .get = param_get_bool, 95 }; 96 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 97 98 /* Crypto compressor to use */ 99 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 100 static int zswap_compressor_param_set(const char *, 101 const struct kernel_param *); 102 static const struct kernel_param_ops zswap_compressor_param_ops = { 103 .set = zswap_compressor_param_set, 104 .get = param_get_charp, 105 .free = param_free_charp, 106 }; 107 module_param_cb(compressor, &zswap_compressor_param_ops, 108 &zswap_compressor, 0644); 109 110 /* Compressed storage zpool to use */ 111 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 112 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 113 static const struct kernel_param_ops zswap_zpool_param_ops = { 114 .set = zswap_zpool_param_set, 115 .get = param_get_charp, 116 .free = param_free_charp, 117 }; 118 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 119 120 /* The maximum percentage of memory that the compressed pool can occupy */ 121 static unsigned int zswap_max_pool_percent = 20; 122 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 123 124 /* The threshold for accepting new pages after the max_pool_percent was hit */ 125 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ 126 module_param_named(accept_threshold_percent, zswap_accept_thr_percent, 127 uint, 0644); 128 129 /* 130 * Enable/disable handling same-value filled pages (enabled by default). 131 * If disabled every page is considered non-same-value filled. 132 */ 133 static bool zswap_same_filled_pages_enabled = true; 134 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 135 bool, 0644); 136 137 /* Enable/disable handling non-same-value filled pages (enabled by default) */ 138 static bool zswap_non_same_filled_pages_enabled = true; 139 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, 140 bool, 0644); 141 142 /* Number of zpools in zswap_pool (empirically determined for scalability) */ 143 #define ZSWAP_NR_ZPOOLS 32 144 145 /* Enable/disable memory pressure-based shrinker. */ 146 static bool zswap_shrinker_enabled = IS_ENABLED( 147 CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); 148 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); 149 150 bool is_zswap_enabled(void) 151 { 152 return zswap_enabled; 153 } 154 155 /********************************* 156 * data structures 157 **********************************/ 158 159 struct crypto_acomp_ctx { 160 struct crypto_acomp *acomp; 161 struct acomp_req *req; 162 struct crypto_wait wait; 163 u8 *buffer; 164 struct mutex mutex; 165 bool is_sleepable; 166 }; 167 168 /* 169 * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. 170 * The only case where lru_lock is not acquired while holding tree.lock is 171 * when a zswap_entry is taken off the lru for writeback, in that case it 172 * needs to be verified that it's still valid in the tree. 173 */ 174 struct zswap_pool { 175 struct zpool *zpools[ZSWAP_NR_ZPOOLS]; 176 struct crypto_acomp_ctx __percpu *acomp_ctx; 177 struct percpu_ref ref; 178 struct list_head list; 179 struct work_struct release_work; 180 struct hlist_node node; 181 char tfm_name[CRYPTO_MAX_ALG_NAME]; 182 }; 183 184 /* Global LRU lists shared by all zswap pools. */ 185 static struct list_lru zswap_list_lru; 186 /* counter of pages stored in all zswap pools. */ 187 static atomic_t zswap_nr_stored = ATOMIC_INIT(0); 188 189 /* The lock protects zswap_next_shrink updates. */ 190 static DEFINE_SPINLOCK(zswap_shrink_lock); 191 static struct mem_cgroup *zswap_next_shrink; 192 static struct work_struct zswap_shrink_work; 193 static struct shrinker *zswap_shrinker; 194 195 /* 196 * struct zswap_entry 197 * 198 * This structure contains the metadata for tracking a single compressed 199 * page within zswap. 200 * 201 * rbnode - links the entry into red-black tree for the appropriate swap type 202 * swpentry - associated swap entry, the offset indexes into the red-black tree 203 * length - the length in bytes of the compressed page data. Needed during 204 * decompression. For a same value filled page length is 0, and both 205 * pool and lru are invalid and must be ignored. 206 * pool - the zswap_pool the entry's data is in 207 * handle - zpool allocation handle that stores the compressed page data 208 * value - value of the same-value filled pages which have same content 209 * objcg - the obj_cgroup that the compressed memory is charged to 210 * lru - handle to the pool's lru used to evict pages. 211 */ 212 struct zswap_entry { 213 struct rb_node rbnode; 214 swp_entry_t swpentry; 215 unsigned int length; 216 struct zswap_pool *pool; 217 union { 218 unsigned long handle; 219 unsigned long value; 220 }; 221 struct obj_cgroup *objcg; 222 struct list_head lru; 223 }; 224 225 struct zswap_tree { 226 struct rb_root rbroot; 227 spinlock_t lock; 228 }; 229 230 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 231 static unsigned int nr_zswap_trees[MAX_SWAPFILES]; 232 233 /* RCU-protected iteration */ 234 static LIST_HEAD(zswap_pools); 235 /* protects zswap_pools list modification */ 236 static DEFINE_SPINLOCK(zswap_pools_lock); 237 /* pool counter to provide unique names to zpool */ 238 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 239 240 enum zswap_init_type { 241 ZSWAP_UNINIT, 242 ZSWAP_INIT_SUCCEED, 243 ZSWAP_INIT_FAILED 244 }; 245 246 static enum zswap_init_type zswap_init_state; 247 248 /* used to ensure the integrity of initialization */ 249 static DEFINE_MUTEX(zswap_init_lock); 250 251 /* init completed, but couldn't create the initial pool */ 252 static bool zswap_has_pool; 253 254 /********************************* 255 * helpers and fwd declarations 256 **********************************/ 257 258 static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) 259 { 260 return &zswap_trees[swp_type(swp)][swp_offset(swp) 261 >> SWAP_ADDRESS_SPACE_SHIFT]; 262 } 263 264 #define zswap_pool_debug(msg, p) \ 265 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 266 zpool_get_type((p)->zpools[0])) 267 268 static bool zswap_is_full(void) 269 { 270 return totalram_pages() * zswap_max_pool_percent / 100 < 271 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 272 } 273 274 static bool zswap_can_accept(void) 275 { 276 return totalram_pages() * zswap_accept_thr_percent / 100 * 277 zswap_max_pool_percent / 100 > 278 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 279 } 280 281 static u64 get_zswap_pool_size(struct zswap_pool *pool) 282 { 283 u64 pool_size = 0; 284 int i; 285 286 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 287 pool_size += zpool_get_total_size(pool->zpools[i]); 288 289 return pool_size; 290 } 291 292 static void zswap_update_total_size(void) 293 { 294 struct zswap_pool *pool; 295 u64 total = 0; 296 297 rcu_read_lock(); 298 299 list_for_each_entry_rcu(pool, &zswap_pools, list) 300 total += get_zswap_pool_size(pool); 301 302 rcu_read_unlock(); 303 304 zswap_pool_total_size = total; 305 } 306 307 /********************************* 308 * pool functions 309 **********************************/ 310 static void __zswap_pool_empty(struct percpu_ref *ref); 311 312 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 313 { 314 int i; 315 struct zswap_pool *pool; 316 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 317 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 318 int ret; 319 320 if (!zswap_has_pool) { 321 /* if either are unset, pool initialization failed, and we 322 * need both params to be set correctly before trying to 323 * create a pool. 324 */ 325 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 326 return NULL; 327 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 328 return NULL; 329 } 330 331 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 332 if (!pool) 333 return NULL; 334 335 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { 336 /* unique name for each pool specifically required by zsmalloc */ 337 snprintf(name, 38, "zswap%x", 338 atomic_inc_return(&zswap_pools_count)); 339 340 pool->zpools[i] = zpool_create_pool(type, name, gfp); 341 if (!pool->zpools[i]) { 342 pr_err("%s zpool not available\n", type); 343 goto error; 344 } 345 } 346 pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); 347 348 strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 349 350 pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); 351 if (!pool->acomp_ctx) { 352 pr_err("percpu alloc failed\n"); 353 goto error; 354 } 355 356 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 357 &pool->node); 358 if (ret) 359 goto error; 360 361 /* being the current pool takes 1 ref; this func expects the 362 * caller to always add the new pool as the current pool 363 */ 364 ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, 365 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); 366 if (ret) 367 goto ref_fail; 368 INIT_LIST_HEAD(&pool->list); 369 370 zswap_pool_debug("created", pool); 371 372 return pool; 373 374 ref_fail: 375 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 376 error: 377 if (pool->acomp_ctx) 378 free_percpu(pool->acomp_ctx); 379 while (i--) 380 zpool_destroy_pool(pool->zpools[i]); 381 kfree(pool); 382 return NULL; 383 } 384 385 static struct zswap_pool *__zswap_pool_create_fallback(void) 386 { 387 bool has_comp, has_zpool; 388 389 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 390 if (!has_comp && strcmp(zswap_compressor, 391 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { 392 pr_err("compressor %s not available, using default %s\n", 393 zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); 394 param_free_charp(&zswap_compressor); 395 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; 396 has_comp = crypto_has_acomp(zswap_compressor, 0, 0); 397 } 398 if (!has_comp) { 399 pr_err("default compressor %s not available\n", 400 zswap_compressor); 401 param_free_charp(&zswap_compressor); 402 zswap_compressor = ZSWAP_PARAM_UNSET; 403 } 404 405 has_zpool = zpool_has_pool(zswap_zpool_type); 406 if (!has_zpool && strcmp(zswap_zpool_type, 407 CONFIG_ZSWAP_ZPOOL_DEFAULT)) { 408 pr_err("zpool %s not available, using default %s\n", 409 zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); 410 param_free_charp(&zswap_zpool_type); 411 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; 412 has_zpool = zpool_has_pool(zswap_zpool_type); 413 } 414 if (!has_zpool) { 415 pr_err("default zpool %s not available\n", 416 zswap_zpool_type); 417 param_free_charp(&zswap_zpool_type); 418 zswap_zpool_type = ZSWAP_PARAM_UNSET; 419 } 420 421 if (!has_comp || !has_zpool) 422 return NULL; 423 424 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 425 } 426 427 static void zswap_pool_destroy(struct zswap_pool *pool) 428 { 429 int i; 430 431 zswap_pool_debug("destroying", pool); 432 433 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 434 free_percpu(pool->acomp_ctx); 435 436 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) 437 zpool_destroy_pool(pool->zpools[i]); 438 kfree(pool); 439 } 440 441 static void __zswap_pool_release(struct work_struct *work) 442 { 443 struct zswap_pool *pool = container_of(work, typeof(*pool), 444 release_work); 445 446 synchronize_rcu(); 447 448 /* nobody should have been able to get a ref... */ 449 WARN_ON(!percpu_ref_is_zero(&pool->ref)); 450 percpu_ref_exit(&pool->ref); 451 452 /* pool is now off zswap_pools list and has no references. */ 453 zswap_pool_destroy(pool); 454 } 455 456 static struct zswap_pool *zswap_pool_current(void); 457 458 static void __zswap_pool_empty(struct percpu_ref *ref) 459 { 460 struct zswap_pool *pool; 461 462 pool = container_of(ref, typeof(*pool), ref); 463 464 spin_lock_bh(&zswap_pools_lock); 465 466 WARN_ON(pool == zswap_pool_current()); 467 468 list_del_rcu(&pool->list); 469 470 INIT_WORK(&pool->release_work, __zswap_pool_release); 471 schedule_work(&pool->release_work); 472 473 spin_unlock_bh(&zswap_pools_lock); 474 } 475 476 static int __must_check zswap_pool_get(struct zswap_pool *pool) 477 { 478 if (!pool) 479 return 0; 480 481 return percpu_ref_tryget(&pool->ref); 482 } 483 484 static void zswap_pool_put(struct zswap_pool *pool) 485 { 486 percpu_ref_put(&pool->ref); 487 } 488 489 static struct zswap_pool *__zswap_pool_current(void) 490 { 491 struct zswap_pool *pool; 492 493 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 494 WARN_ONCE(!pool && zswap_has_pool, 495 "%s: no page storage pool!\n", __func__); 496 497 return pool; 498 } 499 500 static struct zswap_pool *zswap_pool_current(void) 501 { 502 assert_spin_locked(&zswap_pools_lock); 503 504 return __zswap_pool_current(); 505 } 506 507 static struct zswap_pool *zswap_pool_current_get(void) 508 { 509 struct zswap_pool *pool; 510 511 rcu_read_lock(); 512 513 pool = __zswap_pool_current(); 514 if (!zswap_pool_get(pool)) 515 pool = NULL; 516 517 rcu_read_unlock(); 518 519 return pool; 520 } 521 522 /* type and compressor must be null-terminated */ 523 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 524 { 525 struct zswap_pool *pool; 526 527 assert_spin_locked(&zswap_pools_lock); 528 529 list_for_each_entry_rcu(pool, &zswap_pools, list) { 530 if (strcmp(pool->tfm_name, compressor)) 531 continue; 532 /* all zpools share the same type */ 533 if (strcmp(zpool_get_type(pool->zpools[0]), type)) 534 continue; 535 /* if we can't get it, it's about to be destroyed */ 536 if (!zswap_pool_get(pool)) 537 continue; 538 return pool; 539 } 540 541 return NULL; 542 } 543 544 /********************************* 545 * param callbacks 546 **********************************/ 547 548 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) 549 { 550 /* no change required */ 551 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 552 return false; 553 return true; 554 } 555 556 /* val must be a null-terminated string */ 557 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 558 char *type, char *compressor) 559 { 560 struct zswap_pool *pool, *put_pool = NULL; 561 char *s = strstrip((char *)val); 562 int ret = 0; 563 bool new_pool = false; 564 565 mutex_lock(&zswap_init_lock); 566 switch (zswap_init_state) { 567 case ZSWAP_UNINIT: 568 /* if this is load-time (pre-init) param setting, 569 * don't create a pool; that's done during init. 570 */ 571 ret = param_set_charp(s, kp); 572 break; 573 case ZSWAP_INIT_SUCCEED: 574 new_pool = zswap_pool_changed(s, kp); 575 break; 576 case ZSWAP_INIT_FAILED: 577 pr_err("can't set param, initialization failed\n"); 578 ret = -ENODEV; 579 } 580 mutex_unlock(&zswap_init_lock); 581 582 /* no need to create a new pool, return directly */ 583 if (!new_pool) 584 return ret; 585 586 if (!type) { 587 if (!zpool_has_pool(s)) { 588 pr_err("zpool %s not available\n", s); 589 return -ENOENT; 590 } 591 type = s; 592 } else if (!compressor) { 593 if (!crypto_has_acomp(s, 0, 0)) { 594 pr_err("compressor %s not available\n", s); 595 return -ENOENT; 596 } 597 compressor = s; 598 } else { 599 WARN_ON(1); 600 return -EINVAL; 601 } 602 603 spin_lock_bh(&zswap_pools_lock); 604 605 pool = zswap_pool_find_get(type, compressor); 606 if (pool) { 607 zswap_pool_debug("using existing", pool); 608 WARN_ON(pool == zswap_pool_current()); 609 list_del_rcu(&pool->list); 610 } 611 612 spin_unlock_bh(&zswap_pools_lock); 613 614 if (!pool) 615 pool = zswap_pool_create(type, compressor); 616 else { 617 /* 618 * Restore the initial ref dropped by percpu_ref_kill() 619 * when the pool was decommissioned and switch it again 620 * to percpu mode. 621 */ 622 percpu_ref_resurrect(&pool->ref); 623 624 /* Drop the ref from zswap_pool_find_get(). */ 625 zswap_pool_put(pool); 626 } 627 628 if (pool) 629 ret = param_set_charp(s, kp); 630 else 631 ret = -EINVAL; 632 633 spin_lock_bh(&zswap_pools_lock); 634 635 if (!ret) { 636 put_pool = zswap_pool_current(); 637 list_add_rcu(&pool->list, &zswap_pools); 638 zswap_has_pool = true; 639 } else if (pool) { 640 /* add the possibly pre-existing pool to the end of the pools 641 * list; if it's new (and empty) then it'll be removed and 642 * destroyed by the put after we drop the lock 643 */ 644 list_add_tail_rcu(&pool->list, &zswap_pools); 645 put_pool = pool; 646 } 647 648 spin_unlock_bh(&zswap_pools_lock); 649 650 if (!zswap_has_pool && !pool) { 651 /* if initial pool creation failed, and this pool creation also 652 * failed, maybe both compressor and zpool params were bad. 653 * Allow changing this param, so pool creation will succeed 654 * when the other param is changed. We already verified this 655 * param is ok in the zpool_has_pool() or crypto_has_acomp() 656 * checks above. 657 */ 658 ret = param_set_charp(s, kp); 659 } 660 661 /* drop the ref from either the old current pool, 662 * or the new pool we failed to add 663 */ 664 if (put_pool) 665 percpu_ref_kill(&put_pool->ref); 666 667 return ret; 668 } 669 670 static int zswap_compressor_param_set(const char *val, 671 const struct kernel_param *kp) 672 { 673 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 674 } 675 676 static int zswap_zpool_param_set(const char *val, 677 const struct kernel_param *kp) 678 { 679 return __zswap_param_set(val, kp, NULL, zswap_compressor); 680 } 681 682 static int zswap_enabled_param_set(const char *val, 683 const struct kernel_param *kp) 684 { 685 int ret = -ENODEV; 686 687 /* if this is load-time (pre-init) param setting, only set param. */ 688 if (system_state != SYSTEM_RUNNING) 689 return param_set_bool(val, kp); 690 691 mutex_lock(&zswap_init_lock); 692 switch (zswap_init_state) { 693 case ZSWAP_UNINIT: 694 if (zswap_setup()) 695 break; 696 fallthrough; 697 case ZSWAP_INIT_SUCCEED: 698 if (!zswap_has_pool) 699 pr_err("can't enable, no pool configured\n"); 700 else 701 ret = param_set_bool(val, kp); 702 break; 703 case ZSWAP_INIT_FAILED: 704 pr_err("can't enable, initialization failed\n"); 705 } 706 mutex_unlock(&zswap_init_lock); 707 708 return ret; 709 } 710 711 /********************************* 712 * lru functions 713 **********************************/ 714 715 /* should be called under RCU */ 716 #ifdef CONFIG_MEMCG 717 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 718 { 719 return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; 720 } 721 #else 722 static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) 723 { 724 return NULL; 725 } 726 #endif 727 728 static inline int entry_to_nid(struct zswap_entry *entry) 729 { 730 return page_to_nid(virt_to_page(entry)); 731 } 732 733 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) 734 { 735 atomic_long_t *nr_zswap_protected; 736 unsigned long lru_size, old, new; 737 int nid = entry_to_nid(entry); 738 struct mem_cgroup *memcg; 739 struct lruvec *lruvec; 740 741 /* 742 * Note that it is safe to use rcu_read_lock() here, even in the face of 743 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection 744 * used in list_lru lookup, only two scenarios are possible: 745 * 746 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The 747 * new entry will be reparented to memcg's parent's list_lru. 748 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The 749 * new entry will be added directly to memcg's parent's list_lru. 750 * 751 * Similar reasoning holds for list_lru_del(). 752 */ 753 rcu_read_lock(); 754 memcg = mem_cgroup_from_entry(entry); 755 /* will always succeed */ 756 list_lru_add(list_lru, &entry->lru, nid, memcg); 757 758 /* Update the protection area */ 759 lru_size = list_lru_count_one(list_lru, nid, memcg); 760 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 761 nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; 762 old = atomic_long_inc_return(nr_zswap_protected); 763 /* 764 * Decay to avoid overflow and adapt to changing workloads. 765 * This is based on LRU reclaim cost decaying heuristics. 766 */ 767 do { 768 new = old > lru_size / 4 ? old / 2 : old; 769 } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); 770 rcu_read_unlock(); 771 } 772 773 static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) 774 { 775 int nid = entry_to_nid(entry); 776 struct mem_cgroup *memcg; 777 778 rcu_read_lock(); 779 memcg = mem_cgroup_from_entry(entry); 780 /* will always succeed */ 781 list_lru_del(list_lru, &entry->lru, nid, memcg); 782 rcu_read_unlock(); 783 } 784 785 void zswap_lruvec_state_init(struct lruvec *lruvec) 786 { 787 atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); 788 } 789 790 void zswap_folio_swapin(struct folio *folio) 791 { 792 struct lruvec *lruvec; 793 794 if (folio) { 795 lruvec = folio_lruvec(folio); 796 atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); 797 } 798 } 799 800 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) 801 { 802 /* lock out zswap shrinker walking memcg tree */ 803 spin_lock(&zswap_shrink_lock); 804 if (zswap_next_shrink == memcg) 805 zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); 806 spin_unlock(&zswap_shrink_lock); 807 } 808 809 /********************************* 810 * rbtree functions 811 **********************************/ 812 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 813 { 814 struct rb_node *node = root->rb_node; 815 struct zswap_entry *entry; 816 pgoff_t entry_offset; 817 818 while (node) { 819 entry = rb_entry(node, struct zswap_entry, rbnode); 820 entry_offset = swp_offset(entry->swpentry); 821 if (entry_offset > offset) 822 node = node->rb_left; 823 else if (entry_offset < offset) 824 node = node->rb_right; 825 else 826 return entry; 827 } 828 return NULL; 829 } 830 831 /* 832 * In the case that a entry with the same offset is found, a pointer to 833 * the existing entry is stored in dupentry and the function returns -EEXIST 834 */ 835 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 836 struct zswap_entry **dupentry) 837 { 838 struct rb_node **link = &root->rb_node, *parent = NULL; 839 struct zswap_entry *myentry; 840 pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); 841 842 while (*link) { 843 parent = *link; 844 myentry = rb_entry(parent, struct zswap_entry, rbnode); 845 myentry_offset = swp_offset(myentry->swpentry); 846 if (myentry_offset > entry_offset) 847 link = &(*link)->rb_left; 848 else if (myentry_offset < entry_offset) 849 link = &(*link)->rb_right; 850 else { 851 *dupentry = myentry; 852 return -EEXIST; 853 } 854 } 855 rb_link_node(&entry->rbnode, parent, link); 856 rb_insert_color(&entry->rbnode, root); 857 return 0; 858 } 859 860 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 861 { 862 rb_erase(&entry->rbnode, root); 863 RB_CLEAR_NODE(&entry->rbnode); 864 } 865 866 /********************************* 867 * zswap entry functions 868 **********************************/ 869 static struct kmem_cache *zswap_entry_cache; 870 871 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) 872 { 873 struct zswap_entry *entry; 874 entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); 875 if (!entry) 876 return NULL; 877 RB_CLEAR_NODE(&entry->rbnode); 878 return entry; 879 } 880 881 static void zswap_entry_cache_free(struct zswap_entry *entry) 882 { 883 kmem_cache_free(zswap_entry_cache, entry); 884 } 885 886 static struct zpool *zswap_find_zpool(struct zswap_entry *entry) 887 { 888 int i = 0; 889 890 if (ZSWAP_NR_ZPOOLS > 1) 891 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); 892 893 return entry->pool->zpools[i]; 894 } 895 896 /* 897 * Carries out the common pattern of freeing and entry's zpool allocation, 898 * freeing the entry itself, and decrementing the number of stored pages. 899 */ 900 static void zswap_entry_free(struct zswap_entry *entry) 901 { 902 if (!entry->length) 903 atomic_dec(&zswap_same_filled_pages); 904 else { 905 zswap_lru_del(&zswap_list_lru, entry); 906 zpool_free(zswap_find_zpool(entry), entry->handle); 907 atomic_dec(&zswap_nr_stored); 908 zswap_pool_put(entry->pool); 909 } 910 if (entry->objcg) { 911 obj_cgroup_uncharge_zswap(entry->objcg, entry->length); 912 obj_cgroup_put(entry->objcg); 913 } 914 zswap_entry_cache_free(entry); 915 atomic_dec(&zswap_stored_pages); 916 zswap_update_total_size(); 917 } 918 919 /* 920 * The caller hold the tree lock and search the entry from the tree, 921 * so it must be on the tree, remove it from the tree and free it. 922 */ 923 static void zswap_invalidate_entry(struct zswap_tree *tree, 924 struct zswap_entry *entry) 925 { 926 zswap_rb_erase(&tree->rbroot, entry); 927 zswap_entry_free(entry); 928 } 929 930 /********************************* 931 * compressed storage functions 932 **********************************/ 933 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 934 { 935 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 936 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 937 struct crypto_acomp *acomp; 938 struct acomp_req *req; 939 int ret; 940 941 mutex_init(&acomp_ctx->mutex); 942 943 acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 944 if (!acomp_ctx->buffer) 945 return -ENOMEM; 946 947 acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); 948 if (IS_ERR(acomp)) { 949 pr_err("could not alloc crypto acomp %s : %ld\n", 950 pool->tfm_name, PTR_ERR(acomp)); 951 ret = PTR_ERR(acomp); 952 goto acomp_fail; 953 } 954 acomp_ctx->acomp = acomp; 955 acomp_ctx->is_sleepable = acomp_is_async(acomp); 956 957 req = acomp_request_alloc(acomp_ctx->acomp); 958 if (!req) { 959 pr_err("could not alloc crypto acomp_request %s\n", 960 pool->tfm_name); 961 ret = -ENOMEM; 962 goto req_fail; 963 } 964 acomp_ctx->req = req; 965 966 crypto_init_wait(&acomp_ctx->wait); 967 /* 968 * if the backend of acomp is async zip, crypto_req_done() will wakeup 969 * crypto_wait_req(); if the backend of acomp is scomp, the callback 970 * won't be called, crypto_wait_req() will return without blocking. 971 */ 972 acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, 973 crypto_req_done, &acomp_ctx->wait); 974 975 return 0; 976 977 req_fail: 978 crypto_free_acomp(acomp_ctx->acomp); 979 acomp_fail: 980 kfree(acomp_ctx->buffer); 981 return ret; 982 } 983 984 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 985 { 986 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 987 struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); 988 989 if (!IS_ERR_OR_NULL(acomp_ctx)) { 990 if (!IS_ERR_OR_NULL(acomp_ctx->req)) 991 acomp_request_free(acomp_ctx->req); 992 if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) 993 crypto_free_acomp(acomp_ctx->acomp); 994 kfree(acomp_ctx->buffer); 995 } 996 997 return 0; 998 } 999 1000 static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) 1001 { 1002 struct crypto_acomp_ctx *acomp_ctx; 1003 struct scatterlist input, output; 1004 int comp_ret = 0, alloc_ret = 0; 1005 unsigned int dlen = PAGE_SIZE; 1006 unsigned long handle; 1007 struct zpool *zpool; 1008 char *buf; 1009 gfp_t gfp; 1010 u8 *dst; 1011 1012 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1013 1014 mutex_lock(&acomp_ctx->mutex); 1015 1016 dst = acomp_ctx->buffer; 1017 sg_init_table(&input, 1); 1018 sg_set_page(&input, &folio->page, PAGE_SIZE, 0); 1019 1020 /* 1021 * We need PAGE_SIZE * 2 here since there maybe over-compression case, 1022 * and hardware-accelerators may won't check the dst buffer size, so 1023 * giving the dst buffer with enough length to avoid buffer overflow. 1024 */ 1025 sg_init_one(&output, dst, PAGE_SIZE * 2); 1026 acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); 1027 1028 /* 1029 * it maybe looks a little bit silly that we send an asynchronous request, 1030 * then wait for its completion synchronously. This makes the process look 1031 * synchronous in fact. 1032 * Theoretically, acomp supports users send multiple acomp requests in one 1033 * acomp instance, then get those requests done simultaneously. but in this 1034 * case, zswap actually does store and load page by page, there is no 1035 * existing method to send the second page before the first page is done 1036 * in one thread doing zwap. 1037 * but in different threads running on different cpu, we have different 1038 * acomp instance, so multiple threads can do (de)compression in parallel. 1039 */ 1040 comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); 1041 dlen = acomp_ctx->req->dlen; 1042 if (comp_ret) 1043 goto unlock; 1044 1045 zpool = zswap_find_zpool(entry); 1046 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1047 if (zpool_malloc_support_movable(zpool)) 1048 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1049 alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); 1050 if (alloc_ret) 1051 goto unlock; 1052 1053 buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); 1054 memcpy(buf, dst, dlen); 1055 zpool_unmap_handle(zpool, handle); 1056 1057 entry->handle = handle; 1058 entry->length = dlen; 1059 1060 unlock: 1061 if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) 1062 zswap_reject_compress_poor++; 1063 else if (comp_ret) 1064 zswap_reject_compress_fail++; 1065 else if (alloc_ret) 1066 zswap_reject_alloc_fail++; 1067 1068 mutex_unlock(&acomp_ctx->mutex); 1069 return comp_ret == 0 && alloc_ret == 0; 1070 } 1071 1072 static void zswap_decompress(struct zswap_entry *entry, struct page *page) 1073 { 1074 struct zpool *zpool = zswap_find_zpool(entry); 1075 struct scatterlist input, output; 1076 struct crypto_acomp_ctx *acomp_ctx; 1077 u8 *src; 1078 1079 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); 1080 mutex_lock(&acomp_ctx->mutex); 1081 1082 src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); 1083 /* 1084 * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer 1085 * to do crypto_acomp_decompress() which might sleep. In such cases, we must 1086 * resort to copying the buffer to a temporary one. 1087 * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, 1088 * such as a kmap address of high memory or even ever a vmap address. 1089 * However, sg_init_one is only equipped to handle linearly mapped low memory. 1090 * In such cases, we also must copy the buffer to a temporary and lowmem one. 1091 */ 1092 if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || 1093 !virt_addr_valid(src)) { 1094 memcpy(acomp_ctx->buffer, src, entry->length); 1095 src = acomp_ctx->buffer; 1096 zpool_unmap_handle(zpool, entry->handle); 1097 } 1098 1099 sg_init_one(&input, src, entry->length); 1100 sg_init_table(&output, 1); 1101 sg_set_page(&output, page, PAGE_SIZE, 0); 1102 acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); 1103 BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); 1104 BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); 1105 mutex_unlock(&acomp_ctx->mutex); 1106 1107 if (src != acomp_ctx->buffer) 1108 zpool_unmap_handle(zpool, entry->handle); 1109 } 1110 1111 /********************************* 1112 * writeback code 1113 **********************************/ 1114 /* 1115 * Attempts to free an entry by adding a folio to the swap cache, 1116 * decompressing the entry data into the folio, and issuing a 1117 * bio write to write the folio back to the swap device. 1118 * 1119 * This can be thought of as a "resumed writeback" of the folio 1120 * to the swap device. We are basically resuming the same swap 1121 * writeback path that was intercepted with the zswap_store() 1122 * in the first place. After the folio has been decompressed into 1123 * the swap cache, the compressed version stored by zswap can be 1124 * freed. 1125 */ 1126 static int zswap_writeback_entry(struct zswap_entry *entry, 1127 swp_entry_t swpentry) 1128 { 1129 struct zswap_tree *tree; 1130 struct folio *folio; 1131 struct mempolicy *mpol; 1132 bool folio_was_allocated; 1133 struct writeback_control wbc = { 1134 .sync_mode = WB_SYNC_NONE, 1135 }; 1136 1137 /* try to allocate swap cache folio */ 1138 mpol = get_task_policy(current); 1139 folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, 1140 NO_INTERLEAVE_INDEX, &folio_was_allocated, true); 1141 if (!folio) 1142 return -ENOMEM; 1143 1144 /* 1145 * Found an existing folio, we raced with swapin or concurrent 1146 * shrinker. We generally writeback cold folios from zswap, and 1147 * swapin means the folio just became hot, so skip this folio. 1148 * For unlikely concurrent shrinker case, it will be unlinked 1149 * and freed when invalidated by the concurrent shrinker anyway. 1150 */ 1151 if (!folio_was_allocated) { 1152 folio_put(folio); 1153 return -EEXIST; 1154 } 1155 1156 /* 1157 * folio is locked, and the swapcache is now secured against 1158 * concurrent swapping to and from the slot, and concurrent 1159 * swapoff so we can safely dereference the zswap tree here. 1160 * Verify that the swap entry hasn't been invalidated and recycled 1161 * behind our backs, to avoid overwriting a new swap folio with 1162 * old compressed data. Only when this is successful can the entry 1163 * be dereferenced. 1164 */ 1165 tree = swap_zswap_tree(swpentry); 1166 spin_lock(&tree->lock); 1167 if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { 1168 spin_unlock(&tree->lock); 1169 delete_from_swap_cache(folio); 1170 folio_unlock(folio); 1171 folio_put(folio); 1172 return -ENOMEM; 1173 } 1174 1175 /* Safe to deref entry after the entry is verified above. */ 1176 zswap_rb_erase(&tree->rbroot, entry); 1177 spin_unlock(&tree->lock); 1178 1179 zswap_decompress(entry, &folio->page); 1180 1181 count_vm_event(ZSWPWB); 1182 if (entry->objcg) 1183 count_objcg_event(entry->objcg, ZSWPWB); 1184 1185 zswap_entry_free(entry); 1186 1187 /* folio is up to date */ 1188 folio_mark_uptodate(folio); 1189 1190 /* move it to the tail of the inactive list after end_writeback */ 1191 folio_set_reclaim(folio); 1192 1193 /* start writeback */ 1194 __swap_writepage(folio, &wbc); 1195 folio_put(folio); 1196 1197 return 0; 1198 } 1199 1200 /********************************* 1201 * shrinker functions 1202 **********************************/ 1203 static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, 1204 spinlock_t *lock, void *arg) 1205 { 1206 struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); 1207 bool *encountered_page_in_swapcache = (bool *)arg; 1208 swp_entry_t swpentry; 1209 enum lru_status ret = LRU_REMOVED_RETRY; 1210 int writeback_result; 1211 1212 /* 1213 * As soon as we drop the LRU lock, the entry can be freed by 1214 * a concurrent invalidation. This means the following: 1215 * 1216 * 1. We extract the swp_entry_t to the stack, allowing 1217 * zswap_writeback_entry() to pin the swap entry and 1218 * then validate the zwap entry against that swap entry's 1219 * tree using pointer value comparison. Only when that 1220 * is successful can the entry be dereferenced. 1221 * 1222 * 2. Usually, objects are taken off the LRU for reclaim. In 1223 * this case this isn't possible, because if reclaim fails 1224 * for whatever reason, we have no means of knowing if the 1225 * entry is alive to put it back on the LRU. 1226 * 1227 * So rotate it before dropping the lock. If the entry is 1228 * written back or invalidated, the free path will unlink 1229 * it. For failures, rotation is the right thing as well. 1230 * 1231 * Temporary failures, where the same entry should be tried 1232 * again immediately, almost never happen for this shrinker. 1233 * We don't do any trylocking; -ENOMEM comes closest, 1234 * but that's extremely rare and doesn't happen spuriously 1235 * either. Don't bother distinguishing this case. 1236 */ 1237 list_move_tail(item, &l->list); 1238 1239 /* 1240 * Once the lru lock is dropped, the entry might get freed. The 1241 * swpentry is copied to the stack, and entry isn't deref'd again 1242 * until the entry is verified to still be alive in the tree. 1243 */ 1244 swpentry = entry->swpentry; 1245 1246 /* 1247 * It's safe to drop the lock here because we return either 1248 * LRU_REMOVED_RETRY or LRU_RETRY. 1249 */ 1250 spin_unlock(lock); 1251 1252 writeback_result = zswap_writeback_entry(entry, swpentry); 1253 1254 if (writeback_result) { 1255 zswap_reject_reclaim_fail++; 1256 ret = LRU_RETRY; 1257 1258 /* 1259 * Encountering a page already in swap cache is a sign that we are shrinking 1260 * into the warmer region. We should terminate shrinking (if we're in the dynamic 1261 * shrinker context). 1262 */ 1263 if (writeback_result == -EEXIST && encountered_page_in_swapcache) { 1264 ret = LRU_STOP; 1265 *encountered_page_in_swapcache = true; 1266 } 1267 } else { 1268 zswap_written_back_pages++; 1269 } 1270 1271 spin_lock(lock); 1272 return ret; 1273 } 1274 1275 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, 1276 struct shrink_control *sc) 1277 { 1278 struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 1279 unsigned long shrink_ret, nr_protected, lru_size; 1280 bool encountered_page_in_swapcache = false; 1281 1282 if (!zswap_shrinker_enabled || 1283 !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { 1284 sc->nr_scanned = 0; 1285 return SHRINK_STOP; 1286 } 1287 1288 nr_protected = 1289 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 1290 lru_size = list_lru_shrink_count(&zswap_list_lru, sc); 1291 1292 /* 1293 * Abort if we are shrinking into the protected region. 1294 * 1295 * This short-circuiting is necessary because if we have too many multiple 1296 * concurrent reclaimers getting the freeable zswap object counts at the 1297 * same time (before any of them made reasonable progress), the total 1298 * number of reclaimed objects might be more than the number of unprotected 1299 * objects (i.e the reclaimers will reclaim into the protected area of the 1300 * zswap LRU). 1301 */ 1302 if (nr_protected >= lru_size - sc->nr_to_scan) { 1303 sc->nr_scanned = 0; 1304 return SHRINK_STOP; 1305 } 1306 1307 shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, 1308 &encountered_page_in_swapcache); 1309 1310 if (encountered_page_in_swapcache) 1311 return SHRINK_STOP; 1312 1313 return shrink_ret ? shrink_ret : SHRINK_STOP; 1314 } 1315 1316 static unsigned long zswap_shrinker_count(struct shrinker *shrinker, 1317 struct shrink_control *sc) 1318 { 1319 struct mem_cgroup *memcg = sc->memcg; 1320 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); 1321 unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; 1322 1323 if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) 1324 return 0; 1325 1326 /* 1327 * The shrinker resumes swap writeback, which will enter block 1328 * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS 1329 * rules (may_enter_fs()), which apply on a per-folio basis. 1330 */ 1331 if (!gfp_has_io_fs(sc->gfp_mask)) 1332 return 0; 1333 1334 #ifdef CONFIG_MEMCG_KMEM 1335 mem_cgroup_flush_stats(memcg); 1336 nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; 1337 nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); 1338 #else 1339 /* use pool stats instead of memcg stats */ 1340 nr_backing = zswap_pool_total_size >> PAGE_SHIFT; 1341 nr_stored = atomic_read(&zswap_nr_stored); 1342 #endif 1343 1344 if (!nr_stored) 1345 return 0; 1346 1347 nr_protected = 1348 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); 1349 nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); 1350 /* 1351 * Subtract the lru size by an estimate of the number of pages 1352 * that should be protected. 1353 */ 1354 nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; 1355 1356 /* 1357 * Scale the number of freeable pages by the memory saving factor. 1358 * This ensures that the better zswap compresses memory, the fewer 1359 * pages we will evict to swap (as it will otherwise incur IO for 1360 * relatively small memory saving). 1361 */ 1362 return mult_frac(nr_freeable, nr_backing, nr_stored); 1363 } 1364 1365 static struct shrinker *zswap_alloc_shrinker(void) 1366 { 1367 struct shrinker *shrinker; 1368 1369 shrinker = 1370 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); 1371 if (!shrinker) 1372 return NULL; 1373 1374 shrinker->scan_objects = zswap_shrinker_scan; 1375 shrinker->count_objects = zswap_shrinker_count; 1376 shrinker->batch = 0; 1377 shrinker->seeks = DEFAULT_SEEKS; 1378 return shrinker; 1379 } 1380 1381 static int shrink_memcg(struct mem_cgroup *memcg) 1382 { 1383 int nid, shrunk = 0; 1384 1385 if (!mem_cgroup_zswap_writeback_enabled(memcg)) 1386 return -EINVAL; 1387 1388 /* 1389 * Skip zombies because their LRUs are reparented and we would be 1390 * reclaiming from the parent instead of the dead memcg. 1391 */ 1392 if (memcg && !mem_cgroup_online(memcg)) 1393 return -ENOENT; 1394 1395 for_each_node_state(nid, N_NORMAL_MEMORY) { 1396 unsigned long nr_to_walk = 1; 1397 1398 shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, 1399 &shrink_memcg_cb, NULL, &nr_to_walk); 1400 } 1401 return shrunk ? 0 : -EAGAIN; 1402 } 1403 1404 static void shrink_worker(struct work_struct *w) 1405 { 1406 struct mem_cgroup *memcg; 1407 int ret, failures = 0; 1408 1409 /* global reclaim will select cgroup in a round-robin fashion. */ 1410 do { 1411 spin_lock(&zswap_shrink_lock); 1412 zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); 1413 memcg = zswap_next_shrink; 1414 1415 /* 1416 * We need to retry if we have gone through a full round trip, or if we 1417 * got an offline memcg (or else we risk undoing the effect of the 1418 * zswap memcg offlining cleanup callback). This is not catastrophic 1419 * per se, but it will keep the now offlined memcg hostage for a while. 1420 * 1421 * Note that if we got an online memcg, we will keep the extra 1422 * reference in case the original reference obtained by mem_cgroup_iter 1423 * is dropped by the zswap memcg offlining callback, ensuring that the 1424 * memcg is not killed when we are reclaiming. 1425 */ 1426 if (!memcg) { 1427 spin_unlock(&zswap_shrink_lock); 1428 if (++failures == MAX_RECLAIM_RETRIES) 1429 break; 1430 1431 goto resched; 1432 } 1433 1434 if (!mem_cgroup_tryget_online(memcg)) { 1435 /* drop the reference from mem_cgroup_iter() */ 1436 mem_cgroup_iter_break(NULL, memcg); 1437 zswap_next_shrink = NULL; 1438 spin_unlock(&zswap_shrink_lock); 1439 1440 if (++failures == MAX_RECLAIM_RETRIES) 1441 break; 1442 1443 goto resched; 1444 } 1445 spin_unlock(&zswap_shrink_lock); 1446 1447 ret = shrink_memcg(memcg); 1448 /* drop the extra reference */ 1449 mem_cgroup_put(memcg); 1450 1451 if (ret == -EINVAL) 1452 break; 1453 if (ret && ++failures == MAX_RECLAIM_RETRIES) 1454 break; 1455 1456 resched: 1457 cond_resched(); 1458 } while (!zswap_can_accept()); 1459 } 1460 1461 static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 1462 { 1463 unsigned long *page; 1464 unsigned long val; 1465 unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; 1466 1467 page = (unsigned long *)ptr; 1468 val = page[0]; 1469 1470 if (val != page[last_pos]) 1471 return 0; 1472 1473 for (pos = 1; pos < last_pos; pos++) { 1474 if (val != page[pos]) 1475 return 0; 1476 } 1477 1478 *value = val; 1479 1480 return 1; 1481 } 1482 1483 static void zswap_fill_page(void *ptr, unsigned long value) 1484 { 1485 unsigned long *page; 1486 1487 page = (unsigned long *)ptr; 1488 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 1489 } 1490 1491 bool zswap_store(struct folio *folio) 1492 { 1493 swp_entry_t swp = folio->swap; 1494 pgoff_t offset = swp_offset(swp); 1495 struct zswap_tree *tree = swap_zswap_tree(swp); 1496 struct zswap_entry *entry, *dupentry; 1497 struct obj_cgroup *objcg = NULL; 1498 struct mem_cgroup *memcg = NULL; 1499 1500 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1501 VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); 1502 1503 /* Large folios aren't supported */ 1504 if (folio_test_large(folio)) 1505 return false; 1506 1507 if (!zswap_enabled) 1508 goto check_old; 1509 1510 objcg = get_obj_cgroup_from_folio(folio); 1511 if (objcg && !obj_cgroup_may_zswap(objcg)) { 1512 memcg = get_mem_cgroup_from_objcg(objcg); 1513 if (shrink_memcg(memcg)) { 1514 mem_cgroup_put(memcg); 1515 goto reject; 1516 } 1517 mem_cgroup_put(memcg); 1518 } 1519 1520 /* reclaim space if needed */ 1521 if (zswap_is_full()) { 1522 zswap_pool_limit_hit++; 1523 zswap_pool_reached_full = true; 1524 goto shrink; 1525 } 1526 1527 if (zswap_pool_reached_full) { 1528 if (!zswap_can_accept()) 1529 goto shrink; 1530 else 1531 zswap_pool_reached_full = false; 1532 } 1533 1534 /* allocate entry */ 1535 entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); 1536 if (!entry) { 1537 zswap_reject_kmemcache_fail++; 1538 goto reject; 1539 } 1540 1541 if (zswap_same_filled_pages_enabled) { 1542 unsigned long value; 1543 u8 *src; 1544 1545 src = kmap_local_folio(folio, 0); 1546 if (zswap_is_page_same_filled(src, &value)) { 1547 kunmap_local(src); 1548 entry->length = 0; 1549 entry->value = value; 1550 atomic_inc(&zswap_same_filled_pages); 1551 goto insert_entry; 1552 } 1553 kunmap_local(src); 1554 } 1555 1556 if (!zswap_non_same_filled_pages_enabled) 1557 goto freepage; 1558 1559 /* if entry is successfully added, it keeps the reference */ 1560 entry->pool = zswap_pool_current_get(); 1561 if (!entry->pool) 1562 goto freepage; 1563 1564 if (objcg) { 1565 memcg = get_mem_cgroup_from_objcg(objcg); 1566 if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { 1567 mem_cgroup_put(memcg); 1568 goto put_pool; 1569 } 1570 mem_cgroup_put(memcg); 1571 } 1572 1573 if (!zswap_compress(folio, entry)) 1574 goto put_pool; 1575 1576 insert_entry: 1577 entry->swpentry = swp; 1578 entry->objcg = objcg; 1579 if (objcg) { 1580 obj_cgroup_charge_zswap(objcg, entry->length); 1581 /* Account before objcg ref is moved to tree */ 1582 count_objcg_event(objcg, ZSWPOUT); 1583 } 1584 1585 /* map */ 1586 spin_lock(&tree->lock); 1587 /* 1588 * The folio may have been dirtied again, invalidate the 1589 * possibly stale entry before inserting the new entry. 1590 */ 1591 if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { 1592 zswap_invalidate_entry(tree, dupentry); 1593 WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); 1594 } 1595 if (entry->length) { 1596 INIT_LIST_HEAD(&entry->lru); 1597 zswap_lru_add(&zswap_list_lru, entry); 1598 atomic_inc(&zswap_nr_stored); 1599 } 1600 spin_unlock(&tree->lock); 1601 1602 /* update stats */ 1603 atomic_inc(&zswap_stored_pages); 1604 zswap_update_total_size(); 1605 count_vm_event(ZSWPOUT); 1606 1607 return true; 1608 1609 put_pool: 1610 zswap_pool_put(entry->pool); 1611 freepage: 1612 zswap_entry_cache_free(entry); 1613 reject: 1614 if (objcg) 1615 obj_cgroup_put(objcg); 1616 check_old: 1617 /* 1618 * If the zswap store fails or zswap is disabled, we must invalidate the 1619 * possibly stale entry which was previously stored at this offset. 1620 * Otherwise, writeback could overwrite the new data in the swapfile. 1621 */ 1622 spin_lock(&tree->lock); 1623 entry = zswap_rb_search(&tree->rbroot, offset); 1624 if (entry) 1625 zswap_invalidate_entry(tree, entry); 1626 spin_unlock(&tree->lock); 1627 return false; 1628 1629 shrink: 1630 queue_work(shrink_wq, &zswap_shrink_work); 1631 goto reject; 1632 } 1633 1634 bool zswap_load(struct folio *folio) 1635 { 1636 swp_entry_t swp = folio->swap; 1637 pgoff_t offset = swp_offset(swp); 1638 struct page *page = &folio->page; 1639 bool swapcache = folio_test_swapcache(folio); 1640 struct zswap_tree *tree = swap_zswap_tree(swp); 1641 struct zswap_entry *entry; 1642 u8 *dst; 1643 1644 VM_WARN_ON_ONCE(!folio_test_locked(folio)); 1645 1646 spin_lock(&tree->lock); 1647 entry = zswap_rb_search(&tree->rbroot, offset); 1648 if (!entry) { 1649 spin_unlock(&tree->lock); 1650 return false; 1651 } 1652 /* 1653 * When reading into the swapcache, invalidate our entry. The 1654 * swapcache can be the authoritative owner of the page and 1655 * its mappings, and the pressure that results from having two 1656 * in-memory copies outweighs any benefits of caching the 1657 * compression work. 1658 * 1659 * (Most swapins go through the swapcache. The notable 1660 * exception is the singleton fault on SWP_SYNCHRONOUS_IO 1661 * files, which reads into a private page and may free it if 1662 * the fault fails. We remain the primary owner of the entry.) 1663 */ 1664 if (swapcache) 1665 zswap_rb_erase(&tree->rbroot, entry); 1666 spin_unlock(&tree->lock); 1667 1668 if (entry->length) 1669 zswap_decompress(entry, page); 1670 else { 1671 dst = kmap_local_page(page); 1672 zswap_fill_page(dst, entry->value); 1673 kunmap_local(dst); 1674 } 1675 1676 count_vm_event(ZSWPIN); 1677 if (entry->objcg) 1678 count_objcg_event(entry->objcg, ZSWPIN); 1679 1680 if (swapcache) { 1681 zswap_entry_free(entry); 1682 folio_mark_dirty(folio); 1683 } 1684 1685 return true; 1686 } 1687 1688 void zswap_invalidate(swp_entry_t swp) 1689 { 1690 pgoff_t offset = swp_offset(swp); 1691 struct zswap_tree *tree = swap_zswap_tree(swp); 1692 struct zswap_entry *entry; 1693 1694 spin_lock(&tree->lock); 1695 entry = zswap_rb_search(&tree->rbroot, offset); 1696 if (entry) 1697 zswap_invalidate_entry(tree, entry); 1698 spin_unlock(&tree->lock); 1699 } 1700 1701 int zswap_swapon(int type, unsigned long nr_pages) 1702 { 1703 struct zswap_tree *trees, *tree; 1704 unsigned int nr, i; 1705 1706 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 1707 trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); 1708 if (!trees) { 1709 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1710 return -ENOMEM; 1711 } 1712 1713 for (i = 0; i < nr; i++) { 1714 tree = trees + i; 1715 tree->rbroot = RB_ROOT; 1716 spin_lock_init(&tree->lock); 1717 } 1718 1719 nr_zswap_trees[type] = nr; 1720 zswap_trees[type] = trees; 1721 return 0; 1722 } 1723 1724 void zswap_swapoff(int type) 1725 { 1726 struct zswap_tree *trees = zswap_trees[type]; 1727 unsigned int i; 1728 1729 if (!trees) 1730 return; 1731 1732 /* try_to_unuse() invalidated all the entries already */ 1733 for (i = 0; i < nr_zswap_trees[type]; i++) 1734 WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); 1735 1736 kvfree(trees); 1737 nr_zswap_trees[type] = 0; 1738 zswap_trees[type] = NULL; 1739 } 1740 1741 /********************************* 1742 * debugfs functions 1743 **********************************/ 1744 #ifdef CONFIG_DEBUG_FS 1745 #include <linux/debugfs.h> 1746 1747 static struct dentry *zswap_debugfs_root; 1748 1749 static int zswap_debugfs_init(void) 1750 { 1751 if (!debugfs_initialized()) 1752 return -ENODEV; 1753 1754 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1755 1756 debugfs_create_u64("pool_limit_hit", 0444, 1757 zswap_debugfs_root, &zswap_pool_limit_hit); 1758 debugfs_create_u64("reject_reclaim_fail", 0444, 1759 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1760 debugfs_create_u64("reject_alloc_fail", 0444, 1761 zswap_debugfs_root, &zswap_reject_alloc_fail); 1762 debugfs_create_u64("reject_kmemcache_fail", 0444, 1763 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1764 debugfs_create_u64("reject_compress_fail", 0444, 1765 zswap_debugfs_root, &zswap_reject_compress_fail); 1766 debugfs_create_u64("reject_compress_poor", 0444, 1767 zswap_debugfs_root, &zswap_reject_compress_poor); 1768 debugfs_create_u64("written_back_pages", 0444, 1769 zswap_debugfs_root, &zswap_written_back_pages); 1770 debugfs_create_u64("pool_total_size", 0444, 1771 zswap_debugfs_root, &zswap_pool_total_size); 1772 debugfs_create_atomic_t("stored_pages", 0444, 1773 zswap_debugfs_root, &zswap_stored_pages); 1774 debugfs_create_atomic_t("same_filled_pages", 0444, 1775 zswap_debugfs_root, &zswap_same_filled_pages); 1776 1777 return 0; 1778 } 1779 #else 1780 static int zswap_debugfs_init(void) 1781 { 1782 return 0; 1783 } 1784 #endif 1785 1786 /********************************* 1787 * module init and exit 1788 **********************************/ 1789 static int zswap_setup(void) 1790 { 1791 struct zswap_pool *pool; 1792 int ret; 1793 1794 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 1795 if (!zswap_entry_cache) { 1796 pr_err("entry cache creation failed\n"); 1797 goto cache_fail; 1798 } 1799 1800 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1801 "mm/zswap_pool:prepare", 1802 zswap_cpu_comp_prepare, 1803 zswap_cpu_comp_dead); 1804 if (ret) 1805 goto hp_fail; 1806 1807 shrink_wq = alloc_workqueue("zswap-shrink", 1808 WQ_UNBOUND|WQ_MEM_RECLAIM, 1); 1809 if (!shrink_wq) 1810 goto shrink_wq_fail; 1811 1812 zswap_shrinker = zswap_alloc_shrinker(); 1813 if (!zswap_shrinker) 1814 goto shrinker_fail; 1815 if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) 1816 goto lru_fail; 1817 shrinker_register(zswap_shrinker); 1818 1819 INIT_WORK(&zswap_shrink_work, shrink_worker); 1820 1821 pool = __zswap_pool_create_fallback(); 1822 if (pool) { 1823 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1824 zpool_get_type(pool->zpools[0])); 1825 list_add(&pool->list, &zswap_pools); 1826 zswap_has_pool = true; 1827 } else { 1828 pr_err("pool creation failed\n"); 1829 zswap_enabled = false; 1830 } 1831 1832 if (zswap_debugfs_init()) 1833 pr_warn("debugfs initialization failed\n"); 1834 zswap_init_state = ZSWAP_INIT_SUCCEED; 1835 return 0; 1836 1837 lru_fail: 1838 shrinker_free(zswap_shrinker); 1839 shrinker_fail: 1840 destroy_workqueue(shrink_wq); 1841 shrink_wq_fail: 1842 cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); 1843 hp_fail: 1844 kmem_cache_destroy(zswap_entry_cache); 1845 cache_fail: 1846 /* if built-in, we aren't unloaded on failure; don't allow use */ 1847 zswap_init_state = ZSWAP_INIT_FAILED; 1848 zswap_enabled = false; 1849 return -ENOMEM; 1850 } 1851 1852 static int __init zswap_init(void) 1853 { 1854 if (!zswap_enabled) 1855 return 0; 1856 return zswap_setup(); 1857 } 1858 /* must be late so crypto has time to come up */ 1859 late_initcall(zswap_init); 1860 1861 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1862 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1863