1 /* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/cpu.h> 27 #include <linux/highmem.h> 28 #include <linux/slab.h> 29 #include <linux/spinlock.h> 30 #include <linux/types.h> 31 #include <linux/atomic.h> 32 #include <linux/frontswap.h> 33 #include <linux/rbtree.h> 34 #include <linux/swap.h> 35 #include <linux/crypto.h> 36 #include <linux/mempool.h> 37 #include <linux/zpool.h> 38 39 #include <linux/mm_types.h> 40 #include <linux/page-flags.h> 41 #include <linux/swapops.h> 42 #include <linux/writeback.h> 43 #include <linux/pagemap.h> 44 45 /********************************* 46 * statistics 47 **********************************/ 48 /* Total bytes used by the compressed storage */ 49 static u64 zswap_pool_total_size; 50 /* The number of compressed pages currently stored in zswap */ 51 static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 53 /* 54 * The statistics below are not protected from concurrent access for 55 * performance reasons so they may not be a 100% accurate. However, 56 * they do provide useful information on roughly how many times a 57 * certain event is occurring. 58 */ 59 60 /* Pool limit was hit (see zswap_max_pool_percent) */ 61 static u64 zswap_pool_limit_hit; 62 /* Pages written back when pool limit was reached */ 63 static u64 zswap_written_back_pages; 64 /* Store failed due to a reclaim failure after pool limit was reached */ 65 static u64 zswap_reject_reclaim_fail; 66 /* Compressed page was too big for the allocator to (optimally) store */ 67 static u64 zswap_reject_compress_poor; 68 /* Store failed because underlying allocator could not get memory */ 69 static u64 zswap_reject_alloc_fail; 70 /* Store failed because the entry metadata could not be allocated (rare) */ 71 static u64 zswap_reject_kmemcache_fail; 72 /* Duplicate store was encountered (rare) */ 73 static u64 zswap_duplicate_entry; 74 75 /********************************* 76 * tunables 77 **********************************/ 78 79 #define ZSWAP_PARAM_UNSET "" 80 81 /* Enable/disable zswap (disabled by default) */ 82 static bool zswap_enabled; 83 static int zswap_enabled_param_set(const char *, 84 const struct kernel_param *); 85 static struct kernel_param_ops zswap_enabled_param_ops = { 86 .set = zswap_enabled_param_set, 87 .get = param_get_bool, 88 }; 89 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 90 91 /* Crypto compressor to use */ 92 #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 93 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 94 static int zswap_compressor_param_set(const char *, 95 const struct kernel_param *); 96 static struct kernel_param_ops zswap_compressor_param_ops = { 97 .set = zswap_compressor_param_set, 98 .get = param_get_charp, 99 .free = param_free_charp, 100 }; 101 module_param_cb(compressor, &zswap_compressor_param_ops, 102 &zswap_compressor, 0644); 103 104 /* Compressed storage zpool to use */ 105 #define ZSWAP_ZPOOL_DEFAULT "zbud" 106 static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 107 static int zswap_zpool_param_set(const char *, const struct kernel_param *); 108 static struct kernel_param_ops zswap_zpool_param_ops = { 109 .set = zswap_zpool_param_set, 110 .get = param_get_charp, 111 .free = param_free_charp, 112 }; 113 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 114 115 /* The maximum percentage of memory that the compressed pool can occupy */ 116 static unsigned int zswap_max_pool_percent = 20; 117 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 118 119 /********************************* 120 * data structures 121 **********************************/ 122 123 struct zswap_pool { 124 struct zpool *zpool; 125 struct crypto_comp * __percpu *tfm; 126 struct kref kref; 127 struct list_head list; 128 struct work_struct work; 129 struct hlist_node node; 130 char tfm_name[CRYPTO_MAX_ALG_NAME]; 131 }; 132 133 /* 134 * struct zswap_entry 135 * 136 * This structure contains the metadata for tracking a single compressed 137 * page within zswap. 138 * 139 * rbnode - links the entry into red-black tree for the appropriate swap type 140 * offset - the swap offset for the entry. Index into the red-black tree. 141 * refcount - the number of outstanding reference to the entry. This is needed 142 * to protect against premature freeing of the entry by code 143 * concurrent calls to load, invalidate, and writeback. The lock 144 * for the zswap_tree structure that contains the entry must 145 * be held while changing the refcount. Since the lock must 146 * be held, there is no reason to also make refcount atomic. 147 * length - the length in bytes of the compressed page data. Needed during 148 * decompression 149 * pool - the zswap_pool the entry's data is in 150 * handle - zpool allocation handle that stores the compressed page data 151 */ 152 struct zswap_entry { 153 struct rb_node rbnode; 154 pgoff_t offset; 155 int refcount; 156 unsigned int length; 157 struct zswap_pool *pool; 158 unsigned long handle; 159 }; 160 161 struct zswap_header { 162 swp_entry_t swpentry; 163 }; 164 165 /* 166 * The tree lock in the zswap_tree struct protects a few things: 167 * - the rbtree 168 * - the refcount field of each entry in the tree 169 */ 170 struct zswap_tree { 171 struct rb_root rbroot; 172 spinlock_t lock; 173 }; 174 175 static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 176 177 /* RCU-protected iteration */ 178 static LIST_HEAD(zswap_pools); 179 /* protects zswap_pools list modification */ 180 static DEFINE_SPINLOCK(zswap_pools_lock); 181 /* pool counter to provide unique names to zpool */ 182 static atomic_t zswap_pools_count = ATOMIC_INIT(0); 183 184 /* used by param callback function */ 185 static bool zswap_init_started; 186 187 /* fatal error during init */ 188 static bool zswap_init_failed; 189 190 /* init completed, but couldn't create the initial pool */ 191 static bool zswap_has_pool; 192 193 /********************************* 194 * helpers and fwd declarations 195 **********************************/ 196 197 #define zswap_pool_debug(msg, p) \ 198 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 199 zpool_get_type((p)->zpool)) 200 201 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 202 static int zswap_pool_get(struct zswap_pool *pool); 203 static void zswap_pool_put(struct zswap_pool *pool); 204 205 static const struct zpool_ops zswap_zpool_ops = { 206 .evict = zswap_writeback_entry 207 }; 208 209 static bool zswap_is_full(void) 210 { 211 return totalram_pages * zswap_max_pool_percent / 100 < 212 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 213 } 214 215 static void zswap_update_total_size(void) 216 { 217 struct zswap_pool *pool; 218 u64 total = 0; 219 220 rcu_read_lock(); 221 222 list_for_each_entry_rcu(pool, &zswap_pools, list) 223 total += zpool_get_total_size(pool->zpool); 224 225 rcu_read_unlock(); 226 227 zswap_pool_total_size = total; 228 } 229 230 /********************************* 231 * zswap entry functions 232 **********************************/ 233 static struct kmem_cache *zswap_entry_cache; 234 235 static int __init zswap_entry_cache_create(void) 236 { 237 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 238 return zswap_entry_cache == NULL; 239 } 240 241 static void __init zswap_entry_cache_destroy(void) 242 { 243 kmem_cache_destroy(zswap_entry_cache); 244 } 245 246 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 247 { 248 struct zswap_entry *entry; 249 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 250 if (!entry) 251 return NULL; 252 entry->refcount = 1; 253 RB_CLEAR_NODE(&entry->rbnode); 254 return entry; 255 } 256 257 static void zswap_entry_cache_free(struct zswap_entry *entry) 258 { 259 kmem_cache_free(zswap_entry_cache, entry); 260 } 261 262 /********************************* 263 * rbtree functions 264 **********************************/ 265 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 266 { 267 struct rb_node *node = root->rb_node; 268 struct zswap_entry *entry; 269 270 while (node) { 271 entry = rb_entry(node, struct zswap_entry, rbnode); 272 if (entry->offset > offset) 273 node = node->rb_left; 274 else if (entry->offset < offset) 275 node = node->rb_right; 276 else 277 return entry; 278 } 279 return NULL; 280 } 281 282 /* 283 * In the case that a entry with the same offset is found, a pointer to 284 * the existing entry is stored in dupentry and the function returns -EEXIST 285 */ 286 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 287 struct zswap_entry **dupentry) 288 { 289 struct rb_node **link = &root->rb_node, *parent = NULL; 290 struct zswap_entry *myentry; 291 292 while (*link) { 293 parent = *link; 294 myentry = rb_entry(parent, struct zswap_entry, rbnode); 295 if (myentry->offset > entry->offset) 296 link = &(*link)->rb_left; 297 else if (myentry->offset < entry->offset) 298 link = &(*link)->rb_right; 299 else { 300 *dupentry = myentry; 301 return -EEXIST; 302 } 303 } 304 rb_link_node(&entry->rbnode, parent, link); 305 rb_insert_color(&entry->rbnode, root); 306 return 0; 307 } 308 309 static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 310 { 311 if (!RB_EMPTY_NODE(&entry->rbnode)) { 312 rb_erase(&entry->rbnode, root); 313 RB_CLEAR_NODE(&entry->rbnode); 314 } 315 } 316 317 /* 318 * Carries out the common pattern of freeing and entry's zpool allocation, 319 * freeing the entry itself, and decrementing the number of stored pages. 320 */ 321 static void zswap_free_entry(struct zswap_entry *entry) 322 { 323 zpool_free(entry->pool->zpool, entry->handle); 324 zswap_pool_put(entry->pool); 325 zswap_entry_cache_free(entry); 326 atomic_dec(&zswap_stored_pages); 327 zswap_update_total_size(); 328 } 329 330 /* caller must hold the tree lock */ 331 static void zswap_entry_get(struct zswap_entry *entry) 332 { 333 entry->refcount++; 334 } 335 336 /* caller must hold the tree lock 337 * remove from the tree and free it, if nobody reference the entry 338 */ 339 static void zswap_entry_put(struct zswap_tree *tree, 340 struct zswap_entry *entry) 341 { 342 int refcount = --entry->refcount; 343 344 BUG_ON(refcount < 0); 345 if (refcount == 0) { 346 zswap_rb_erase(&tree->rbroot, entry); 347 zswap_free_entry(entry); 348 } 349 } 350 351 /* caller must hold the tree lock */ 352 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 353 pgoff_t offset) 354 { 355 struct zswap_entry *entry; 356 357 entry = zswap_rb_search(root, offset); 358 if (entry) 359 zswap_entry_get(entry); 360 361 return entry; 362 } 363 364 /********************************* 365 * per-cpu code 366 **********************************/ 367 static DEFINE_PER_CPU(u8 *, zswap_dstmem); 368 369 static int zswap_dstmem_prepare(unsigned int cpu) 370 { 371 u8 *dst; 372 373 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 374 if (!dst) { 375 pr_err("can't allocate compressor buffer\n"); 376 return -ENOMEM; 377 } 378 per_cpu(zswap_dstmem, cpu) = dst; 379 return 0; 380 } 381 382 static int zswap_dstmem_dead(unsigned int cpu) 383 { 384 u8 *dst; 385 386 dst = per_cpu(zswap_dstmem, cpu); 387 kfree(dst); 388 per_cpu(zswap_dstmem, cpu) = NULL; 389 390 return 0; 391 } 392 393 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 394 { 395 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 396 struct crypto_comp *tfm; 397 398 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 399 return 0; 400 401 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 402 if (IS_ERR_OR_NULL(tfm)) { 403 pr_err("could not alloc crypto comp %s : %ld\n", 404 pool->tfm_name, PTR_ERR(tfm)); 405 return -ENOMEM; 406 } 407 *per_cpu_ptr(pool->tfm, cpu) = tfm; 408 return 0; 409 } 410 411 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 412 { 413 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 414 struct crypto_comp *tfm; 415 416 tfm = *per_cpu_ptr(pool->tfm, cpu); 417 if (!IS_ERR_OR_NULL(tfm)) 418 crypto_free_comp(tfm); 419 *per_cpu_ptr(pool->tfm, cpu) = NULL; 420 return 0; 421 } 422 423 /********************************* 424 * pool functions 425 **********************************/ 426 427 static struct zswap_pool *__zswap_pool_current(void) 428 { 429 struct zswap_pool *pool; 430 431 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 432 WARN_ONCE(!pool && zswap_has_pool, 433 "%s: no page storage pool!\n", __func__); 434 435 return pool; 436 } 437 438 static struct zswap_pool *zswap_pool_current(void) 439 { 440 assert_spin_locked(&zswap_pools_lock); 441 442 return __zswap_pool_current(); 443 } 444 445 static struct zswap_pool *zswap_pool_current_get(void) 446 { 447 struct zswap_pool *pool; 448 449 rcu_read_lock(); 450 451 pool = __zswap_pool_current(); 452 if (!zswap_pool_get(pool)) 453 pool = NULL; 454 455 rcu_read_unlock(); 456 457 return pool; 458 } 459 460 static struct zswap_pool *zswap_pool_last_get(void) 461 { 462 struct zswap_pool *pool, *last = NULL; 463 464 rcu_read_lock(); 465 466 list_for_each_entry_rcu(pool, &zswap_pools, list) 467 last = pool; 468 WARN_ONCE(!last && zswap_has_pool, 469 "%s: no page storage pool!\n", __func__); 470 if (!zswap_pool_get(last)) 471 last = NULL; 472 473 rcu_read_unlock(); 474 475 return last; 476 } 477 478 /* type and compressor must be null-terminated */ 479 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 480 { 481 struct zswap_pool *pool; 482 483 assert_spin_locked(&zswap_pools_lock); 484 485 list_for_each_entry_rcu(pool, &zswap_pools, list) { 486 if (strcmp(pool->tfm_name, compressor)) 487 continue; 488 if (strcmp(zpool_get_type(pool->zpool), type)) 489 continue; 490 /* if we can't get it, it's about to be destroyed */ 491 if (!zswap_pool_get(pool)) 492 continue; 493 return pool; 494 } 495 496 return NULL; 497 } 498 499 static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 500 { 501 struct zswap_pool *pool; 502 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 503 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 504 int ret; 505 506 if (!zswap_has_pool) { 507 /* if either are unset, pool initialization failed, and we 508 * need both params to be set correctly before trying to 509 * create a pool. 510 */ 511 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 512 return NULL; 513 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 514 return NULL; 515 } 516 517 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 518 if (!pool) { 519 pr_err("pool alloc failed\n"); 520 return NULL; 521 } 522 523 /* unique name for each pool specifically required by zsmalloc */ 524 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 525 526 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 527 if (!pool->zpool) { 528 pr_err("%s zpool not available\n", type); 529 goto error; 530 } 531 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 532 533 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 534 pool->tfm = alloc_percpu(struct crypto_comp *); 535 if (!pool->tfm) { 536 pr_err("percpu alloc failed\n"); 537 goto error; 538 } 539 540 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 541 &pool->node); 542 if (ret) 543 goto error; 544 pr_debug("using %s compressor\n", pool->tfm_name); 545 546 /* being the current pool takes 1 ref; this func expects the 547 * caller to always add the new pool as the current pool 548 */ 549 kref_init(&pool->kref); 550 INIT_LIST_HEAD(&pool->list); 551 552 zswap_pool_debug("created", pool); 553 554 return pool; 555 556 error: 557 free_percpu(pool->tfm); 558 if (pool->zpool) 559 zpool_destroy_pool(pool->zpool); 560 kfree(pool); 561 return NULL; 562 } 563 564 static __init struct zswap_pool *__zswap_pool_create_fallback(void) 565 { 566 bool has_comp, has_zpool; 567 568 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 569 if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 570 pr_err("compressor %s not available, using default %s\n", 571 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 572 param_free_charp(&zswap_compressor); 573 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 574 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 575 } 576 if (!has_comp) { 577 pr_err("default compressor %s not available\n", 578 zswap_compressor); 579 param_free_charp(&zswap_compressor); 580 zswap_compressor = ZSWAP_PARAM_UNSET; 581 } 582 583 has_zpool = zpool_has_pool(zswap_zpool_type); 584 if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 585 pr_err("zpool %s not available, using default %s\n", 586 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 587 param_free_charp(&zswap_zpool_type); 588 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 589 has_zpool = zpool_has_pool(zswap_zpool_type); 590 } 591 if (!has_zpool) { 592 pr_err("default zpool %s not available\n", 593 zswap_zpool_type); 594 param_free_charp(&zswap_zpool_type); 595 zswap_zpool_type = ZSWAP_PARAM_UNSET; 596 } 597 598 if (!has_comp || !has_zpool) 599 return NULL; 600 601 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 602 } 603 604 static void zswap_pool_destroy(struct zswap_pool *pool) 605 { 606 zswap_pool_debug("destroying", pool); 607 608 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 609 free_percpu(pool->tfm); 610 zpool_destroy_pool(pool->zpool); 611 kfree(pool); 612 } 613 614 static int __must_check zswap_pool_get(struct zswap_pool *pool) 615 { 616 if (!pool) 617 return 0; 618 619 return kref_get_unless_zero(&pool->kref); 620 } 621 622 static void __zswap_pool_release(struct work_struct *work) 623 { 624 struct zswap_pool *pool = container_of(work, typeof(*pool), work); 625 626 synchronize_rcu(); 627 628 /* nobody should have been able to get a kref... */ 629 WARN_ON(kref_get_unless_zero(&pool->kref)); 630 631 /* pool is now off zswap_pools list and has no references. */ 632 zswap_pool_destroy(pool); 633 } 634 635 static void __zswap_pool_empty(struct kref *kref) 636 { 637 struct zswap_pool *pool; 638 639 pool = container_of(kref, typeof(*pool), kref); 640 641 spin_lock(&zswap_pools_lock); 642 643 WARN_ON(pool == zswap_pool_current()); 644 645 list_del_rcu(&pool->list); 646 647 INIT_WORK(&pool->work, __zswap_pool_release); 648 schedule_work(&pool->work); 649 650 spin_unlock(&zswap_pools_lock); 651 } 652 653 static void zswap_pool_put(struct zswap_pool *pool) 654 { 655 kref_put(&pool->kref, __zswap_pool_empty); 656 } 657 658 /********************************* 659 * param callbacks 660 **********************************/ 661 662 /* val must be a null-terminated string */ 663 static int __zswap_param_set(const char *val, const struct kernel_param *kp, 664 char *type, char *compressor) 665 { 666 struct zswap_pool *pool, *put_pool = NULL; 667 char *s = strstrip((char *)val); 668 int ret; 669 670 if (zswap_init_failed) { 671 pr_err("can't set param, initialization failed\n"); 672 return -ENODEV; 673 } 674 675 /* no change required */ 676 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 677 return 0; 678 679 /* if this is load-time (pre-init) param setting, 680 * don't create a pool; that's done during init. 681 */ 682 if (!zswap_init_started) 683 return param_set_charp(s, kp); 684 685 if (!type) { 686 if (!zpool_has_pool(s)) { 687 pr_err("zpool %s not available\n", s); 688 return -ENOENT; 689 } 690 type = s; 691 } else if (!compressor) { 692 if (!crypto_has_comp(s, 0, 0)) { 693 pr_err("compressor %s not available\n", s); 694 return -ENOENT; 695 } 696 compressor = s; 697 } else { 698 WARN_ON(1); 699 return -EINVAL; 700 } 701 702 spin_lock(&zswap_pools_lock); 703 704 pool = zswap_pool_find_get(type, compressor); 705 if (pool) { 706 zswap_pool_debug("using existing", pool); 707 WARN_ON(pool == zswap_pool_current()); 708 list_del_rcu(&pool->list); 709 } 710 711 spin_unlock(&zswap_pools_lock); 712 713 if (!pool) 714 pool = zswap_pool_create(type, compressor); 715 716 if (pool) 717 ret = param_set_charp(s, kp); 718 else 719 ret = -EINVAL; 720 721 spin_lock(&zswap_pools_lock); 722 723 if (!ret) { 724 put_pool = zswap_pool_current(); 725 list_add_rcu(&pool->list, &zswap_pools); 726 zswap_has_pool = true; 727 } else if (pool) { 728 /* add the possibly pre-existing pool to the end of the pools 729 * list; if it's new (and empty) then it'll be removed and 730 * destroyed by the put after we drop the lock 731 */ 732 list_add_tail_rcu(&pool->list, &zswap_pools); 733 put_pool = pool; 734 } 735 736 spin_unlock(&zswap_pools_lock); 737 738 if (!zswap_has_pool && !pool) { 739 /* if initial pool creation failed, and this pool creation also 740 * failed, maybe both compressor and zpool params were bad. 741 * Allow changing this param, so pool creation will succeed 742 * when the other param is changed. We already verified this 743 * param is ok in the zpool_has_pool() or crypto_has_comp() 744 * checks above. 745 */ 746 ret = param_set_charp(s, kp); 747 } 748 749 /* drop the ref from either the old current pool, 750 * or the new pool we failed to add 751 */ 752 if (put_pool) 753 zswap_pool_put(put_pool); 754 755 return ret; 756 } 757 758 static int zswap_compressor_param_set(const char *val, 759 const struct kernel_param *kp) 760 { 761 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 762 } 763 764 static int zswap_zpool_param_set(const char *val, 765 const struct kernel_param *kp) 766 { 767 return __zswap_param_set(val, kp, NULL, zswap_compressor); 768 } 769 770 static int zswap_enabled_param_set(const char *val, 771 const struct kernel_param *kp) 772 { 773 if (zswap_init_failed) { 774 pr_err("can't enable, initialization failed\n"); 775 return -ENODEV; 776 } 777 if (!zswap_has_pool && zswap_init_started) { 778 pr_err("can't enable, no pool configured\n"); 779 return -ENODEV; 780 } 781 782 return param_set_bool(val, kp); 783 } 784 785 /********************************* 786 * writeback code 787 **********************************/ 788 /* return enum for zswap_get_swap_cache_page */ 789 enum zswap_get_swap_ret { 790 ZSWAP_SWAPCACHE_NEW, 791 ZSWAP_SWAPCACHE_EXIST, 792 ZSWAP_SWAPCACHE_FAIL, 793 }; 794 795 /* 796 * zswap_get_swap_cache_page 797 * 798 * This is an adaption of read_swap_cache_async() 799 * 800 * This function tries to find a page with the given swap entry 801 * in the swapper_space address space (the swap cache). If the page 802 * is found, it is returned in retpage. Otherwise, a page is allocated, 803 * added to the swap cache, and returned in retpage. 804 * 805 * If success, the swap cache page is returned in retpage 806 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 807 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 808 * the new page is added to swapcache and locked 809 * Returns ZSWAP_SWAPCACHE_FAIL on error 810 */ 811 static int zswap_get_swap_cache_page(swp_entry_t entry, 812 struct page **retpage) 813 { 814 bool page_was_allocated; 815 816 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 817 NULL, 0, &page_was_allocated); 818 if (page_was_allocated) 819 return ZSWAP_SWAPCACHE_NEW; 820 if (!*retpage) 821 return ZSWAP_SWAPCACHE_FAIL; 822 return ZSWAP_SWAPCACHE_EXIST; 823 } 824 825 /* 826 * Attempts to free an entry by adding a page to the swap cache, 827 * decompressing the entry data into the page, and issuing a 828 * bio write to write the page back to the swap device. 829 * 830 * This can be thought of as a "resumed writeback" of the page 831 * to the swap device. We are basically resuming the same swap 832 * writeback path that was intercepted with the frontswap_store() 833 * in the first place. After the page has been decompressed into 834 * the swap cache, the compressed version stored by zswap can be 835 * freed. 836 */ 837 static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 838 { 839 struct zswap_header *zhdr; 840 swp_entry_t swpentry; 841 struct zswap_tree *tree; 842 pgoff_t offset; 843 struct zswap_entry *entry; 844 struct page *page; 845 struct crypto_comp *tfm; 846 u8 *src, *dst; 847 unsigned int dlen; 848 int ret; 849 struct writeback_control wbc = { 850 .sync_mode = WB_SYNC_NONE, 851 }; 852 853 /* extract swpentry from data */ 854 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 855 swpentry = zhdr->swpentry; /* here */ 856 zpool_unmap_handle(pool, handle); 857 tree = zswap_trees[swp_type(swpentry)]; 858 offset = swp_offset(swpentry); 859 860 /* find and ref zswap entry */ 861 spin_lock(&tree->lock); 862 entry = zswap_entry_find_get(&tree->rbroot, offset); 863 if (!entry) { 864 /* entry was invalidated */ 865 spin_unlock(&tree->lock); 866 return 0; 867 } 868 spin_unlock(&tree->lock); 869 BUG_ON(offset != entry->offset); 870 871 /* try to allocate swap cache page */ 872 switch (zswap_get_swap_cache_page(swpentry, &page)) { 873 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 874 ret = -ENOMEM; 875 goto fail; 876 877 case ZSWAP_SWAPCACHE_EXIST: 878 /* page is already in the swap cache, ignore for now */ 879 put_page(page); 880 ret = -EEXIST; 881 goto fail; 882 883 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 884 /* decompress */ 885 dlen = PAGE_SIZE; 886 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 887 ZPOOL_MM_RO) + sizeof(struct zswap_header); 888 dst = kmap_atomic(page); 889 tfm = *get_cpu_ptr(entry->pool->tfm); 890 ret = crypto_comp_decompress(tfm, src, entry->length, 891 dst, &dlen); 892 put_cpu_ptr(entry->pool->tfm); 893 kunmap_atomic(dst); 894 zpool_unmap_handle(entry->pool->zpool, entry->handle); 895 BUG_ON(ret); 896 BUG_ON(dlen != PAGE_SIZE); 897 898 /* page is up to date */ 899 SetPageUptodate(page); 900 } 901 902 /* move it to the tail of the inactive list after end_writeback */ 903 SetPageReclaim(page); 904 905 /* start writeback */ 906 __swap_writepage(page, &wbc, end_swap_bio_write); 907 put_page(page); 908 zswap_written_back_pages++; 909 910 spin_lock(&tree->lock); 911 /* drop local reference */ 912 zswap_entry_put(tree, entry); 913 914 /* 915 * There are two possible situations for entry here: 916 * (1) refcount is 1(normal case), entry is valid and on the tree 917 * (2) refcount is 0, entry is freed and not on the tree 918 * because invalidate happened during writeback 919 * search the tree and free the entry if find entry 920 */ 921 if (entry == zswap_rb_search(&tree->rbroot, offset)) 922 zswap_entry_put(tree, entry); 923 spin_unlock(&tree->lock); 924 925 goto end; 926 927 /* 928 * if we get here due to ZSWAP_SWAPCACHE_EXIST 929 * a load may happening concurrently 930 * it is safe and okay to not free the entry 931 * if we free the entry in the following put 932 * it it either okay to return !0 933 */ 934 fail: 935 spin_lock(&tree->lock); 936 zswap_entry_put(tree, entry); 937 spin_unlock(&tree->lock); 938 939 end: 940 return ret; 941 } 942 943 static int zswap_shrink(void) 944 { 945 struct zswap_pool *pool; 946 int ret; 947 948 pool = zswap_pool_last_get(); 949 if (!pool) 950 return -ENOENT; 951 952 ret = zpool_shrink(pool->zpool, 1, NULL); 953 954 zswap_pool_put(pool); 955 956 return ret; 957 } 958 959 /********************************* 960 * frontswap hooks 961 **********************************/ 962 /* attempts to compress and store an single page */ 963 static int zswap_frontswap_store(unsigned type, pgoff_t offset, 964 struct page *page) 965 { 966 struct zswap_tree *tree = zswap_trees[type]; 967 struct zswap_entry *entry, *dupentry; 968 struct crypto_comp *tfm; 969 int ret; 970 unsigned int dlen = PAGE_SIZE, len; 971 unsigned long handle; 972 char *buf; 973 u8 *src, *dst; 974 struct zswap_header *zhdr; 975 976 if (!zswap_enabled || !tree) { 977 ret = -ENODEV; 978 goto reject; 979 } 980 981 /* reclaim space if needed */ 982 if (zswap_is_full()) { 983 zswap_pool_limit_hit++; 984 if (zswap_shrink()) { 985 zswap_reject_reclaim_fail++; 986 ret = -ENOMEM; 987 goto reject; 988 } 989 } 990 991 /* allocate entry */ 992 entry = zswap_entry_cache_alloc(GFP_KERNEL); 993 if (!entry) { 994 zswap_reject_kmemcache_fail++; 995 ret = -ENOMEM; 996 goto reject; 997 } 998 999 /* if entry is successfully added, it keeps the reference */ 1000 entry->pool = zswap_pool_current_get(); 1001 if (!entry->pool) { 1002 ret = -EINVAL; 1003 goto freepage; 1004 } 1005 1006 /* compress */ 1007 dst = get_cpu_var(zswap_dstmem); 1008 tfm = *get_cpu_ptr(entry->pool->tfm); 1009 src = kmap_atomic(page); 1010 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1011 kunmap_atomic(src); 1012 put_cpu_ptr(entry->pool->tfm); 1013 if (ret) { 1014 ret = -EINVAL; 1015 goto put_dstmem; 1016 } 1017 1018 /* store */ 1019 len = dlen + sizeof(struct zswap_header); 1020 ret = zpool_malloc(entry->pool->zpool, len, 1021 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1022 &handle); 1023 if (ret == -ENOSPC) { 1024 zswap_reject_compress_poor++; 1025 goto put_dstmem; 1026 } 1027 if (ret) { 1028 zswap_reject_alloc_fail++; 1029 goto put_dstmem; 1030 } 1031 zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1032 zhdr->swpentry = swp_entry(type, offset); 1033 buf = (u8 *)(zhdr + 1); 1034 memcpy(buf, dst, dlen); 1035 zpool_unmap_handle(entry->pool->zpool, handle); 1036 put_cpu_var(zswap_dstmem); 1037 1038 /* populate entry */ 1039 entry->offset = offset; 1040 entry->handle = handle; 1041 entry->length = dlen; 1042 1043 /* map */ 1044 spin_lock(&tree->lock); 1045 do { 1046 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1047 if (ret == -EEXIST) { 1048 zswap_duplicate_entry++; 1049 /* remove from rbtree */ 1050 zswap_rb_erase(&tree->rbroot, dupentry); 1051 zswap_entry_put(tree, dupentry); 1052 } 1053 } while (ret == -EEXIST); 1054 spin_unlock(&tree->lock); 1055 1056 /* update stats */ 1057 atomic_inc(&zswap_stored_pages); 1058 zswap_update_total_size(); 1059 1060 return 0; 1061 1062 put_dstmem: 1063 put_cpu_var(zswap_dstmem); 1064 zswap_pool_put(entry->pool); 1065 freepage: 1066 zswap_entry_cache_free(entry); 1067 reject: 1068 return ret; 1069 } 1070 1071 /* 1072 * returns 0 if the page was successfully decompressed 1073 * return -1 on entry not found or error 1074 */ 1075 static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1076 struct page *page) 1077 { 1078 struct zswap_tree *tree = zswap_trees[type]; 1079 struct zswap_entry *entry; 1080 struct crypto_comp *tfm; 1081 u8 *src, *dst; 1082 unsigned int dlen; 1083 int ret; 1084 1085 /* find */ 1086 spin_lock(&tree->lock); 1087 entry = zswap_entry_find_get(&tree->rbroot, offset); 1088 if (!entry) { 1089 /* entry was written back */ 1090 spin_unlock(&tree->lock); 1091 return -1; 1092 } 1093 spin_unlock(&tree->lock); 1094 1095 /* decompress */ 1096 dlen = PAGE_SIZE; 1097 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 1098 ZPOOL_MM_RO) + sizeof(struct zswap_header); 1099 dst = kmap_atomic(page); 1100 tfm = *get_cpu_ptr(entry->pool->tfm); 1101 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1102 put_cpu_ptr(entry->pool->tfm); 1103 kunmap_atomic(dst); 1104 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1105 BUG_ON(ret); 1106 1107 spin_lock(&tree->lock); 1108 zswap_entry_put(tree, entry); 1109 spin_unlock(&tree->lock); 1110 1111 return 0; 1112 } 1113 1114 /* frees an entry in zswap */ 1115 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1116 { 1117 struct zswap_tree *tree = zswap_trees[type]; 1118 struct zswap_entry *entry; 1119 1120 /* find */ 1121 spin_lock(&tree->lock); 1122 entry = zswap_rb_search(&tree->rbroot, offset); 1123 if (!entry) { 1124 /* entry was written back */ 1125 spin_unlock(&tree->lock); 1126 return; 1127 } 1128 1129 /* remove from rbtree */ 1130 zswap_rb_erase(&tree->rbroot, entry); 1131 1132 /* drop the initial reference from entry creation */ 1133 zswap_entry_put(tree, entry); 1134 1135 spin_unlock(&tree->lock); 1136 } 1137 1138 /* frees all zswap entries for the given swap type */ 1139 static void zswap_frontswap_invalidate_area(unsigned type) 1140 { 1141 struct zswap_tree *tree = zswap_trees[type]; 1142 struct zswap_entry *entry, *n; 1143 1144 if (!tree) 1145 return; 1146 1147 /* walk the tree and free everything */ 1148 spin_lock(&tree->lock); 1149 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1150 zswap_free_entry(entry); 1151 tree->rbroot = RB_ROOT; 1152 spin_unlock(&tree->lock); 1153 kfree(tree); 1154 zswap_trees[type] = NULL; 1155 } 1156 1157 static void zswap_frontswap_init(unsigned type) 1158 { 1159 struct zswap_tree *tree; 1160 1161 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 1162 if (!tree) { 1163 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1164 return; 1165 } 1166 1167 tree->rbroot = RB_ROOT; 1168 spin_lock_init(&tree->lock); 1169 zswap_trees[type] = tree; 1170 } 1171 1172 static struct frontswap_ops zswap_frontswap_ops = { 1173 .store = zswap_frontswap_store, 1174 .load = zswap_frontswap_load, 1175 .invalidate_page = zswap_frontswap_invalidate_page, 1176 .invalidate_area = zswap_frontswap_invalidate_area, 1177 .init = zswap_frontswap_init 1178 }; 1179 1180 /********************************* 1181 * debugfs functions 1182 **********************************/ 1183 #ifdef CONFIG_DEBUG_FS 1184 #include <linux/debugfs.h> 1185 1186 static struct dentry *zswap_debugfs_root; 1187 1188 static int __init zswap_debugfs_init(void) 1189 { 1190 if (!debugfs_initialized()) 1191 return -ENODEV; 1192 1193 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1194 if (!zswap_debugfs_root) 1195 return -ENOMEM; 1196 1197 debugfs_create_u64("pool_limit_hit", S_IRUGO, 1198 zswap_debugfs_root, &zswap_pool_limit_hit); 1199 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 1200 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1201 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 1202 zswap_debugfs_root, &zswap_reject_alloc_fail); 1203 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 1204 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1205 debugfs_create_u64("reject_compress_poor", S_IRUGO, 1206 zswap_debugfs_root, &zswap_reject_compress_poor); 1207 debugfs_create_u64("written_back_pages", S_IRUGO, 1208 zswap_debugfs_root, &zswap_written_back_pages); 1209 debugfs_create_u64("duplicate_entry", S_IRUGO, 1210 zswap_debugfs_root, &zswap_duplicate_entry); 1211 debugfs_create_u64("pool_total_size", S_IRUGO, 1212 zswap_debugfs_root, &zswap_pool_total_size); 1213 debugfs_create_atomic_t("stored_pages", S_IRUGO, 1214 zswap_debugfs_root, &zswap_stored_pages); 1215 1216 return 0; 1217 } 1218 1219 static void __exit zswap_debugfs_exit(void) 1220 { 1221 debugfs_remove_recursive(zswap_debugfs_root); 1222 } 1223 #else 1224 static int __init zswap_debugfs_init(void) 1225 { 1226 return 0; 1227 } 1228 1229 static void __exit zswap_debugfs_exit(void) { } 1230 #endif 1231 1232 /********************************* 1233 * module init and exit 1234 **********************************/ 1235 static int __init init_zswap(void) 1236 { 1237 struct zswap_pool *pool; 1238 int ret; 1239 1240 zswap_init_started = true; 1241 1242 if (zswap_entry_cache_create()) { 1243 pr_err("entry cache creation failed\n"); 1244 goto cache_fail; 1245 } 1246 1247 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1248 zswap_dstmem_prepare, zswap_dstmem_dead); 1249 if (ret) { 1250 pr_err("dstmem alloc failed\n"); 1251 goto dstmem_fail; 1252 } 1253 1254 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1255 "mm/zswap_pool:prepare", 1256 zswap_cpu_comp_prepare, 1257 zswap_cpu_comp_dead); 1258 if (ret) 1259 goto hp_fail; 1260 1261 pool = __zswap_pool_create_fallback(); 1262 if (pool) { 1263 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1264 zpool_get_type(pool->zpool)); 1265 list_add(&pool->list, &zswap_pools); 1266 zswap_has_pool = true; 1267 } else { 1268 pr_err("pool creation failed\n"); 1269 zswap_enabled = false; 1270 } 1271 1272 frontswap_register_ops(&zswap_frontswap_ops); 1273 if (zswap_debugfs_init()) 1274 pr_warn("debugfs initialization failed\n"); 1275 return 0; 1276 1277 hp_fail: 1278 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1279 dstmem_fail: 1280 zswap_entry_cache_destroy(); 1281 cache_fail: 1282 /* if built-in, we aren't unloaded on failure; don't allow use */ 1283 zswap_init_failed = true; 1284 zswap_enabled = false; 1285 return -ENOMEM; 1286 } 1287 /* must be late so crypto has time to come up */ 1288 late_initcall(init_zswap); 1289 1290 MODULE_LICENSE("GPL"); 1291 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1292 MODULE_DESCRIPTION("Compressed cache for swap pages"); 1293