xref: /linux/mm/zswap.c (revision 2b3460cbf454c6b03d7429e9ffc4fe09322eb1a9)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * zswap.c - zswap driver file
4   *
5   * zswap is a cache that takes pages that are in the process
6   * of being swapped out and attempts to compress and store them in a
7   * RAM-based memory pool.  This can result in a significant I/O reduction on
8   * the swap device and, in the case where decompressing from RAM is faster
9   * than reading from the swap device, can also improve workload performance.
10   *
11   * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12  */
13  
14  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15  
16  #include <linux/module.h>
17  #include <linux/cpu.h>
18  #include <linux/highmem.h>
19  #include <linux/slab.h>
20  #include <linux/spinlock.h>
21  #include <linux/types.h>
22  #include <linux/atomic.h>
23  #include <linux/rbtree.h>
24  #include <linux/swap.h>
25  #include <linux/crypto.h>
26  #include <linux/scatterlist.h>
27  #include <linux/mempolicy.h>
28  #include <linux/mempool.h>
29  #include <linux/zpool.h>
30  #include <crypto/acompress.h>
31  #include <linux/zswap.h>
32  #include <linux/mm_types.h>
33  #include <linux/page-flags.h>
34  #include <linux/swapops.h>
35  #include <linux/writeback.h>
36  #include <linux/pagemap.h>
37  #include <linux/workqueue.h>
38  #include <linux/list_lru.h>
39  
40  #include "swap.h"
41  #include "internal.h"
42  
43  /*********************************
44  * statistics
45  **********************************/
46  /* Total bytes used by the compressed storage */
47  u64 zswap_pool_total_size;
48  /* The number of compressed pages currently stored in zswap */
49  atomic_t zswap_stored_pages = ATOMIC_INIT(0);
50  /* The number of same-value filled pages currently stored in zswap */
51  static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
52  
53  /*
54   * The statistics below are not protected from concurrent access for
55   * performance reasons so they may not be a 100% accurate.  However,
56   * they do provide useful information on roughly how many times a
57   * certain event is occurring.
58  */
59  
60  /* Pool limit was hit (see zswap_max_pool_percent) */
61  static u64 zswap_pool_limit_hit;
62  /* Pages written back when pool limit was reached */
63  static u64 zswap_written_back_pages;
64  /* Store failed due to a reclaim failure after pool limit was reached */
65  static u64 zswap_reject_reclaim_fail;
66  /* Store failed due to compression algorithm failure */
67  static u64 zswap_reject_compress_fail;
68  /* Compressed page was too big for the allocator to (optimally) store */
69  static u64 zswap_reject_compress_poor;
70  /* Store failed because underlying allocator could not get memory */
71  static u64 zswap_reject_alloc_fail;
72  /* Store failed because the entry metadata could not be allocated (rare) */
73  static u64 zswap_reject_kmemcache_fail;
74  
75  /* Shrinker work queue */
76  static struct workqueue_struct *shrink_wq;
77  /* Pool limit was hit, we need to calm down */
78  static bool zswap_pool_reached_full;
79  
80  /*********************************
81  * tunables
82  **********************************/
83  
84  #define ZSWAP_PARAM_UNSET ""
85  
86  static int zswap_setup(void);
87  
88  /* Enable/disable zswap */
89  static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
90  static int zswap_enabled_param_set(const char *,
91  				   const struct kernel_param *);
92  static const struct kernel_param_ops zswap_enabled_param_ops = {
93  	.set =		zswap_enabled_param_set,
94  	.get =		param_get_bool,
95  };
96  module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
97  
98  /* Crypto compressor to use */
99  static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
100  static int zswap_compressor_param_set(const char *,
101  				      const struct kernel_param *);
102  static const struct kernel_param_ops zswap_compressor_param_ops = {
103  	.set =		zswap_compressor_param_set,
104  	.get =		param_get_charp,
105  	.free =		param_free_charp,
106  };
107  module_param_cb(compressor, &zswap_compressor_param_ops,
108  		&zswap_compressor, 0644);
109  
110  /* Compressed storage zpool to use */
111  static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
112  static int zswap_zpool_param_set(const char *, const struct kernel_param *);
113  static const struct kernel_param_ops zswap_zpool_param_ops = {
114  	.set =		zswap_zpool_param_set,
115  	.get =		param_get_charp,
116  	.free =		param_free_charp,
117  };
118  module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
119  
120  /* The maximum percentage of memory that the compressed pool can occupy */
121  static unsigned int zswap_max_pool_percent = 20;
122  module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
123  
124  /* The threshold for accepting new pages after the max_pool_percent was hit */
125  static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
126  module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
127  		   uint, 0644);
128  
129  /*
130   * Enable/disable handling same-value filled pages (enabled by default).
131   * If disabled every page is considered non-same-value filled.
132   */
133  static bool zswap_same_filled_pages_enabled = true;
134  module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
135  		   bool, 0644);
136  
137  /* Enable/disable handling non-same-value filled pages (enabled by default) */
138  static bool zswap_non_same_filled_pages_enabled = true;
139  module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
140  		   bool, 0644);
141  
142  /* Number of zpools in zswap_pool (empirically determined for scalability) */
143  #define ZSWAP_NR_ZPOOLS 32
144  
145  /* Enable/disable memory pressure-based shrinker. */
146  static bool zswap_shrinker_enabled = IS_ENABLED(
147  		CONFIG_ZSWAP_SHRINKER_DEFAULT_ON);
148  module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
149  
150  bool is_zswap_enabled(void)
151  {
152  	return zswap_enabled;
153  }
154  
155  /*********************************
156  * data structures
157  **********************************/
158  
159  struct crypto_acomp_ctx {
160  	struct crypto_acomp *acomp;
161  	struct acomp_req *req;
162  	struct crypto_wait wait;
163  	u8 *buffer;
164  	struct mutex mutex;
165  	bool is_sleepable;
166  };
167  
168  /*
169   * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
170   * The only case where lru_lock is not acquired while holding tree.lock is
171   * when a zswap_entry is taken off the lru for writeback, in that case it
172   * needs to be verified that it's still valid in the tree.
173   */
174  struct zswap_pool {
175  	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
176  	struct crypto_acomp_ctx __percpu *acomp_ctx;
177  	struct percpu_ref ref;
178  	struct list_head list;
179  	struct work_struct release_work;
180  	struct hlist_node node;
181  	char tfm_name[CRYPTO_MAX_ALG_NAME];
182  };
183  
184  /* Global LRU lists shared by all zswap pools. */
185  static struct list_lru zswap_list_lru;
186  /* counter of pages stored in all zswap pools. */
187  static atomic_t zswap_nr_stored = ATOMIC_INIT(0);
188  
189  /* The lock protects zswap_next_shrink updates. */
190  static DEFINE_SPINLOCK(zswap_shrink_lock);
191  static struct mem_cgroup *zswap_next_shrink;
192  static struct work_struct zswap_shrink_work;
193  static struct shrinker *zswap_shrinker;
194  
195  /*
196   * struct zswap_entry
197   *
198   * This structure contains the metadata for tracking a single compressed
199   * page within zswap.
200   *
201   * rbnode - links the entry into red-black tree for the appropriate swap type
202   * swpentry - associated swap entry, the offset indexes into the red-black tree
203   * length - the length in bytes of the compressed page data.  Needed during
204   *          decompression. For a same value filled page length is 0, and both
205   *          pool and lru are invalid and must be ignored.
206   * pool - the zswap_pool the entry's data is in
207   * handle - zpool allocation handle that stores the compressed page data
208   * value - value of the same-value filled pages which have same content
209   * objcg - the obj_cgroup that the compressed memory is charged to
210   * lru - handle to the pool's lru used to evict pages.
211   */
212  struct zswap_entry {
213  	struct rb_node rbnode;
214  	swp_entry_t swpentry;
215  	unsigned int length;
216  	struct zswap_pool *pool;
217  	union {
218  		unsigned long handle;
219  		unsigned long value;
220  	};
221  	struct obj_cgroup *objcg;
222  	struct list_head lru;
223  };
224  
225  struct zswap_tree {
226  	struct rb_root rbroot;
227  	spinlock_t lock;
228  };
229  
230  static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
231  static unsigned int nr_zswap_trees[MAX_SWAPFILES];
232  
233  /* RCU-protected iteration */
234  static LIST_HEAD(zswap_pools);
235  /* protects zswap_pools list modification */
236  static DEFINE_SPINLOCK(zswap_pools_lock);
237  /* pool counter to provide unique names to zpool */
238  static atomic_t zswap_pools_count = ATOMIC_INIT(0);
239  
240  enum zswap_init_type {
241  	ZSWAP_UNINIT,
242  	ZSWAP_INIT_SUCCEED,
243  	ZSWAP_INIT_FAILED
244  };
245  
246  static enum zswap_init_type zswap_init_state;
247  
248  /* used to ensure the integrity of initialization */
249  static DEFINE_MUTEX(zswap_init_lock);
250  
251  /* init completed, but couldn't create the initial pool */
252  static bool zswap_has_pool;
253  
254  /*********************************
255  * helpers and fwd declarations
256  **********************************/
257  
258  static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
259  {
260  	return &zswap_trees[swp_type(swp)][swp_offset(swp)
261  		>> SWAP_ADDRESS_SPACE_SHIFT];
262  }
263  
264  #define zswap_pool_debug(msg, p)				\
265  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
266  		 zpool_get_type((p)->zpools[0]))
267  
268  static bool zswap_is_full(void)
269  {
270  	return totalram_pages() * zswap_max_pool_percent / 100 <
271  			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
272  }
273  
274  static bool zswap_can_accept(void)
275  {
276  	return totalram_pages() * zswap_accept_thr_percent / 100 *
277  				zswap_max_pool_percent / 100 >
278  			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
279  }
280  
281  static u64 get_zswap_pool_size(struct zswap_pool *pool)
282  {
283  	u64 pool_size = 0;
284  	int i;
285  
286  	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
287  		pool_size += zpool_get_total_size(pool->zpools[i]);
288  
289  	return pool_size;
290  }
291  
292  static void zswap_update_total_size(void)
293  {
294  	struct zswap_pool *pool;
295  	u64 total = 0;
296  
297  	rcu_read_lock();
298  
299  	list_for_each_entry_rcu(pool, &zswap_pools, list)
300  		total += get_zswap_pool_size(pool);
301  
302  	rcu_read_unlock();
303  
304  	zswap_pool_total_size = total;
305  }
306  
307  /*********************************
308  * pool functions
309  **********************************/
310  static void __zswap_pool_empty(struct percpu_ref *ref);
311  
312  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
313  {
314  	int i;
315  	struct zswap_pool *pool;
316  	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
317  	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
318  	int ret;
319  
320  	if (!zswap_has_pool) {
321  		/* if either are unset, pool initialization failed, and we
322  		 * need both params to be set correctly before trying to
323  		 * create a pool.
324  		 */
325  		if (!strcmp(type, ZSWAP_PARAM_UNSET))
326  			return NULL;
327  		if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
328  			return NULL;
329  	}
330  
331  	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
332  	if (!pool)
333  		return NULL;
334  
335  	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
336  		/* unique name for each pool specifically required by zsmalloc */
337  		snprintf(name, 38, "zswap%x",
338  			 atomic_inc_return(&zswap_pools_count));
339  
340  		pool->zpools[i] = zpool_create_pool(type, name, gfp);
341  		if (!pool->zpools[i]) {
342  			pr_err("%s zpool not available\n", type);
343  			goto error;
344  		}
345  	}
346  	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
347  
348  	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
349  
350  	pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
351  	if (!pool->acomp_ctx) {
352  		pr_err("percpu alloc failed\n");
353  		goto error;
354  	}
355  
356  	ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
357  				       &pool->node);
358  	if (ret)
359  		goto error;
360  
361  	/* being the current pool takes 1 ref; this func expects the
362  	 * caller to always add the new pool as the current pool
363  	 */
364  	ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
365  			      PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
366  	if (ret)
367  		goto ref_fail;
368  	INIT_LIST_HEAD(&pool->list);
369  
370  	zswap_pool_debug("created", pool);
371  
372  	return pool;
373  
374  ref_fail:
375  	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
376  error:
377  	if (pool->acomp_ctx)
378  		free_percpu(pool->acomp_ctx);
379  	while (i--)
380  		zpool_destroy_pool(pool->zpools[i]);
381  	kfree(pool);
382  	return NULL;
383  }
384  
385  static struct zswap_pool *__zswap_pool_create_fallback(void)
386  {
387  	bool has_comp, has_zpool;
388  
389  	has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
390  	if (!has_comp && strcmp(zswap_compressor,
391  				CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
392  		pr_err("compressor %s not available, using default %s\n",
393  		       zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
394  		param_free_charp(&zswap_compressor);
395  		zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
396  		has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
397  	}
398  	if (!has_comp) {
399  		pr_err("default compressor %s not available\n",
400  		       zswap_compressor);
401  		param_free_charp(&zswap_compressor);
402  		zswap_compressor = ZSWAP_PARAM_UNSET;
403  	}
404  
405  	has_zpool = zpool_has_pool(zswap_zpool_type);
406  	if (!has_zpool && strcmp(zswap_zpool_type,
407  				 CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
408  		pr_err("zpool %s not available, using default %s\n",
409  		       zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
410  		param_free_charp(&zswap_zpool_type);
411  		zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
412  		has_zpool = zpool_has_pool(zswap_zpool_type);
413  	}
414  	if (!has_zpool) {
415  		pr_err("default zpool %s not available\n",
416  		       zswap_zpool_type);
417  		param_free_charp(&zswap_zpool_type);
418  		zswap_zpool_type = ZSWAP_PARAM_UNSET;
419  	}
420  
421  	if (!has_comp || !has_zpool)
422  		return NULL;
423  
424  	return zswap_pool_create(zswap_zpool_type, zswap_compressor);
425  }
426  
427  static void zswap_pool_destroy(struct zswap_pool *pool)
428  {
429  	int i;
430  
431  	zswap_pool_debug("destroying", pool);
432  
433  	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
434  	free_percpu(pool->acomp_ctx);
435  
436  	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
437  		zpool_destroy_pool(pool->zpools[i]);
438  	kfree(pool);
439  }
440  
441  static void __zswap_pool_release(struct work_struct *work)
442  {
443  	struct zswap_pool *pool = container_of(work, typeof(*pool),
444  						release_work);
445  
446  	synchronize_rcu();
447  
448  	/* nobody should have been able to get a ref... */
449  	WARN_ON(!percpu_ref_is_zero(&pool->ref));
450  	percpu_ref_exit(&pool->ref);
451  
452  	/* pool is now off zswap_pools list and has no references. */
453  	zswap_pool_destroy(pool);
454  }
455  
456  static struct zswap_pool *zswap_pool_current(void);
457  
458  static void __zswap_pool_empty(struct percpu_ref *ref)
459  {
460  	struct zswap_pool *pool;
461  
462  	pool = container_of(ref, typeof(*pool), ref);
463  
464  	spin_lock_bh(&zswap_pools_lock);
465  
466  	WARN_ON(pool == zswap_pool_current());
467  
468  	list_del_rcu(&pool->list);
469  
470  	INIT_WORK(&pool->release_work, __zswap_pool_release);
471  	schedule_work(&pool->release_work);
472  
473  	spin_unlock_bh(&zswap_pools_lock);
474  }
475  
476  static int __must_check zswap_pool_get(struct zswap_pool *pool)
477  {
478  	if (!pool)
479  		return 0;
480  
481  	return percpu_ref_tryget(&pool->ref);
482  }
483  
484  static void zswap_pool_put(struct zswap_pool *pool)
485  {
486  	percpu_ref_put(&pool->ref);
487  }
488  
489  static struct zswap_pool *__zswap_pool_current(void)
490  {
491  	struct zswap_pool *pool;
492  
493  	pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
494  	WARN_ONCE(!pool && zswap_has_pool,
495  		  "%s: no page storage pool!\n", __func__);
496  
497  	return pool;
498  }
499  
500  static struct zswap_pool *zswap_pool_current(void)
501  {
502  	assert_spin_locked(&zswap_pools_lock);
503  
504  	return __zswap_pool_current();
505  }
506  
507  static struct zswap_pool *zswap_pool_current_get(void)
508  {
509  	struct zswap_pool *pool;
510  
511  	rcu_read_lock();
512  
513  	pool = __zswap_pool_current();
514  	if (!zswap_pool_get(pool))
515  		pool = NULL;
516  
517  	rcu_read_unlock();
518  
519  	return pool;
520  }
521  
522  /* type and compressor must be null-terminated */
523  static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
524  {
525  	struct zswap_pool *pool;
526  
527  	assert_spin_locked(&zswap_pools_lock);
528  
529  	list_for_each_entry_rcu(pool, &zswap_pools, list) {
530  		if (strcmp(pool->tfm_name, compressor))
531  			continue;
532  		/* all zpools share the same type */
533  		if (strcmp(zpool_get_type(pool->zpools[0]), type))
534  			continue;
535  		/* if we can't get it, it's about to be destroyed */
536  		if (!zswap_pool_get(pool))
537  			continue;
538  		return pool;
539  	}
540  
541  	return NULL;
542  }
543  
544  /*********************************
545  * param callbacks
546  **********************************/
547  
548  static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
549  {
550  	/* no change required */
551  	if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
552  		return false;
553  	return true;
554  }
555  
556  /* val must be a null-terminated string */
557  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
558  			     char *type, char *compressor)
559  {
560  	struct zswap_pool *pool, *put_pool = NULL;
561  	char *s = strstrip((char *)val);
562  	int ret = 0;
563  	bool new_pool = false;
564  
565  	mutex_lock(&zswap_init_lock);
566  	switch (zswap_init_state) {
567  	case ZSWAP_UNINIT:
568  		/* if this is load-time (pre-init) param setting,
569  		 * don't create a pool; that's done during init.
570  		 */
571  		ret = param_set_charp(s, kp);
572  		break;
573  	case ZSWAP_INIT_SUCCEED:
574  		new_pool = zswap_pool_changed(s, kp);
575  		break;
576  	case ZSWAP_INIT_FAILED:
577  		pr_err("can't set param, initialization failed\n");
578  		ret = -ENODEV;
579  	}
580  	mutex_unlock(&zswap_init_lock);
581  
582  	/* no need to create a new pool, return directly */
583  	if (!new_pool)
584  		return ret;
585  
586  	if (!type) {
587  		if (!zpool_has_pool(s)) {
588  			pr_err("zpool %s not available\n", s);
589  			return -ENOENT;
590  		}
591  		type = s;
592  	} else if (!compressor) {
593  		if (!crypto_has_acomp(s, 0, 0)) {
594  			pr_err("compressor %s not available\n", s);
595  			return -ENOENT;
596  		}
597  		compressor = s;
598  	} else {
599  		WARN_ON(1);
600  		return -EINVAL;
601  	}
602  
603  	spin_lock_bh(&zswap_pools_lock);
604  
605  	pool = zswap_pool_find_get(type, compressor);
606  	if (pool) {
607  		zswap_pool_debug("using existing", pool);
608  		WARN_ON(pool == zswap_pool_current());
609  		list_del_rcu(&pool->list);
610  	}
611  
612  	spin_unlock_bh(&zswap_pools_lock);
613  
614  	if (!pool)
615  		pool = zswap_pool_create(type, compressor);
616  	else {
617  		/*
618  		 * Restore the initial ref dropped by percpu_ref_kill()
619  		 * when the pool was decommissioned and switch it again
620  		 * to percpu mode.
621  		 */
622  		percpu_ref_resurrect(&pool->ref);
623  
624  		/* Drop the ref from zswap_pool_find_get(). */
625  		zswap_pool_put(pool);
626  	}
627  
628  	if (pool)
629  		ret = param_set_charp(s, kp);
630  	else
631  		ret = -EINVAL;
632  
633  	spin_lock_bh(&zswap_pools_lock);
634  
635  	if (!ret) {
636  		put_pool = zswap_pool_current();
637  		list_add_rcu(&pool->list, &zswap_pools);
638  		zswap_has_pool = true;
639  	} else if (pool) {
640  		/* add the possibly pre-existing pool to the end of the pools
641  		 * list; if it's new (and empty) then it'll be removed and
642  		 * destroyed by the put after we drop the lock
643  		 */
644  		list_add_tail_rcu(&pool->list, &zswap_pools);
645  		put_pool = pool;
646  	}
647  
648  	spin_unlock_bh(&zswap_pools_lock);
649  
650  	if (!zswap_has_pool && !pool) {
651  		/* if initial pool creation failed, and this pool creation also
652  		 * failed, maybe both compressor and zpool params were bad.
653  		 * Allow changing this param, so pool creation will succeed
654  		 * when the other param is changed. We already verified this
655  		 * param is ok in the zpool_has_pool() or crypto_has_acomp()
656  		 * checks above.
657  		 */
658  		ret = param_set_charp(s, kp);
659  	}
660  
661  	/* drop the ref from either the old current pool,
662  	 * or the new pool we failed to add
663  	 */
664  	if (put_pool)
665  		percpu_ref_kill(&put_pool->ref);
666  
667  	return ret;
668  }
669  
670  static int zswap_compressor_param_set(const char *val,
671  				      const struct kernel_param *kp)
672  {
673  	return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
674  }
675  
676  static int zswap_zpool_param_set(const char *val,
677  				 const struct kernel_param *kp)
678  {
679  	return __zswap_param_set(val, kp, NULL, zswap_compressor);
680  }
681  
682  static int zswap_enabled_param_set(const char *val,
683  				   const struct kernel_param *kp)
684  {
685  	int ret = -ENODEV;
686  
687  	/* if this is load-time (pre-init) param setting, only set param. */
688  	if (system_state != SYSTEM_RUNNING)
689  		return param_set_bool(val, kp);
690  
691  	mutex_lock(&zswap_init_lock);
692  	switch (zswap_init_state) {
693  	case ZSWAP_UNINIT:
694  		if (zswap_setup())
695  			break;
696  		fallthrough;
697  	case ZSWAP_INIT_SUCCEED:
698  		if (!zswap_has_pool)
699  			pr_err("can't enable, no pool configured\n");
700  		else
701  			ret = param_set_bool(val, kp);
702  		break;
703  	case ZSWAP_INIT_FAILED:
704  		pr_err("can't enable, initialization failed\n");
705  	}
706  	mutex_unlock(&zswap_init_lock);
707  
708  	return ret;
709  }
710  
711  /*********************************
712  * lru functions
713  **********************************/
714  
715  /* should be called under RCU */
716  #ifdef CONFIG_MEMCG
717  static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
718  {
719  	return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
720  }
721  #else
722  static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
723  {
724  	return NULL;
725  }
726  #endif
727  
728  static inline int entry_to_nid(struct zswap_entry *entry)
729  {
730  	return page_to_nid(virt_to_page(entry));
731  }
732  
733  static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
734  {
735  	atomic_long_t *nr_zswap_protected;
736  	unsigned long lru_size, old, new;
737  	int nid = entry_to_nid(entry);
738  	struct mem_cgroup *memcg;
739  	struct lruvec *lruvec;
740  
741  	/*
742  	 * Note that it is safe to use rcu_read_lock() here, even in the face of
743  	 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
744  	 * used in list_lru lookup, only two scenarios are possible:
745  	 *
746  	 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
747  	 *    new entry will be reparented to memcg's parent's list_lru.
748  	 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
749  	 *    new entry will be added directly to memcg's parent's list_lru.
750  	 *
751  	 * Similar reasoning holds for list_lru_del().
752  	 */
753  	rcu_read_lock();
754  	memcg = mem_cgroup_from_entry(entry);
755  	/* will always succeed */
756  	list_lru_add(list_lru, &entry->lru, nid, memcg);
757  
758  	/* Update the protection area */
759  	lru_size = list_lru_count_one(list_lru, nid, memcg);
760  	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
761  	nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
762  	old = atomic_long_inc_return(nr_zswap_protected);
763  	/*
764  	 * Decay to avoid overflow and adapt to changing workloads.
765  	 * This is based on LRU reclaim cost decaying heuristics.
766  	 */
767  	do {
768  		new = old > lru_size / 4 ? old / 2 : old;
769  	} while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
770  	rcu_read_unlock();
771  }
772  
773  static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
774  {
775  	int nid = entry_to_nid(entry);
776  	struct mem_cgroup *memcg;
777  
778  	rcu_read_lock();
779  	memcg = mem_cgroup_from_entry(entry);
780  	/* will always succeed */
781  	list_lru_del(list_lru, &entry->lru, nid, memcg);
782  	rcu_read_unlock();
783  }
784  
785  void zswap_lruvec_state_init(struct lruvec *lruvec)
786  {
787  	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
788  }
789  
790  void zswap_folio_swapin(struct folio *folio)
791  {
792  	struct lruvec *lruvec;
793  
794  	if (folio) {
795  		lruvec = folio_lruvec(folio);
796  		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
797  	}
798  }
799  
800  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
801  {
802  	/* lock out zswap shrinker walking memcg tree */
803  	spin_lock(&zswap_shrink_lock);
804  	if (zswap_next_shrink == memcg)
805  		zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
806  	spin_unlock(&zswap_shrink_lock);
807  }
808  
809  /*********************************
810  * rbtree functions
811  **********************************/
812  static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
813  {
814  	struct rb_node *node = root->rb_node;
815  	struct zswap_entry *entry;
816  	pgoff_t entry_offset;
817  
818  	while (node) {
819  		entry = rb_entry(node, struct zswap_entry, rbnode);
820  		entry_offset = swp_offset(entry->swpentry);
821  		if (entry_offset > offset)
822  			node = node->rb_left;
823  		else if (entry_offset < offset)
824  			node = node->rb_right;
825  		else
826  			return entry;
827  	}
828  	return NULL;
829  }
830  
831  /*
832   * In the case that a entry with the same offset is found, a pointer to
833   * the existing entry is stored in dupentry and the function returns -EEXIST
834   */
835  static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
836  			struct zswap_entry **dupentry)
837  {
838  	struct rb_node **link = &root->rb_node, *parent = NULL;
839  	struct zswap_entry *myentry;
840  	pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
841  
842  	while (*link) {
843  		parent = *link;
844  		myentry = rb_entry(parent, struct zswap_entry, rbnode);
845  		myentry_offset = swp_offset(myentry->swpentry);
846  		if (myentry_offset > entry_offset)
847  			link = &(*link)->rb_left;
848  		else if (myentry_offset < entry_offset)
849  			link = &(*link)->rb_right;
850  		else {
851  			*dupentry = myentry;
852  			return -EEXIST;
853  		}
854  	}
855  	rb_link_node(&entry->rbnode, parent, link);
856  	rb_insert_color(&entry->rbnode, root);
857  	return 0;
858  }
859  
860  static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
861  {
862  	rb_erase(&entry->rbnode, root);
863  	RB_CLEAR_NODE(&entry->rbnode);
864  }
865  
866  /*********************************
867  * zswap entry functions
868  **********************************/
869  static struct kmem_cache *zswap_entry_cache;
870  
871  static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
872  {
873  	struct zswap_entry *entry;
874  	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
875  	if (!entry)
876  		return NULL;
877  	RB_CLEAR_NODE(&entry->rbnode);
878  	return entry;
879  }
880  
881  static void zswap_entry_cache_free(struct zswap_entry *entry)
882  {
883  	kmem_cache_free(zswap_entry_cache, entry);
884  }
885  
886  static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
887  {
888  	int i = 0;
889  
890  	if (ZSWAP_NR_ZPOOLS > 1)
891  		i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
892  
893  	return entry->pool->zpools[i];
894  }
895  
896  /*
897   * Carries out the common pattern of freeing and entry's zpool allocation,
898   * freeing the entry itself, and decrementing the number of stored pages.
899   */
900  static void zswap_entry_free(struct zswap_entry *entry)
901  {
902  	if (!entry->length)
903  		atomic_dec(&zswap_same_filled_pages);
904  	else {
905  		zswap_lru_del(&zswap_list_lru, entry);
906  		zpool_free(zswap_find_zpool(entry), entry->handle);
907  		atomic_dec(&zswap_nr_stored);
908  		zswap_pool_put(entry->pool);
909  	}
910  	if (entry->objcg) {
911  		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
912  		obj_cgroup_put(entry->objcg);
913  	}
914  	zswap_entry_cache_free(entry);
915  	atomic_dec(&zswap_stored_pages);
916  	zswap_update_total_size();
917  }
918  
919  /*
920   * The caller hold the tree lock and search the entry from the tree,
921   * so it must be on the tree, remove it from the tree and free it.
922   */
923  static void zswap_invalidate_entry(struct zswap_tree *tree,
924  				   struct zswap_entry *entry)
925  {
926  	zswap_rb_erase(&tree->rbroot, entry);
927  	zswap_entry_free(entry);
928  }
929  
930  /*********************************
931  * compressed storage functions
932  **********************************/
933  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
934  {
935  	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
936  	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
937  	struct crypto_acomp *acomp;
938  	struct acomp_req *req;
939  	int ret;
940  
941  	mutex_init(&acomp_ctx->mutex);
942  
943  	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
944  	if (!acomp_ctx->buffer)
945  		return -ENOMEM;
946  
947  	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
948  	if (IS_ERR(acomp)) {
949  		pr_err("could not alloc crypto acomp %s : %ld\n",
950  				pool->tfm_name, PTR_ERR(acomp));
951  		ret = PTR_ERR(acomp);
952  		goto acomp_fail;
953  	}
954  	acomp_ctx->acomp = acomp;
955  	acomp_ctx->is_sleepable = acomp_is_async(acomp);
956  
957  	req = acomp_request_alloc(acomp_ctx->acomp);
958  	if (!req) {
959  		pr_err("could not alloc crypto acomp_request %s\n",
960  		       pool->tfm_name);
961  		ret = -ENOMEM;
962  		goto req_fail;
963  	}
964  	acomp_ctx->req = req;
965  
966  	crypto_init_wait(&acomp_ctx->wait);
967  	/*
968  	 * if the backend of acomp is async zip, crypto_req_done() will wakeup
969  	 * crypto_wait_req(); if the backend of acomp is scomp, the callback
970  	 * won't be called, crypto_wait_req() will return without blocking.
971  	 */
972  	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
973  				   crypto_req_done, &acomp_ctx->wait);
974  
975  	return 0;
976  
977  req_fail:
978  	crypto_free_acomp(acomp_ctx->acomp);
979  acomp_fail:
980  	kfree(acomp_ctx->buffer);
981  	return ret;
982  }
983  
984  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
985  {
986  	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
987  	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
988  
989  	if (!IS_ERR_OR_NULL(acomp_ctx)) {
990  		if (!IS_ERR_OR_NULL(acomp_ctx->req))
991  			acomp_request_free(acomp_ctx->req);
992  		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
993  			crypto_free_acomp(acomp_ctx->acomp);
994  		kfree(acomp_ctx->buffer);
995  	}
996  
997  	return 0;
998  }
999  
1000  static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
1001  {
1002  	struct crypto_acomp_ctx *acomp_ctx;
1003  	struct scatterlist input, output;
1004  	int comp_ret = 0, alloc_ret = 0;
1005  	unsigned int dlen = PAGE_SIZE;
1006  	unsigned long handle;
1007  	struct zpool *zpool;
1008  	char *buf;
1009  	gfp_t gfp;
1010  	u8 *dst;
1011  
1012  	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1013  
1014  	mutex_lock(&acomp_ctx->mutex);
1015  
1016  	dst = acomp_ctx->buffer;
1017  	sg_init_table(&input, 1);
1018  	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
1019  
1020  	/*
1021  	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
1022  	 * and hardware-accelerators may won't check the dst buffer size, so
1023  	 * giving the dst buffer with enough length to avoid buffer overflow.
1024  	 */
1025  	sg_init_one(&output, dst, PAGE_SIZE * 2);
1026  	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1027  
1028  	/*
1029  	 * it maybe looks a little bit silly that we send an asynchronous request,
1030  	 * then wait for its completion synchronously. This makes the process look
1031  	 * synchronous in fact.
1032  	 * Theoretically, acomp supports users send multiple acomp requests in one
1033  	 * acomp instance, then get those requests done simultaneously. but in this
1034  	 * case, zswap actually does store and load page by page, there is no
1035  	 * existing method to send the second page before the first page is done
1036  	 * in one thread doing zwap.
1037  	 * but in different threads running on different cpu, we have different
1038  	 * acomp instance, so multiple threads can do (de)compression in parallel.
1039  	 */
1040  	comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1041  	dlen = acomp_ctx->req->dlen;
1042  	if (comp_ret)
1043  		goto unlock;
1044  
1045  	zpool = zswap_find_zpool(entry);
1046  	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1047  	if (zpool_malloc_support_movable(zpool))
1048  		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1049  	alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
1050  	if (alloc_ret)
1051  		goto unlock;
1052  
1053  	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
1054  	memcpy(buf, dst, dlen);
1055  	zpool_unmap_handle(zpool, handle);
1056  
1057  	entry->handle = handle;
1058  	entry->length = dlen;
1059  
1060  unlock:
1061  	if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
1062  		zswap_reject_compress_poor++;
1063  	else if (comp_ret)
1064  		zswap_reject_compress_fail++;
1065  	else if (alloc_ret)
1066  		zswap_reject_alloc_fail++;
1067  
1068  	mutex_unlock(&acomp_ctx->mutex);
1069  	return comp_ret == 0 && alloc_ret == 0;
1070  }
1071  
1072  static void zswap_decompress(struct zswap_entry *entry, struct page *page)
1073  {
1074  	struct zpool *zpool = zswap_find_zpool(entry);
1075  	struct scatterlist input, output;
1076  	struct crypto_acomp_ctx *acomp_ctx;
1077  	u8 *src;
1078  
1079  	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1080  	mutex_lock(&acomp_ctx->mutex);
1081  
1082  	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
1083  	if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) {
1084  		memcpy(acomp_ctx->buffer, src, entry->length);
1085  		src = acomp_ctx->buffer;
1086  		zpool_unmap_handle(zpool, entry->handle);
1087  	}
1088  
1089  	sg_init_one(&input, src, entry->length);
1090  	sg_init_table(&output, 1);
1091  	sg_set_page(&output, page, PAGE_SIZE, 0);
1092  	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
1093  	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
1094  	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
1095  	mutex_unlock(&acomp_ctx->mutex);
1096  
1097  	if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool))
1098  		zpool_unmap_handle(zpool, entry->handle);
1099  }
1100  
1101  /*********************************
1102  * writeback code
1103  **********************************/
1104  /*
1105   * Attempts to free an entry by adding a folio to the swap cache,
1106   * decompressing the entry data into the folio, and issuing a
1107   * bio write to write the folio back to the swap device.
1108   *
1109   * This can be thought of as a "resumed writeback" of the folio
1110   * to the swap device.  We are basically resuming the same swap
1111   * writeback path that was intercepted with the zswap_store()
1112   * in the first place.  After the folio has been decompressed into
1113   * the swap cache, the compressed version stored by zswap can be
1114   * freed.
1115   */
1116  static int zswap_writeback_entry(struct zswap_entry *entry,
1117  				 swp_entry_t swpentry)
1118  {
1119  	struct zswap_tree *tree;
1120  	struct folio *folio;
1121  	struct mempolicy *mpol;
1122  	bool folio_was_allocated;
1123  	struct writeback_control wbc = {
1124  		.sync_mode = WB_SYNC_NONE,
1125  	};
1126  
1127  	/* try to allocate swap cache folio */
1128  	mpol = get_task_policy(current);
1129  	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
1130  				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
1131  	if (!folio)
1132  		return -ENOMEM;
1133  
1134  	/*
1135  	 * Found an existing folio, we raced with swapin or concurrent
1136  	 * shrinker. We generally writeback cold folios from zswap, and
1137  	 * swapin means the folio just became hot, so skip this folio.
1138  	 * For unlikely concurrent shrinker case, it will be unlinked
1139  	 * and freed when invalidated by the concurrent shrinker anyway.
1140  	 */
1141  	if (!folio_was_allocated) {
1142  		folio_put(folio);
1143  		return -EEXIST;
1144  	}
1145  
1146  	/*
1147  	 * folio is locked, and the swapcache is now secured against
1148  	 * concurrent swapping to and from the slot, and concurrent
1149  	 * swapoff so we can safely dereference the zswap tree here.
1150  	 * Verify that the swap entry hasn't been invalidated and recycled
1151  	 * behind our backs, to avoid overwriting a new swap folio with
1152  	 * old compressed data. Only when this is successful can the entry
1153  	 * be dereferenced.
1154  	 */
1155  	tree = swap_zswap_tree(swpentry);
1156  	spin_lock(&tree->lock);
1157  	if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
1158  		spin_unlock(&tree->lock);
1159  		delete_from_swap_cache(folio);
1160  		folio_unlock(folio);
1161  		folio_put(folio);
1162  		return -ENOMEM;
1163  	}
1164  
1165  	/* Safe to deref entry after the entry is verified above. */
1166  	zswap_rb_erase(&tree->rbroot, entry);
1167  	spin_unlock(&tree->lock);
1168  
1169  	zswap_decompress(entry, &folio->page);
1170  
1171  	count_vm_event(ZSWPWB);
1172  	if (entry->objcg)
1173  		count_objcg_event(entry->objcg, ZSWPWB);
1174  
1175  	zswap_entry_free(entry);
1176  
1177  	/* folio is up to date */
1178  	folio_mark_uptodate(folio);
1179  
1180  	/* move it to the tail of the inactive list after end_writeback */
1181  	folio_set_reclaim(folio);
1182  
1183  	/* start writeback */
1184  	__swap_writepage(folio, &wbc);
1185  	folio_put(folio);
1186  
1187  	return 0;
1188  }
1189  
1190  /*********************************
1191  * shrinker functions
1192  **********************************/
1193  static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
1194  				       spinlock_t *lock, void *arg)
1195  {
1196  	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
1197  	bool *encountered_page_in_swapcache = (bool *)arg;
1198  	swp_entry_t swpentry;
1199  	enum lru_status ret = LRU_REMOVED_RETRY;
1200  	int writeback_result;
1201  
1202  	/*
1203  	 * As soon as we drop the LRU lock, the entry can be freed by
1204  	 * a concurrent invalidation. This means the following:
1205  	 *
1206  	 * 1. We extract the swp_entry_t to the stack, allowing
1207  	 *    zswap_writeback_entry() to pin the swap entry and
1208  	 *    then validate the zwap entry against that swap entry's
1209  	 *    tree using pointer value comparison. Only when that
1210  	 *    is successful can the entry be dereferenced.
1211  	 *
1212  	 * 2. Usually, objects are taken off the LRU for reclaim. In
1213  	 *    this case this isn't possible, because if reclaim fails
1214  	 *    for whatever reason, we have no means of knowing if the
1215  	 *    entry is alive to put it back on the LRU.
1216  	 *
1217  	 *    So rotate it before dropping the lock. If the entry is
1218  	 *    written back or invalidated, the free path will unlink
1219  	 *    it. For failures, rotation is the right thing as well.
1220  	 *
1221  	 *    Temporary failures, where the same entry should be tried
1222  	 *    again immediately, almost never happen for this shrinker.
1223  	 *    We don't do any trylocking; -ENOMEM comes closest,
1224  	 *    but that's extremely rare and doesn't happen spuriously
1225  	 *    either. Don't bother distinguishing this case.
1226  	 */
1227  	list_move_tail(item, &l->list);
1228  
1229  	/*
1230  	 * Once the lru lock is dropped, the entry might get freed. The
1231  	 * swpentry is copied to the stack, and entry isn't deref'd again
1232  	 * until the entry is verified to still be alive in the tree.
1233  	 */
1234  	swpentry = entry->swpentry;
1235  
1236  	/*
1237  	 * It's safe to drop the lock here because we return either
1238  	 * LRU_REMOVED_RETRY or LRU_RETRY.
1239  	 */
1240  	spin_unlock(lock);
1241  
1242  	writeback_result = zswap_writeback_entry(entry, swpentry);
1243  
1244  	if (writeback_result) {
1245  		zswap_reject_reclaim_fail++;
1246  		ret = LRU_RETRY;
1247  
1248  		/*
1249  		 * Encountering a page already in swap cache is a sign that we are shrinking
1250  		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
1251  		 * shrinker context).
1252  		 */
1253  		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
1254  			ret = LRU_STOP;
1255  			*encountered_page_in_swapcache = true;
1256  		}
1257  	} else {
1258  		zswap_written_back_pages++;
1259  	}
1260  
1261  	spin_lock(lock);
1262  	return ret;
1263  }
1264  
1265  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
1266  		struct shrink_control *sc)
1267  {
1268  	struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
1269  	unsigned long shrink_ret, nr_protected, lru_size;
1270  	bool encountered_page_in_swapcache = false;
1271  
1272  	if (!zswap_shrinker_enabled ||
1273  			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
1274  		sc->nr_scanned = 0;
1275  		return SHRINK_STOP;
1276  	}
1277  
1278  	nr_protected =
1279  		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
1280  	lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
1281  
1282  	/*
1283  	 * Abort if we are shrinking into the protected region.
1284  	 *
1285  	 * This short-circuiting is necessary because if we have too many multiple
1286  	 * concurrent reclaimers getting the freeable zswap object counts at the
1287  	 * same time (before any of them made reasonable progress), the total
1288  	 * number of reclaimed objects might be more than the number of unprotected
1289  	 * objects (i.e the reclaimers will reclaim into the protected area of the
1290  	 * zswap LRU).
1291  	 */
1292  	if (nr_protected >= lru_size - sc->nr_to_scan) {
1293  		sc->nr_scanned = 0;
1294  		return SHRINK_STOP;
1295  	}
1296  
1297  	shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
1298  		&encountered_page_in_swapcache);
1299  
1300  	if (encountered_page_in_swapcache)
1301  		return SHRINK_STOP;
1302  
1303  	return shrink_ret ? shrink_ret : SHRINK_STOP;
1304  }
1305  
1306  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
1307  		struct shrink_control *sc)
1308  {
1309  	struct mem_cgroup *memcg = sc->memcg;
1310  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
1311  	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
1312  
1313  	if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
1314  		return 0;
1315  
1316  #ifdef CONFIG_MEMCG_KMEM
1317  	mem_cgroup_flush_stats(memcg);
1318  	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
1319  	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
1320  #else
1321  	/* use pool stats instead of memcg stats */
1322  	nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
1323  	nr_stored = atomic_read(&zswap_nr_stored);
1324  #endif
1325  
1326  	if (!nr_stored)
1327  		return 0;
1328  
1329  	nr_protected =
1330  		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
1331  	nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
1332  	/*
1333  	 * Subtract the lru size by an estimate of the number of pages
1334  	 * that should be protected.
1335  	 */
1336  	nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
1337  
1338  	/*
1339  	 * Scale the number of freeable pages by the memory saving factor.
1340  	 * This ensures that the better zswap compresses memory, the fewer
1341  	 * pages we will evict to swap (as it will otherwise incur IO for
1342  	 * relatively small memory saving).
1343  	 */
1344  	return mult_frac(nr_freeable, nr_backing, nr_stored);
1345  }
1346  
1347  static struct shrinker *zswap_alloc_shrinker(void)
1348  {
1349  	struct shrinker *shrinker;
1350  
1351  	shrinker =
1352  		shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
1353  	if (!shrinker)
1354  		return NULL;
1355  
1356  	shrinker->scan_objects = zswap_shrinker_scan;
1357  	shrinker->count_objects = zswap_shrinker_count;
1358  	shrinker->batch = 0;
1359  	shrinker->seeks = DEFAULT_SEEKS;
1360  	return shrinker;
1361  }
1362  
1363  static int shrink_memcg(struct mem_cgroup *memcg)
1364  {
1365  	int nid, shrunk = 0;
1366  
1367  	if (!mem_cgroup_zswap_writeback_enabled(memcg))
1368  		return -EINVAL;
1369  
1370  	/*
1371  	 * Skip zombies because their LRUs are reparented and we would be
1372  	 * reclaiming from the parent instead of the dead memcg.
1373  	 */
1374  	if (memcg && !mem_cgroup_online(memcg))
1375  		return -ENOENT;
1376  
1377  	for_each_node_state(nid, N_NORMAL_MEMORY) {
1378  		unsigned long nr_to_walk = 1;
1379  
1380  		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
1381  					    &shrink_memcg_cb, NULL, &nr_to_walk);
1382  	}
1383  	return shrunk ? 0 : -EAGAIN;
1384  }
1385  
1386  static void shrink_worker(struct work_struct *w)
1387  {
1388  	struct mem_cgroup *memcg;
1389  	int ret, failures = 0;
1390  
1391  	/* global reclaim will select cgroup in a round-robin fashion. */
1392  	do {
1393  		spin_lock(&zswap_shrink_lock);
1394  		zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
1395  		memcg = zswap_next_shrink;
1396  
1397  		/*
1398  		 * We need to retry if we have gone through a full round trip, or if we
1399  		 * got an offline memcg (or else we risk undoing the effect of the
1400  		 * zswap memcg offlining cleanup callback). This is not catastrophic
1401  		 * per se, but it will keep the now offlined memcg hostage for a while.
1402  		 *
1403  		 * Note that if we got an online memcg, we will keep the extra
1404  		 * reference in case the original reference obtained by mem_cgroup_iter
1405  		 * is dropped by the zswap memcg offlining callback, ensuring that the
1406  		 * memcg is not killed when we are reclaiming.
1407  		 */
1408  		if (!memcg) {
1409  			spin_unlock(&zswap_shrink_lock);
1410  			if (++failures == MAX_RECLAIM_RETRIES)
1411  				break;
1412  
1413  			goto resched;
1414  		}
1415  
1416  		if (!mem_cgroup_tryget_online(memcg)) {
1417  			/* drop the reference from mem_cgroup_iter() */
1418  			mem_cgroup_iter_break(NULL, memcg);
1419  			zswap_next_shrink = NULL;
1420  			spin_unlock(&zswap_shrink_lock);
1421  
1422  			if (++failures == MAX_RECLAIM_RETRIES)
1423  				break;
1424  
1425  			goto resched;
1426  		}
1427  		spin_unlock(&zswap_shrink_lock);
1428  
1429  		ret = shrink_memcg(memcg);
1430  		/* drop the extra reference */
1431  		mem_cgroup_put(memcg);
1432  
1433  		if (ret == -EINVAL)
1434  			break;
1435  		if (ret && ++failures == MAX_RECLAIM_RETRIES)
1436  			break;
1437  
1438  resched:
1439  		cond_resched();
1440  	} while (!zswap_can_accept());
1441  }
1442  
1443  static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1444  {
1445  	unsigned long *page;
1446  	unsigned long val;
1447  	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1448  
1449  	page = (unsigned long *)ptr;
1450  	val = page[0];
1451  
1452  	if (val != page[last_pos])
1453  		return 0;
1454  
1455  	for (pos = 1; pos < last_pos; pos++) {
1456  		if (val != page[pos])
1457  			return 0;
1458  	}
1459  
1460  	*value = val;
1461  
1462  	return 1;
1463  }
1464  
1465  static void zswap_fill_page(void *ptr, unsigned long value)
1466  {
1467  	unsigned long *page;
1468  
1469  	page = (unsigned long *)ptr;
1470  	memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1471  }
1472  
1473  bool zswap_store(struct folio *folio)
1474  {
1475  	swp_entry_t swp = folio->swap;
1476  	pgoff_t offset = swp_offset(swp);
1477  	struct zswap_tree *tree = swap_zswap_tree(swp);
1478  	struct zswap_entry *entry, *dupentry;
1479  	struct obj_cgroup *objcg = NULL;
1480  	struct mem_cgroup *memcg = NULL;
1481  
1482  	VM_WARN_ON_ONCE(!folio_test_locked(folio));
1483  	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
1484  
1485  	/* Large folios aren't supported */
1486  	if (folio_test_large(folio))
1487  		return false;
1488  
1489  	if (!zswap_enabled)
1490  		goto check_old;
1491  
1492  	objcg = get_obj_cgroup_from_folio(folio);
1493  	if (objcg && !obj_cgroup_may_zswap(objcg)) {
1494  		memcg = get_mem_cgroup_from_objcg(objcg);
1495  		if (shrink_memcg(memcg)) {
1496  			mem_cgroup_put(memcg);
1497  			goto reject;
1498  		}
1499  		mem_cgroup_put(memcg);
1500  	}
1501  
1502  	/* reclaim space if needed */
1503  	if (zswap_is_full()) {
1504  		zswap_pool_limit_hit++;
1505  		zswap_pool_reached_full = true;
1506  		goto shrink;
1507  	}
1508  
1509  	if (zswap_pool_reached_full) {
1510  	       if (!zswap_can_accept())
1511  			goto shrink;
1512  		else
1513  			zswap_pool_reached_full = false;
1514  	}
1515  
1516  	/* allocate entry */
1517  	entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
1518  	if (!entry) {
1519  		zswap_reject_kmemcache_fail++;
1520  		goto reject;
1521  	}
1522  
1523  	if (zswap_same_filled_pages_enabled) {
1524  		unsigned long value;
1525  		u8 *src;
1526  
1527  		src = kmap_local_folio(folio, 0);
1528  		if (zswap_is_page_same_filled(src, &value)) {
1529  			kunmap_local(src);
1530  			entry->length = 0;
1531  			entry->value = value;
1532  			atomic_inc(&zswap_same_filled_pages);
1533  			goto insert_entry;
1534  		}
1535  		kunmap_local(src);
1536  	}
1537  
1538  	if (!zswap_non_same_filled_pages_enabled)
1539  		goto freepage;
1540  
1541  	/* if entry is successfully added, it keeps the reference */
1542  	entry->pool = zswap_pool_current_get();
1543  	if (!entry->pool)
1544  		goto freepage;
1545  
1546  	if (objcg) {
1547  		memcg = get_mem_cgroup_from_objcg(objcg);
1548  		if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
1549  			mem_cgroup_put(memcg);
1550  			goto put_pool;
1551  		}
1552  		mem_cgroup_put(memcg);
1553  	}
1554  
1555  	if (!zswap_compress(folio, entry))
1556  		goto put_pool;
1557  
1558  insert_entry:
1559  	entry->swpentry = swp;
1560  	entry->objcg = objcg;
1561  	if (objcg) {
1562  		obj_cgroup_charge_zswap(objcg, entry->length);
1563  		/* Account before objcg ref is moved to tree */
1564  		count_objcg_event(objcg, ZSWPOUT);
1565  	}
1566  
1567  	/* map */
1568  	spin_lock(&tree->lock);
1569  	/*
1570  	 * The folio may have been dirtied again, invalidate the
1571  	 * possibly stale entry before inserting the new entry.
1572  	 */
1573  	if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
1574  		zswap_invalidate_entry(tree, dupentry);
1575  		WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
1576  	}
1577  	if (entry->length) {
1578  		INIT_LIST_HEAD(&entry->lru);
1579  		zswap_lru_add(&zswap_list_lru, entry);
1580  		atomic_inc(&zswap_nr_stored);
1581  	}
1582  	spin_unlock(&tree->lock);
1583  
1584  	/* update stats */
1585  	atomic_inc(&zswap_stored_pages);
1586  	zswap_update_total_size();
1587  	count_vm_event(ZSWPOUT);
1588  
1589  	return true;
1590  
1591  put_pool:
1592  	zswap_pool_put(entry->pool);
1593  freepage:
1594  	zswap_entry_cache_free(entry);
1595  reject:
1596  	if (objcg)
1597  		obj_cgroup_put(objcg);
1598  check_old:
1599  	/*
1600  	 * If the zswap store fails or zswap is disabled, we must invalidate the
1601  	 * possibly stale entry which was previously stored at this offset.
1602  	 * Otherwise, writeback could overwrite the new data in the swapfile.
1603  	 */
1604  	spin_lock(&tree->lock);
1605  	entry = zswap_rb_search(&tree->rbroot, offset);
1606  	if (entry)
1607  		zswap_invalidate_entry(tree, entry);
1608  	spin_unlock(&tree->lock);
1609  	return false;
1610  
1611  shrink:
1612  	queue_work(shrink_wq, &zswap_shrink_work);
1613  	goto reject;
1614  }
1615  
1616  bool zswap_load(struct folio *folio)
1617  {
1618  	swp_entry_t swp = folio->swap;
1619  	pgoff_t offset = swp_offset(swp);
1620  	struct page *page = &folio->page;
1621  	struct zswap_tree *tree = swap_zswap_tree(swp);
1622  	struct zswap_entry *entry;
1623  	u8 *dst;
1624  
1625  	VM_WARN_ON_ONCE(!folio_test_locked(folio));
1626  
1627  	spin_lock(&tree->lock);
1628  	entry = zswap_rb_search(&tree->rbroot, offset);
1629  	if (!entry) {
1630  		spin_unlock(&tree->lock);
1631  		return false;
1632  	}
1633  	zswap_rb_erase(&tree->rbroot, entry);
1634  	spin_unlock(&tree->lock);
1635  
1636  	if (entry->length)
1637  		zswap_decompress(entry, page);
1638  	else {
1639  		dst = kmap_local_page(page);
1640  		zswap_fill_page(dst, entry->value);
1641  		kunmap_local(dst);
1642  	}
1643  
1644  	count_vm_event(ZSWPIN);
1645  	if (entry->objcg)
1646  		count_objcg_event(entry->objcg, ZSWPIN);
1647  
1648  	zswap_entry_free(entry);
1649  
1650  	folio_mark_dirty(folio);
1651  
1652  	return true;
1653  }
1654  
1655  void zswap_invalidate(swp_entry_t swp)
1656  {
1657  	pgoff_t offset = swp_offset(swp);
1658  	struct zswap_tree *tree = swap_zswap_tree(swp);
1659  	struct zswap_entry *entry;
1660  
1661  	spin_lock(&tree->lock);
1662  	entry = zswap_rb_search(&tree->rbroot, offset);
1663  	if (entry)
1664  		zswap_invalidate_entry(tree, entry);
1665  	spin_unlock(&tree->lock);
1666  }
1667  
1668  int zswap_swapon(int type, unsigned long nr_pages)
1669  {
1670  	struct zswap_tree *trees, *tree;
1671  	unsigned int nr, i;
1672  
1673  	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
1674  	trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
1675  	if (!trees) {
1676  		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1677  		return -ENOMEM;
1678  	}
1679  
1680  	for (i = 0; i < nr; i++) {
1681  		tree = trees + i;
1682  		tree->rbroot = RB_ROOT;
1683  		spin_lock_init(&tree->lock);
1684  	}
1685  
1686  	nr_zswap_trees[type] = nr;
1687  	zswap_trees[type] = trees;
1688  	return 0;
1689  }
1690  
1691  void zswap_swapoff(int type)
1692  {
1693  	struct zswap_tree *trees = zswap_trees[type];
1694  	unsigned int i;
1695  
1696  	if (!trees)
1697  		return;
1698  
1699  	/* try_to_unuse() invalidated all the entries already */
1700  	for (i = 0; i < nr_zswap_trees[type]; i++)
1701  		WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
1702  
1703  	kvfree(trees);
1704  	nr_zswap_trees[type] = 0;
1705  	zswap_trees[type] = NULL;
1706  }
1707  
1708  /*********************************
1709  * debugfs functions
1710  **********************************/
1711  #ifdef CONFIG_DEBUG_FS
1712  #include <linux/debugfs.h>
1713  
1714  static struct dentry *zswap_debugfs_root;
1715  
1716  static int zswap_debugfs_init(void)
1717  {
1718  	if (!debugfs_initialized())
1719  		return -ENODEV;
1720  
1721  	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1722  
1723  	debugfs_create_u64("pool_limit_hit", 0444,
1724  			   zswap_debugfs_root, &zswap_pool_limit_hit);
1725  	debugfs_create_u64("reject_reclaim_fail", 0444,
1726  			   zswap_debugfs_root, &zswap_reject_reclaim_fail);
1727  	debugfs_create_u64("reject_alloc_fail", 0444,
1728  			   zswap_debugfs_root, &zswap_reject_alloc_fail);
1729  	debugfs_create_u64("reject_kmemcache_fail", 0444,
1730  			   zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1731  	debugfs_create_u64("reject_compress_fail", 0444,
1732  			   zswap_debugfs_root, &zswap_reject_compress_fail);
1733  	debugfs_create_u64("reject_compress_poor", 0444,
1734  			   zswap_debugfs_root, &zswap_reject_compress_poor);
1735  	debugfs_create_u64("written_back_pages", 0444,
1736  			   zswap_debugfs_root, &zswap_written_back_pages);
1737  	debugfs_create_u64("pool_total_size", 0444,
1738  			   zswap_debugfs_root, &zswap_pool_total_size);
1739  	debugfs_create_atomic_t("stored_pages", 0444,
1740  				zswap_debugfs_root, &zswap_stored_pages);
1741  	debugfs_create_atomic_t("same_filled_pages", 0444,
1742  				zswap_debugfs_root, &zswap_same_filled_pages);
1743  
1744  	return 0;
1745  }
1746  #else
1747  static int zswap_debugfs_init(void)
1748  {
1749  	return 0;
1750  }
1751  #endif
1752  
1753  /*********************************
1754  * module init and exit
1755  **********************************/
1756  static int zswap_setup(void)
1757  {
1758  	struct zswap_pool *pool;
1759  	int ret;
1760  
1761  	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1762  	if (!zswap_entry_cache) {
1763  		pr_err("entry cache creation failed\n");
1764  		goto cache_fail;
1765  	}
1766  
1767  	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1768  				      "mm/zswap_pool:prepare",
1769  				      zswap_cpu_comp_prepare,
1770  				      zswap_cpu_comp_dead);
1771  	if (ret)
1772  		goto hp_fail;
1773  
1774  	shrink_wq = alloc_workqueue("zswap-shrink",
1775  			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
1776  	if (!shrink_wq)
1777  		goto shrink_wq_fail;
1778  
1779  	zswap_shrinker = zswap_alloc_shrinker();
1780  	if (!zswap_shrinker)
1781  		goto shrinker_fail;
1782  	if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker))
1783  		goto lru_fail;
1784  	shrinker_register(zswap_shrinker);
1785  
1786  	INIT_WORK(&zswap_shrink_work, shrink_worker);
1787  
1788  	pool = __zswap_pool_create_fallback();
1789  	if (pool) {
1790  		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1791  			zpool_get_type(pool->zpools[0]));
1792  		list_add(&pool->list, &zswap_pools);
1793  		zswap_has_pool = true;
1794  	} else {
1795  		pr_err("pool creation failed\n");
1796  		zswap_enabled = false;
1797  	}
1798  
1799  	if (zswap_debugfs_init())
1800  		pr_warn("debugfs initialization failed\n");
1801  	zswap_init_state = ZSWAP_INIT_SUCCEED;
1802  	return 0;
1803  
1804  lru_fail:
1805  	shrinker_free(zswap_shrinker);
1806  shrinker_fail:
1807  	destroy_workqueue(shrink_wq);
1808  shrink_wq_fail:
1809  	cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE);
1810  hp_fail:
1811  	kmem_cache_destroy(zswap_entry_cache);
1812  cache_fail:
1813  	/* if built-in, we aren't unloaded on failure; don't allow use */
1814  	zswap_init_state = ZSWAP_INIT_FAILED;
1815  	zswap_enabled = false;
1816  	return -ENOMEM;
1817  }
1818  
1819  static int __init zswap_init(void)
1820  {
1821  	if (!zswap_enabled)
1822  		return 0;
1823  	return zswap_setup();
1824  }
1825  /* must be late so crypto has time to come up */
1826  late_initcall(zswap_init);
1827  
1828  MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1829  MODULE_DESCRIPTION("Compressed cache for swap pages");
1830