xref: /linux/drivers/block/zram/zram_drv.c (revision a552c81ff4a16738ca5a44a177d552eb38d552ce)
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14 
15 #define pr_fmt(fmt) "zram: " fmt
16 
17 #include <linux/module.h>
18 #include <linux/kernel.h>
19 #include <linux/bio.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/highmem.h>
25 #include <linux/slab.h>
26 #include <linux/backing-dev.h>
27 #include <linux/string.h>
28 #include <linux/vmalloc.h>
29 #include <linux/err.h>
30 #include <linux/idr.h>
31 #include <linux/sysfs.h>
32 #include <linux/debugfs.h>
33 #include <linux/cpuhotplug.h>
34 #include <linux/part_stat.h>
35 #include <linux/kernel_read_file.h>
36 #include <linux/rcupdate.h>
37 
38 #include "zram_drv.h"
39 
40 static DEFINE_IDR(zram_index_idr);
41 /* idr index must be protected */
42 static DEFINE_MUTEX(zram_index_mutex);
43 
44 static int zram_major;
45 static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
46 
47 #define ZRAM_MAX_ALGO_NAME_SZ	128
48 
49 /* Module params (documentation at end) */
50 static unsigned int num_devices = 1;
51 /*
52  * Pages that compress to sizes equals or greater than this are stored
53  * uncompressed in memory.
54  */
55 static size_t huge_class_size;
56 
57 static const struct block_device_operations zram_devops;
58 
59 static void slot_free(struct zram *zram, u32 index);
60 #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map)
61 
62 static void slot_lock_init(struct zram *zram, u32 index)
63 {
64 	static struct lock_class_key __key;
65 
66 	lockdep_init_map(slot_dep_map(zram, index), "zram->table[index].lock",
67 			 &__key, 0);
68 }
69 
70 /*
71  * entry locking rules:
72  *
73  * 1) Lock is exclusive
74  *
75  * 2) lock() function can sleep waiting for the lock
76  *
77  * 3) Lock owner can sleep
78  *
79  * 4) Use TRY lock variant when in atomic context
80  *    - must check return value and handle locking failers
81  */
82 static __must_check bool slot_trylock(struct zram *zram, u32 index)
83 {
84 	unsigned long *lock = &zram->table[index].__lock;
85 
86 	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
87 		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
88 		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
89 		return true;
90 	}
91 
92 	return false;
93 }
94 
95 static void slot_lock(struct zram *zram, u32 index)
96 {
97 	unsigned long *lock = &zram->table[index].__lock;
98 
99 	mutex_acquire(slot_dep_map(zram, index), 0, 0, _RET_IP_);
100 	wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE);
101 	lock_acquired(slot_dep_map(zram, index), _RET_IP_);
102 }
103 
104 static void slot_unlock(struct zram *zram, u32 index)
105 {
106 	unsigned long *lock = &zram->table[index].__lock;
107 
108 	mutex_release(slot_dep_map(zram, index), _RET_IP_);
109 	clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock);
110 }
111 
112 static inline bool init_done(struct zram *zram)
113 {
114 	return zram->disksize;
115 }
116 
117 static inline struct zram *dev_to_zram(struct device *dev)
118 {
119 	return (struct zram *)dev_to_disk(dev)->private_data;
120 }
121 
122 static unsigned long get_slot_handle(struct zram *zram, u32 index)
123 {
124 	return zram->table[index].handle;
125 }
126 
127 static void set_slot_handle(struct zram *zram, u32 index, unsigned long handle)
128 {
129 	zram->table[index].handle = handle;
130 }
131 
132 static bool test_slot_flag(struct zram *zram, u32 index,
133 			   enum zram_pageflags flag)
134 {
135 	return zram->table[index].attr.flags & BIT(flag);
136 }
137 
138 static void set_slot_flag(struct zram *zram, u32 index,
139 			  enum zram_pageflags flag)
140 {
141 	zram->table[index].attr.flags |= BIT(flag);
142 }
143 
144 static void clear_slot_flag(struct zram *zram, u32 index,
145 			    enum zram_pageflags flag)
146 {
147 	zram->table[index].attr.flags &= ~BIT(flag);
148 }
149 
150 static size_t get_slot_size(struct zram *zram, u32 index)
151 {
152 	return zram->table[index].attr.flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
153 }
154 
155 static void set_slot_size(struct zram *zram, u32 index, size_t size)
156 {
157 	unsigned long flags = zram->table[index].attr.flags >> ZRAM_FLAG_SHIFT;
158 
159 	zram->table[index].attr.flags = (flags << ZRAM_FLAG_SHIFT) | size;
160 }
161 
162 static inline bool slot_allocated(struct zram *zram, u32 index)
163 {
164 	return get_slot_size(zram, index) ||
165 		test_slot_flag(zram, index, ZRAM_SAME) ||
166 		test_slot_flag(zram, index, ZRAM_WB);
167 }
168 
169 static inline void set_slot_comp_priority(struct zram *zram, u32 index,
170 					  u32 prio)
171 {
172 	prio &= ZRAM_COMP_PRIORITY_MASK;
173 	/*
174 	 * Clear previous priority value first, in case if we recompress
175 	 * further an already recompressed page
176 	 */
177 	zram->table[index].attr.flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
178 					   ZRAM_COMP_PRIORITY_BIT1);
179 	zram->table[index].attr.flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
180 }
181 
182 static inline u32 get_slot_comp_priority(struct zram *zram, u32 index)
183 {
184 	u32 prio = zram->table[index].attr.flags >> ZRAM_COMP_PRIORITY_BIT1;
185 
186 	return prio & ZRAM_COMP_PRIORITY_MASK;
187 }
188 
189 static void mark_slot_accessed(struct zram *zram, u32 index)
190 {
191 	clear_slot_flag(zram, index, ZRAM_IDLE);
192 	clear_slot_flag(zram, index, ZRAM_PP_SLOT);
193 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
194 	zram->table[index].attr.ac_time = (u32)ktime_get_boottime_seconds();
195 #endif
196 }
197 
198 static inline void update_used_max(struct zram *zram, const unsigned long pages)
199 {
200 	unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
201 
202 	do {
203 		if (cur_max >= pages)
204 			return;
205 	} while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
206 					  &cur_max, pages));
207 }
208 
209 static bool zram_can_store_page(struct zram *zram)
210 {
211 	unsigned long alloced_pages;
212 
213 	alloced_pages = zs_get_total_pages(zram->mem_pool);
214 	update_used_max(zram, alloced_pages);
215 
216 	return !zram->limit_pages || alloced_pages <= zram->limit_pages;
217 }
218 
219 #if PAGE_SIZE != 4096
220 static inline bool is_partial_io(struct bio_vec *bvec)
221 {
222 	return bvec->bv_len != PAGE_SIZE;
223 }
224 #define ZRAM_PARTIAL_IO		1
225 #else
226 static inline bool is_partial_io(struct bio_vec *bvec)
227 {
228 	return false;
229 }
230 #endif
231 
232 #if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP
233 struct zram_pp_slot {
234 	unsigned long		index;
235 	struct list_head	entry;
236 };
237 
238 /*
239  * A post-processing bucket is, essentially, a size class, this defines
240  * the range (in bytes) of pp-slots sizes in particular bucket.
241  */
242 #define PP_BUCKET_SIZE_RANGE	64
243 #define NUM_PP_BUCKETS		((PAGE_SIZE / PP_BUCKET_SIZE_RANGE) + 1)
244 
245 struct zram_pp_ctl {
246 	struct list_head	pp_buckets[NUM_PP_BUCKETS];
247 };
248 
249 static struct zram_pp_ctl *init_pp_ctl(void)
250 {
251 	struct zram_pp_ctl *ctl;
252 	u32 idx;
253 
254 	ctl = kmalloc_obj(*ctl);
255 	if (!ctl)
256 		return NULL;
257 
258 	for (idx = 0; idx < NUM_PP_BUCKETS; idx++)
259 		INIT_LIST_HEAD(&ctl->pp_buckets[idx]);
260 	return ctl;
261 }
262 
263 static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps)
264 {
265 	list_del_init(&pps->entry);
266 
267 	slot_lock(zram, pps->index);
268 	clear_slot_flag(zram, pps->index, ZRAM_PP_SLOT);
269 	slot_unlock(zram, pps->index);
270 
271 	kfree(pps);
272 }
273 
274 static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl)
275 {
276 	u32 idx;
277 
278 	if (!ctl)
279 		return;
280 
281 	for (idx = 0; idx < NUM_PP_BUCKETS; idx++) {
282 		while (!list_empty(&ctl->pp_buckets[idx])) {
283 			struct zram_pp_slot *pps;
284 
285 			pps = list_first_entry(&ctl->pp_buckets[idx],
286 					       struct zram_pp_slot,
287 					       entry);
288 			release_pp_slot(zram, pps);
289 		}
290 	}
291 
292 	kfree(ctl);
293 }
294 
295 static bool place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl,
296 			  u32 index)
297 {
298 	struct zram_pp_slot *pps;
299 	u32 bid;
300 
301 	pps = kmalloc_obj(*pps, GFP_NOIO | __GFP_NOWARN);
302 	if (!pps)
303 		return false;
304 
305 	INIT_LIST_HEAD(&pps->entry);
306 	pps->index = index;
307 
308 	bid = get_slot_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE;
309 	list_add(&pps->entry, &ctl->pp_buckets[bid]);
310 
311 	set_slot_flag(zram, pps->index, ZRAM_PP_SLOT);
312 	return true;
313 }
314 
315 static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl)
316 {
317 	struct zram_pp_slot *pps = NULL;
318 	s32 idx = NUM_PP_BUCKETS - 1;
319 
320 	/* The higher the bucket id the more optimal slot post-processing is */
321 	while (idx >= 0) {
322 		pps = list_first_entry_or_null(&ctl->pp_buckets[idx],
323 					       struct zram_pp_slot,
324 					       entry);
325 		if (pps)
326 			break;
327 
328 		idx--;
329 	}
330 	return pps;
331 }
332 #endif
333 
334 static inline void zram_fill_page(void *ptr, unsigned long len,
335 					unsigned long value)
336 {
337 	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
338 	memset_l(ptr, value, len / sizeof(unsigned long));
339 }
340 
341 static bool page_same_filled(void *ptr, unsigned long *element)
342 {
343 	unsigned long *page;
344 	unsigned long val;
345 	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
346 
347 	page = (unsigned long *)ptr;
348 	val = page[0];
349 
350 	if (val != page[last_pos])
351 		return false;
352 
353 	for (pos = 1; pos < last_pos; pos++) {
354 		if (val != page[pos])
355 			return false;
356 	}
357 
358 	*element = val;
359 
360 	return true;
361 }
362 
363 static ssize_t initstate_show(struct device *dev, struct device_attribute *attr,
364 			      char *buf)
365 {
366 	u32 val;
367 	struct zram *zram = dev_to_zram(dev);
368 
369 	guard(rwsem_read)(&zram->dev_lock);
370 	val = init_done(zram);
371 
372 	return sysfs_emit(buf, "%u\n", val);
373 }
374 
375 static ssize_t disksize_show(struct device *dev,
376 		struct device_attribute *attr, char *buf)
377 {
378 	struct zram *zram = dev_to_zram(dev);
379 
380 	return sysfs_emit(buf, "%llu\n", zram->disksize);
381 }
382 
383 static ssize_t mem_limit_store(struct device *dev,
384 			       struct device_attribute *attr, const char *buf,
385 			       size_t len)
386 {
387 	u64 limit;
388 	char *tmp;
389 	struct zram *zram = dev_to_zram(dev);
390 
391 	limit = memparse(buf, &tmp);
392 	if (buf == tmp) /* no chars parsed, invalid input */
393 		return -EINVAL;
394 
395 	guard(rwsem_write)(&zram->dev_lock);
396 	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
397 
398 	return len;
399 }
400 
401 static ssize_t mem_used_max_store(struct device *dev,
402 				  struct device_attribute *attr,
403 				  const char *buf, size_t len)
404 {
405 	int err;
406 	unsigned long val;
407 	struct zram *zram = dev_to_zram(dev);
408 
409 	err = kstrtoul(buf, 10, &val);
410 	if (err || val != 0)
411 		return -EINVAL;
412 
413 	guard(rwsem_read)(&zram->dev_lock);
414 	if (init_done(zram)) {
415 		atomic_long_set(&zram->stats.max_used_pages,
416 				zs_get_total_pages(zram->mem_pool));
417 	}
418 
419 	return len;
420 }
421 
422 /*
423  * Mark all pages which are older than or equal to cutoff as IDLE.
424  * Callers should hold the zram init lock in read mode
425  */
426 static void mark_idle(struct zram *zram, ktime_t cutoff)
427 {
428 	int is_idle = 1;
429 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
430 	int index;
431 
432 	for (index = 0; index < nr_pages; index++) {
433 		/*
434 		 * Do not mark ZRAM_SAME slots as ZRAM_IDLE, because no
435 		 * post-processing (recompress, writeback) happens to the
436 		 * ZRAM_SAME slot.
437 		 *
438 		 * And ZRAM_WB slots simply cannot be ZRAM_IDLE.
439 		 */
440 		slot_lock(zram, index);
441 		if (!slot_allocated(zram, index) ||
442 		    test_slot_flag(zram, index, ZRAM_WB) ||
443 		    test_slot_flag(zram, index, ZRAM_SAME)) {
444 			slot_unlock(zram, index);
445 			continue;
446 		}
447 
448 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
449 		is_idle = !cutoff ||
450 			ktime_after(cutoff, zram->table[index].attr.ac_time);
451 #endif
452 		if (is_idle)
453 			set_slot_flag(zram, index, ZRAM_IDLE);
454 		else
455 			clear_slot_flag(zram, index, ZRAM_IDLE);
456 		slot_unlock(zram, index);
457 	}
458 }
459 
460 static ssize_t idle_store(struct device *dev, struct device_attribute *attr,
461 			  const char *buf, size_t len)
462 {
463 	struct zram *zram = dev_to_zram(dev);
464 	ktime_t cutoff = 0;
465 
466 	if (!sysfs_streq(buf, "all")) {
467 		/*
468 		 * If it did not parse as 'all' try to treat it as an integer
469 		 * when we have memory tracking enabled.
470 		 */
471 		u32 age_sec;
472 
473 		if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) &&
474 		    !kstrtouint(buf, 0, &age_sec))
475 			cutoff = ktime_sub((u32)ktime_get_boottime_seconds(),
476 					   age_sec);
477 		else
478 			return -EINVAL;
479 	}
480 
481 	guard(rwsem_read)(&zram->dev_lock);
482 	if (!init_done(zram))
483 		return -EINVAL;
484 
485 	/*
486 	 * A cutoff of 0 marks everything as idle, this is the
487 	 * "all" behavior.
488 	 */
489 	mark_idle(zram, cutoff);
490 	return len;
491 }
492 
493 #ifdef CONFIG_ZRAM_WRITEBACK
494 #define INVALID_BDEV_BLOCK		(~0UL)
495 
496 static int read_from_zspool_raw(struct zram *zram, struct page *page,
497 				u32 index);
498 static int read_from_zspool(struct zram *zram, struct page *page, u32 index);
499 
500 struct zram_wb_ctl {
501 	/* idle list is accessed only by the writeback task, no concurency */
502 	struct list_head idle_reqs;
503 	/* done list is accessed concurrently, protect by done_lock */
504 	struct list_head done_reqs;
505 	wait_queue_head_t done_wait;
506 	spinlock_t done_lock;
507 	atomic_t num_inflight;
508 	struct rcu_head rcu;
509 };
510 
511 struct zram_wb_req {
512 	unsigned long blk_idx;
513 	struct page *page;
514 	struct zram_pp_slot *pps;
515 	struct bio_vec bio_vec;
516 	struct bio bio;
517 
518 	struct list_head entry;
519 };
520 
521 struct zram_rb_req {
522 	struct work_struct work;
523 	struct zram *zram;
524 	struct page *page;
525 	/* The read bio for backing device */
526 	struct bio *bio;
527 	unsigned long blk_idx;
528 	union {
529 		/* The original bio to complete (async read) */
530 		struct bio *parent;
531 		/* error status (sync read) */
532 		int error;
533 	};
534 	u32 index;
535 };
536 
537 #define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
538 static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr,
539 			    char *buf)
540 {
541 	struct zram *zram = dev_to_zram(dev);
542 	ssize_t ret;
543 
544 	guard(rwsem_read)(&zram->dev_lock);
545 	ret = sysfs_emit(buf,
546 			 "%8llu %8llu %8llu\n",
547 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
548 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
549 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
550 
551 	return ret;
552 }
553 
554 static ssize_t compressed_writeback_store(struct device *dev,
555 					  struct device_attribute *attr,
556 					  const char *buf, size_t len)
557 {
558 	struct zram *zram = dev_to_zram(dev);
559 	bool val;
560 
561 	if (kstrtobool(buf, &val))
562 		return -EINVAL;
563 
564 	guard(rwsem_write)(&zram->dev_lock);
565 	if (init_done(zram)) {
566 		return -EBUSY;
567 	}
568 
569 	zram->compressed_wb = val;
570 
571 	return len;
572 }
573 
574 static ssize_t compressed_writeback_show(struct device *dev,
575 					 struct device_attribute *attr,
576 					 char *buf)
577 {
578 	bool val;
579 	struct zram *zram = dev_to_zram(dev);
580 
581 	guard(rwsem_read)(&zram->dev_lock);
582 	val = zram->compressed_wb;
583 
584 	return sysfs_emit(buf, "%d\n", val);
585 }
586 
587 static ssize_t writeback_limit_enable_store(struct device *dev,
588 					    struct device_attribute *attr,
589 					    const char *buf, size_t len)
590 {
591 	struct zram *zram = dev_to_zram(dev);
592 	u64 val;
593 
594 	if (kstrtoull(buf, 10, &val))
595 		return -EINVAL;
596 
597 	guard(rwsem_write)(&zram->dev_lock);
598 	zram->wb_limit_enable = val;
599 
600 	return len;
601 }
602 
603 static ssize_t writeback_limit_enable_show(struct device *dev,
604 					   struct device_attribute *attr,
605 					   char *buf)
606 {
607 	bool val;
608 	struct zram *zram = dev_to_zram(dev);
609 
610 	guard(rwsem_read)(&zram->dev_lock);
611 	val = zram->wb_limit_enable;
612 
613 	return sysfs_emit(buf, "%d\n", val);
614 }
615 
616 static ssize_t writeback_limit_store(struct device *dev,
617 				     struct device_attribute *attr,
618 				     const char *buf, size_t len)
619 {
620 	struct zram *zram = dev_to_zram(dev);
621 	u64 val;
622 
623 	if (kstrtoull(buf, 10, &val))
624 		return -EINVAL;
625 
626 	/*
627 	 * When the page size is greater than 4KB, if bd_wb_limit is set to
628 	 * a value that is not page - size aligned, it will cause value
629 	 * wrapping. For example, when the page size is set to 16KB and
630 	 * bd_wb_limit is set to 3, a single write - back operation will
631 	 * cause bd_wb_limit to become -1. Even more terrifying is that
632 	 * bd_wb_limit is an unsigned number.
633 	 */
634 	val = rounddown(val, PAGE_SIZE / 4096);
635 
636 	guard(rwsem_write)(&zram->dev_lock);
637 	zram->bd_wb_limit = val;
638 
639 	return len;
640 }
641 
642 static ssize_t writeback_limit_show(struct device *dev,
643 				    struct device_attribute *attr, char *buf)
644 {
645 	u64 val;
646 	struct zram *zram = dev_to_zram(dev);
647 
648 	guard(rwsem_read)(&zram->dev_lock);
649 	val = zram->bd_wb_limit;
650 
651 	return sysfs_emit(buf, "%llu\n", val);
652 }
653 
654 static ssize_t writeback_batch_size_store(struct device *dev,
655 					  struct device_attribute *attr,
656 					  const char *buf, size_t len)
657 {
658 	struct zram *zram = dev_to_zram(dev);
659 	u32 val;
660 
661 	if (kstrtouint(buf, 10, &val))
662 		return -EINVAL;
663 
664 	if (!val)
665 		return -EINVAL;
666 
667 	guard(rwsem_write)(&zram->dev_lock);
668 	zram->wb_batch_size = val;
669 
670 	return len;
671 }
672 
673 static ssize_t writeback_batch_size_show(struct device *dev,
674 					 struct device_attribute *attr,
675 					 char *buf)
676 {
677 	u32 val;
678 	struct zram *zram = dev_to_zram(dev);
679 
680 	guard(rwsem_read)(&zram->dev_lock);
681 	val = zram->wb_batch_size;
682 
683 	return sysfs_emit(buf, "%u\n", val);
684 }
685 
686 static void reset_bdev(struct zram *zram)
687 {
688 	if (!zram->backing_dev)
689 		return;
690 
691 	/* hope filp_close flush all of IO */
692 	filp_close(zram->backing_dev, NULL);
693 	zram->backing_dev = NULL;
694 	zram->bdev = NULL;
695 	zram->disk->fops = &zram_devops;
696 	kvfree(zram->bitmap);
697 	zram->bitmap = NULL;
698 }
699 
700 static ssize_t backing_dev_show(struct device *dev,
701 				struct device_attribute *attr, char *buf)
702 {
703 	struct file *file;
704 	struct zram *zram = dev_to_zram(dev);
705 	char *p;
706 	ssize_t ret;
707 
708 	guard(rwsem_read)(&zram->dev_lock);
709 	file = zram->backing_dev;
710 	if (!file) {
711 		memcpy(buf, "none\n", 5);
712 		return 5;
713 	}
714 
715 	p = file_path(file, buf, PAGE_SIZE - 1);
716 	if (IS_ERR(p))
717 		return PTR_ERR(p);
718 
719 	ret = strlen(p);
720 	memmove(buf, p, ret);
721 	buf[ret++] = '\n';
722 	return ret;
723 }
724 
725 static ssize_t backing_dev_store(struct device *dev,
726 				 struct device_attribute *attr, const char *buf,
727 				 size_t len)
728 {
729 	char *file_name;
730 	size_t sz;
731 	struct file *backing_dev = NULL;
732 	struct inode *inode;
733 	unsigned int bitmap_sz;
734 	unsigned long nr_pages, *bitmap = NULL;
735 	int err;
736 	struct zram *zram = dev_to_zram(dev);
737 
738 	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
739 	if (!file_name)
740 		return -ENOMEM;
741 
742 	guard(rwsem_write)(&zram->dev_lock);
743 	if (init_done(zram)) {
744 		pr_info("Can't setup backing device for initialized device\n");
745 		err = -EBUSY;
746 		goto out;
747 	}
748 
749 	strscpy(file_name, buf, PATH_MAX);
750 	/* ignore trailing newline */
751 	sz = strlen(file_name);
752 	if (sz > 0 && file_name[sz - 1] == '\n')
753 		file_name[sz - 1] = 0x00;
754 
755 	backing_dev = filp_open(file_name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
756 	if (IS_ERR(backing_dev)) {
757 		err = PTR_ERR(backing_dev);
758 		backing_dev = NULL;
759 		goto out;
760 	}
761 
762 	inode = backing_dev->f_mapping->host;
763 
764 	/* Support only block device in this moment */
765 	if (!S_ISBLK(inode->i_mode)) {
766 		err = -ENOTBLK;
767 		goto out;
768 	}
769 
770 	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
771 	/* Refuse to use zero sized device (also prevents self reference) */
772 	if (!nr_pages) {
773 		err = -EINVAL;
774 		goto out;
775 	}
776 
777 	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
778 	bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
779 	if (!bitmap) {
780 		err = -ENOMEM;
781 		goto out;
782 	}
783 
784 	reset_bdev(zram);
785 
786 	zram->bdev = I_BDEV(inode);
787 	zram->backing_dev = backing_dev;
788 	zram->bitmap = bitmap;
789 	zram->nr_pages = nr_pages;
790 
791 	pr_info("setup backing device %s\n", file_name);
792 	kfree(file_name);
793 
794 	return len;
795 out:
796 	kvfree(bitmap);
797 
798 	if (backing_dev)
799 		filp_close(backing_dev, NULL);
800 
801 	kfree(file_name);
802 
803 	return err;
804 }
805 
806 static unsigned long zram_reserve_bdev_block(struct zram *zram)
807 {
808 	unsigned long blk_idx;
809 
810 	blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
811 	if (blk_idx == zram->nr_pages)
812 		return INVALID_BDEV_BLOCK;
813 
814 	set_bit(blk_idx, zram->bitmap);
815 	atomic64_inc(&zram->stats.bd_count);
816 	return blk_idx;
817 }
818 
819 static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
820 {
821 	int was_set;
822 
823 	was_set = test_and_clear_bit(blk_idx, zram->bitmap);
824 	WARN_ON_ONCE(!was_set);
825 	atomic64_dec(&zram->stats.bd_count);
826 }
827 
828 static void release_wb_req(struct zram_wb_req *req)
829 {
830 	__free_page(req->page);
831 	kfree(req);
832 }
833 
834 static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
835 {
836 	if (!wb_ctl)
837 		return;
838 
839 	/* We should never have inflight requests at this point */
840 	WARN_ON(atomic_read(&wb_ctl->num_inflight));
841 	WARN_ON(!list_empty(&wb_ctl->done_reqs));
842 
843 	while (!list_empty(&wb_ctl->idle_reqs)) {
844 		struct zram_wb_req *req;
845 
846 		req = list_first_entry(&wb_ctl->idle_reqs,
847 				       struct zram_wb_req, entry);
848 		list_del(&req->entry);
849 		release_wb_req(req);
850 	}
851 
852 	kfree_rcu(wb_ctl, rcu);
853 }
854 
855 static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
856 {
857 	struct zram_wb_ctl *wb_ctl;
858 	int i;
859 
860 	wb_ctl = kmalloc_obj(*wb_ctl);
861 	if (!wb_ctl)
862 		return NULL;
863 
864 	INIT_LIST_HEAD(&wb_ctl->idle_reqs);
865 	INIT_LIST_HEAD(&wb_ctl->done_reqs);
866 	atomic_set(&wb_ctl->num_inflight, 0);
867 	init_waitqueue_head(&wb_ctl->done_wait);
868 	spin_lock_init(&wb_ctl->done_lock);
869 
870 	for (i = 0; i < zram->wb_batch_size; i++) {
871 		struct zram_wb_req *req;
872 
873 		/*
874 		 * This is fatal condition only if we couldn't allocate
875 		 * any requests at all.  Otherwise we just work with the
876 		 * requests that we have successfully allocated, so that
877 		 * writeback can still proceed, even if there is only one
878 		 * request on the idle list.
879 		 */
880 		req = kzalloc_obj(*req, GFP_KERNEL | __GFP_NOWARN);
881 		if (!req)
882 			break;
883 
884 		req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN);
885 		if (!req->page) {
886 			kfree(req);
887 			break;
888 		}
889 
890 		list_add(&req->entry, &wb_ctl->idle_reqs);
891 	}
892 
893 	/* We couldn't allocate any requests, so writeabck is not possible */
894 	if (list_empty(&wb_ctl->idle_reqs))
895 		goto release_wb_ctl;
896 
897 	return wb_ctl;
898 
899 release_wb_ctl:
900 	release_wb_ctl(wb_ctl);
901 	return NULL;
902 }
903 
904 static void zram_account_writeback_rollback(struct zram *zram)
905 {
906 	lockdep_assert_held_write(&zram->dev_lock);
907 
908 	if (zram->wb_limit_enable)
909 		zram->bd_wb_limit +=  1UL << (PAGE_SHIFT - 12);
910 }
911 
912 static void zram_account_writeback_submit(struct zram *zram)
913 {
914 	lockdep_assert_held_write(&zram->dev_lock);
915 
916 	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
917 		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
918 }
919 
920 static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
921 {
922 	u32 index = req->pps->index;
923 	int err;
924 
925 	err = blk_status_to_errno(req->bio.bi_status);
926 	if (err) {
927 		/*
928 		 * Failed wb requests should not be accounted in wb_limit
929 		 * (if enabled).
930 		 */
931 		zram_account_writeback_rollback(zram);
932 		zram_release_bdev_block(zram, req->blk_idx);
933 		return err;
934 	}
935 
936 	atomic64_inc(&zram->stats.bd_writes);
937 	slot_lock(zram, index);
938 	/*
939 	 * We release slot lock during writeback so slot can change under us:
940 	 * slot_free() or slot_free() and zram_write_page(). In both cases
941 	 * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
942 	 * set ZRAM_PP_SLOT on such slots until current post-processing
943 	 * finishes.
944 	 */
945 	if (!test_slot_flag(zram, index, ZRAM_PP_SLOT)) {
946 		zram_release_bdev_block(zram, req->blk_idx);
947 		goto out;
948 	}
949 
950 	clear_slot_flag(zram, index, ZRAM_IDLE);
951 	if (test_slot_flag(zram, index, ZRAM_HUGE))
952 		atomic64_dec(&zram->stats.huge_pages);
953 	atomic64_sub(get_slot_size(zram, index), &zram->stats.compr_data_size);
954 	zs_free(zram->mem_pool, get_slot_handle(zram, index));
955 	set_slot_handle(zram, index, req->blk_idx);
956 	set_slot_flag(zram, index, ZRAM_WB);
957 
958 out:
959 	slot_unlock(zram, index);
960 	return 0;
961 }
962 
963 static void zram_writeback_endio(struct bio *bio)
964 {
965 	struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio);
966 	struct zram_wb_ctl *wb_ctl = bio->bi_private;
967 	unsigned long flags;
968 
969 	rcu_read_lock();
970 	spin_lock_irqsave(&wb_ctl->done_lock, flags);
971 	list_add(&req->entry, &wb_ctl->done_reqs);
972 	spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
973 
974 	wake_up(&wb_ctl->done_wait);
975 	rcu_read_unlock();
976 }
977 
978 static void zram_submit_wb_request(struct zram *zram,
979 				   struct zram_wb_ctl *wb_ctl,
980 				   struct zram_wb_req *req)
981 {
982 	/*
983 	 * wb_limit (if enabled) should be adjusted before submission,
984 	 * so that we don't over-submit.
985 	 */
986 	zram_account_writeback_submit(zram);
987 	atomic_inc(&wb_ctl->num_inflight);
988 	req->bio.bi_private = wb_ctl;
989 	submit_bio(&req->bio);
990 }
991 
992 static int zram_complete_done_reqs(struct zram *zram,
993 				   struct zram_wb_ctl *wb_ctl)
994 {
995 	struct zram_wb_req *req;
996 	unsigned long flags;
997 	int ret = 0, err;
998 
999 	while (atomic_read(&wb_ctl->num_inflight) > 0) {
1000 		spin_lock_irqsave(&wb_ctl->done_lock, flags);
1001 		req = list_first_entry_or_null(&wb_ctl->done_reqs,
1002 					       struct zram_wb_req, entry);
1003 		if (req)
1004 			list_del(&req->entry);
1005 		spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
1006 
1007 		/* ->num_inflight > 0 doesn't mean we have done requests */
1008 		if (!req)
1009 			break;
1010 
1011 		err = zram_writeback_complete(zram, req);
1012 		if (err)
1013 			ret = err;
1014 
1015 		atomic_dec(&wb_ctl->num_inflight);
1016 		release_pp_slot(zram, req->pps);
1017 		req->pps = NULL;
1018 
1019 		list_add(&req->entry, &wb_ctl->idle_reqs);
1020 	}
1021 
1022 	return ret;
1023 }
1024 
1025 static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
1026 {
1027 	struct zram_wb_req *req;
1028 
1029 	req = list_first_entry_or_null(&wb_ctl->idle_reqs,
1030 				       struct zram_wb_req, entry);
1031 	if (req)
1032 		list_del(&req->entry);
1033 	return req;
1034 }
1035 
1036 static int zram_writeback_slots(struct zram *zram,
1037 				struct zram_pp_ctl *ctl,
1038 				struct zram_wb_ctl *wb_ctl)
1039 {
1040 	unsigned long blk_idx = INVALID_BDEV_BLOCK;
1041 	struct zram_wb_req *req = NULL;
1042 	struct zram_pp_slot *pps;
1043 	int ret = 0, err = 0;
1044 	u32 index = 0;
1045 
1046 	while ((pps = select_pp_slot(ctl))) {
1047 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
1048 			ret = -EIO;
1049 			break;
1050 		}
1051 
1052 		while (!req) {
1053 			req = zram_select_idle_req(wb_ctl);
1054 			if (req)
1055 				break;
1056 
1057 			wait_event(wb_ctl->done_wait,
1058 				   !list_empty(&wb_ctl->done_reqs));
1059 
1060 			err = zram_complete_done_reqs(zram, wb_ctl);
1061 			/*
1062 			 * BIO errors are not fatal, we continue and simply
1063 			 * attempt to writeback the remaining objects (pages).
1064 			 * At the same time we need to signal user-space that
1065 			 * some writes (at least one, but also could be all of
1066 			 * them) were not successful and we do so by returning
1067 			 * the most recent BIO error.
1068 			 */
1069 			if (err)
1070 				ret = err;
1071 		}
1072 
1073 		if (blk_idx == INVALID_BDEV_BLOCK) {
1074 			blk_idx = zram_reserve_bdev_block(zram);
1075 			if (blk_idx == INVALID_BDEV_BLOCK) {
1076 				ret = -ENOSPC;
1077 				break;
1078 			}
1079 		}
1080 
1081 		index = pps->index;
1082 		slot_lock(zram, index);
1083 		/*
1084 		 * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so
1085 		 * slots can change in the meantime. If slots are accessed or
1086 		 * freed they lose ZRAM_PP_SLOT flag and hence we don't
1087 		 * post-process them.
1088 		 */
1089 		if (!test_slot_flag(zram, index, ZRAM_PP_SLOT))
1090 			goto next;
1091 		if (zram->compressed_wb)
1092 			err = read_from_zspool_raw(zram, req->page, index);
1093 		else
1094 			err = read_from_zspool(zram, req->page, index);
1095 		if (err)
1096 			goto next;
1097 		slot_unlock(zram, index);
1098 
1099 		/*
1100 		 * From now on pp-slot is owned by the req, remove it from
1101 		 * its pp bucket.
1102 		 */
1103 		list_del_init(&pps->entry);
1104 
1105 		req->blk_idx = blk_idx;
1106 		req->pps = pps;
1107 		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE);
1108 		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
1109 		req->bio.bi_end_io = zram_writeback_endio;
1110 		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
1111 
1112 		zram_submit_wb_request(zram, wb_ctl, req);
1113 		blk_idx = INVALID_BDEV_BLOCK;
1114 		req = NULL;
1115 		cond_resched();
1116 		continue;
1117 
1118 next:
1119 		slot_unlock(zram, index);
1120 		release_pp_slot(zram, pps);
1121 	}
1122 
1123 	/*
1124 	 * Selected idle req, but never submitted it due to some error or
1125 	 * wb limit.
1126 	 */
1127 	if (req)
1128 		release_wb_req(req);
1129 
1130 	if (blk_idx != INVALID_BDEV_BLOCK)
1131 		zram_release_bdev_block(zram, blk_idx);
1132 
1133 	while (atomic_read(&wb_ctl->num_inflight) > 0) {
1134 		wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs));
1135 		err = zram_complete_done_reqs(zram, wb_ctl);
1136 		if (err)
1137 			ret = err;
1138 	}
1139 
1140 	return ret;
1141 }
1142 
1143 #define PAGE_WRITEBACK			0
1144 #define HUGE_WRITEBACK			(1 << 0)
1145 #define IDLE_WRITEBACK			(1 << 1)
1146 #define INCOMPRESSIBLE_WRITEBACK	(1 << 2)
1147 
1148 static int parse_page_index(char *val, unsigned long nr_pages,
1149 			    unsigned long *lo, unsigned long *hi)
1150 {
1151 	int ret;
1152 
1153 	ret = kstrtoul(val, 10, lo);
1154 	if (ret)
1155 		return ret;
1156 	if (*lo >= nr_pages)
1157 		return -ERANGE;
1158 	*hi = *lo + 1;
1159 	return 0;
1160 }
1161 
1162 static int parse_page_indexes(char *val, unsigned long nr_pages,
1163 			      unsigned long *lo, unsigned long *hi)
1164 {
1165 	char *delim;
1166 	int ret;
1167 
1168 	delim = strchr(val, '-');
1169 	if (!delim)
1170 		return -EINVAL;
1171 
1172 	*delim = 0x00;
1173 	ret = kstrtoul(val, 10, lo);
1174 	if (ret)
1175 		return ret;
1176 	if (*lo >= nr_pages)
1177 		return -ERANGE;
1178 
1179 	ret = kstrtoul(delim + 1, 10, hi);
1180 	if (ret)
1181 		return ret;
1182 	if (*hi >= nr_pages || *lo > *hi)
1183 		return -ERANGE;
1184 	*hi += 1;
1185 	return 0;
1186 }
1187 
1188 static int parse_mode(char *val, u32 *mode)
1189 {
1190 	*mode = 0;
1191 
1192 	if (!strcmp(val, "idle"))
1193 		*mode = IDLE_WRITEBACK;
1194 	if (!strcmp(val, "huge"))
1195 		*mode = HUGE_WRITEBACK;
1196 	if (!strcmp(val, "huge_idle"))
1197 		*mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
1198 	if (!strcmp(val, "incompressible"))
1199 		*mode = INCOMPRESSIBLE_WRITEBACK;
1200 
1201 	if (*mode == 0)
1202 		return -EINVAL;
1203 	return 0;
1204 }
1205 
1206 static void scan_slots_for_writeback(struct zram *zram, u32 mode,
1207 				     unsigned long lo, unsigned long hi,
1208 				     struct zram_pp_ctl *ctl)
1209 {
1210 	u32 index = lo;
1211 
1212 	while (index < hi) {
1213 		bool ok = true;
1214 
1215 		slot_lock(zram, index);
1216 		if (!slot_allocated(zram, index))
1217 			goto next;
1218 
1219 		if (test_slot_flag(zram, index, ZRAM_WB) ||
1220 		    test_slot_flag(zram, index, ZRAM_SAME))
1221 			goto next;
1222 
1223 		if (mode & IDLE_WRITEBACK &&
1224 		    !test_slot_flag(zram, index, ZRAM_IDLE))
1225 			goto next;
1226 		if (mode & HUGE_WRITEBACK &&
1227 		    !test_slot_flag(zram, index, ZRAM_HUGE))
1228 			goto next;
1229 		if (mode & INCOMPRESSIBLE_WRITEBACK &&
1230 		    !test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1231 			goto next;
1232 
1233 		ok = place_pp_slot(zram, ctl, index);
1234 next:
1235 		slot_unlock(zram, index);
1236 		if (!ok)
1237 			break;
1238 		index++;
1239 	}
1240 }
1241 
1242 static ssize_t writeback_store(struct device *dev,
1243 			       struct device_attribute *attr,
1244 			       const char *buf, size_t len)
1245 {
1246 	struct zram *zram = dev_to_zram(dev);
1247 	u64 nr_pages = zram->disksize >> PAGE_SHIFT;
1248 	unsigned long lo = 0, hi = nr_pages;
1249 	struct zram_pp_ctl *pp_ctl = NULL;
1250 	struct zram_wb_ctl *wb_ctl = NULL;
1251 	char *args, *param, *val;
1252 	ssize_t ret = len;
1253 	int err, mode = 0;
1254 
1255 	guard(rwsem_write)(&zram->dev_lock);
1256 	if (!init_done(zram))
1257 		return -EINVAL;
1258 
1259 	if (!zram->backing_dev)
1260 		return -ENODEV;
1261 
1262 	pp_ctl = init_pp_ctl();
1263 	if (!pp_ctl)
1264 		return -ENOMEM;
1265 
1266 	wb_ctl = init_wb_ctl(zram);
1267 	if (!wb_ctl) {
1268 		ret = -ENOMEM;
1269 		goto out;
1270 	}
1271 
1272 	args = skip_spaces(buf);
1273 	while (*args) {
1274 		args = next_arg(args, &param, &val);
1275 
1276 		/*
1277 		 * Workaround to support the old writeback interface.
1278 		 *
1279 		 * The old writeback interface has a minor inconsistency and
1280 		 * requires key=value only for page_index parameter, while the
1281 		 * writeback mode is a valueless parameter.
1282 		 *
1283 		 * This is not the case anymore and now all parameters are
1284 		 * required to have values, however, we need to support the
1285 		 * legacy writeback interface format so we check if we can
1286 		 * recognize a valueless parameter as the (legacy) writeback
1287 		 * mode.
1288 		 */
1289 		if (!val || !*val) {
1290 			err = parse_mode(param, &mode);
1291 			if (err) {
1292 				ret = err;
1293 				goto out;
1294 			}
1295 
1296 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1297 			break;
1298 		}
1299 
1300 		if (!strcmp(param, "type")) {
1301 			err = parse_mode(val, &mode);
1302 			if (err) {
1303 				ret = err;
1304 				goto out;
1305 			}
1306 
1307 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1308 			break;
1309 		}
1310 
1311 		if (!strcmp(param, "page_index")) {
1312 			err = parse_page_index(val, nr_pages, &lo, &hi);
1313 			if (err) {
1314 				ret = err;
1315 				goto out;
1316 			}
1317 
1318 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1319 			continue;
1320 		}
1321 
1322 		if (!strcmp(param, "page_indexes")) {
1323 			err = parse_page_indexes(val, nr_pages, &lo, &hi);
1324 			if (err) {
1325 				ret = err;
1326 				goto out;
1327 			}
1328 
1329 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1330 			continue;
1331 		}
1332 	}
1333 
1334 	err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
1335 	if (err)
1336 		ret = err;
1337 
1338 out:
1339 	release_pp_ctl(zram, pp_ctl);
1340 	release_wb_ctl(wb_ctl);
1341 
1342 	return ret;
1343 }
1344 
1345 static int decompress_bdev_page(struct zram *zram, struct page *page, u32 index)
1346 {
1347 	struct zcomp_strm *zstrm;
1348 	unsigned int size;
1349 	int ret, prio;
1350 	void *src;
1351 
1352 	slot_lock(zram, index);
1353 	/* Since slot was unlocked we need to make sure it's still ZRAM_WB */
1354 	if (!test_slot_flag(zram, index, ZRAM_WB)) {
1355 		slot_unlock(zram, index);
1356 		/* We read some stale data, zero it out */
1357 		memset_page(page, 0, 0, PAGE_SIZE);
1358 		return -EIO;
1359 	}
1360 
1361 	if (test_slot_flag(zram, index, ZRAM_HUGE)) {
1362 		slot_unlock(zram, index);
1363 		return 0;
1364 	}
1365 
1366 	size = get_slot_size(zram, index);
1367 	prio = get_slot_comp_priority(zram, index);
1368 
1369 	zstrm = zcomp_stream_get(zram->comps[prio]);
1370 	src = kmap_local_page(page);
1371 	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size,
1372 			       zstrm->local_copy);
1373 	if (!ret)
1374 		copy_page(src, zstrm->local_copy);
1375 	kunmap_local(src);
1376 	zcomp_stream_put(zstrm);
1377 	slot_unlock(zram, index);
1378 
1379 	return ret;
1380 }
1381 
1382 static void zram_deferred_decompress(struct work_struct *w)
1383 {
1384 	struct zram_rb_req *req = container_of(w, struct zram_rb_req, work);
1385 	struct page *page = bio_first_page_all(req->bio);
1386 	struct zram *zram = req->zram;
1387 	u32 index = req->index;
1388 	int ret;
1389 
1390 	ret = decompress_bdev_page(zram, page, index);
1391 	if (ret)
1392 		req->parent->bi_status = BLK_STS_IOERR;
1393 
1394 	/* Decrement parent's ->remaining */
1395 	bio_endio(req->parent);
1396 	bio_put(req->bio);
1397 	kfree(req);
1398 }
1399 
1400 static void zram_async_read_endio(struct bio *bio)
1401 {
1402 	struct zram_rb_req *req = bio->bi_private;
1403 	struct zram *zram = req->zram;
1404 
1405 	if (bio->bi_status) {
1406 		req->parent->bi_status = bio->bi_status;
1407 		bio_endio(req->parent);
1408 		bio_put(bio);
1409 		kfree(req);
1410 		return;
1411 	}
1412 
1413 	/*
1414 	 * NOTE: zram_async_read_endio() is not exactly right place for this.
1415 	 * Ideally, we need to do it after ZRAM_WB check, but this requires
1416 	 * us to use wq path even on systems that don't enable compressed
1417 	 * writeback, because we cannot take slot-lock in the current context.
1418 	 *
1419 	 * Keep the existing behavior for now.
1420 	 */
1421 	if (zram->compressed_wb == false) {
1422 		/* No decompression needed, complete the parent IO */
1423 		bio_endio(req->parent);
1424 		bio_put(bio);
1425 		kfree(req);
1426 		return;
1427 	}
1428 
1429 	/*
1430 	 * zram decompression is sleepable, so we need to deffer it to
1431 	 * a preemptible context.
1432 	 */
1433 	INIT_WORK(&req->work, zram_deferred_decompress);
1434 	queue_work(system_highpri_wq, &req->work);
1435 }
1436 
1437 static int read_from_bdev_async(struct zram *zram, struct page *page,
1438 				u32 index, unsigned long blk_idx,
1439 				struct bio *parent)
1440 {
1441 	struct zram_rb_req *req;
1442 	struct bio *bio;
1443 
1444 	req = kmalloc_obj(*req, GFP_NOIO);
1445 	if (!req)
1446 		return -ENOMEM;
1447 
1448 	bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO);
1449 	if (!bio) {
1450 		kfree(req);
1451 		return -ENOMEM;
1452 	}
1453 
1454 	req->zram = zram;
1455 	req->index = index;
1456 	req->blk_idx = blk_idx;
1457 	req->bio = bio;
1458 	req->parent = parent;
1459 
1460 	bio->bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
1461 	bio->bi_private = req;
1462 	bio->bi_end_io = zram_async_read_endio;
1463 
1464 	__bio_add_page(bio, page, PAGE_SIZE, 0);
1465 	bio_inc_remaining(parent);
1466 	submit_bio(bio);
1467 
1468 	return 0;
1469 }
1470 
1471 static void zram_sync_read(struct work_struct *w)
1472 {
1473 	struct zram_rb_req *req = container_of(w, struct zram_rb_req, work);
1474 	struct bio_vec bv;
1475 	struct bio bio;
1476 
1477 	bio_init(&bio, req->zram->bdev, &bv, 1, REQ_OP_READ);
1478 	bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
1479 	__bio_add_page(&bio, req->page, PAGE_SIZE, 0);
1480 	req->error = submit_bio_wait(&bio);
1481 }
1482 
1483 /*
1484  * Block layer want one ->submit_bio to be active at a time, so if we use
1485  * chained IO with parent IO in same context, it's a deadlock. To avoid that,
1486  * use a worker thread context.
1487  */
1488 static int read_from_bdev_sync(struct zram *zram, struct page *page, u32 index,
1489 			       unsigned long blk_idx)
1490 {
1491 	struct zram_rb_req req;
1492 
1493 	req.page = page;
1494 	req.zram = zram;
1495 	req.blk_idx = blk_idx;
1496 
1497 	INIT_WORK_ONSTACK(&req.work, zram_sync_read);
1498 	queue_work(system_dfl_wq, &req.work);
1499 	flush_work(&req.work);
1500 	destroy_work_on_stack(&req.work);
1501 
1502 	if (req.error || zram->compressed_wb == false)
1503 		return req.error;
1504 
1505 	return decompress_bdev_page(zram, page, index);
1506 }
1507 
1508 static int read_from_bdev(struct zram *zram, struct page *page, u32 index,
1509 			  unsigned long blk_idx, struct bio *parent)
1510 {
1511 	atomic64_inc(&zram->stats.bd_reads);
1512 	if (!parent) {
1513 		if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO)))
1514 			return -EIO;
1515 		return read_from_bdev_sync(zram, page, index, blk_idx);
1516 	}
1517 	return read_from_bdev_async(zram, page, index, blk_idx, parent);
1518 }
1519 #else
1520 static inline void reset_bdev(struct zram *zram) {};
1521 static int read_from_bdev(struct zram *zram, struct page *page, u32 index,
1522 			  unsigned long blk_idx, struct bio *parent)
1523 {
1524 	return -EIO;
1525 }
1526 
1527 static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
1528 {
1529 }
1530 #endif
1531 
1532 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
1533 
1534 static struct dentry *zram_debugfs_root;
1535 
1536 static void zram_debugfs_create(void)
1537 {
1538 	zram_debugfs_root = debugfs_create_dir("zram", NULL);
1539 }
1540 
1541 static void zram_debugfs_destroy(void)
1542 {
1543 	debugfs_remove_recursive(zram_debugfs_root);
1544 }
1545 
1546 static ssize_t read_block_state(struct file *file, char __user *buf,
1547 				size_t count, loff_t *ppos)
1548 {
1549 	char *kbuf;
1550 	ssize_t index, written = 0;
1551 	struct zram *zram = file->private_data;
1552 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
1553 
1554 	kbuf = kvmalloc(count, GFP_KERNEL);
1555 	if (!kbuf)
1556 		return -ENOMEM;
1557 
1558 	guard(rwsem_read)(&zram->dev_lock);
1559 	if (!init_done(zram)) {
1560 		kvfree(kbuf);
1561 		return -EINVAL;
1562 	}
1563 
1564 	for (index = *ppos; index < nr_pages; index++) {
1565 		int copied;
1566 
1567 		slot_lock(zram, index);
1568 		if (!slot_allocated(zram, index))
1569 			goto next;
1570 
1571 		copied = snprintf(kbuf + written, count,
1572 			"%12zd %12u.%06d %c%c%c%c%c%c\n",
1573 			index, zram->table[index].attr.ac_time, 0,
1574 			test_slot_flag(zram, index, ZRAM_SAME) ? 's' : '.',
1575 			test_slot_flag(zram, index, ZRAM_WB) ? 'w' : '.',
1576 			test_slot_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
1577 			test_slot_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
1578 			get_slot_comp_priority(zram, index) ? 'r' : '.',
1579 			test_slot_flag(zram, index,
1580 				       ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
1581 
1582 		if (count <= copied) {
1583 			slot_unlock(zram, index);
1584 			break;
1585 		}
1586 		written += copied;
1587 		count -= copied;
1588 next:
1589 		slot_unlock(zram, index);
1590 		*ppos += 1;
1591 	}
1592 
1593 	if (copy_to_user(buf, kbuf, written))
1594 		written = -EFAULT;
1595 	kvfree(kbuf);
1596 
1597 	return written;
1598 }
1599 
1600 static const struct file_operations proc_zram_block_state_op = {
1601 	.open = simple_open,
1602 	.read = read_block_state,
1603 	.llseek = default_llseek,
1604 };
1605 
1606 static void zram_debugfs_register(struct zram *zram)
1607 {
1608 	if (!zram_debugfs_root)
1609 		return;
1610 
1611 	zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
1612 						zram_debugfs_root);
1613 	debugfs_create_file("block_state", 0400, zram->debugfs_dir,
1614 				zram, &proc_zram_block_state_op);
1615 }
1616 
1617 static void zram_debugfs_unregister(struct zram *zram)
1618 {
1619 	debugfs_remove_recursive(zram->debugfs_dir);
1620 }
1621 #else
1622 static void zram_debugfs_create(void) {};
1623 static void zram_debugfs_destroy(void) {};
1624 static void zram_debugfs_register(struct zram *zram) {};
1625 static void zram_debugfs_unregister(struct zram *zram) {};
1626 #endif
1627 
1628 /* Only algo parameter given, lookup by algo name */
1629 static int lookup_algo_priority(struct zram *zram, const char *algo,
1630 				u32 min_prio)
1631 {
1632 	s32 prio;
1633 
1634 	for (prio = min_prio; prio < ZRAM_MAX_COMPS; prio++) {
1635 		if (!zram->comp_algs[prio])
1636 			continue;
1637 
1638 		if (!strcmp(zram->comp_algs[prio], algo))
1639 			return prio;
1640 	}
1641 
1642 	return -EINVAL;
1643 }
1644 
1645 /* Both algo and priority parameters given, validate them */
1646 static int validate_algo_priority(struct zram *zram, const char *algo, u32 prio)
1647 {
1648 	if (prio >= ZRAM_MAX_COMPS)
1649 		return -EINVAL;
1650 	/* No algo at given priority */
1651 	if (!zram->comp_algs[prio])
1652 		return -EINVAL;
1653 	/* A different algo at given priority */
1654 	if (strcmp(zram->comp_algs[prio], algo))
1655 		return -EINVAL;
1656 	return 0;
1657 }
1658 
1659 static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
1660 {
1661 	zram->comp_algs[prio] = alg;
1662 }
1663 
1664 static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
1665 {
1666 	const char *alg;
1667 	size_t sz;
1668 
1669 	sz = strlen(buf);
1670 	if (sz >= ZRAM_MAX_ALGO_NAME_SZ)
1671 		return -E2BIG;
1672 
1673 	alg = zcomp_lookup_backend_name(buf);
1674 	if (!alg)
1675 		return -EINVAL;
1676 
1677 	guard(rwsem_write)(&zram->dev_lock);
1678 	if (init_done(zram)) {
1679 		pr_info("Can't change algorithm for initialized device\n");
1680 		return -EBUSY;
1681 	}
1682 
1683 	comp_algorithm_set(zram, prio, alg);
1684 	return 0;
1685 }
1686 
1687 static void comp_params_reset(struct zram *zram, u32 prio)
1688 {
1689 	struct zcomp_params *params = &zram->params[prio];
1690 
1691 	vfree(params->dict);
1692 	params->level = ZCOMP_PARAM_NOT_SET;
1693 	params->deflate.winbits = ZCOMP_PARAM_NOT_SET;
1694 	params->dict_sz = 0;
1695 	params->dict = NULL;
1696 }
1697 
1698 static int comp_params_store(struct zram *zram, u32 prio, s32 level,
1699 			     const char *dict_path,
1700 			     struct deflate_params *deflate_params)
1701 {
1702 	ssize_t sz = 0;
1703 
1704 	comp_params_reset(zram, prio);
1705 
1706 	if (dict_path) {
1707 		sz = kernel_read_file_from_path(dict_path, 0,
1708 						&zram->params[prio].dict,
1709 						INT_MAX,
1710 						NULL,
1711 						READING_POLICY);
1712 		if (sz < 0)
1713 			return -EINVAL;
1714 	}
1715 
1716 	zram->params[prio].dict_sz = sz;
1717 	zram->params[prio].level = level;
1718 	zram->params[prio].deflate.winbits = deflate_params->winbits;
1719 	return 0;
1720 }
1721 
1722 static ssize_t algorithm_params_store(struct device *dev,
1723 				      struct device_attribute *attr,
1724 				      const char *buf,
1725 				      size_t len)
1726 {
1727 	s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NOT_SET;
1728 	char *args, *param, *val, *algo = NULL, *dict_path = NULL;
1729 	struct deflate_params deflate_params;
1730 	struct zram *zram = dev_to_zram(dev);
1731 	bool prio_param = false;
1732 	int ret;
1733 
1734 	deflate_params.winbits = ZCOMP_PARAM_NOT_SET;
1735 
1736 	args = skip_spaces(buf);
1737 	while (*args) {
1738 		args = next_arg(args, &param, &val);
1739 
1740 		if (!val || !*val)
1741 			return -EINVAL;
1742 
1743 		if (!strcmp(param, "priority")) {
1744 			prio_param = true;
1745 			ret = kstrtoint(val, 10, &prio);
1746 			if (ret)
1747 				return ret;
1748 			continue;
1749 		}
1750 
1751 		if (!strcmp(param, "level")) {
1752 			ret = kstrtoint(val, 10, &level);
1753 			if (ret)
1754 				return ret;
1755 			continue;
1756 		}
1757 
1758 		if (!strcmp(param, "algo")) {
1759 			algo = val;
1760 			continue;
1761 		}
1762 
1763 		if (!strcmp(param, "dict")) {
1764 			dict_path = val;
1765 			continue;
1766 		}
1767 
1768 		if (!strcmp(param, "deflate.winbits")) {
1769 			ret = kstrtoint(val, 10, &deflate_params.winbits);
1770 			if (ret)
1771 				return ret;
1772 			continue;
1773 		}
1774 	}
1775 
1776 	guard(rwsem_write)(&zram->dev_lock);
1777 	if (init_done(zram))
1778 		return -EBUSY;
1779 
1780 	if (prio_param) {
1781 		if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS)
1782 			return -EINVAL;
1783 	}
1784 
1785 	if (algo && prio_param) {
1786 		ret = validate_algo_priority(zram, algo, prio);
1787 		if (ret)
1788 			return ret;
1789 	}
1790 
1791 	if (algo && !prio_param) {
1792 		prio = lookup_algo_priority(zram, algo, ZRAM_PRIMARY_COMP);
1793 		if (prio < 0)
1794 			return -EINVAL;
1795 	}
1796 
1797 	ret = comp_params_store(zram, prio, level, dict_path, &deflate_params);
1798 	return ret ? ret : len;
1799 }
1800 
1801 static ssize_t comp_algorithm_show(struct device *dev,
1802 				   struct device_attribute *attr,
1803 				   char *buf)
1804 {
1805 	struct zram *zram = dev_to_zram(dev);
1806 	ssize_t sz;
1807 
1808 	guard(rwsem_read)(&zram->dev_lock);
1809 	sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0);
1810 	return sz;
1811 }
1812 
1813 static ssize_t comp_algorithm_store(struct device *dev,
1814 				    struct device_attribute *attr,
1815 				    const char *buf,
1816 				    size_t len)
1817 {
1818 	struct zram *zram = dev_to_zram(dev);
1819 	int ret;
1820 
1821 	ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
1822 	return ret ? ret : len;
1823 }
1824 
1825 #ifdef CONFIG_ZRAM_MULTI_COMP
1826 static ssize_t recomp_algorithm_show(struct device *dev,
1827 				     struct device_attribute *attr,
1828 				     char *buf)
1829 {
1830 	struct zram *zram = dev_to_zram(dev);
1831 	ssize_t sz = 0;
1832 	u32 prio;
1833 
1834 	guard(rwsem_read)(&zram->dev_lock);
1835 	for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
1836 		if (!zram->comp_algs[prio])
1837 			continue;
1838 
1839 		sz += sysfs_emit_at(buf, sz, "#%d: ", prio);
1840 		sz += zcomp_available_show(zram->comp_algs[prio], buf, sz);
1841 	}
1842 	return sz;
1843 }
1844 
1845 static ssize_t recomp_algorithm_store(struct device *dev,
1846 				      struct device_attribute *attr,
1847 				      const char *buf,
1848 				      size_t len)
1849 {
1850 	struct zram *zram = dev_to_zram(dev);
1851 	int prio = ZRAM_SECONDARY_COMP;
1852 	char *args, *param, *val;
1853 	char *alg = NULL;
1854 	int ret;
1855 
1856 	args = skip_spaces(buf);
1857 	while (*args) {
1858 		args = next_arg(args, &param, &val);
1859 
1860 		if (!val || !*val)
1861 			return -EINVAL;
1862 
1863 		if (!strcmp(param, "algo")) {
1864 			alg = val;
1865 			continue;
1866 		}
1867 
1868 		if (!strcmp(param, "priority")) {
1869 			ret = kstrtoint(val, 10, &prio);
1870 			if (ret)
1871 				return ret;
1872 			continue;
1873 		}
1874 	}
1875 
1876 	if (!alg)
1877 		return -EINVAL;
1878 
1879 	if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
1880 		return -EINVAL;
1881 
1882 	ret = __comp_algorithm_store(zram, prio, alg);
1883 	return ret ? ret : len;
1884 }
1885 #endif
1886 
1887 static ssize_t compact_store(struct device *dev, struct device_attribute *attr,
1888 			     const char *buf, size_t len)
1889 {
1890 	struct zram *zram = dev_to_zram(dev);
1891 
1892 	guard(rwsem_read)(&zram->dev_lock);
1893 	if (!init_done(zram))
1894 		return -EINVAL;
1895 
1896 	zs_compact(zram->mem_pool);
1897 
1898 	return len;
1899 }
1900 
1901 static ssize_t io_stat_show(struct device *dev, struct device_attribute *attr,
1902 			    char *buf)
1903 {
1904 	struct zram *zram = dev_to_zram(dev);
1905 	ssize_t ret;
1906 
1907 	guard(rwsem_read)(&zram->dev_lock);
1908 	ret = sysfs_emit(buf,
1909 			"%8llu %8llu 0 %8llu\n",
1910 			(u64)atomic64_read(&zram->stats.failed_reads),
1911 			(u64)atomic64_read(&zram->stats.failed_writes),
1912 			(u64)atomic64_read(&zram->stats.notify_free));
1913 
1914 	return ret;
1915 }
1916 
1917 static ssize_t mm_stat_show(struct device *dev, struct device_attribute *attr,
1918 			    char *buf)
1919 {
1920 	struct zram *zram = dev_to_zram(dev);
1921 	struct zs_pool_stats pool_stats;
1922 	u64 orig_size, mem_used = 0;
1923 	long max_used;
1924 	ssize_t ret;
1925 
1926 	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1927 
1928 	guard(rwsem_read)(&zram->dev_lock);
1929 	if (init_done(zram)) {
1930 		mem_used = zs_get_total_pages(zram->mem_pool);
1931 		zs_pool_stats(zram->mem_pool, &pool_stats);
1932 	}
1933 
1934 	orig_size = atomic64_read(&zram->stats.pages_stored);
1935 	max_used = atomic_long_read(&zram->stats.max_used_pages);
1936 
1937 	ret = sysfs_emit(buf,
1938 			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1939 			orig_size << PAGE_SHIFT,
1940 			(u64)atomic64_read(&zram->stats.compr_data_size),
1941 			mem_used << PAGE_SHIFT,
1942 			zram->limit_pages << PAGE_SHIFT,
1943 			max_used << PAGE_SHIFT,
1944 			(u64)atomic64_read(&zram->stats.same_pages),
1945 			atomic_long_read(&pool_stats.pages_compacted),
1946 			(u64)atomic64_read(&zram->stats.huge_pages),
1947 			(u64)atomic64_read(&zram->stats.huge_pages_since));
1948 
1949 	return ret;
1950 }
1951 
1952 static ssize_t debug_stat_show(struct device *dev,
1953 			       struct device_attribute *attr, char *buf)
1954 {
1955 	int version = 1;
1956 	struct zram *zram = dev_to_zram(dev);
1957 	ssize_t ret;
1958 
1959 	guard(rwsem_read)(&zram->dev_lock);
1960 	ret = sysfs_emit(buf,
1961 			"version: %d\n0 %8llu\n",
1962 			version,
1963 			(u64)atomic64_read(&zram->stats.miss_free));
1964 
1965 	return ret;
1966 }
1967 
1968 static void zram_meta_free(struct zram *zram, u64 disksize)
1969 {
1970 	size_t num_pages = disksize >> PAGE_SHIFT;
1971 	size_t index;
1972 
1973 	if (!zram->table)
1974 		return;
1975 
1976 	/* Free all pages that are still in this zram device */
1977 	for (index = 0; index < num_pages; index++)
1978 		slot_free(zram, index);
1979 
1980 	zs_destroy_pool(zram->mem_pool);
1981 	vfree(zram->table);
1982 	zram->table = NULL;
1983 }
1984 
1985 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1986 {
1987 	size_t num_pages, index;
1988 
1989 	num_pages = disksize >> PAGE_SHIFT;
1990 	zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1991 	if (!zram->table)
1992 		return false;
1993 
1994 	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1995 	if (!zram->mem_pool) {
1996 		vfree(zram->table);
1997 		zram->table = NULL;
1998 		return false;
1999 	}
2000 
2001 	if (!huge_class_size)
2002 		huge_class_size = zs_huge_class_size(zram->mem_pool);
2003 
2004 	for (index = 0; index < num_pages; index++)
2005 		slot_lock_init(zram, index);
2006 
2007 	return true;
2008 }
2009 
2010 static void slot_free(struct zram *zram, u32 index)
2011 {
2012 	unsigned long handle;
2013 
2014 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
2015 	zram->table[index].attr.ac_time = 0;
2016 #endif
2017 
2018 	clear_slot_flag(zram, index, ZRAM_IDLE);
2019 	clear_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE);
2020 	clear_slot_flag(zram, index, ZRAM_PP_SLOT);
2021 	set_slot_comp_priority(zram, index, 0);
2022 
2023 	if (test_slot_flag(zram, index, ZRAM_HUGE)) {
2024 		/*
2025 		 * Writeback completion decrements ->huge_pages but keeps
2026 		 * ZRAM_HUGE flag for deferred decompression path.
2027 		 */
2028 		if (!test_slot_flag(zram, index, ZRAM_WB))
2029 			atomic64_dec(&zram->stats.huge_pages);
2030 		clear_slot_flag(zram, index, ZRAM_HUGE);
2031 	}
2032 
2033 	if (test_slot_flag(zram, index, ZRAM_WB)) {
2034 		clear_slot_flag(zram, index, ZRAM_WB);
2035 		zram_release_bdev_block(zram, get_slot_handle(zram, index));
2036 		goto out;
2037 	}
2038 
2039 	/*
2040 	 * No memory is allocated for same element filled pages.
2041 	 * Simply clear same page flag.
2042 	 */
2043 	if (test_slot_flag(zram, index, ZRAM_SAME)) {
2044 		clear_slot_flag(zram, index, ZRAM_SAME);
2045 		atomic64_dec(&zram->stats.same_pages);
2046 		goto out;
2047 	}
2048 
2049 	handle = get_slot_handle(zram, index);
2050 	if (!handle)
2051 		return;
2052 
2053 	zs_free(zram->mem_pool, handle);
2054 
2055 	atomic64_sub(get_slot_size(zram, index),
2056 		     &zram->stats.compr_data_size);
2057 out:
2058 	atomic64_dec(&zram->stats.pages_stored);
2059 	set_slot_handle(zram, index, 0);
2060 	set_slot_size(zram, index, 0);
2061 }
2062 
2063 static int read_same_filled_page(struct zram *zram, struct page *page,
2064 				 u32 index)
2065 {
2066 	void *mem;
2067 
2068 	mem = kmap_local_page(page);
2069 	zram_fill_page(mem, PAGE_SIZE, get_slot_handle(zram, index));
2070 	kunmap_local(mem);
2071 	return 0;
2072 }
2073 
2074 static int read_incompressible_page(struct zram *zram, struct page *page,
2075 				    u32 index)
2076 {
2077 	unsigned long handle;
2078 	void *src, *dst;
2079 
2080 	handle = get_slot_handle(zram, index);
2081 	src = zs_obj_read_begin(zram->mem_pool, handle, PAGE_SIZE, NULL);
2082 	dst = kmap_local_page(page);
2083 	copy_page(dst, src);
2084 	kunmap_local(dst);
2085 	zs_obj_read_end(zram->mem_pool, handle, PAGE_SIZE, src);
2086 
2087 	return 0;
2088 }
2089 
2090 static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
2091 {
2092 	struct zcomp_strm *zstrm;
2093 	unsigned long handle;
2094 	unsigned int size;
2095 	void *src, *dst;
2096 	int ret, prio;
2097 
2098 	handle = get_slot_handle(zram, index);
2099 	size = get_slot_size(zram, index);
2100 	prio = get_slot_comp_priority(zram, index);
2101 
2102 	zstrm = zcomp_stream_get(zram->comps[prio]);
2103 	src = zs_obj_read_begin(zram->mem_pool, handle, size,
2104 				zstrm->local_copy);
2105 	dst = kmap_local_page(page);
2106 	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst);
2107 	kunmap_local(dst);
2108 	zs_obj_read_end(zram->mem_pool, handle, size, src);
2109 	zcomp_stream_put(zstrm);
2110 
2111 	return ret;
2112 }
2113 
2114 #if defined CONFIG_ZRAM_WRITEBACK
2115 static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index)
2116 {
2117 	struct zcomp_strm *zstrm;
2118 	unsigned long handle;
2119 	unsigned int size;
2120 	void *src;
2121 
2122 	handle = get_slot_handle(zram, index);
2123 	size = get_slot_size(zram, index);
2124 
2125 	/*
2126 	 * We need to get stream just for ->local_copy buffer, in
2127 	 * case if object spans two physical pages. No decompression
2128 	 * takes place here, as we read raw compressed data.
2129 	 */
2130 	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
2131 	src = zs_obj_read_begin(zram->mem_pool, handle, size,
2132 				zstrm->local_copy);
2133 	memcpy_to_page(page, 0, src, size);
2134 	zs_obj_read_end(zram->mem_pool, handle, size, src);
2135 	zcomp_stream_put(zstrm);
2136 
2137 	memzero_page(page, size, PAGE_SIZE - size);
2138 
2139 	return 0;
2140 }
2141 #endif
2142 
2143 /*
2144  * Reads (decompresses if needed) a page from zspool (zsmalloc).
2145  * Corresponding ZRAM slot should be locked.
2146  */
2147 static int read_from_zspool(struct zram *zram, struct page *page, u32 index)
2148 {
2149 	if (test_slot_flag(zram, index, ZRAM_SAME) ||
2150 	    !get_slot_handle(zram, index))
2151 		return read_same_filled_page(zram, page, index);
2152 
2153 	if (!test_slot_flag(zram, index, ZRAM_HUGE))
2154 		return read_compressed_page(zram, page, index);
2155 	else
2156 		return read_incompressible_page(zram, page, index);
2157 }
2158 
2159 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
2160 			  struct bio *parent)
2161 {
2162 	int ret;
2163 
2164 	slot_lock(zram, index);
2165 	if (!test_slot_flag(zram, index, ZRAM_WB)) {
2166 		/* Slot should be locked through out the function call */
2167 		ret = read_from_zspool(zram, page, index);
2168 		slot_unlock(zram, index);
2169 	} else {
2170 		unsigned long blk_idx = get_slot_handle(zram, index);
2171 
2172 		/*
2173 		 * The slot should be unlocked before reading from the backing
2174 		 * device.
2175 		 */
2176 		slot_unlock(zram, index);
2177 		ret = read_from_bdev(zram, page, index, blk_idx, parent);
2178 	}
2179 
2180 	/* Should NEVER happen. Return bio error if it does. */
2181 	if (WARN_ON(ret < 0))
2182 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
2183 
2184 	return ret;
2185 }
2186 
2187 /*
2188  * Use a temporary buffer to decompress the page, as the decompressor
2189  * always expects a full page for the output.
2190  */
2191 static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec,
2192 				  u32 index, int offset)
2193 {
2194 	struct page *page = alloc_page(GFP_NOIO);
2195 	int ret;
2196 
2197 	if (!page)
2198 		return -ENOMEM;
2199 	ret = zram_read_page(zram, page, index, NULL);
2200 	if (likely(!ret))
2201 		memcpy_to_bvec(bvec, page_address(page) + offset);
2202 	__free_page(page);
2203 	return ret;
2204 }
2205 
2206 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
2207 			  u32 index, int offset, struct bio *bio)
2208 {
2209 	if (is_partial_io(bvec))
2210 		return zram_bvec_read_partial(zram, bvec, index, offset);
2211 	return zram_read_page(zram, bvec->bv_page, index, bio);
2212 }
2213 
2214 static int write_same_filled_page(struct zram *zram, unsigned long fill,
2215 				  u32 index)
2216 {
2217 	slot_lock(zram, index);
2218 	slot_free(zram, index);
2219 	set_slot_flag(zram, index, ZRAM_SAME);
2220 	set_slot_handle(zram, index, fill);
2221 	slot_unlock(zram, index);
2222 
2223 	atomic64_inc(&zram->stats.same_pages);
2224 	atomic64_inc(&zram->stats.pages_stored);
2225 
2226 	return 0;
2227 }
2228 
2229 static int write_incompressible_page(struct zram *zram, struct page *page,
2230 				     u32 index)
2231 {
2232 	unsigned long handle;
2233 	void *src;
2234 
2235 	/*
2236 	 * This function is called from preemptible context so we don't need
2237 	 * to do optimistic and fallback to pessimistic handle allocation,
2238 	 * like we do for compressible pages.
2239 	 */
2240 	handle = zs_malloc(zram->mem_pool, PAGE_SIZE,
2241 			   GFP_NOIO | __GFP_NOWARN |
2242 			   __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
2243 	if (IS_ERR_VALUE(handle))
2244 		return PTR_ERR((void *)handle);
2245 
2246 	if (!zram_can_store_page(zram)) {
2247 		zs_free(zram->mem_pool, handle);
2248 		return -ENOMEM;
2249 	}
2250 
2251 	src = kmap_local_page(page);
2252 	zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE);
2253 	kunmap_local(src);
2254 
2255 	slot_lock(zram, index);
2256 	slot_free(zram, index);
2257 	set_slot_flag(zram, index, ZRAM_HUGE);
2258 	set_slot_handle(zram, index, handle);
2259 	set_slot_size(zram, index, PAGE_SIZE);
2260 	slot_unlock(zram, index);
2261 
2262 	atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size);
2263 	atomic64_inc(&zram->stats.huge_pages);
2264 	atomic64_inc(&zram->stats.huge_pages_since);
2265 	atomic64_inc(&zram->stats.pages_stored);
2266 
2267 	return 0;
2268 }
2269 
2270 static int zram_write_page(struct zram *zram, struct page *page, u32 index)
2271 {
2272 	int ret = 0;
2273 	unsigned long handle;
2274 	unsigned int comp_len;
2275 	void *mem;
2276 	struct zcomp_strm *zstrm;
2277 	unsigned long element;
2278 	bool same_filled;
2279 
2280 	mem = kmap_local_page(page);
2281 	same_filled = page_same_filled(mem, &element);
2282 	kunmap_local(mem);
2283 	if (same_filled)
2284 		return write_same_filled_page(zram, element, index);
2285 
2286 	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
2287 	mem = kmap_local_page(page);
2288 	ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm,
2289 			     mem, &comp_len);
2290 	kunmap_local(mem);
2291 
2292 	if (unlikely(ret)) {
2293 		zcomp_stream_put(zstrm);
2294 		pr_err("Compression failed! err=%d\n", ret);
2295 		return ret;
2296 	}
2297 
2298 	if (comp_len >= huge_class_size) {
2299 		zcomp_stream_put(zstrm);
2300 		return write_incompressible_page(zram, page, index);
2301 	}
2302 
2303 	handle = zs_malloc(zram->mem_pool, comp_len,
2304 			   GFP_NOIO | __GFP_NOWARN |
2305 			   __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
2306 	if (IS_ERR_VALUE(handle)) {
2307 		zcomp_stream_put(zstrm);
2308 		return PTR_ERR((void *)handle);
2309 	}
2310 
2311 	if (!zram_can_store_page(zram)) {
2312 		zcomp_stream_put(zstrm);
2313 		zs_free(zram->mem_pool, handle);
2314 		return -ENOMEM;
2315 	}
2316 
2317 	zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len);
2318 	zcomp_stream_put(zstrm);
2319 
2320 	slot_lock(zram, index);
2321 	slot_free(zram, index);
2322 	set_slot_handle(zram, index, handle);
2323 	set_slot_size(zram, index, comp_len);
2324 	slot_unlock(zram, index);
2325 
2326 	/* Update stats */
2327 	atomic64_inc(&zram->stats.pages_stored);
2328 	atomic64_add(comp_len, &zram->stats.compr_data_size);
2329 
2330 	return ret;
2331 }
2332 
2333 /*
2334  * This is a partial IO. Read the full page before writing the changes.
2335  */
2336 static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
2337 				   u32 index, int offset)
2338 {
2339 	struct page *page = alloc_page(GFP_NOIO);
2340 	int ret;
2341 
2342 	if (!page)
2343 		return -ENOMEM;
2344 
2345 	ret = zram_read_page(zram, page, index, NULL);
2346 	if (!ret) {
2347 		memcpy_from_bvec(page_address(page) + offset, bvec);
2348 		ret = zram_write_page(zram, page, index);
2349 	}
2350 	__free_page(page);
2351 	return ret;
2352 }
2353 
2354 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
2355 			   u32 index, int offset)
2356 {
2357 	if (is_partial_io(bvec))
2358 		return zram_bvec_write_partial(zram, bvec, index, offset);
2359 	return zram_write_page(zram, bvec->bv_page, index);
2360 }
2361 
2362 #ifdef CONFIG_ZRAM_MULTI_COMP
2363 #define RECOMPRESS_IDLE		(1 << 0)
2364 #define RECOMPRESS_HUGE		(1 << 1)
2365 
2366 static bool highest_priority_algorithm(struct zram *zram, u32 prio)
2367 {
2368 	u32 p;
2369 
2370 	for (p = prio + 1; p < ZRAM_MAX_COMPS; p++) {
2371 		if (zram->comp_algs[p])
2372 			return false;
2373 	}
2374 
2375 	return true;
2376 }
2377 
2378 static void scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio,
2379 				      struct zram_pp_ctl *ctl)
2380 {
2381 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
2382 	unsigned long index;
2383 
2384 	for (index = 0; index < nr_pages; index++) {
2385 		bool ok = true;
2386 
2387 		slot_lock(zram, index);
2388 		if (!slot_allocated(zram, index))
2389 			goto next;
2390 
2391 		if (mode & RECOMPRESS_IDLE &&
2392 		    !test_slot_flag(zram, index, ZRAM_IDLE))
2393 			goto next;
2394 
2395 		if (mode & RECOMPRESS_HUGE &&
2396 		    !test_slot_flag(zram, index, ZRAM_HUGE))
2397 			goto next;
2398 
2399 		if (test_slot_flag(zram, index, ZRAM_WB) ||
2400 		    test_slot_flag(zram, index, ZRAM_SAME) ||
2401 		    test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE))
2402 			goto next;
2403 
2404 		/* Already compressed with same or higher priority */
2405 		if (get_slot_comp_priority(zram, index) >= prio)
2406 			goto next;
2407 
2408 		ok = place_pp_slot(zram, ctl, index);
2409 next:
2410 		slot_unlock(zram, index);
2411 		if (!ok)
2412 			break;
2413 	}
2414 }
2415 
2416 /*
2417  * This function will decompress (unless it's ZRAM_HUGE) the page and then
2418  * attempt to compress it using provided compression algorithm priority
2419  * (which is potentially more effective).
2420  *
2421  * Corresponding ZRAM slot should be locked.
2422  */
2423 static int recompress_slot(struct zram *zram, u32 index, struct page *page,
2424 			   u64 *num_recomp_pages, u32 threshold, u32 prio)
2425 {
2426 	struct zcomp_strm *zstrm = NULL;
2427 	unsigned long handle_old;
2428 	unsigned long handle_new;
2429 	unsigned int comp_len_old;
2430 	unsigned int comp_len_new;
2431 	unsigned int class_index_old;
2432 	unsigned int class_index_new;
2433 	void *src;
2434 	int ret = 0;
2435 
2436 	handle_old = get_slot_handle(zram, index);
2437 	if (!handle_old)
2438 		return -EINVAL;
2439 
2440 	comp_len_old = get_slot_size(zram, index);
2441 	/*
2442 	 * Do not recompress objects that are already "small enough".
2443 	 */
2444 	if (comp_len_old < threshold)
2445 		return 0;
2446 
2447 	ret = read_from_zspool(zram, page, index);
2448 	if (ret)
2449 		return ret;
2450 
2451 	/*
2452 	 * We touched this entry so mark it as non-IDLE. This makes sure that
2453 	 * we don't preserve IDLE flag and don't incorrectly pick this entry
2454 	 * for different post-processing type (e.g. writeback).
2455 	 */
2456 	clear_slot_flag(zram, index, ZRAM_IDLE);
2457 
2458 	zstrm = zcomp_stream_get(zram->comps[prio]);
2459 	src = kmap_local_page(page);
2460 	ret = zcomp_compress(zram->comps[prio], zstrm, src, &comp_len_new);
2461 	kunmap_local(src);
2462 
2463 	/*
2464 	 * Decrement the limit (if set) on pages we can recompress, even
2465 	 * when current recompression was unsuccessful or did not compress
2466 	 * the page below the threshold, because we still spent resources
2467 	 * on it.
2468 	 */
2469 	if (*num_recomp_pages)
2470 		*num_recomp_pages -= 1;
2471 
2472 	if (ret) {
2473 		zcomp_stream_put(zstrm);
2474 		return ret;
2475 	}
2476 
2477 	class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
2478 	class_index_new = zs_lookup_class_index(zram->mem_pool, comp_len_new);
2479 
2480 	if (class_index_new >= class_index_old ||
2481 	    (threshold && comp_len_new >= threshold)) {
2482 		zcomp_stream_put(zstrm);
2483 
2484 		/*
2485 		 * Secondary algorithms failed to re-compress the page
2486 		 * in a way that would save memory.
2487 		 *
2488 		 * Mark the object incompressible if the max-priority (the
2489 		 * last configured one) algorithm couldn't re-compress it.
2490 		 */
2491 		if (highest_priority_algorithm(zram, prio))
2492 			set_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE);
2493 		return 0;
2494 	}
2495 
2496 	/*
2497 	 * We are holding per-CPU stream mutex and entry lock so better
2498 	 * avoid direct reclaim.  Allocation error is not fatal since
2499 	 * we still have the old object in the mem_pool.
2500 	 *
2501 	 * XXX: technically, the node we really want here is the node that
2502 	 * holds the original compressed data. But that would require us to
2503 	 * modify zsmalloc API to return this information. For now, we will
2504 	 * make do with the node of the page allocated for recompression.
2505 	 */
2506 	handle_new = zs_malloc(zram->mem_pool, comp_len_new,
2507 			       GFP_NOIO | __GFP_NOWARN |
2508 			       __GFP_HIGHMEM | __GFP_MOVABLE,
2509 			       page_to_nid(page));
2510 	if (IS_ERR_VALUE(handle_new)) {
2511 		zcomp_stream_put(zstrm);
2512 		return PTR_ERR((void *)handle_new);
2513 	}
2514 
2515 	zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new);
2516 	zcomp_stream_put(zstrm);
2517 
2518 	slot_free(zram, index);
2519 	set_slot_handle(zram, index, handle_new);
2520 	set_slot_size(zram, index, comp_len_new);
2521 	set_slot_comp_priority(zram, index, prio);
2522 
2523 	atomic64_add(comp_len_new, &zram->stats.compr_data_size);
2524 	atomic64_inc(&zram->stats.pages_stored);
2525 
2526 	return 0;
2527 }
2528 
2529 static ssize_t recompress_store(struct device *dev,
2530 				struct device_attribute *attr,
2531 				const char *buf, size_t len)
2532 {
2533 	struct zram *zram = dev_to_zram(dev);
2534 	char *args, *param, *val, *algo = NULL;
2535 	u64 num_recomp_pages = ULLONG_MAX;
2536 	struct zram_pp_ctl *ctl = NULL;
2537 	s32 prio = ZRAM_SECONDARY_COMP;
2538 	u32 mode = 0, threshold = 0;
2539 	struct zram_pp_slot *pps;
2540 	struct page *page = NULL;
2541 	bool prio_param = false;
2542 	ssize_t ret;
2543 
2544 	args = skip_spaces(buf);
2545 	while (*args) {
2546 		args = next_arg(args, &param, &val);
2547 
2548 		if (!val || !*val)
2549 			return -EINVAL;
2550 
2551 		if (!strcmp(param, "type")) {
2552 			if (!strcmp(val, "idle"))
2553 				mode = RECOMPRESS_IDLE;
2554 			if (!strcmp(val, "huge"))
2555 				mode = RECOMPRESS_HUGE;
2556 			if (!strcmp(val, "huge_idle"))
2557 				mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
2558 			if (!mode)
2559 				return -EINVAL;
2560 			continue;
2561 		}
2562 
2563 		if (!strcmp(param, "max_pages")) {
2564 			/*
2565 			 * Limit the number of entries (pages) we attempt to
2566 			 * recompress.
2567 			 */
2568 			ret = kstrtoull(val, 10, &num_recomp_pages);
2569 			if (ret)
2570 				return ret;
2571 			continue;
2572 		}
2573 
2574 		if (!strcmp(param, "threshold")) {
2575 			/*
2576 			 * We will re-compress only idle objects equal or
2577 			 * greater in size than watermark.
2578 			 */
2579 			ret = kstrtouint(val, 10, &threshold);
2580 			if (ret)
2581 				return ret;
2582 			continue;
2583 		}
2584 
2585 		if (!strcmp(param, "algo")) {
2586 			algo = val;
2587 			continue;
2588 		}
2589 
2590 		if (!strcmp(param, "priority")) {
2591 			prio_param = true;
2592 			ret = kstrtoint(val, 10, &prio);
2593 			if (ret)
2594 				return ret;
2595 			continue;
2596 		}
2597 	}
2598 
2599 	if (threshold >= huge_class_size)
2600 		return -EINVAL;
2601 
2602 	guard(rwsem_write)(&zram->dev_lock);
2603 	if (!init_done(zram))
2604 		return -EINVAL;
2605 
2606 	if (prio_param) {
2607 		if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
2608 			return -EINVAL;
2609 	}
2610 
2611 	if (algo && prio_param) {
2612 		ret = validate_algo_priority(zram, algo, prio);
2613 		if (ret)
2614 			return ret;
2615 	}
2616 
2617 	if (algo && !prio_param) {
2618 		prio = lookup_algo_priority(zram, algo, ZRAM_SECONDARY_COMP);
2619 		if (prio < 0)
2620 			return -EINVAL;
2621 	}
2622 
2623 	if (!zram->comps[prio])
2624 		return -EINVAL;
2625 
2626 	page = alloc_page(GFP_KERNEL);
2627 	if (!page) {
2628 		ret = -ENOMEM;
2629 		goto out;
2630 	}
2631 
2632 	ctl = init_pp_ctl();
2633 	if (!ctl) {
2634 		ret = -ENOMEM;
2635 		goto out;
2636 	}
2637 
2638 	scan_slots_for_recompress(zram, mode, prio, ctl);
2639 
2640 	ret = len;
2641 	while ((pps = select_pp_slot(ctl))) {
2642 		int err = 0;
2643 
2644 		if (!num_recomp_pages)
2645 			break;
2646 
2647 		slot_lock(zram, pps->index);
2648 		if (!test_slot_flag(zram, pps->index, ZRAM_PP_SLOT))
2649 			goto next;
2650 
2651 		err = recompress_slot(zram, pps->index, page,
2652 				      &num_recomp_pages, threshold, prio);
2653 next:
2654 		slot_unlock(zram, pps->index);
2655 		release_pp_slot(zram, pps);
2656 
2657 		if (err) {
2658 			ret = err;
2659 			break;
2660 		}
2661 
2662 		cond_resched();
2663 	}
2664 
2665 out:
2666 	if (page)
2667 		__free_page(page);
2668 	release_pp_ctl(zram, ctl);
2669 	return ret;
2670 }
2671 #endif
2672 
2673 static void zram_bio_discard(struct zram *zram, struct bio *bio)
2674 {
2675 	size_t n = bio->bi_iter.bi_size;
2676 	u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2677 	u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2678 			SECTOR_SHIFT;
2679 
2680 	/*
2681 	 * zram manages data in physical block size units. Because logical block
2682 	 * size isn't identical with physical block size on some arch, we
2683 	 * could get a discard request pointing to a specific offset within a
2684 	 * certain physical block.  Although we can handle this request by
2685 	 * reading that physiclal block and decompressing and partially zeroing
2686 	 * and re-compressing and then re-storing it, this isn't reasonable
2687 	 * because our intent with a discard request is to save memory.  So
2688 	 * skipping this logical block is appropriate here.
2689 	 */
2690 	if (offset) {
2691 		if (n <= (PAGE_SIZE - offset))
2692 			goto end_bio;
2693 
2694 		n -= (PAGE_SIZE - offset);
2695 		index++;
2696 	}
2697 
2698 	while (n >= PAGE_SIZE) {
2699 		slot_lock(zram, index);
2700 		slot_free(zram, index);
2701 		slot_unlock(zram, index);
2702 		atomic64_inc(&zram->stats.notify_free);
2703 		index++;
2704 		n -= PAGE_SIZE;
2705 	}
2706 
2707 end_bio:
2708 	bio_endio(bio);
2709 }
2710 
2711 static void zram_bio_read(struct zram *zram, struct bio *bio)
2712 {
2713 	unsigned long start_time = bio_start_io_acct(bio);
2714 	struct bvec_iter iter = bio->bi_iter;
2715 
2716 	do {
2717 		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2718 		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2719 				SECTOR_SHIFT;
2720 		struct bio_vec bv = bio_iter_iovec(bio, iter);
2721 
2722 		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
2723 
2724 		if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
2725 			atomic64_inc(&zram->stats.failed_reads);
2726 			bio->bi_status = BLK_STS_IOERR;
2727 			break;
2728 		}
2729 		flush_dcache_page(bv.bv_page);
2730 
2731 		slot_lock(zram, index);
2732 		mark_slot_accessed(zram, index);
2733 		slot_unlock(zram, index);
2734 
2735 		bio_advance_iter_single(bio, &iter, bv.bv_len);
2736 	} while (iter.bi_size);
2737 
2738 	bio_end_io_acct(bio, start_time);
2739 	bio_endio(bio);
2740 }
2741 
2742 static void zram_bio_write(struct zram *zram, struct bio *bio)
2743 {
2744 	unsigned long start_time = bio_start_io_acct(bio);
2745 	struct bvec_iter iter = bio->bi_iter;
2746 
2747 	do {
2748 		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2749 		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2750 				SECTOR_SHIFT;
2751 		struct bio_vec bv = bio_iter_iovec(bio, iter);
2752 
2753 		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
2754 
2755 		if (zram_bvec_write(zram, &bv, index, offset) < 0) {
2756 			atomic64_inc(&zram->stats.failed_writes);
2757 			bio->bi_status = BLK_STS_IOERR;
2758 			break;
2759 		}
2760 
2761 		slot_lock(zram, index);
2762 		mark_slot_accessed(zram, index);
2763 		slot_unlock(zram, index);
2764 
2765 		bio_advance_iter_single(bio, &iter, bv.bv_len);
2766 	} while (iter.bi_size);
2767 
2768 	bio_end_io_acct(bio, start_time);
2769 	bio_endio(bio);
2770 }
2771 
2772 /*
2773  * Handler function for all zram I/O requests.
2774  */
2775 static void zram_submit_bio(struct bio *bio)
2776 {
2777 	struct zram *zram = bio->bi_bdev->bd_disk->private_data;
2778 
2779 	switch (bio_op(bio)) {
2780 	case REQ_OP_READ:
2781 		zram_bio_read(zram, bio);
2782 		break;
2783 	case REQ_OP_WRITE:
2784 		zram_bio_write(zram, bio);
2785 		break;
2786 	case REQ_OP_DISCARD:
2787 	case REQ_OP_WRITE_ZEROES:
2788 		zram_bio_discard(zram, bio);
2789 		break;
2790 	default:
2791 		WARN_ON_ONCE(1);
2792 		bio_endio(bio);
2793 	}
2794 }
2795 
2796 static void zram_slot_free_notify(struct block_device *bdev,
2797 				unsigned long index)
2798 {
2799 	struct zram *zram;
2800 
2801 	zram = bdev->bd_disk->private_data;
2802 
2803 	atomic64_inc(&zram->stats.notify_free);
2804 	if (!slot_trylock(zram, index)) {
2805 		atomic64_inc(&zram->stats.miss_free);
2806 		return;
2807 	}
2808 
2809 	slot_free(zram, index);
2810 	slot_unlock(zram, index);
2811 }
2812 
2813 static void zram_comp_params_reset(struct zram *zram)
2814 {
2815 	u32 prio;
2816 
2817 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2818 		comp_params_reset(zram, prio);
2819 	}
2820 }
2821 
2822 static void zram_destroy_comps(struct zram *zram)
2823 {
2824 	u32 prio;
2825 
2826 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2827 		struct zcomp *comp = zram->comps[prio];
2828 
2829 		zram->comps[prio] = NULL;
2830 		if (!comp)
2831 			continue;
2832 		zcomp_destroy(comp);
2833 	}
2834 
2835 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++)
2836 		zram->comp_algs[prio] = NULL;
2837 
2838 	zram_comp_params_reset(zram);
2839 }
2840 
2841 static void zram_reset_device(struct zram *zram)
2842 {
2843 	guard(rwsem_write)(&zram->dev_lock);
2844 
2845 	zram->limit_pages = 0;
2846 
2847 	set_capacity_and_notify(zram->disk, 0);
2848 	part_stat_set_all(zram->disk->part0, 0);
2849 
2850 	/* I/O operation under all of CPU are done so let's free */
2851 	zram_meta_free(zram, zram->disksize);
2852 	zram->disksize = 0;
2853 	zram_destroy_comps(zram);
2854 	memset(&zram->stats, 0, sizeof(zram->stats));
2855 	reset_bdev(zram);
2856 
2857 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2858 }
2859 
2860 static ssize_t disksize_store(struct device *dev, struct device_attribute *attr,
2861 			      const char *buf, size_t len)
2862 {
2863 	u64 disksize;
2864 	struct zcomp *comp;
2865 	struct zram *zram = dev_to_zram(dev);
2866 	int err;
2867 	u32 prio;
2868 
2869 	disksize = memparse(buf, NULL);
2870 	if (!disksize)
2871 		return -EINVAL;
2872 
2873 	guard(rwsem_write)(&zram->dev_lock);
2874 	if (init_done(zram)) {
2875 		pr_info("Cannot change disksize for initialized device\n");
2876 		return -EBUSY;
2877 	}
2878 
2879 	disksize = PAGE_ALIGN(disksize);
2880 	if (!zram_meta_alloc(zram, disksize))
2881 		return -ENOMEM;
2882 
2883 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2884 		if (!zram->comp_algs[prio])
2885 			continue;
2886 
2887 		comp = zcomp_create(zram->comp_algs[prio],
2888 				    &zram->params[prio]);
2889 		if (IS_ERR(comp)) {
2890 			pr_err("Cannot initialise %s compressing backend\n",
2891 			       zram->comp_algs[prio]);
2892 			err = PTR_ERR(comp);
2893 			goto out_free_comps;
2894 		}
2895 
2896 		zram->comps[prio] = comp;
2897 	}
2898 	zram->disksize = disksize;
2899 	set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
2900 
2901 	return len;
2902 
2903 out_free_comps:
2904 	zram_destroy_comps(zram);
2905 	zram_meta_free(zram, disksize);
2906 	return err;
2907 }
2908 
2909 static ssize_t reset_store(struct device *dev,
2910 		struct device_attribute *attr, const char *buf, size_t len)
2911 {
2912 	int ret;
2913 	unsigned short do_reset;
2914 	struct zram *zram;
2915 	struct gendisk *disk;
2916 
2917 	ret = kstrtou16(buf, 10, &do_reset);
2918 	if (ret)
2919 		return ret;
2920 
2921 	if (!do_reset)
2922 		return -EINVAL;
2923 
2924 	zram = dev_to_zram(dev);
2925 	disk = zram->disk;
2926 
2927 	mutex_lock(&disk->open_mutex);
2928 	/* Do not reset an active device or claimed device */
2929 	if (disk_openers(disk) || zram->claim) {
2930 		mutex_unlock(&disk->open_mutex);
2931 		return -EBUSY;
2932 	}
2933 
2934 	/* From now on, anyone can't open /dev/zram[0-9] */
2935 	zram->claim = true;
2936 	mutex_unlock(&disk->open_mutex);
2937 
2938 	/* Make sure all the pending I/O are finished */
2939 	sync_blockdev(disk->part0);
2940 	zram_reset_device(zram);
2941 
2942 	mutex_lock(&disk->open_mutex);
2943 	zram->claim = false;
2944 	mutex_unlock(&disk->open_mutex);
2945 
2946 	return len;
2947 }
2948 
2949 static int zram_open(struct gendisk *disk, blk_mode_t mode)
2950 {
2951 	struct zram *zram = disk->private_data;
2952 
2953 	WARN_ON(!mutex_is_locked(&disk->open_mutex));
2954 
2955 	/* zram was claimed to reset so open request fails */
2956 	if (zram->claim)
2957 		return -EBUSY;
2958 	return 0;
2959 }
2960 
2961 static const struct block_device_operations zram_devops = {
2962 	.open = zram_open,
2963 	.submit_bio = zram_submit_bio,
2964 	.swap_slot_free_notify = zram_slot_free_notify,
2965 	.owner = THIS_MODULE
2966 };
2967 
2968 static DEVICE_ATTR_RO(io_stat);
2969 static DEVICE_ATTR_RO(mm_stat);
2970 static DEVICE_ATTR_RO(debug_stat);
2971 static DEVICE_ATTR_WO(compact);
2972 static DEVICE_ATTR_RW(disksize);
2973 static DEVICE_ATTR_RO(initstate);
2974 static DEVICE_ATTR_WO(reset);
2975 static DEVICE_ATTR_WO(mem_limit);
2976 static DEVICE_ATTR_WO(mem_used_max);
2977 static DEVICE_ATTR_WO(idle);
2978 static DEVICE_ATTR_RW(comp_algorithm);
2979 #ifdef CONFIG_ZRAM_WRITEBACK
2980 static DEVICE_ATTR_RO(bd_stat);
2981 static DEVICE_ATTR_RW(backing_dev);
2982 static DEVICE_ATTR_WO(writeback);
2983 static DEVICE_ATTR_RW(writeback_limit);
2984 static DEVICE_ATTR_RW(writeback_limit_enable);
2985 static DEVICE_ATTR_RW(writeback_batch_size);
2986 static DEVICE_ATTR_RW(compressed_writeback);
2987 #endif
2988 #ifdef CONFIG_ZRAM_MULTI_COMP
2989 static DEVICE_ATTR_RW(recomp_algorithm);
2990 static DEVICE_ATTR_WO(recompress);
2991 #endif
2992 static DEVICE_ATTR_WO(algorithm_params);
2993 
2994 static struct attribute *zram_disk_attrs[] = {
2995 	&dev_attr_disksize.attr,
2996 	&dev_attr_initstate.attr,
2997 	&dev_attr_reset.attr,
2998 	&dev_attr_compact.attr,
2999 	&dev_attr_mem_limit.attr,
3000 	&dev_attr_mem_used_max.attr,
3001 	&dev_attr_idle.attr,
3002 	&dev_attr_comp_algorithm.attr,
3003 #ifdef CONFIG_ZRAM_WRITEBACK
3004 	&dev_attr_bd_stat.attr,
3005 	&dev_attr_backing_dev.attr,
3006 	&dev_attr_writeback.attr,
3007 	&dev_attr_writeback_limit.attr,
3008 	&dev_attr_writeback_limit_enable.attr,
3009 	&dev_attr_writeback_batch_size.attr,
3010 	&dev_attr_compressed_writeback.attr,
3011 #endif
3012 	&dev_attr_io_stat.attr,
3013 	&dev_attr_mm_stat.attr,
3014 	&dev_attr_debug_stat.attr,
3015 #ifdef CONFIG_ZRAM_MULTI_COMP
3016 	&dev_attr_recomp_algorithm.attr,
3017 	&dev_attr_recompress.attr,
3018 #endif
3019 	&dev_attr_algorithm_params.attr,
3020 	NULL,
3021 };
3022 
3023 ATTRIBUTE_GROUPS(zram_disk);
3024 
3025 /*
3026  * Allocate and initialize new zram device. the function returns
3027  * '>= 0' device_id upon success, and negative value otherwise.
3028  */
3029 static int zram_add(void)
3030 {
3031 	struct queue_limits lim = {
3032 		.logical_block_size		= ZRAM_LOGICAL_BLOCK_SIZE,
3033 		/*
3034 		 * To ensure that we always get PAGE_SIZE aligned and
3035 		 * n*PAGE_SIZED sized I/O requests.
3036 		 */
3037 		.physical_block_size		= PAGE_SIZE,
3038 		.io_min				= PAGE_SIZE,
3039 		.io_opt				= PAGE_SIZE,
3040 		.max_hw_discard_sectors		= UINT_MAX,
3041 		/*
3042 		 * zram_bio_discard() will clear all logical blocks if logical
3043 		 * block size is identical with physical block size(PAGE_SIZE).
3044 		 * But if it is different, we will skip discarding some parts of
3045 		 * logical blocks in the part of the request range which isn't
3046 		 * aligned to physical block size.  So we can't ensure that all
3047 		 * discarded logical blocks are zeroed.
3048 		 */
3049 #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
3050 		.max_write_zeroes_sectors	= UINT_MAX,
3051 #endif
3052 		.features			= BLK_FEAT_STABLE_WRITES |
3053 						  BLK_FEAT_SYNCHRONOUS,
3054 	};
3055 	struct zram *zram;
3056 	int ret, device_id;
3057 
3058 	zram = kzalloc_obj(struct zram);
3059 	if (!zram)
3060 		return -ENOMEM;
3061 
3062 	ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
3063 	if (ret < 0)
3064 		goto out_free_dev;
3065 	device_id = ret;
3066 
3067 	init_rwsem(&zram->dev_lock);
3068 #ifdef CONFIG_ZRAM_WRITEBACK
3069 	zram->wb_batch_size = 32;
3070 	zram->compressed_wb = false;
3071 #endif
3072 
3073 	/* gendisk structure */
3074 	zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
3075 	if (IS_ERR(zram->disk)) {
3076 		pr_err("Error allocating disk structure for device %d\n",
3077 			device_id);
3078 		ret = PTR_ERR(zram->disk);
3079 		goto out_free_idr;
3080 	}
3081 
3082 	zram->disk->major = zram_major;
3083 	zram->disk->first_minor = device_id;
3084 	zram->disk->minors = 1;
3085 	zram->disk->flags |= GENHD_FL_NO_PART;
3086 	zram->disk->fops = &zram_devops;
3087 	zram->disk->private_data = zram;
3088 	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
3089 	zram_comp_params_reset(zram);
3090 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
3091 
3092 	/* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
3093 	set_capacity(zram->disk, 0);
3094 	ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
3095 	if (ret)
3096 		goto out_cleanup_disk;
3097 
3098 	zram_debugfs_register(zram);
3099 	pr_info("Added device: %s\n", zram->disk->disk_name);
3100 	return device_id;
3101 
3102 out_cleanup_disk:
3103 	put_disk(zram->disk);
3104 out_free_idr:
3105 	idr_remove(&zram_index_idr, device_id);
3106 out_free_dev:
3107 	kfree(zram);
3108 	return ret;
3109 }
3110 
3111 static int zram_remove(struct zram *zram)
3112 {
3113 	bool claimed;
3114 
3115 	mutex_lock(&zram->disk->open_mutex);
3116 	if (disk_openers(zram->disk)) {
3117 		mutex_unlock(&zram->disk->open_mutex);
3118 		return -EBUSY;
3119 	}
3120 
3121 	claimed = zram->claim;
3122 	if (!claimed)
3123 		zram->claim = true;
3124 	mutex_unlock(&zram->disk->open_mutex);
3125 
3126 	zram_debugfs_unregister(zram);
3127 
3128 	if (claimed) {
3129 		/*
3130 		 * If we were claimed by reset_store(), del_gendisk() will
3131 		 * wait until reset_store() is done, so nothing need to do.
3132 		 */
3133 		;
3134 	} else {
3135 		/* Make sure all the pending I/O are finished */
3136 		sync_blockdev(zram->disk->part0);
3137 		zram_reset_device(zram);
3138 	}
3139 
3140 	pr_info("Removed device: %s\n", zram->disk->disk_name);
3141 
3142 	del_gendisk(zram->disk);
3143 
3144 	/* del_gendisk drains pending reset_store */
3145 	WARN_ON_ONCE(claimed && zram->claim);
3146 
3147 	/*
3148 	 * disksize_store() may be called in between zram_reset_device()
3149 	 * and del_gendisk(), so run the last reset to avoid leaking
3150 	 * anything allocated with disksize_store()
3151 	 */
3152 	zram_reset_device(zram);
3153 
3154 	put_disk(zram->disk);
3155 	kfree(zram);
3156 	return 0;
3157 }
3158 
3159 /* zram-control sysfs attributes */
3160 
3161 /*
3162  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
3163  * sense that reading from this file does alter the state of your system -- it
3164  * creates a new un-initialized zram device and returns back this device's
3165  * device_id (or an error code if it fails to create a new device).
3166  */
3167 static ssize_t hot_add_show(const struct class *class,
3168 			const struct class_attribute *attr,
3169 			char *buf)
3170 {
3171 	int ret;
3172 
3173 	mutex_lock(&zram_index_mutex);
3174 	ret = zram_add();
3175 	mutex_unlock(&zram_index_mutex);
3176 
3177 	if (ret < 0)
3178 		return ret;
3179 	return sysfs_emit(buf, "%d\n", ret);
3180 }
3181 /* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */
3182 static struct class_attribute class_attr_hot_add =
3183 	__ATTR(hot_add, 0400, hot_add_show, NULL);
3184 
3185 static ssize_t hot_remove_store(const struct class *class,
3186 			const struct class_attribute *attr,
3187 			const char *buf,
3188 			size_t count)
3189 {
3190 	struct zram *zram;
3191 	int ret, dev_id;
3192 
3193 	/* dev_id is gendisk->first_minor, which is `int' */
3194 	ret = kstrtoint(buf, 10, &dev_id);
3195 	if (ret)
3196 		return ret;
3197 	if (dev_id < 0)
3198 		return -EINVAL;
3199 
3200 	mutex_lock(&zram_index_mutex);
3201 
3202 	zram = idr_find(&zram_index_idr, dev_id);
3203 	if (zram) {
3204 		ret = zram_remove(zram);
3205 		if (!ret)
3206 			idr_remove(&zram_index_idr, dev_id);
3207 	} else {
3208 		ret = -ENODEV;
3209 	}
3210 
3211 	mutex_unlock(&zram_index_mutex);
3212 	return ret ? ret : count;
3213 }
3214 static CLASS_ATTR_WO(hot_remove);
3215 
3216 static struct attribute *zram_control_class_attrs[] = {
3217 	&class_attr_hot_add.attr,
3218 	&class_attr_hot_remove.attr,
3219 	NULL,
3220 };
3221 ATTRIBUTE_GROUPS(zram_control_class);
3222 
3223 static struct class zram_control_class = {
3224 	.name		= "zram-control",
3225 	.class_groups	= zram_control_class_groups,
3226 };
3227 
3228 static int zram_remove_cb(int id, void *ptr, void *data)
3229 {
3230 	WARN_ON_ONCE(zram_remove(ptr));
3231 	return 0;
3232 }
3233 
3234 static void destroy_devices(void)
3235 {
3236 	class_unregister(&zram_control_class);
3237 	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
3238 	zram_debugfs_destroy();
3239 	idr_destroy(&zram_index_idr);
3240 	unregister_blkdev(zram_major, "zram");
3241 	cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3242 }
3243 
3244 static int __init zram_init(void)
3245 {
3246 	struct zram_table_entry zram_te;
3247 	int ret;
3248 
3249 	BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.attr.flags) * 8);
3250 
3251 	ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
3252 				      zcomp_cpu_up_prepare, zcomp_cpu_dead);
3253 	if (ret < 0)
3254 		return ret;
3255 
3256 	ret = class_register(&zram_control_class);
3257 	if (ret) {
3258 		pr_err("Unable to register zram-control class\n");
3259 		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3260 		return ret;
3261 	}
3262 
3263 	zram_debugfs_create();
3264 	zram_major = register_blkdev(0, "zram");
3265 	if (zram_major <= 0) {
3266 		pr_err("Unable to get major number\n");
3267 		class_unregister(&zram_control_class);
3268 		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3269 		return -EBUSY;
3270 	}
3271 
3272 	while (num_devices != 0) {
3273 		mutex_lock(&zram_index_mutex);
3274 		ret = zram_add();
3275 		mutex_unlock(&zram_index_mutex);
3276 		if (ret < 0)
3277 			goto out_error;
3278 		num_devices--;
3279 	}
3280 
3281 	return 0;
3282 
3283 out_error:
3284 	destroy_devices();
3285 	return ret;
3286 }
3287 
3288 static void __exit zram_exit(void)
3289 {
3290 	destroy_devices();
3291 }
3292 
3293 module_init(zram_init);
3294 module_exit(zram_exit);
3295 
3296 module_param(num_devices, uint, 0);
3297 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
3298 
3299 MODULE_LICENSE("Dual BSD/GPL");
3300 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
3301 MODULE_DESCRIPTION("Compressed RAM Block Device");
3302