xref: /linux/drivers/block/zram/zram_drv.c (revision 6efc548d8a08ae918020225e16d040ce3903bff7)
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14 
15 #define pr_fmt(fmt) "zram: " fmt
16 
17 #include <linux/module.h>
18 #include <linux/kernel.h>
19 #include <linux/bio.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/highmem.h>
25 #include <linux/slab.h>
26 #include <linux/backing-dev.h>
27 #include <linux/string.h>
28 #include <linux/vmalloc.h>
29 #include <linux/err.h>
30 #include <linux/idr.h>
31 #include <linux/sysfs.h>
32 #include <linux/debugfs.h>
33 #include <linux/cpuhotplug.h>
34 #include <linux/part_stat.h>
35 #include <linux/kernel_read_file.h>
36 
37 #include "zram_drv.h"
38 
39 static DEFINE_IDR(zram_index_idr);
40 /* idr index must be protected */
41 static DEFINE_MUTEX(zram_index_mutex);
42 
43 static int zram_major;
44 static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
45 
46 #define ZRAM_MAX_ALGO_NAME_SZ	128
47 
48 /* Module params (documentation at end) */
49 static unsigned int num_devices = 1;
50 /*
51  * Pages that compress to sizes equals or greater than this are stored
52  * uncompressed in memory.
53  */
54 static size_t huge_class_size;
55 
56 static const struct block_device_operations zram_devops;
57 
58 static void slot_free(struct zram *zram, u32 index);
59 #define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map)
60 
61 static void slot_lock_init(struct zram *zram, u32 index)
62 {
63 	static struct lock_class_key __key;
64 
65 	lockdep_init_map(slot_dep_map(zram, index), "zram->table[index].lock",
66 			 &__key, 0);
67 }
68 
69 /*
70  * entry locking rules:
71  *
72  * 1) Lock is exclusive
73  *
74  * 2) lock() function can sleep waiting for the lock
75  *
76  * 3) Lock owner can sleep
77  *
78  * 4) Use TRY lock variant when in atomic context
79  *    - must check return value and handle locking failers
80  */
81 static __must_check bool slot_trylock(struct zram *zram, u32 index)
82 {
83 	unsigned long *lock = &zram->table[index].__lock;
84 
85 	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
86 		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
87 		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
88 		return true;
89 	}
90 
91 	return false;
92 }
93 
94 static void slot_lock(struct zram *zram, u32 index)
95 {
96 	unsigned long *lock = &zram->table[index].__lock;
97 
98 	mutex_acquire(slot_dep_map(zram, index), 0, 0, _RET_IP_);
99 	wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE);
100 	lock_acquired(slot_dep_map(zram, index), _RET_IP_);
101 }
102 
103 static void slot_unlock(struct zram *zram, u32 index)
104 {
105 	unsigned long *lock = &zram->table[index].__lock;
106 
107 	mutex_release(slot_dep_map(zram, index), _RET_IP_);
108 	clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock);
109 }
110 
111 static inline bool init_done(struct zram *zram)
112 {
113 	return zram->disksize;
114 }
115 
116 static inline struct zram *dev_to_zram(struct device *dev)
117 {
118 	return (struct zram *)dev_to_disk(dev)->private_data;
119 }
120 
121 static unsigned long get_slot_handle(struct zram *zram, u32 index)
122 {
123 	return zram->table[index].handle;
124 }
125 
126 static void set_slot_handle(struct zram *zram, u32 index, unsigned long handle)
127 {
128 	zram->table[index].handle = handle;
129 }
130 
131 static bool test_slot_flag(struct zram *zram, u32 index,
132 			   enum zram_pageflags flag)
133 {
134 	return zram->table[index].attr.flags & BIT(flag);
135 }
136 
137 static void set_slot_flag(struct zram *zram, u32 index,
138 			  enum zram_pageflags flag)
139 {
140 	zram->table[index].attr.flags |= BIT(flag);
141 }
142 
143 static void clear_slot_flag(struct zram *zram, u32 index,
144 			    enum zram_pageflags flag)
145 {
146 	zram->table[index].attr.flags &= ~BIT(flag);
147 }
148 
149 static size_t get_slot_size(struct zram *zram, u32 index)
150 {
151 	return zram->table[index].attr.flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
152 }
153 
154 static void set_slot_size(struct zram *zram, u32 index, size_t size)
155 {
156 	unsigned long flags = zram->table[index].attr.flags >> ZRAM_FLAG_SHIFT;
157 
158 	zram->table[index].attr.flags = (flags << ZRAM_FLAG_SHIFT) | size;
159 }
160 
161 static inline bool slot_allocated(struct zram *zram, u32 index)
162 {
163 	return get_slot_size(zram, index) ||
164 		test_slot_flag(zram, index, ZRAM_SAME) ||
165 		test_slot_flag(zram, index, ZRAM_WB);
166 }
167 
168 static inline void set_slot_comp_priority(struct zram *zram, u32 index,
169 					  u32 prio)
170 {
171 	prio &= ZRAM_COMP_PRIORITY_MASK;
172 	/*
173 	 * Clear previous priority value first, in case if we recompress
174 	 * further an already recompressed page
175 	 */
176 	zram->table[index].attr.flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
177 					   ZRAM_COMP_PRIORITY_BIT1);
178 	zram->table[index].attr.flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
179 }
180 
181 static inline u32 get_slot_comp_priority(struct zram *zram, u32 index)
182 {
183 	u32 prio = zram->table[index].attr.flags >> ZRAM_COMP_PRIORITY_BIT1;
184 
185 	return prio & ZRAM_COMP_PRIORITY_MASK;
186 }
187 
188 static void mark_slot_accessed(struct zram *zram, u32 index)
189 {
190 	clear_slot_flag(zram, index, ZRAM_IDLE);
191 	clear_slot_flag(zram, index, ZRAM_PP_SLOT);
192 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
193 	zram->table[index].attr.ac_time = (u32)ktime_get_boottime_seconds();
194 #endif
195 }
196 
197 static inline void update_used_max(struct zram *zram, const unsigned long pages)
198 {
199 	unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
200 
201 	do {
202 		if (cur_max >= pages)
203 			return;
204 	} while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
205 					  &cur_max, pages));
206 }
207 
208 static bool zram_can_store_page(struct zram *zram)
209 {
210 	unsigned long alloced_pages;
211 
212 	alloced_pages = zs_get_total_pages(zram->mem_pool);
213 	update_used_max(zram, alloced_pages);
214 
215 	return !zram->limit_pages || alloced_pages <= zram->limit_pages;
216 }
217 
218 #if PAGE_SIZE != 4096
219 static inline bool is_partial_io(struct bio_vec *bvec)
220 {
221 	return bvec->bv_len != PAGE_SIZE;
222 }
223 #define ZRAM_PARTIAL_IO		1
224 #else
225 static inline bool is_partial_io(struct bio_vec *bvec)
226 {
227 	return false;
228 }
229 #endif
230 
231 #if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP
232 struct zram_pp_slot {
233 	unsigned long		index;
234 	struct list_head	entry;
235 };
236 
237 /*
238  * A post-processing bucket is, essentially, a size class, this defines
239  * the range (in bytes) of pp-slots sizes in particular bucket.
240  */
241 #define PP_BUCKET_SIZE_RANGE	64
242 #define NUM_PP_BUCKETS		((PAGE_SIZE / PP_BUCKET_SIZE_RANGE) + 1)
243 
244 struct zram_pp_ctl {
245 	struct list_head	pp_buckets[NUM_PP_BUCKETS];
246 };
247 
248 static struct zram_pp_ctl *init_pp_ctl(void)
249 {
250 	struct zram_pp_ctl *ctl;
251 	u32 idx;
252 
253 	ctl = kmalloc(sizeof(*ctl), GFP_KERNEL);
254 	if (!ctl)
255 		return NULL;
256 
257 	for (idx = 0; idx < NUM_PP_BUCKETS; idx++)
258 		INIT_LIST_HEAD(&ctl->pp_buckets[idx]);
259 	return ctl;
260 }
261 
262 static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps)
263 {
264 	list_del_init(&pps->entry);
265 
266 	slot_lock(zram, pps->index);
267 	clear_slot_flag(zram, pps->index, ZRAM_PP_SLOT);
268 	slot_unlock(zram, pps->index);
269 
270 	kfree(pps);
271 }
272 
273 static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl)
274 {
275 	u32 idx;
276 
277 	if (!ctl)
278 		return;
279 
280 	for (idx = 0; idx < NUM_PP_BUCKETS; idx++) {
281 		while (!list_empty(&ctl->pp_buckets[idx])) {
282 			struct zram_pp_slot *pps;
283 
284 			pps = list_first_entry(&ctl->pp_buckets[idx],
285 					       struct zram_pp_slot,
286 					       entry);
287 			release_pp_slot(zram, pps);
288 		}
289 	}
290 
291 	kfree(ctl);
292 }
293 
294 static bool place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl,
295 			  u32 index)
296 {
297 	struct zram_pp_slot *pps;
298 	u32 bid;
299 
300 	pps = kmalloc(sizeof(*pps), GFP_NOIO | __GFP_NOWARN);
301 	if (!pps)
302 		return false;
303 
304 	INIT_LIST_HEAD(&pps->entry);
305 	pps->index = index;
306 
307 	bid = get_slot_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE;
308 	list_add(&pps->entry, &ctl->pp_buckets[bid]);
309 
310 	set_slot_flag(zram, pps->index, ZRAM_PP_SLOT);
311 	return true;
312 }
313 
314 static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl)
315 {
316 	struct zram_pp_slot *pps = NULL;
317 	s32 idx = NUM_PP_BUCKETS - 1;
318 
319 	/* The higher the bucket id the more optimal slot post-processing is */
320 	while (idx >= 0) {
321 		pps = list_first_entry_or_null(&ctl->pp_buckets[idx],
322 					       struct zram_pp_slot,
323 					       entry);
324 		if (pps)
325 			break;
326 
327 		idx--;
328 	}
329 	return pps;
330 }
331 #endif
332 
333 static inline void zram_fill_page(void *ptr, unsigned long len,
334 					unsigned long value)
335 {
336 	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
337 	memset_l(ptr, value, len / sizeof(unsigned long));
338 }
339 
340 static bool page_same_filled(void *ptr, unsigned long *element)
341 {
342 	unsigned long *page;
343 	unsigned long val;
344 	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
345 
346 	page = (unsigned long *)ptr;
347 	val = page[0];
348 
349 	if (val != page[last_pos])
350 		return false;
351 
352 	for (pos = 1; pos < last_pos; pos++) {
353 		if (val != page[pos])
354 			return false;
355 	}
356 
357 	*element = val;
358 
359 	return true;
360 }
361 
362 static ssize_t initstate_show(struct device *dev, struct device_attribute *attr,
363 			      char *buf)
364 {
365 	u32 val;
366 	struct zram *zram = dev_to_zram(dev);
367 
368 	guard(rwsem_read)(&zram->dev_lock);
369 	val = init_done(zram);
370 
371 	return sysfs_emit(buf, "%u\n", val);
372 }
373 
374 static ssize_t disksize_show(struct device *dev,
375 		struct device_attribute *attr, char *buf)
376 {
377 	struct zram *zram = dev_to_zram(dev);
378 
379 	return sysfs_emit(buf, "%llu\n", zram->disksize);
380 }
381 
382 static ssize_t mem_limit_store(struct device *dev,
383 			       struct device_attribute *attr, const char *buf,
384 			       size_t len)
385 {
386 	u64 limit;
387 	char *tmp;
388 	struct zram *zram = dev_to_zram(dev);
389 
390 	limit = memparse(buf, &tmp);
391 	if (buf == tmp) /* no chars parsed, invalid input */
392 		return -EINVAL;
393 
394 	guard(rwsem_write)(&zram->dev_lock);
395 	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
396 
397 	return len;
398 }
399 
400 static ssize_t mem_used_max_store(struct device *dev,
401 				  struct device_attribute *attr,
402 				  const char *buf, size_t len)
403 {
404 	int err;
405 	unsigned long val;
406 	struct zram *zram = dev_to_zram(dev);
407 
408 	err = kstrtoul(buf, 10, &val);
409 	if (err || val != 0)
410 		return -EINVAL;
411 
412 	guard(rwsem_read)(&zram->dev_lock);
413 	if (init_done(zram)) {
414 		atomic_long_set(&zram->stats.max_used_pages,
415 				zs_get_total_pages(zram->mem_pool));
416 	}
417 
418 	return len;
419 }
420 
421 /*
422  * Mark all pages which are older than or equal to cutoff as IDLE.
423  * Callers should hold the zram init lock in read mode
424  */
425 static void mark_idle(struct zram *zram, ktime_t cutoff)
426 {
427 	int is_idle = 1;
428 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
429 	int index;
430 
431 	for (index = 0; index < nr_pages; index++) {
432 		/*
433 		 * Do not mark ZRAM_SAME slots as ZRAM_IDLE, because no
434 		 * post-processing (recompress, writeback) happens to the
435 		 * ZRAM_SAME slot.
436 		 *
437 		 * And ZRAM_WB slots simply cannot be ZRAM_IDLE.
438 		 */
439 		slot_lock(zram, index);
440 		if (!slot_allocated(zram, index) ||
441 		    test_slot_flag(zram, index, ZRAM_WB) ||
442 		    test_slot_flag(zram, index, ZRAM_SAME)) {
443 			slot_unlock(zram, index);
444 			continue;
445 		}
446 
447 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
448 		is_idle = !cutoff ||
449 			ktime_after(cutoff, zram->table[index].attr.ac_time);
450 #endif
451 		if (is_idle)
452 			set_slot_flag(zram, index, ZRAM_IDLE);
453 		else
454 			clear_slot_flag(zram, index, ZRAM_IDLE);
455 		slot_unlock(zram, index);
456 	}
457 }
458 
459 static ssize_t idle_store(struct device *dev, struct device_attribute *attr,
460 			  const char *buf, size_t len)
461 {
462 	struct zram *zram = dev_to_zram(dev);
463 	ktime_t cutoff = 0;
464 
465 	if (!sysfs_streq(buf, "all")) {
466 		/*
467 		 * If it did not parse as 'all' try to treat it as an integer
468 		 * when we have memory tracking enabled.
469 		 */
470 		u32 age_sec;
471 
472 		if (IS_ENABLED(CONFIG_ZRAM_TRACK_ENTRY_ACTIME) &&
473 		    !kstrtouint(buf, 0, &age_sec))
474 			cutoff = ktime_sub((u32)ktime_get_boottime_seconds(),
475 					   age_sec);
476 		else
477 			return -EINVAL;
478 	}
479 
480 	guard(rwsem_read)(&zram->dev_lock);
481 	if (!init_done(zram))
482 		return -EINVAL;
483 
484 	/*
485 	 * A cutoff of 0 marks everything as idle, this is the
486 	 * "all" behavior.
487 	 */
488 	mark_idle(zram, cutoff);
489 	return len;
490 }
491 
492 #ifdef CONFIG_ZRAM_WRITEBACK
493 #define INVALID_BDEV_BLOCK		(~0UL)
494 
495 static int read_from_zspool_raw(struct zram *zram, struct page *page,
496 				u32 index);
497 static int read_from_zspool(struct zram *zram, struct page *page, u32 index);
498 
499 struct zram_wb_ctl {
500 	/* idle list is accessed only by the writeback task, no concurency */
501 	struct list_head idle_reqs;
502 	/* done list is accessed concurrently, protect by done_lock */
503 	struct list_head done_reqs;
504 	wait_queue_head_t done_wait;
505 	spinlock_t done_lock;
506 	atomic_t num_inflight;
507 };
508 
509 struct zram_wb_req {
510 	unsigned long blk_idx;
511 	struct page *page;
512 	struct zram_pp_slot *pps;
513 	struct bio_vec bio_vec;
514 	struct bio bio;
515 
516 	struct list_head entry;
517 };
518 
519 struct zram_rb_req {
520 	struct work_struct work;
521 	struct zram *zram;
522 	struct page *page;
523 	/* The read bio for backing device */
524 	struct bio *bio;
525 	unsigned long blk_idx;
526 	union {
527 		/* The original bio to complete (async read) */
528 		struct bio *parent;
529 		/* error status (sync read) */
530 		int error;
531 	};
532 	u32 index;
533 };
534 
535 #define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
536 static ssize_t bd_stat_show(struct device *dev, struct device_attribute *attr,
537 			    char *buf)
538 {
539 	struct zram *zram = dev_to_zram(dev);
540 	ssize_t ret;
541 
542 	guard(rwsem_read)(&zram->dev_lock);
543 	ret = sysfs_emit(buf,
544 			 "%8llu %8llu %8llu\n",
545 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
546 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
547 			 FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
548 
549 	return ret;
550 }
551 
552 static ssize_t writeback_compressed_store(struct device *dev,
553 					  struct device_attribute *attr,
554 					  const char *buf, size_t len)
555 {
556 	struct zram *zram = dev_to_zram(dev);
557 	bool val;
558 
559 	if (kstrtobool(buf, &val))
560 		return -EINVAL;
561 
562 	guard(rwsem_write)(&zram->dev_lock);
563 	if (init_done(zram)) {
564 		return -EBUSY;
565 	}
566 
567 	zram->wb_compressed = val;
568 
569 	return len;
570 }
571 
572 static ssize_t writeback_compressed_show(struct device *dev,
573 					 struct device_attribute *attr,
574 					 char *buf)
575 {
576 	bool val;
577 	struct zram *zram = dev_to_zram(dev);
578 
579 	guard(rwsem_read)(&zram->dev_lock);
580 	val = zram->wb_compressed;
581 
582 	return sysfs_emit(buf, "%d\n", val);
583 }
584 
585 static ssize_t writeback_limit_enable_store(struct device *dev,
586 					    struct device_attribute *attr,
587 					    const char *buf, size_t len)
588 {
589 	struct zram *zram = dev_to_zram(dev);
590 	u64 val;
591 
592 	if (kstrtoull(buf, 10, &val))
593 		return -EINVAL;
594 
595 	guard(rwsem_write)(&zram->dev_lock);
596 	zram->wb_limit_enable = val;
597 
598 	return len;
599 }
600 
601 static ssize_t writeback_limit_enable_show(struct device *dev,
602 					   struct device_attribute *attr,
603 					   char *buf)
604 {
605 	bool val;
606 	struct zram *zram = dev_to_zram(dev);
607 
608 	guard(rwsem_read)(&zram->dev_lock);
609 	val = zram->wb_limit_enable;
610 
611 	return sysfs_emit(buf, "%d\n", val);
612 }
613 
614 static ssize_t writeback_limit_store(struct device *dev,
615 				     struct device_attribute *attr,
616 				     const char *buf, size_t len)
617 {
618 	struct zram *zram = dev_to_zram(dev);
619 	u64 val;
620 
621 	if (kstrtoull(buf, 10, &val))
622 		return -EINVAL;
623 
624 	/*
625 	 * When the page size is greater than 4KB, if bd_wb_limit is set to
626 	 * a value that is not page - size aligned, it will cause value
627 	 * wrapping. For example, when the page size is set to 16KB and
628 	 * bd_wb_limit is set to 3, a single write - back operation will
629 	 * cause bd_wb_limit to become -1. Even more terrifying is that
630 	 * bd_wb_limit is an unsigned number.
631 	 */
632 	val = rounddown(val, PAGE_SIZE / 4096);
633 
634 	guard(rwsem_write)(&zram->dev_lock);
635 	zram->bd_wb_limit = val;
636 
637 	return len;
638 }
639 
640 static ssize_t writeback_limit_show(struct device *dev,
641 				    struct device_attribute *attr, char *buf)
642 {
643 	u64 val;
644 	struct zram *zram = dev_to_zram(dev);
645 
646 	guard(rwsem_read)(&zram->dev_lock);
647 	val = zram->bd_wb_limit;
648 
649 	return sysfs_emit(buf, "%llu\n", val);
650 }
651 
652 static ssize_t writeback_batch_size_store(struct device *dev,
653 					  struct device_attribute *attr,
654 					  const char *buf, size_t len)
655 {
656 	struct zram *zram = dev_to_zram(dev);
657 	u32 val;
658 
659 	if (kstrtouint(buf, 10, &val))
660 		return -EINVAL;
661 
662 	if (!val)
663 		return -EINVAL;
664 
665 	guard(rwsem_write)(&zram->dev_lock);
666 	zram->wb_batch_size = val;
667 
668 	return len;
669 }
670 
671 static ssize_t writeback_batch_size_show(struct device *dev,
672 					 struct device_attribute *attr,
673 					 char *buf)
674 {
675 	u32 val;
676 	struct zram *zram = dev_to_zram(dev);
677 
678 	guard(rwsem_read)(&zram->dev_lock);
679 	val = zram->wb_batch_size;
680 
681 	return sysfs_emit(buf, "%u\n", val);
682 }
683 
684 static void reset_bdev(struct zram *zram)
685 {
686 	if (!zram->backing_dev)
687 		return;
688 
689 	/* hope filp_close flush all of IO */
690 	filp_close(zram->backing_dev, NULL);
691 	zram->backing_dev = NULL;
692 	zram->bdev = NULL;
693 	zram->disk->fops = &zram_devops;
694 	kvfree(zram->bitmap);
695 	zram->bitmap = NULL;
696 }
697 
698 static ssize_t backing_dev_show(struct device *dev,
699 				struct device_attribute *attr, char *buf)
700 {
701 	struct file *file;
702 	struct zram *zram = dev_to_zram(dev);
703 	char *p;
704 	ssize_t ret;
705 
706 	guard(rwsem_read)(&zram->dev_lock);
707 	file = zram->backing_dev;
708 	if (!file) {
709 		memcpy(buf, "none\n", 5);
710 		return 5;
711 	}
712 
713 	p = file_path(file, buf, PAGE_SIZE - 1);
714 	if (IS_ERR(p))
715 		return PTR_ERR(p);
716 
717 	ret = strlen(p);
718 	memmove(buf, p, ret);
719 	buf[ret++] = '\n';
720 	return ret;
721 }
722 
723 static ssize_t backing_dev_store(struct device *dev,
724 				 struct device_attribute *attr, const char *buf,
725 				 size_t len)
726 {
727 	char *file_name;
728 	size_t sz;
729 	struct file *backing_dev = NULL;
730 	struct inode *inode;
731 	unsigned int bitmap_sz;
732 	unsigned long nr_pages, *bitmap = NULL;
733 	int err;
734 	struct zram *zram = dev_to_zram(dev);
735 
736 	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
737 	if (!file_name)
738 		return -ENOMEM;
739 
740 	guard(rwsem_write)(&zram->dev_lock);
741 	if (init_done(zram)) {
742 		pr_info("Can't setup backing device for initialized device\n");
743 		err = -EBUSY;
744 		goto out;
745 	}
746 
747 	strscpy(file_name, buf, PATH_MAX);
748 	/* ignore trailing newline */
749 	sz = strlen(file_name);
750 	if (sz > 0 && file_name[sz - 1] == '\n')
751 		file_name[sz - 1] = 0x00;
752 
753 	backing_dev = filp_open(file_name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
754 	if (IS_ERR(backing_dev)) {
755 		err = PTR_ERR(backing_dev);
756 		backing_dev = NULL;
757 		goto out;
758 	}
759 
760 	inode = backing_dev->f_mapping->host;
761 
762 	/* Support only block device in this moment */
763 	if (!S_ISBLK(inode->i_mode)) {
764 		err = -ENOTBLK;
765 		goto out;
766 	}
767 
768 	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
769 	/* Refuse to use zero sized device (also prevents self reference) */
770 	if (!nr_pages) {
771 		err = -EINVAL;
772 		goto out;
773 	}
774 
775 	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
776 	bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
777 	if (!bitmap) {
778 		err = -ENOMEM;
779 		goto out;
780 	}
781 
782 	reset_bdev(zram);
783 
784 	zram->bdev = I_BDEV(inode);
785 	zram->backing_dev = backing_dev;
786 	zram->bitmap = bitmap;
787 	zram->nr_pages = nr_pages;
788 
789 	pr_info("setup backing device %s\n", file_name);
790 	kfree(file_name);
791 
792 	return len;
793 out:
794 	kvfree(bitmap);
795 
796 	if (backing_dev)
797 		filp_close(backing_dev, NULL);
798 
799 	kfree(file_name);
800 
801 	return err;
802 }
803 
804 static unsigned long zram_reserve_bdev_block(struct zram *zram)
805 {
806 	unsigned long blk_idx;
807 
808 	blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
809 	if (blk_idx == zram->nr_pages)
810 		return INVALID_BDEV_BLOCK;
811 
812 	set_bit(blk_idx, zram->bitmap);
813 	atomic64_inc(&zram->stats.bd_count);
814 	return blk_idx;
815 }
816 
817 static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
818 {
819 	int was_set;
820 
821 	was_set = test_and_clear_bit(blk_idx, zram->bitmap);
822 	WARN_ON_ONCE(!was_set);
823 	atomic64_dec(&zram->stats.bd_count);
824 }
825 
826 static void release_wb_req(struct zram_wb_req *req)
827 {
828 	__free_page(req->page);
829 	kfree(req);
830 }
831 
832 static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
833 {
834 	if (!wb_ctl)
835 		return;
836 
837 	/* We should never have inflight requests at this point */
838 	WARN_ON(atomic_read(&wb_ctl->num_inflight));
839 	WARN_ON(!list_empty(&wb_ctl->done_reqs));
840 
841 	while (!list_empty(&wb_ctl->idle_reqs)) {
842 		struct zram_wb_req *req;
843 
844 		req = list_first_entry(&wb_ctl->idle_reqs,
845 				       struct zram_wb_req, entry);
846 		list_del(&req->entry);
847 		release_wb_req(req);
848 	}
849 
850 	kfree(wb_ctl);
851 }
852 
853 static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
854 {
855 	struct zram_wb_ctl *wb_ctl;
856 	int i;
857 
858 	wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
859 	if (!wb_ctl)
860 		return NULL;
861 
862 	INIT_LIST_HEAD(&wb_ctl->idle_reqs);
863 	INIT_LIST_HEAD(&wb_ctl->done_reqs);
864 	atomic_set(&wb_ctl->num_inflight, 0);
865 	init_waitqueue_head(&wb_ctl->done_wait);
866 	spin_lock_init(&wb_ctl->done_lock);
867 
868 	for (i = 0; i < zram->wb_batch_size; i++) {
869 		struct zram_wb_req *req;
870 
871 		/*
872 		 * This is fatal condition only if we couldn't allocate
873 		 * any requests at all.  Otherwise we just work with the
874 		 * requests that we have successfully allocated, so that
875 		 * writeback can still proceed, even if there is only one
876 		 * request on the idle list.
877 		 */
878 		req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN);
879 		if (!req)
880 			break;
881 
882 		req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN);
883 		if (!req->page) {
884 			kfree(req);
885 			break;
886 		}
887 
888 		list_add(&req->entry, &wb_ctl->idle_reqs);
889 	}
890 
891 	/* We couldn't allocate any requests, so writeabck is not possible */
892 	if (list_empty(&wb_ctl->idle_reqs))
893 		goto release_wb_ctl;
894 
895 	return wb_ctl;
896 
897 release_wb_ctl:
898 	release_wb_ctl(wb_ctl);
899 	return NULL;
900 }
901 
902 static void zram_account_writeback_rollback(struct zram *zram)
903 {
904 	lockdep_assert_held_write(&zram->dev_lock);
905 
906 	if (zram->wb_limit_enable)
907 		zram->bd_wb_limit +=  1UL << (PAGE_SHIFT - 12);
908 }
909 
910 static void zram_account_writeback_submit(struct zram *zram)
911 {
912 	lockdep_assert_held_write(&zram->dev_lock);
913 
914 	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
915 		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
916 }
917 
918 static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
919 {
920 	u32 size, index = req->pps->index;
921 	int err, prio;
922 	bool huge;
923 
924 	err = blk_status_to_errno(req->bio.bi_status);
925 	if (err) {
926 		/*
927 		 * Failed wb requests should not be accounted in wb_limit
928 		 * (if enabled).
929 		 */
930 		zram_account_writeback_rollback(zram);
931 		zram_release_bdev_block(zram, req->blk_idx);
932 		return err;
933 	}
934 
935 	atomic64_inc(&zram->stats.bd_writes);
936 	slot_lock(zram, index);
937 	/*
938 	 * We release slot lock during writeback so slot can change under us:
939 	 * slot_free() or slot_free() and zram_write_page(). In both cases
940 	 * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
941 	 * set ZRAM_PP_SLOT on such slots until current post-processing
942 	 * finishes.
943 	 */
944 	if (!test_slot_flag(zram, index, ZRAM_PP_SLOT)) {
945 		zram_release_bdev_block(zram, req->blk_idx);
946 		goto out;
947 	}
948 
949 	if (zram->wb_compressed) {
950 		/*
951 		 * ZRAM_WB slots get freed, we need to preserve data required
952 		 * for read decompression.
953 		 */
954 		size = get_slot_size(zram, index);
955 		prio = get_slot_comp_priority(zram, index);
956 		huge = test_slot_flag(zram, index, ZRAM_HUGE);
957 	}
958 
959 	slot_free(zram, index);
960 	set_slot_flag(zram, index, ZRAM_WB);
961 	set_slot_handle(zram, index, req->blk_idx);
962 
963 	if (zram->wb_compressed) {
964 		if (huge)
965 			set_slot_flag(zram, index, ZRAM_HUGE);
966 		set_slot_size(zram, index, size);
967 		set_slot_comp_priority(zram, index, prio);
968 	}
969 
970 	atomic64_inc(&zram->stats.pages_stored);
971 
972 out:
973 	slot_unlock(zram, index);
974 	return 0;
975 }
976 
977 static void zram_writeback_endio(struct bio *bio)
978 {
979 	struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio);
980 	struct zram_wb_ctl *wb_ctl = bio->bi_private;
981 	unsigned long flags;
982 
983 	spin_lock_irqsave(&wb_ctl->done_lock, flags);
984 	list_add(&req->entry, &wb_ctl->done_reqs);
985 	spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
986 
987 	wake_up(&wb_ctl->done_wait);
988 }
989 
990 static void zram_submit_wb_request(struct zram *zram,
991 				   struct zram_wb_ctl *wb_ctl,
992 				   struct zram_wb_req *req)
993 {
994 	/*
995 	 * wb_limit (if enabled) should be adjusted before submission,
996 	 * so that we don't over-submit.
997 	 */
998 	zram_account_writeback_submit(zram);
999 	atomic_inc(&wb_ctl->num_inflight);
1000 	req->bio.bi_private = wb_ctl;
1001 	submit_bio(&req->bio);
1002 }
1003 
1004 static int zram_complete_done_reqs(struct zram *zram,
1005 				   struct zram_wb_ctl *wb_ctl)
1006 {
1007 	struct zram_wb_req *req;
1008 	unsigned long flags;
1009 	int ret = 0, err;
1010 
1011 	while (atomic_read(&wb_ctl->num_inflight) > 0) {
1012 		spin_lock_irqsave(&wb_ctl->done_lock, flags);
1013 		req = list_first_entry_or_null(&wb_ctl->done_reqs,
1014 					       struct zram_wb_req, entry);
1015 		if (req)
1016 			list_del(&req->entry);
1017 		spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
1018 
1019 		/* ->num_inflight > 0 doesn't mean we have done requests */
1020 		if (!req)
1021 			break;
1022 
1023 		err = zram_writeback_complete(zram, req);
1024 		if (err)
1025 			ret = err;
1026 
1027 		atomic_dec(&wb_ctl->num_inflight);
1028 		release_pp_slot(zram, req->pps);
1029 		req->pps = NULL;
1030 
1031 		list_add(&req->entry, &wb_ctl->idle_reqs);
1032 	}
1033 
1034 	return ret;
1035 }
1036 
1037 static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
1038 {
1039 	struct zram_wb_req *req;
1040 
1041 	req = list_first_entry_or_null(&wb_ctl->idle_reqs,
1042 				       struct zram_wb_req, entry);
1043 	if (req)
1044 		list_del(&req->entry);
1045 	return req;
1046 }
1047 
1048 static int zram_writeback_slots(struct zram *zram,
1049 				struct zram_pp_ctl *ctl,
1050 				struct zram_wb_ctl *wb_ctl)
1051 {
1052 	unsigned long blk_idx = INVALID_BDEV_BLOCK;
1053 	struct zram_wb_req *req = NULL;
1054 	struct zram_pp_slot *pps;
1055 	int ret = 0, err = 0;
1056 	u32 index = 0;
1057 
1058 	while ((pps = select_pp_slot(ctl))) {
1059 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
1060 			ret = -EIO;
1061 			break;
1062 		}
1063 
1064 		while (!req) {
1065 			req = zram_select_idle_req(wb_ctl);
1066 			if (req)
1067 				break;
1068 
1069 			wait_event(wb_ctl->done_wait,
1070 				   !list_empty(&wb_ctl->done_reqs));
1071 
1072 			err = zram_complete_done_reqs(zram, wb_ctl);
1073 			/*
1074 			 * BIO errors are not fatal, we continue and simply
1075 			 * attempt to writeback the remaining objects (pages).
1076 			 * At the same time we need to signal user-space that
1077 			 * some writes (at least one, but also could be all of
1078 			 * them) were not successful and we do so by returning
1079 			 * the most recent BIO error.
1080 			 */
1081 			if (err)
1082 				ret = err;
1083 		}
1084 
1085 		if (blk_idx == INVALID_BDEV_BLOCK) {
1086 			blk_idx = zram_reserve_bdev_block(zram);
1087 			if (blk_idx == INVALID_BDEV_BLOCK) {
1088 				ret = -ENOSPC;
1089 				break;
1090 			}
1091 		}
1092 
1093 		index = pps->index;
1094 		slot_lock(zram, index);
1095 		/*
1096 		 * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so
1097 		 * slots can change in the meantime. If slots are accessed or
1098 		 * freed they lose ZRAM_PP_SLOT flag and hence we don't
1099 		 * post-process them.
1100 		 */
1101 		if (!test_slot_flag(zram, index, ZRAM_PP_SLOT))
1102 			goto next;
1103 		if (zram->wb_compressed)
1104 			err = read_from_zspool_raw(zram, req->page, index);
1105 		else
1106 			err = read_from_zspool(zram, req->page, index);
1107 		if (err)
1108 			goto next;
1109 		slot_unlock(zram, index);
1110 
1111 		/*
1112 		 * From now on pp-slot is owned by the req, remove it from
1113 		 * its pp bucket.
1114 		 */
1115 		list_del_init(&pps->entry);
1116 
1117 		req->blk_idx = blk_idx;
1118 		req->pps = pps;
1119 		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE);
1120 		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
1121 		req->bio.bi_end_io = zram_writeback_endio;
1122 		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
1123 
1124 		zram_submit_wb_request(zram, wb_ctl, req);
1125 		blk_idx = INVALID_BDEV_BLOCK;
1126 		req = NULL;
1127 		cond_resched();
1128 		continue;
1129 
1130 next:
1131 		slot_unlock(zram, index);
1132 		release_pp_slot(zram, pps);
1133 	}
1134 
1135 	/*
1136 	 * Selected idle req, but never submitted it due to some error or
1137 	 * wb limit.
1138 	 */
1139 	if (req)
1140 		release_wb_req(req);
1141 
1142 	while (atomic_read(&wb_ctl->num_inflight) > 0) {
1143 		wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs));
1144 		err = zram_complete_done_reqs(zram, wb_ctl);
1145 		if (err)
1146 			ret = err;
1147 	}
1148 
1149 	return ret;
1150 }
1151 
1152 #define PAGE_WRITEBACK			0
1153 #define HUGE_WRITEBACK			(1 << 0)
1154 #define IDLE_WRITEBACK			(1 << 1)
1155 #define INCOMPRESSIBLE_WRITEBACK	(1 << 2)
1156 
1157 static int parse_page_index(char *val, unsigned long nr_pages,
1158 			    unsigned long *lo, unsigned long *hi)
1159 {
1160 	int ret;
1161 
1162 	ret = kstrtoul(val, 10, lo);
1163 	if (ret)
1164 		return ret;
1165 	if (*lo >= nr_pages)
1166 		return -ERANGE;
1167 	*hi = *lo + 1;
1168 	return 0;
1169 }
1170 
1171 static int parse_page_indexes(char *val, unsigned long nr_pages,
1172 			      unsigned long *lo, unsigned long *hi)
1173 {
1174 	char *delim;
1175 	int ret;
1176 
1177 	delim = strchr(val, '-');
1178 	if (!delim)
1179 		return -EINVAL;
1180 
1181 	*delim = 0x00;
1182 	ret = kstrtoul(val, 10, lo);
1183 	if (ret)
1184 		return ret;
1185 	if (*lo >= nr_pages)
1186 		return -ERANGE;
1187 
1188 	ret = kstrtoul(delim + 1, 10, hi);
1189 	if (ret)
1190 		return ret;
1191 	if (*hi >= nr_pages || *lo > *hi)
1192 		return -ERANGE;
1193 	*hi += 1;
1194 	return 0;
1195 }
1196 
1197 static int parse_mode(char *val, u32 *mode)
1198 {
1199 	*mode = 0;
1200 
1201 	if (!strcmp(val, "idle"))
1202 		*mode = IDLE_WRITEBACK;
1203 	if (!strcmp(val, "huge"))
1204 		*mode = HUGE_WRITEBACK;
1205 	if (!strcmp(val, "huge_idle"))
1206 		*mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
1207 	if (!strcmp(val, "incompressible"))
1208 		*mode = INCOMPRESSIBLE_WRITEBACK;
1209 
1210 	if (*mode == 0)
1211 		return -EINVAL;
1212 	return 0;
1213 }
1214 
1215 static int scan_slots_for_writeback(struct zram *zram, u32 mode,
1216 				    unsigned long lo, unsigned long hi,
1217 				    struct zram_pp_ctl *ctl)
1218 {
1219 	u32 index = lo;
1220 
1221 	while (index < hi) {
1222 		bool ok = true;
1223 
1224 		slot_lock(zram, index);
1225 		if (!slot_allocated(zram, index))
1226 			goto next;
1227 
1228 		if (test_slot_flag(zram, index, ZRAM_WB) ||
1229 		    test_slot_flag(zram, index, ZRAM_SAME))
1230 			goto next;
1231 
1232 		if (mode & IDLE_WRITEBACK &&
1233 		    !test_slot_flag(zram, index, ZRAM_IDLE))
1234 			goto next;
1235 		if (mode & HUGE_WRITEBACK &&
1236 		    !test_slot_flag(zram, index, ZRAM_HUGE))
1237 			goto next;
1238 		if (mode & INCOMPRESSIBLE_WRITEBACK &&
1239 		    !test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1240 			goto next;
1241 
1242 		ok = place_pp_slot(zram, ctl, index);
1243 next:
1244 		slot_unlock(zram, index);
1245 		if (!ok)
1246 			break;
1247 		index++;
1248 	}
1249 
1250 	return 0;
1251 }
1252 
1253 static ssize_t writeback_store(struct device *dev,
1254 			       struct device_attribute *attr,
1255 			       const char *buf, size_t len)
1256 {
1257 	struct zram *zram = dev_to_zram(dev);
1258 	u64 nr_pages = zram->disksize >> PAGE_SHIFT;
1259 	unsigned long lo = 0, hi = nr_pages;
1260 	struct zram_pp_ctl *pp_ctl = NULL;
1261 	struct zram_wb_ctl *wb_ctl = NULL;
1262 	char *args, *param, *val;
1263 	ssize_t ret = len;
1264 	int err, mode = 0;
1265 
1266 	guard(rwsem_write)(&zram->dev_lock);
1267 	if (!init_done(zram))
1268 		return -EINVAL;
1269 
1270 	if (!zram->backing_dev)
1271 		return -ENODEV;
1272 
1273 	pp_ctl = init_pp_ctl();
1274 	if (!pp_ctl)
1275 		return -ENOMEM;
1276 
1277 	wb_ctl = init_wb_ctl(zram);
1278 	if (!wb_ctl) {
1279 		ret = -ENOMEM;
1280 		goto out;
1281 	}
1282 
1283 	args = skip_spaces(buf);
1284 	while (*args) {
1285 		args = next_arg(args, &param, &val);
1286 
1287 		/*
1288 		 * Workaround to support the old writeback interface.
1289 		 *
1290 		 * The old writeback interface has a minor inconsistency and
1291 		 * requires key=value only for page_index parameter, while the
1292 		 * writeback mode is a valueless parameter.
1293 		 *
1294 		 * This is not the case anymore and now all parameters are
1295 		 * required to have values, however, we need to support the
1296 		 * legacy writeback interface format so we check if we can
1297 		 * recognize a valueless parameter as the (legacy) writeback
1298 		 * mode.
1299 		 */
1300 		if (!val || !*val) {
1301 			err = parse_mode(param, &mode);
1302 			if (err) {
1303 				ret = err;
1304 				goto out;
1305 			}
1306 
1307 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1308 			break;
1309 		}
1310 
1311 		if (!strcmp(param, "type")) {
1312 			err = parse_mode(val, &mode);
1313 			if (err) {
1314 				ret = err;
1315 				goto out;
1316 			}
1317 
1318 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1319 			break;
1320 		}
1321 
1322 		if (!strcmp(param, "page_index")) {
1323 			err = parse_page_index(val, nr_pages, &lo, &hi);
1324 			if (err) {
1325 				ret = err;
1326 				goto out;
1327 			}
1328 
1329 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1330 			continue;
1331 		}
1332 
1333 		if (!strcmp(param, "page_indexes")) {
1334 			err = parse_page_indexes(val, nr_pages, &lo, &hi);
1335 			if (err) {
1336 				ret = err;
1337 				goto out;
1338 			}
1339 
1340 			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
1341 			continue;
1342 		}
1343 	}
1344 
1345 	err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
1346 	if (err)
1347 		ret = err;
1348 
1349 out:
1350 	release_pp_ctl(zram, pp_ctl);
1351 	release_wb_ctl(wb_ctl);
1352 
1353 	return ret;
1354 }
1355 
1356 static int decompress_bdev_page(struct zram *zram, struct page *page, u32 index)
1357 {
1358 	struct zcomp_strm *zstrm;
1359 	unsigned int size;
1360 	int ret, prio;
1361 	void *src;
1362 
1363 	slot_lock(zram, index);
1364 	/* Since slot was unlocked we need to make sure it's still ZRAM_WB */
1365 	if (!test_slot_flag(zram, index, ZRAM_WB)) {
1366 		slot_unlock(zram, index);
1367 		/* We read some stale data, zero it out */
1368 		memset_page(page, 0, 0, PAGE_SIZE);
1369 		return -EIO;
1370 	}
1371 
1372 	if (test_slot_flag(zram, index, ZRAM_HUGE)) {
1373 		slot_unlock(zram, index);
1374 		return 0;
1375 	}
1376 
1377 	size = get_slot_size(zram, index);
1378 	prio = get_slot_comp_priority(zram, index);
1379 
1380 	zstrm = zcomp_stream_get(zram->comps[prio]);
1381 	src = kmap_local_page(page);
1382 	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size,
1383 			       zstrm->local_copy);
1384 	if (!ret)
1385 		copy_page(src, zstrm->local_copy);
1386 	kunmap_local(src);
1387 	zcomp_stream_put(zstrm);
1388 	slot_unlock(zram, index);
1389 
1390 	return ret;
1391 }
1392 
1393 static void zram_deferred_decompress(struct work_struct *w)
1394 {
1395 	struct zram_rb_req *req = container_of(w, struct zram_rb_req, work);
1396 	struct page *page = bio_first_page_all(req->bio);
1397 	struct zram *zram = req->zram;
1398 	u32 index = req->index;
1399 	int ret;
1400 
1401 	ret = decompress_bdev_page(zram, page, index);
1402 	if (ret)
1403 		req->parent->bi_status = BLK_STS_IOERR;
1404 
1405 	/* Decrement parent's ->remaining */
1406 	bio_endio(req->parent);
1407 	bio_put(req->bio);
1408 	kfree(req);
1409 }
1410 
1411 static void zram_async_read_endio(struct bio *bio)
1412 {
1413 	struct zram_rb_req *req = bio->bi_private;
1414 	struct zram *zram = req->zram;
1415 
1416 	if (bio->bi_status) {
1417 		req->parent->bi_status = bio->bi_status;
1418 		bio_endio(req->parent);
1419 		bio_put(bio);
1420 		kfree(req);
1421 		return;
1422 	}
1423 
1424 	/*
1425 	 * NOTE: zram_async_read_endio() is not exactly right place for this.
1426 	 * Ideally, we need to do it after ZRAM_WB check, but this requires
1427 	 * us to use wq path even on systems that don't enable compressed
1428 	 * writeback, because we cannot take slot-lock in the current context.
1429 	 *
1430 	 * Keep the existing behavior for now.
1431 	 */
1432 	if (zram->wb_compressed == false) {
1433 		/* No decompression needed, complete the parent IO */
1434 		bio_endio(req->parent);
1435 		bio_put(bio);
1436 		kfree(req);
1437 		return;
1438 	}
1439 
1440 	/*
1441 	 * zram decompression is sleepable, so we need to deffer it to
1442 	 * a preemptible context.
1443 	 */
1444 	INIT_WORK(&req->work, zram_deferred_decompress);
1445 	queue_work(system_highpri_wq, &req->work);
1446 }
1447 
1448 static void read_from_bdev_async(struct zram *zram, struct page *page,
1449 				 u32 index, unsigned long blk_idx,
1450 				 struct bio *parent)
1451 {
1452 	struct zram_rb_req *req;
1453 	struct bio *bio;
1454 
1455 	req = kmalloc(sizeof(*req), GFP_NOIO);
1456 	if (!req)
1457 		return;
1458 
1459 	bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO);
1460 	if (!bio) {
1461 		kfree(req);
1462 		return;
1463 	}
1464 
1465 	req->zram = zram;
1466 	req->index = index;
1467 	req->blk_idx = blk_idx;
1468 	req->bio = bio;
1469 	req->parent = parent;
1470 
1471 	bio->bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
1472 	bio->bi_private = req;
1473 	bio->bi_end_io = zram_async_read_endio;
1474 
1475 	__bio_add_page(bio, page, PAGE_SIZE, 0);
1476 	bio_inc_remaining(parent);
1477 	submit_bio(bio);
1478 }
1479 
1480 static void zram_sync_read(struct work_struct *w)
1481 {
1482 	struct zram_rb_req *req = container_of(w, struct zram_rb_req, work);
1483 	struct bio_vec bv;
1484 	struct bio bio;
1485 
1486 	bio_init(&bio, req->zram->bdev, &bv, 1, REQ_OP_READ);
1487 	bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
1488 	__bio_add_page(&bio, req->page, PAGE_SIZE, 0);
1489 	req->error = submit_bio_wait(&bio);
1490 }
1491 
1492 /*
1493  * Block layer want one ->submit_bio to be active at a time, so if we use
1494  * chained IO with parent IO in same context, it's a deadlock. To avoid that,
1495  * use a worker thread context.
1496  */
1497 static int read_from_bdev_sync(struct zram *zram, struct page *page, u32 index,
1498 			       unsigned long blk_idx)
1499 {
1500 	struct zram_rb_req req;
1501 
1502 	req.page = page;
1503 	req.zram = zram;
1504 	req.blk_idx = blk_idx;
1505 
1506 	INIT_WORK_ONSTACK(&req.work, zram_sync_read);
1507 	queue_work(system_dfl_wq, &req.work);
1508 	flush_work(&req.work);
1509 	destroy_work_on_stack(&req.work);
1510 
1511 	if (req.error || zram->wb_compressed == false)
1512 		return req.error;
1513 
1514 	return decompress_bdev_page(zram, page, index);
1515 }
1516 
1517 static int read_from_bdev(struct zram *zram, struct page *page, u32 index,
1518 			  unsigned long blk_idx, struct bio *parent)
1519 {
1520 	atomic64_inc(&zram->stats.bd_reads);
1521 	if (!parent) {
1522 		if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO)))
1523 			return -EIO;
1524 		return read_from_bdev_sync(zram, page, index, blk_idx);
1525 	}
1526 	read_from_bdev_async(zram, page, index, blk_idx, parent);
1527 	return 0;
1528 }
1529 #else
1530 static inline void reset_bdev(struct zram *zram) {};
1531 static int read_from_bdev(struct zram *zram, struct page *page, u32 index,
1532 			  unsigned long blk_idx, struct bio *parent)
1533 {
1534 	return -EIO;
1535 }
1536 
1537 static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
1538 {
1539 }
1540 #endif
1541 
1542 #ifdef CONFIG_ZRAM_MEMORY_TRACKING
1543 
1544 static struct dentry *zram_debugfs_root;
1545 
1546 static void zram_debugfs_create(void)
1547 {
1548 	zram_debugfs_root = debugfs_create_dir("zram", NULL);
1549 }
1550 
1551 static void zram_debugfs_destroy(void)
1552 {
1553 	debugfs_remove_recursive(zram_debugfs_root);
1554 }
1555 
1556 static ssize_t read_block_state(struct file *file, char __user *buf,
1557 				size_t count, loff_t *ppos)
1558 {
1559 	char *kbuf;
1560 	ssize_t index, written = 0;
1561 	struct zram *zram = file->private_data;
1562 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
1563 
1564 	kbuf = kvmalloc(count, GFP_KERNEL);
1565 	if (!kbuf)
1566 		return -ENOMEM;
1567 
1568 	guard(rwsem_read)(&zram->dev_lock);
1569 	if (!init_done(zram)) {
1570 		kvfree(kbuf);
1571 		return -EINVAL;
1572 	}
1573 
1574 	for (index = *ppos; index < nr_pages; index++) {
1575 		int copied;
1576 
1577 		slot_lock(zram, index);
1578 		if (!slot_allocated(zram, index))
1579 			goto next;
1580 
1581 		copied = snprintf(kbuf + written, count,
1582 			"%12zd %12u.%06d %c%c%c%c%c%c\n",
1583 			index, zram->table[index].attr.ac_time, 0,
1584 			test_slot_flag(zram, index, ZRAM_SAME) ? 's' : '.',
1585 			test_slot_flag(zram, index, ZRAM_WB) ? 'w' : '.',
1586 			test_slot_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
1587 			test_slot_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
1588 			get_slot_comp_priority(zram, index) ? 'r' : '.',
1589 			test_slot_flag(zram, index,
1590 				       ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
1591 
1592 		if (count <= copied) {
1593 			slot_unlock(zram, index);
1594 			break;
1595 		}
1596 		written += copied;
1597 		count -= copied;
1598 next:
1599 		slot_unlock(zram, index);
1600 		*ppos += 1;
1601 	}
1602 
1603 	if (copy_to_user(buf, kbuf, written))
1604 		written = -EFAULT;
1605 	kvfree(kbuf);
1606 
1607 	return written;
1608 }
1609 
1610 static const struct file_operations proc_zram_block_state_op = {
1611 	.open = simple_open,
1612 	.read = read_block_state,
1613 	.llseek = default_llseek,
1614 };
1615 
1616 static void zram_debugfs_register(struct zram *zram)
1617 {
1618 	if (!zram_debugfs_root)
1619 		return;
1620 
1621 	zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
1622 						zram_debugfs_root);
1623 	debugfs_create_file("block_state", 0400, zram->debugfs_dir,
1624 				zram, &proc_zram_block_state_op);
1625 }
1626 
1627 static void zram_debugfs_unregister(struct zram *zram)
1628 {
1629 	debugfs_remove_recursive(zram->debugfs_dir);
1630 }
1631 #else
1632 static void zram_debugfs_create(void) {};
1633 static void zram_debugfs_destroy(void) {};
1634 static void zram_debugfs_register(struct zram *zram) {};
1635 static void zram_debugfs_unregister(struct zram *zram) {};
1636 #endif
1637 
1638 static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
1639 {
1640 	/* Do not free statically defined compression algorithms */
1641 	if (zram->comp_algs[prio] != default_compressor)
1642 		kfree(zram->comp_algs[prio]);
1643 
1644 	zram->comp_algs[prio] = alg;
1645 }
1646 
1647 static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
1648 {
1649 	char *compressor;
1650 	size_t sz;
1651 
1652 	sz = strlen(buf);
1653 	if (sz >= ZRAM_MAX_ALGO_NAME_SZ)
1654 		return -E2BIG;
1655 
1656 	compressor = kstrdup(buf, GFP_KERNEL);
1657 	if (!compressor)
1658 		return -ENOMEM;
1659 
1660 	/* ignore trailing newline */
1661 	if (sz > 0 && compressor[sz - 1] == '\n')
1662 		compressor[sz - 1] = 0x00;
1663 
1664 	if (!zcomp_available_algorithm(compressor)) {
1665 		kfree(compressor);
1666 		return -EINVAL;
1667 	}
1668 
1669 	guard(rwsem_write)(&zram->dev_lock);
1670 	if (init_done(zram)) {
1671 		kfree(compressor);
1672 		pr_info("Can't change algorithm for initialized device\n");
1673 		return -EBUSY;
1674 	}
1675 
1676 	comp_algorithm_set(zram, prio, compressor);
1677 	return 0;
1678 }
1679 
1680 static void comp_params_reset(struct zram *zram, u32 prio)
1681 {
1682 	struct zcomp_params *params = &zram->params[prio];
1683 
1684 	vfree(params->dict);
1685 	params->level = ZCOMP_PARAM_NOT_SET;
1686 	params->deflate.winbits = ZCOMP_PARAM_NOT_SET;
1687 	params->dict_sz = 0;
1688 	params->dict = NULL;
1689 }
1690 
1691 static int comp_params_store(struct zram *zram, u32 prio, s32 level,
1692 			     const char *dict_path,
1693 			     struct deflate_params *deflate_params)
1694 {
1695 	ssize_t sz = 0;
1696 
1697 	comp_params_reset(zram, prio);
1698 
1699 	if (dict_path) {
1700 		sz = kernel_read_file_from_path(dict_path, 0,
1701 						&zram->params[prio].dict,
1702 						INT_MAX,
1703 						NULL,
1704 						READING_POLICY);
1705 		if (sz < 0)
1706 			return -EINVAL;
1707 	}
1708 
1709 	zram->params[prio].dict_sz = sz;
1710 	zram->params[prio].level = level;
1711 	zram->params[prio].deflate.winbits = deflate_params->winbits;
1712 	return 0;
1713 }
1714 
1715 static ssize_t algorithm_params_store(struct device *dev,
1716 				      struct device_attribute *attr,
1717 				      const char *buf,
1718 				      size_t len)
1719 {
1720 	s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NOT_SET;
1721 	char *args, *param, *val, *algo = NULL, *dict_path = NULL;
1722 	struct deflate_params deflate_params;
1723 	struct zram *zram = dev_to_zram(dev);
1724 	int ret;
1725 
1726 	deflate_params.winbits = ZCOMP_PARAM_NOT_SET;
1727 
1728 	args = skip_spaces(buf);
1729 	while (*args) {
1730 		args = next_arg(args, &param, &val);
1731 
1732 		if (!val || !*val)
1733 			return -EINVAL;
1734 
1735 		if (!strcmp(param, "priority")) {
1736 			ret = kstrtoint(val, 10, &prio);
1737 			if (ret)
1738 				return ret;
1739 			continue;
1740 		}
1741 
1742 		if (!strcmp(param, "level")) {
1743 			ret = kstrtoint(val, 10, &level);
1744 			if (ret)
1745 				return ret;
1746 			continue;
1747 		}
1748 
1749 		if (!strcmp(param, "algo")) {
1750 			algo = val;
1751 			continue;
1752 		}
1753 
1754 		if (!strcmp(param, "dict")) {
1755 			dict_path = val;
1756 			continue;
1757 		}
1758 
1759 		if (!strcmp(param, "deflate.winbits")) {
1760 			ret = kstrtoint(val, 10, &deflate_params.winbits);
1761 			if (ret)
1762 				return ret;
1763 			continue;
1764 		}
1765 	}
1766 
1767 	/* Lookup priority by algorithm name */
1768 	if (algo) {
1769 		s32 p;
1770 
1771 		prio = -EINVAL;
1772 		for (p = ZRAM_PRIMARY_COMP; p < ZRAM_MAX_COMPS; p++) {
1773 			if (!zram->comp_algs[p])
1774 				continue;
1775 
1776 			if (!strcmp(zram->comp_algs[p], algo)) {
1777 				prio = p;
1778 				break;
1779 			}
1780 		}
1781 	}
1782 
1783 	if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS)
1784 		return -EINVAL;
1785 
1786 	ret = comp_params_store(zram, prio, level, dict_path, &deflate_params);
1787 	return ret ? ret : len;
1788 }
1789 
1790 static ssize_t comp_algorithm_show(struct device *dev,
1791 				   struct device_attribute *attr,
1792 				   char *buf)
1793 {
1794 	struct zram *zram = dev_to_zram(dev);
1795 	ssize_t sz;
1796 
1797 	guard(rwsem_read)(&zram->dev_lock);
1798 	sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0);
1799 	return sz;
1800 }
1801 
1802 static ssize_t comp_algorithm_store(struct device *dev,
1803 				    struct device_attribute *attr,
1804 				    const char *buf,
1805 				    size_t len)
1806 {
1807 	struct zram *zram = dev_to_zram(dev);
1808 	int ret;
1809 
1810 	ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
1811 	return ret ? ret : len;
1812 }
1813 
1814 #ifdef CONFIG_ZRAM_MULTI_COMP
1815 static ssize_t recomp_algorithm_show(struct device *dev,
1816 				     struct device_attribute *attr,
1817 				     char *buf)
1818 {
1819 	struct zram *zram = dev_to_zram(dev);
1820 	ssize_t sz = 0;
1821 	u32 prio;
1822 
1823 	guard(rwsem_read)(&zram->dev_lock);
1824 	for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
1825 		if (!zram->comp_algs[prio])
1826 			continue;
1827 
1828 		sz += sysfs_emit_at(buf, sz, "#%d: ", prio);
1829 		sz += zcomp_available_show(zram->comp_algs[prio], buf, sz);
1830 	}
1831 	return sz;
1832 }
1833 
1834 static ssize_t recomp_algorithm_store(struct device *dev,
1835 				      struct device_attribute *attr,
1836 				      const char *buf,
1837 				      size_t len)
1838 {
1839 	struct zram *zram = dev_to_zram(dev);
1840 	int prio = ZRAM_SECONDARY_COMP;
1841 	char *args, *param, *val;
1842 	char *alg = NULL;
1843 	int ret;
1844 
1845 	args = skip_spaces(buf);
1846 	while (*args) {
1847 		args = next_arg(args, &param, &val);
1848 
1849 		if (!val || !*val)
1850 			return -EINVAL;
1851 
1852 		if (!strcmp(param, "algo")) {
1853 			alg = val;
1854 			continue;
1855 		}
1856 
1857 		if (!strcmp(param, "priority")) {
1858 			ret = kstrtoint(val, 10, &prio);
1859 			if (ret)
1860 				return ret;
1861 			continue;
1862 		}
1863 	}
1864 
1865 	if (!alg)
1866 		return -EINVAL;
1867 
1868 	if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
1869 		return -EINVAL;
1870 
1871 	ret = __comp_algorithm_store(zram, prio, alg);
1872 	return ret ? ret : len;
1873 }
1874 #endif
1875 
1876 static ssize_t compact_store(struct device *dev, struct device_attribute *attr,
1877 			     const char *buf, size_t len)
1878 {
1879 	struct zram *zram = dev_to_zram(dev);
1880 
1881 	guard(rwsem_read)(&zram->dev_lock);
1882 	if (!init_done(zram))
1883 		return -EINVAL;
1884 
1885 	zs_compact(zram->mem_pool);
1886 
1887 	return len;
1888 }
1889 
1890 static ssize_t io_stat_show(struct device *dev, struct device_attribute *attr,
1891 			    char *buf)
1892 {
1893 	struct zram *zram = dev_to_zram(dev);
1894 	ssize_t ret;
1895 
1896 	guard(rwsem_read)(&zram->dev_lock);
1897 	ret = sysfs_emit(buf,
1898 			"%8llu %8llu 0 %8llu\n",
1899 			(u64)atomic64_read(&zram->stats.failed_reads),
1900 			(u64)atomic64_read(&zram->stats.failed_writes),
1901 			(u64)atomic64_read(&zram->stats.notify_free));
1902 
1903 	return ret;
1904 }
1905 
1906 static ssize_t mm_stat_show(struct device *dev, struct device_attribute *attr,
1907 			    char *buf)
1908 {
1909 	struct zram *zram = dev_to_zram(dev);
1910 	struct zs_pool_stats pool_stats;
1911 	u64 orig_size, mem_used = 0;
1912 	long max_used;
1913 	ssize_t ret;
1914 
1915 	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1916 
1917 	guard(rwsem_read)(&zram->dev_lock);
1918 	if (init_done(zram)) {
1919 		mem_used = zs_get_total_pages(zram->mem_pool);
1920 		zs_pool_stats(zram->mem_pool, &pool_stats);
1921 	}
1922 
1923 	orig_size = atomic64_read(&zram->stats.pages_stored);
1924 	max_used = atomic_long_read(&zram->stats.max_used_pages);
1925 
1926 	ret = sysfs_emit(buf,
1927 			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1928 			orig_size << PAGE_SHIFT,
1929 			(u64)atomic64_read(&zram->stats.compr_data_size),
1930 			mem_used << PAGE_SHIFT,
1931 			zram->limit_pages << PAGE_SHIFT,
1932 			max_used << PAGE_SHIFT,
1933 			(u64)atomic64_read(&zram->stats.same_pages),
1934 			atomic_long_read(&pool_stats.pages_compacted),
1935 			(u64)atomic64_read(&zram->stats.huge_pages),
1936 			(u64)atomic64_read(&zram->stats.huge_pages_since));
1937 
1938 	return ret;
1939 }
1940 
1941 static ssize_t debug_stat_show(struct device *dev,
1942 			       struct device_attribute *attr, char *buf)
1943 {
1944 	int version = 1;
1945 	struct zram *zram = dev_to_zram(dev);
1946 	ssize_t ret;
1947 
1948 	guard(rwsem_read)(&zram->dev_lock);
1949 	ret = sysfs_emit(buf,
1950 			"version: %d\n0 %8llu\n",
1951 			version,
1952 			(u64)atomic64_read(&zram->stats.miss_free));
1953 
1954 	return ret;
1955 }
1956 
1957 static void zram_meta_free(struct zram *zram, u64 disksize)
1958 {
1959 	size_t num_pages = disksize >> PAGE_SHIFT;
1960 	size_t index;
1961 
1962 	if (!zram->table)
1963 		return;
1964 
1965 	/* Free all pages that are still in this zram device */
1966 	for (index = 0; index < num_pages; index++)
1967 		slot_free(zram, index);
1968 
1969 	zs_destroy_pool(zram->mem_pool);
1970 	vfree(zram->table);
1971 	zram->table = NULL;
1972 }
1973 
1974 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1975 {
1976 	size_t num_pages, index;
1977 
1978 	num_pages = disksize >> PAGE_SHIFT;
1979 	zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1980 	if (!zram->table)
1981 		return false;
1982 
1983 	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1984 	if (!zram->mem_pool) {
1985 		vfree(zram->table);
1986 		zram->table = NULL;
1987 		return false;
1988 	}
1989 
1990 	if (!huge_class_size)
1991 		huge_class_size = zs_huge_class_size(zram->mem_pool);
1992 
1993 	for (index = 0; index < num_pages; index++)
1994 		slot_lock_init(zram, index);
1995 
1996 	return true;
1997 }
1998 
1999 static void slot_free(struct zram *zram, u32 index)
2000 {
2001 	unsigned long handle;
2002 
2003 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
2004 	zram->table[index].attr.ac_time = 0;
2005 #endif
2006 
2007 	clear_slot_flag(zram, index, ZRAM_IDLE);
2008 	clear_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE);
2009 	clear_slot_flag(zram, index, ZRAM_PP_SLOT);
2010 	set_slot_comp_priority(zram, index, 0);
2011 
2012 	if (test_slot_flag(zram, index, ZRAM_HUGE)) {
2013 		clear_slot_flag(zram, index, ZRAM_HUGE);
2014 		atomic64_dec(&zram->stats.huge_pages);
2015 	}
2016 
2017 	if (test_slot_flag(zram, index, ZRAM_WB)) {
2018 		clear_slot_flag(zram, index, ZRAM_WB);
2019 		zram_release_bdev_block(zram, get_slot_handle(zram, index));
2020 		goto out;
2021 	}
2022 
2023 	/*
2024 	 * No memory is allocated for same element filled pages.
2025 	 * Simply clear same page flag.
2026 	 */
2027 	if (test_slot_flag(zram, index, ZRAM_SAME)) {
2028 		clear_slot_flag(zram, index, ZRAM_SAME);
2029 		atomic64_dec(&zram->stats.same_pages);
2030 		goto out;
2031 	}
2032 
2033 	handle = get_slot_handle(zram, index);
2034 	if (!handle)
2035 		return;
2036 
2037 	zs_free(zram->mem_pool, handle);
2038 
2039 	atomic64_sub(get_slot_size(zram, index),
2040 		     &zram->stats.compr_data_size);
2041 out:
2042 	atomic64_dec(&zram->stats.pages_stored);
2043 	set_slot_handle(zram, index, 0);
2044 	set_slot_size(zram, index, 0);
2045 }
2046 
2047 static int read_same_filled_page(struct zram *zram, struct page *page,
2048 				 u32 index)
2049 {
2050 	void *mem;
2051 
2052 	mem = kmap_local_page(page);
2053 	zram_fill_page(mem, PAGE_SIZE, get_slot_handle(zram, index));
2054 	kunmap_local(mem);
2055 	return 0;
2056 }
2057 
2058 static int read_incompressible_page(struct zram *zram, struct page *page,
2059 				    u32 index)
2060 {
2061 	unsigned long handle;
2062 	void *src, *dst;
2063 
2064 	handle = get_slot_handle(zram, index);
2065 	src = zs_obj_read_begin(zram->mem_pool, handle, PAGE_SIZE, NULL);
2066 	dst = kmap_local_page(page);
2067 	copy_page(dst, src);
2068 	kunmap_local(dst);
2069 	zs_obj_read_end(zram->mem_pool, handle, PAGE_SIZE, src);
2070 
2071 	return 0;
2072 }
2073 
2074 static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
2075 {
2076 	struct zcomp_strm *zstrm;
2077 	unsigned long handle;
2078 	unsigned int size;
2079 	void *src, *dst;
2080 	int ret, prio;
2081 
2082 	handle = get_slot_handle(zram, index);
2083 	size = get_slot_size(zram, index);
2084 	prio = get_slot_comp_priority(zram, index);
2085 
2086 	zstrm = zcomp_stream_get(zram->comps[prio]);
2087 	src = zs_obj_read_begin(zram->mem_pool, handle, size,
2088 				zstrm->local_copy);
2089 	dst = kmap_local_page(page);
2090 	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst);
2091 	kunmap_local(dst);
2092 	zs_obj_read_end(zram->mem_pool, handle, size, src);
2093 	zcomp_stream_put(zstrm);
2094 
2095 	return ret;
2096 }
2097 
2098 #if defined CONFIG_ZRAM_WRITEBACK
2099 static int read_from_zspool_raw(struct zram *zram, struct page *page, u32 index)
2100 {
2101 	struct zcomp_strm *zstrm;
2102 	unsigned long handle;
2103 	unsigned int size;
2104 	void *src;
2105 
2106 	handle = get_slot_handle(zram, index);
2107 	size = get_slot_size(zram, index);
2108 
2109 	/*
2110 	 * We need to get stream just for ->local_copy buffer, in
2111 	 * case if object spans two physical pages. No decompression
2112 	 * takes place here, as we read raw compressed data.
2113 	 */
2114 	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
2115 	src = zs_obj_read_begin(zram->mem_pool, handle, size,
2116 				zstrm->local_copy);
2117 	memcpy_to_page(page, 0, src, size);
2118 	zs_obj_read_end(zram->mem_pool, handle, size, src);
2119 	zcomp_stream_put(zstrm);
2120 
2121 	return 0;
2122 }
2123 #endif
2124 
2125 /*
2126  * Reads (decompresses if needed) a page from zspool (zsmalloc).
2127  * Corresponding ZRAM slot should be locked.
2128  */
2129 static int read_from_zspool(struct zram *zram, struct page *page, u32 index)
2130 {
2131 	if (test_slot_flag(zram, index, ZRAM_SAME) ||
2132 	    !get_slot_handle(zram, index))
2133 		return read_same_filled_page(zram, page, index);
2134 
2135 	if (!test_slot_flag(zram, index, ZRAM_HUGE))
2136 		return read_compressed_page(zram, page, index);
2137 	else
2138 		return read_incompressible_page(zram, page, index);
2139 }
2140 
2141 static int zram_read_page(struct zram *zram, struct page *page, u32 index,
2142 			  struct bio *parent)
2143 {
2144 	int ret;
2145 
2146 	slot_lock(zram, index);
2147 	if (!test_slot_flag(zram, index, ZRAM_WB)) {
2148 		/* Slot should be locked through out the function call */
2149 		ret = read_from_zspool(zram, page, index);
2150 		slot_unlock(zram, index);
2151 	} else {
2152 		unsigned long blk_idx = get_slot_handle(zram, index);
2153 
2154 		/*
2155 		 * The slot should be unlocked before reading from the backing
2156 		 * device.
2157 		 */
2158 		slot_unlock(zram, index);
2159 		ret = read_from_bdev(zram, page, index, blk_idx, parent);
2160 	}
2161 
2162 	/* Should NEVER happen. Return bio error if it does. */
2163 	if (WARN_ON(ret < 0))
2164 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
2165 
2166 	return ret;
2167 }
2168 
2169 /*
2170  * Use a temporary buffer to decompress the page, as the decompressor
2171  * always expects a full page for the output.
2172  */
2173 static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec,
2174 				  u32 index, int offset)
2175 {
2176 	struct page *page = alloc_page(GFP_NOIO);
2177 	int ret;
2178 
2179 	if (!page)
2180 		return -ENOMEM;
2181 	ret = zram_read_page(zram, page, index, NULL);
2182 	if (likely(!ret))
2183 		memcpy_to_bvec(bvec, page_address(page) + offset);
2184 	__free_page(page);
2185 	return ret;
2186 }
2187 
2188 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
2189 			  u32 index, int offset, struct bio *bio)
2190 {
2191 	if (is_partial_io(bvec))
2192 		return zram_bvec_read_partial(zram, bvec, index, offset);
2193 	return zram_read_page(zram, bvec->bv_page, index, bio);
2194 }
2195 
2196 static int write_same_filled_page(struct zram *zram, unsigned long fill,
2197 				  u32 index)
2198 {
2199 	slot_lock(zram, index);
2200 	slot_free(zram, index);
2201 	set_slot_flag(zram, index, ZRAM_SAME);
2202 	set_slot_handle(zram, index, fill);
2203 	slot_unlock(zram, index);
2204 
2205 	atomic64_inc(&zram->stats.same_pages);
2206 	atomic64_inc(&zram->stats.pages_stored);
2207 
2208 	return 0;
2209 }
2210 
2211 static int write_incompressible_page(struct zram *zram, struct page *page,
2212 				     u32 index)
2213 {
2214 	unsigned long handle;
2215 	void *src;
2216 
2217 	/*
2218 	 * This function is called from preemptible context so we don't need
2219 	 * to do optimistic and fallback to pessimistic handle allocation,
2220 	 * like we do for compressible pages.
2221 	 */
2222 	handle = zs_malloc(zram->mem_pool, PAGE_SIZE,
2223 			   GFP_NOIO | __GFP_NOWARN |
2224 			   __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
2225 	if (IS_ERR_VALUE(handle))
2226 		return PTR_ERR((void *)handle);
2227 
2228 	if (!zram_can_store_page(zram)) {
2229 		zs_free(zram->mem_pool, handle);
2230 		return -ENOMEM;
2231 	}
2232 
2233 	src = kmap_local_page(page);
2234 	zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE);
2235 	kunmap_local(src);
2236 
2237 	slot_lock(zram, index);
2238 	slot_free(zram, index);
2239 	set_slot_flag(zram, index, ZRAM_HUGE);
2240 	set_slot_handle(zram, index, handle);
2241 	set_slot_size(zram, index, PAGE_SIZE);
2242 	slot_unlock(zram, index);
2243 
2244 	atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size);
2245 	atomic64_inc(&zram->stats.huge_pages);
2246 	atomic64_inc(&zram->stats.huge_pages_since);
2247 	atomic64_inc(&zram->stats.pages_stored);
2248 
2249 	return 0;
2250 }
2251 
2252 static int zram_write_page(struct zram *zram, struct page *page, u32 index)
2253 {
2254 	int ret = 0;
2255 	unsigned long handle;
2256 	unsigned int comp_len;
2257 	void *mem;
2258 	struct zcomp_strm *zstrm;
2259 	unsigned long element;
2260 	bool same_filled;
2261 
2262 	mem = kmap_local_page(page);
2263 	same_filled = page_same_filled(mem, &element);
2264 	kunmap_local(mem);
2265 	if (same_filled)
2266 		return write_same_filled_page(zram, element, index);
2267 
2268 	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
2269 	mem = kmap_local_page(page);
2270 	ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm,
2271 			     mem, &comp_len);
2272 	kunmap_local(mem);
2273 
2274 	if (unlikely(ret)) {
2275 		zcomp_stream_put(zstrm);
2276 		pr_err("Compression failed! err=%d\n", ret);
2277 		return ret;
2278 	}
2279 
2280 	if (comp_len >= huge_class_size) {
2281 		zcomp_stream_put(zstrm);
2282 		return write_incompressible_page(zram, page, index);
2283 	}
2284 
2285 	handle = zs_malloc(zram->mem_pool, comp_len,
2286 			   GFP_NOIO | __GFP_NOWARN |
2287 			   __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page));
2288 	if (IS_ERR_VALUE(handle)) {
2289 		zcomp_stream_put(zstrm);
2290 		return PTR_ERR((void *)handle);
2291 	}
2292 
2293 	if (!zram_can_store_page(zram)) {
2294 		zcomp_stream_put(zstrm);
2295 		zs_free(zram->mem_pool, handle);
2296 		return -ENOMEM;
2297 	}
2298 
2299 	zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len);
2300 	zcomp_stream_put(zstrm);
2301 
2302 	slot_lock(zram, index);
2303 	slot_free(zram, index);
2304 	set_slot_handle(zram, index, handle);
2305 	set_slot_size(zram, index, comp_len);
2306 	slot_unlock(zram, index);
2307 
2308 	/* Update stats */
2309 	atomic64_inc(&zram->stats.pages_stored);
2310 	atomic64_add(comp_len, &zram->stats.compr_data_size);
2311 
2312 	return ret;
2313 }
2314 
2315 /*
2316  * This is a partial IO. Read the full page before writing the changes.
2317  */
2318 static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
2319 				   u32 index, int offset, struct bio *bio)
2320 {
2321 	struct page *page = alloc_page(GFP_NOIO);
2322 	int ret;
2323 
2324 	if (!page)
2325 		return -ENOMEM;
2326 
2327 	ret = zram_read_page(zram, page, index, bio);
2328 	if (!ret) {
2329 		memcpy_from_bvec(page_address(page) + offset, bvec);
2330 		ret = zram_write_page(zram, page, index);
2331 	}
2332 	__free_page(page);
2333 	return ret;
2334 }
2335 
2336 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
2337 			   u32 index, int offset, struct bio *bio)
2338 {
2339 	if (is_partial_io(bvec))
2340 		return zram_bvec_write_partial(zram, bvec, index, offset, bio);
2341 	return zram_write_page(zram, bvec->bv_page, index);
2342 }
2343 
2344 #ifdef CONFIG_ZRAM_MULTI_COMP
2345 #define RECOMPRESS_IDLE		(1 << 0)
2346 #define RECOMPRESS_HUGE		(1 << 1)
2347 
2348 static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max,
2349 				     struct zram_pp_ctl *ctl)
2350 {
2351 	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
2352 	unsigned long index;
2353 
2354 	for (index = 0; index < nr_pages; index++) {
2355 		bool ok = true;
2356 
2357 		slot_lock(zram, index);
2358 		if (!slot_allocated(zram, index))
2359 			goto next;
2360 
2361 		if (mode & RECOMPRESS_IDLE &&
2362 		    !test_slot_flag(zram, index, ZRAM_IDLE))
2363 			goto next;
2364 
2365 		if (mode & RECOMPRESS_HUGE &&
2366 		    !test_slot_flag(zram, index, ZRAM_HUGE))
2367 			goto next;
2368 
2369 		if (test_slot_flag(zram, index, ZRAM_WB) ||
2370 		    test_slot_flag(zram, index, ZRAM_SAME) ||
2371 		    test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE))
2372 			goto next;
2373 
2374 		/* Already compressed with same of higher priority */
2375 		if (get_slot_comp_priority(zram, index) + 1 >= prio_max)
2376 			goto next;
2377 
2378 		ok = place_pp_slot(zram, ctl, index);
2379 next:
2380 		slot_unlock(zram, index);
2381 		if (!ok)
2382 			break;
2383 	}
2384 
2385 	return 0;
2386 }
2387 
2388 /*
2389  * This function will decompress (unless it's ZRAM_HUGE) the page and then
2390  * attempt to compress it using provided compression algorithm priority
2391  * (which is potentially more effective).
2392  *
2393  * Corresponding ZRAM slot should be locked.
2394  */
2395 static int recompress_slot(struct zram *zram, u32 index, struct page *page,
2396 			   u64 *num_recomp_pages, u32 threshold, u32 prio,
2397 			   u32 prio_max)
2398 {
2399 	struct zcomp_strm *zstrm = NULL;
2400 	unsigned long handle_old;
2401 	unsigned long handle_new;
2402 	unsigned int comp_len_old;
2403 	unsigned int comp_len_new;
2404 	unsigned int class_index_old;
2405 	unsigned int class_index_new;
2406 	void *src;
2407 	int ret = 0;
2408 
2409 	handle_old = get_slot_handle(zram, index);
2410 	if (!handle_old)
2411 		return -EINVAL;
2412 
2413 	comp_len_old = get_slot_size(zram, index);
2414 	/*
2415 	 * Do not recompress objects that are already "small enough".
2416 	 */
2417 	if (comp_len_old < threshold)
2418 		return 0;
2419 
2420 	ret = read_from_zspool(zram, page, index);
2421 	if (ret)
2422 		return ret;
2423 
2424 	/*
2425 	 * We touched this entry so mark it as non-IDLE. This makes sure that
2426 	 * we don't preserve IDLE flag and don't incorrectly pick this entry
2427 	 * for different post-processing type (e.g. writeback).
2428 	 */
2429 	clear_slot_flag(zram, index, ZRAM_IDLE);
2430 
2431 	class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
2432 
2433 	prio = max(prio, get_slot_comp_priority(zram, index) + 1);
2434 	/*
2435 	 * Recompression slots scan should not select slots that are
2436 	 * already compressed with a higher priority algorithm, but
2437 	 * just in case
2438 	 */
2439 	if (prio >= prio_max)
2440 		return 0;
2441 
2442 	/*
2443 	 * Iterate the secondary comp algorithms list (in order of priority)
2444 	 * and try to recompress the page.
2445 	 */
2446 	for (; prio < prio_max; prio++) {
2447 		if (!zram->comps[prio])
2448 			continue;
2449 
2450 		zstrm = zcomp_stream_get(zram->comps[prio]);
2451 		src = kmap_local_page(page);
2452 		ret = zcomp_compress(zram->comps[prio], zstrm,
2453 				     src, &comp_len_new);
2454 		kunmap_local(src);
2455 
2456 		if (ret) {
2457 			zcomp_stream_put(zstrm);
2458 			zstrm = NULL;
2459 			break;
2460 		}
2461 
2462 		class_index_new = zs_lookup_class_index(zram->mem_pool,
2463 							comp_len_new);
2464 
2465 		/* Continue until we make progress */
2466 		if (class_index_new >= class_index_old ||
2467 		    (threshold && comp_len_new >= threshold)) {
2468 			zcomp_stream_put(zstrm);
2469 			zstrm = NULL;
2470 			continue;
2471 		}
2472 
2473 		/* Recompression was successful so break out */
2474 		break;
2475 	}
2476 
2477 	/*
2478 	 * Decrement the limit (if set) on pages we can recompress, even
2479 	 * when current recompression was unsuccessful or did not compress
2480 	 * the page below the threshold, because we still spent resources
2481 	 * on it.
2482 	 */
2483 	if (*num_recomp_pages)
2484 		*num_recomp_pages -= 1;
2485 
2486 	/* Compression error */
2487 	if (ret)
2488 		return ret;
2489 
2490 	if (!zstrm) {
2491 		/*
2492 		 * Secondary algorithms failed to re-compress the page
2493 		 * in a way that would save memory.
2494 		 *
2495 		 * Mark the object incompressible if the max-priority
2496 		 * algorithm couldn't re-compress it.
2497 		 */
2498 		if (prio < zram->num_active_comps)
2499 			return 0;
2500 		set_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE);
2501 		return 0;
2502 	}
2503 
2504 	/*
2505 	 * We are holding per-CPU stream mutex and entry lock so better
2506 	 * avoid direct reclaim.  Allocation error is not fatal since
2507 	 * we still have the old object in the mem_pool.
2508 	 *
2509 	 * XXX: technically, the node we really want here is the node that
2510 	 * holds the original compressed data. But that would require us to
2511 	 * modify zsmalloc API to return this information. For now, we will
2512 	 * make do with the node of the page allocated for recompression.
2513 	 */
2514 	handle_new = zs_malloc(zram->mem_pool, comp_len_new,
2515 			       GFP_NOIO | __GFP_NOWARN |
2516 			       __GFP_HIGHMEM | __GFP_MOVABLE,
2517 			       page_to_nid(page));
2518 	if (IS_ERR_VALUE(handle_new)) {
2519 		zcomp_stream_put(zstrm);
2520 		return PTR_ERR((void *)handle_new);
2521 	}
2522 
2523 	zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new);
2524 	zcomp_stream_put(zstrm);
2525 
2526 	slot_free(zram, index);
2527 	set_slot_handle(zram, index, handle_new);
2528 	set_slot_size(zram, index, comp_len_new);
2529 	set_slot_comp_priority(zram, index, prio);
2530 
2531 	atomic64_add(comp_len_new, &zram->stats.compr_data_size);
2532 	atomic64_inc(&zram->stats.pages_stored);
2533 
2534 	return 0;
2535 }
2536 
2537 static ssize_t recompress_store(struct device *dev,
2538 				struct device_attribute *attr,
2539 				const char *buf, size_t len)
2540 {
2541 	struct zram *zram = dev_to_zram(dev);
2542 	char *args, *param, *val, *algo = NULL;
2543 	u64 num_recomp_pages = ULLONG_MAX;
2544 	struct zram_pp_ctl *ctl = NULL;
2545 	struct zram_pp_slot *pps;
2546 	u32 mode = 0, threshold = 0;
2547 	u32 prio, prio_max;
2548 	struct page *page = NULL;
2549 	ssize_t ret;
2550 
2551 	prio = ZRAM_SECONDARY_COMP;
2552 	prio_max = zram->num_active_comps;
2553 
2554 	args = skip_spaces(buf);
2555 	while (*args) {
2556 		args = next_arg(args, &param, &val);
2557 
2558 		if (!val || !*val)
2559 			return -EINVAL;
2560 
2561 		if (!strcmp(param, "type")) {
2562 			if (!strcmp(val, "idle"))
2563 				mode = RECOMPRESS_IDLE;
2564 			if (!strcmp(val, "huge"))
2565 				mode = RECOMPRESS_HUGE;
2566 			if (!strcmp(val, "huge_idle"))
2567 				mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
2568 			continue;
2569 		}
2570 
2571 		if (!strcmp(param, "max_pages")) {
2572 			/*
2573 			 * Limit the number of entries (pages) we attempt to
2574 			 * recompress.
2575 			 */
2576 			ret = kstrtoull(val, 10, &num_recomp_pages);
2577 			if (ret)
2578 				return ret;
2579 			continue;
2580 		}
2581 
2582 		if (!strcmp(param, "threshold")) {
2583 			/*
2584 			 * We will re-compress only idle objects equal or
2585 			 * greater in size than watermark.
2586 			 */
2587 			ret = kstrtouint(val, 10, &threshold);
2588 			if (ret)
2589 				return ret;
2590 			continue;
2591 		}
2592 
2593 		if (!strcmp(param, "algo")) {
2594 			algo = val;
2595 			continue;
2596 		}
2597 
2598 		if (!strcmp(param, "priority")) {
2599 			ret = kstrtouint(val, 10, &prio);
2600 			if (ret)
2601 				return ret;
2602 
2603 			if (prio == ZRAM_PRIMARY_COMP)
2604 				prio = ZRAM_SECONDARY_COMP;
2605 
2606 			prio_max = prio + 1;
2607 			continue;
2608 		}
2609 	}
2610 
2611 	if (threshold >= huge_class_size)
2612 		return -EINVAL;
2613 
2614 	guard(rwsem_write)(&zram->dev_lock);
2615 	if (!init_done(zram))
2616 		return -EINVAL;
2617 
2618 	if (algo) {
2619 		bool found = false;
2620 
2621 		for (; prio < ZRAM_MAX_COMPS; prio++) {
2622 			if (!zram->comp_algs[prio])
2623 				continue;
2624 
2625 			if (!strcmp(zram->comp_algs[prio], algo)) {
2626 				prio_max = prio + 1;
2627 				found = true;
2628 				break;
2629 			}
2630 		}
2631 
2632 		if (!found) {
2633 			ret = -EINVAL;
2634 			goto out;
2635 		}
2636 	}
2637 
2638 	prio_max = min(prio_max, (u32)zram->num_active_comps);
2639 	if (prio >= prio_max) {
2640 		ret = -EINVAL;
2641 		goto out;
2642 	}
2643 
2644 	page = alloc_page(GFP_KERNEL);
2645 	if (!page) {
2646 		ret = -ENOMEM;
2647 		goto out;
2648 	}
2649 
2650 	ctl = init_pp_ctl();
2651 	if (!ctl) {
2652 		ret = -ENOMEM;
2653 		goto out;
2654 	}
2655 
2656 	scan_slots_for_recompress(zram, mode, prio_max, ctl);
2657 
2658 	ret = len;
2659 	while ((pps = select_pp_slot(ctl))) {
2660 		int err = 0;
2661 
2662 		if (!num_recomp_pages)
2663 			break;
2664 
2665 		slot_lock(zram, pps->index);
2666 		if (!test_slot_flag(zram, pps->index, ZRAM_PP_SLOT))
2667 			goto next;
2668 
2669 		err = recompress_slot(zram, pps->index, page,
2670 				      &num_recomp_pages, threshold,
2671 				      prio, prio_max);
2672 next:
2673 		slot_unlock(zram, pps->index);
2674 		release_pp_slot(zram, pps);
2675 
2676 		if (err) {
2677 			ret = err;
2678 			break;
2679 		}
2680 
2681 		cond_resched();
2682 	}
2683 
2684 out:
2685 	if (page)
2686 		__free_page(page);
2687 	release_pp_ctl(zram, ctl);
2688 	return ret;
2689 }
2690 #endif
2691 
2692 static void zram_bio_discard(struct zram *zram, struct bio *bio)
2693 {
2694 	size_t n = bio->bi_iter.bi_size;
2695 	u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2696 	u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2697 			SECTOR_SHIFT;
2698 
2699 	/*
2700 	 * zram manages data in physical block size units. Because logical block
2701 	 * size isn't identical with physical block size on some arch, we
2702 	 * could get a discard request pointing to a specific offset within a
2703 	 * certain physical block.  Although we can handle this request by
2704 	 * reading that physiclal block and decompressing and partially zeroing
2705 	 * and re-compressing and then re-storing it, this isn't reasonable
2706 	 * because our intent with a discard request is to save memory.  So
2707 	 * skipping this logical block is appropriate here.
2708 	 */
2709 	if (offset) {
2710 		if (n <= (PAGE_SIZE - offset))
2711 			return;
2712 
2713 		n -= (PAGE_SIZE - offset);
2714 		index++;
2715 	}
2716 
2717 	while (n >= PAGE_SIZE) {
2718 		slot_lock(zram, index);
2719 		slot_free(zram, index);
2720 		slot_unlock(zram, index);
2721 		atomic64_inc(&zram->stats.notify_free);
2722 		index++;
2723 		n -= PAGE_SIZE;
2724 	}
2725 
2726 	bio_endio(bio);
2727 }
2728 
2729 static void zram_bio_read(struct zram *zram, struct bio *bio)
2730 {
2731 	unsigned long start_time = bio_start_io_acct(bio);
2732 	struct bvec_iter iter = bio->bi_iter;
2733 
2734 	do {
2735 		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2736 		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2737 				SECTOR_SHIFT;
2738 		struct bio_vec bv = bio_iter_iovec(bio, iter);
2739 
2740 		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
2741 
2742 		if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
2743 			atomic64_inc(&zram->stats.failed_reads);
2744 			bio->bi_status = BLK_STS_IOERR;
2745 			break;
2746 		}
2747 		flush_dcache_page(bv.bv_page);
2748 
2749 		slot_lock(zram, index);
2750 		mark_slot_accessed(zram, index);
2751 		slot_unlock(zram, index);
2752 
2753 		bio_advance_iter_single(bio, &iter, bv.bv_len);
2754 	} while (iter.bi_size);
2755 
2756 	bio_end_io_acct(bio, start_time);
2757 	bio_endio(bio);
2758 }
2759 
2760 static void zram_bio_write(struct zram *zram, struct bio *bio)
2761 {
2762 	unsigned long start_time = bio_start_io_acct(bio);
2763 	struct bvec_iter iter = bio->bi_iter;
2764 
2765 	do {
2766 		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
2767 		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
2768 				SECTOR_SHIFT;
2769 		struct bio_vec bv = bio_iter_iovec(bio, iter);
2770 
2771 		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
2772 
2773 		if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
2774 			atomic64_inc(&zram->stats.failed_writes);
2775 			bio->bi_status = BLK_STS_IOERR;
2776 			break;
2777 		}
2778 
2779 		slot_lock(zram, index);
2780 		mark_slot_accessed(zram, index);
2781 		slot_unlock(zram, index);
2782 
2783 		bio_advance_iter_single(bio, &iter, bv.bv_len);
2784 	} while (iter.bi_size);
2785 
2786 	bio_end_io_acct(bio, start_time);
2787 	bio_endio(bio);
2788 }
2789 
2790 /*
2791  * Handler function for all zram I/O requests.
2792  */
2793 static void zram_submit_bio(struct bio *bio)
2794 {
2795 	struct zram *zram = bio->bi_bdev->bd_disk->private_data;
2796 
2797 	switch (bio_op(bio)) {
2798 	case REQ_OP_READ:
2799 		zram_bio_read(zram, bio);
2800 		break;
2801 	case REQ_OP_WRITE:
2802 		zram_bio_write(zram, bio);
2803 		break;
2804 	case REQ_OP_DISCARD:
2805 	case REQ_OP_WRITE_ZEROES:
2806 		zram_bio_discard(zram, bio);
2807 		break;
2808 	default:
2809 		WARN_ON_ONCE(1);
2810 		bio_endio(bio);
2811 	}
2812 }
2813 
2814 static void zram_slot_free_notify(struct block_device *bdev,
2815 				unsigned long index)
2816 {
2817 	struct zram *zram;
2818 
2819 	zram = bdev->bd_disk->private_data;
2820 
2821 	atomic64_inc(&zram->stats.notify_free);
2822 	if (!slot_trylock(zram, index)) {
2823 		atomic64_inc(&zram->stats.miss_free);
2824 		return;
2825 	}
2826 
2827 	slot_free(zram, index);
2828 	slot_unlock(zram, index);
2829 }
2830 
2831 static void zram_comp_params_reset(struct zram *zram)
2832 {
2833 	u32 prio;
2834 
2835 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2836 		comp_params_reset(zram, prio);
2837 	}
2838 }
2839 
2840 static void zram_destroy_comps(struct zram *zram)
2841 {
2842 	u32 prio;
2843 
2844 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2845 		struct zcomp *comp = zram->comps[prio];
2846 
2847 		zram->comps[prio] = NULL;
2848 		if (!comp)
2849 			continue;
2850 		zcomp_destroy(comp);
2851 		zram->num_active_comps--;
2852 	}
2853 
2854 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2855 		/* Do not free statically defined compression algorithms */
2856 		if (zram->comp_algs[prio] != default_compressor)
2857 			kfree(zram->comp_algs[prio]);
2858 		zram->comp_algs[prio] = NULL;
2859 	}
2860 
2861 	zram_comp_params_reset(zram);
2862 }
2863 
2864 static void zram_reset_device(struct zram *zram)
2865 {
2866 	guard(rwsem_write)(&zram->dev_lock);
2867 
2868 	zram->limit_pages = 0;
2869 
2870 	set_capacity_and_notify(zram->disk, 0);
2871 	part_stat_set_all(zram->disk->part0, 0);
2872 
2873 	/* I/O operation under all of CPU are done so let's free */
2874 	zram_meta_free(zram, zram->disksize);
2875 	zram->disksize = 0;
2876 	zram_destroy_comps(zram);
2877 	memset(&zram->stats, 0, sizeof(zram->stats));
2878 	reset_bdev(zram);
2879 
2880 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2881 }
2882 
2883 static ssize_t disksize_store(struct device *dev, struct device_attribute *attr,
2884 			      const char *buf, size_t len)
2885 {
2886 	u64 disksize;
2887 	struct zcomp *comp;
2888 	struct zram *zram = dev_to_zram(dev);
2889 	int err;
2890 	u32 prio;
2891 
2892 	disksize = memparse(buf, NULL);
2893 	if (!disksize)
2894 		return -EINVAL;
2895 
2896 	guard(rwsem_write)(&zram->dev_lock);
2897 	if (init_done(zram)) {
2898 		pr_info("Cannot change disksize for initialized device\n");
2899 		return -EBUSY;
2900 	}
2901 
2902 	disksize = PAGE_ALIGN(disksize);
2903 	if (!zram_meta_alloc(zram, disksize))
2904 		return -ENOMEM;
2905 
2906 	for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
2907 		if (!zram->comp_algs[prio])
2908 			continue;
2909 
2910 		comp = zcomp_create(zram->comp_algs[prio],
2911 				    &zram->params[prio]);
2912 		if (IS_ERR(comp)) {
2913 			pr_err("Cannot initialise %s compressing backend\n",
2914 			       zram->comp_algs[prio]);
2915 			err = PTR_ERR(comp);
2916 			goto out_free_comps;
2917 		}
2918 
2919 		zram->comps[prio] = comp;
2920 		zram->num_active_comps++;
2921 	}
2922 	zram->disksize = disksize;
2923 	set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
2924 
2925 	return len;
2926 
2927 out_free_comps:
2928 	zram_destroy_comps(zram);
2929 	zram_meta_free(zram, disksize);
2930 	return err;
2931 }
2932 
2933 static ssize_t reset_store(struct device *dev,
2934 		struct device_attribute *attr, const char *buf, size_t len)
2935 {
2936 	int ret;
2937 	unsigned short do_reset;
2938 	struct zram *zram;
2939 	struct gendisk *disk;
2940 
2941 	ret = kstrtou16(buf, 10, &do_reset);
2942 	if (ret)
2943 		return ret;
2944 
2945 	if (!do_reset)
2946 		return -EINVAL;
2947 
2948 	zram = dev_to_zram(dev);
2949 	disk = zram->disk;
2950 
2951 	mutex_lock(&disk->open_mutex);
2952 	/* Do not reset an active device or claimed device */
2953 	if (disk_openers(disk) || zram->claim) {
2954 		mutex_unlock(&disk->open_mutex);
2955 		return -EBUSY;
2956 	}
2957 
2958 	/* From now on, anyone can't open /dev/zram[0-9] */
2959 	zram->claim = true;
2960 	mutex_unlock(&disk->open_mutex);
2961 
2962 	/* Make sure all the pending I/O are finished */
2963 	sync_blockdev(disk->part0);
2964 	zram_reset_device(zram);
2965 
2966 	mutex_lock(&disk->open_mutex);
2967 	zram->claim = false;
2968 	mutex_unlock(&disk->open_mutex);
2969 
2970 	return len;
2971 }
2972 
2973 static int zram_open(struct gendisk *disk, blk_mode_t mode)
2974 {
2975 	struct zram *zram = disk->private_data;
2976 
2977 	WARN_ON(!mutex_is_locked(&disk->open_mutex));
2978 
2979 	/* zram was claimed to reset so open request fails */
2980 	if (zram->claim)
2981 		return -EBUSY;
2982 	return 0;
2983 }
2984 
2985 static const struct block_device_operations zram_devops = {
2986 	.open = zram_open,
2987 	.submit_bio = zram_submit_bio,
2988 	.swap_slot_free_notify = zram_slot_free_notify,
2989 	.owner = THIS_MODULE
2990 };
2991 
2992 static DEVICE_ATTR_RO(io_stat);
2993 static DEVICE_ATTR_RO(mm_stat);
2994 static DEVICE_ATTR_RO(debug_stat);
2995 static DEVICE_ATTR_WO(compact);
2996 static DEVICE_ATTR_RW(disksize);
2997 static DEVICE_ATTR_RO(initstate);
2998 static DEVICE_ATTR_WO(reset);
2999 static DEVICE_ATTR_WO(mem_limit);
3000 static DEVICE_ATTR_WO(mem_used_max);
3001 static DEVICE_ATTR_WO(idle);
3002 static DEVICE_ATTR_RW(comp_algorithm);
3003 #ifdef CONFIG_ZRAM_WRITEBACK
3004 static DEVICE_ATTR_RO(bd_stat);
3005 static DEVICE_ATTR_RW(backing_dev);
3006 static DEVICE_ATTR_WO(writeback);
3007 static DEVICE_ATTR_RW(writeback_limit);
3008 static DEVICE_ATTR_RW(writeback_limit_enable);
3009 static DEVICE_ATTR_RW(writeback_batch_size);
3010 static DEVICE_ATTR_RW(writeback_compressed);
3011 #endif
3012 #ifdef CONFIG_ZRAM_MULTI_COMP
3013 static DEVICE_ATTR_RW(recomp_algorithm);
3014 static DEVICE_ATTR_WO(recompress);
3015 #endif
3016 static DEVICE_ATTR_WO(algorithm_params);
3017 
3018 static struct attribute *zram_disk_attrs[] = {
3019 	&dev_attr_disksize.attr,
3020 	&dev_attr_initstate.attr,
3021 	&dev_attr_reset.attr,
3022 	&dev_attr_compact.attr,
3023 	&dev_attr_mem_limit.attr,
3024 	&dev_attr_mem_used_max.attr,
3025 	&dev_attr_idle.attr,
3026 	&dev_attr_comp_algorithm.attr,
3027 #ifdef CONFIG_ZRAM_WRITEBACK
3028 	&dev_attr_bd_stat.attr,
3029 	&dev_attr_backing_dev.attr,
3030 	&dev_attr_writeback.attr,
3031 	&dev_attr_writeback_limit.attr,
3032 	&dev_attr_writeback_limit_enable.attr,
3033 	&dev_attr_writeback_batch_size.attr,
3034 	&dev_attr_writeback_compressed.attr,
3035 #endif
3036 	&dev_attr_io_stat.attr,
3037 	&dev_attr_mm_stat.attr,
3038 	&dev_attr_debug_stat.attr,
3039 #ifdef CONFIG_ZRAM_MULTI_COMP
3040 	&dev_attr_recomp_algorithm.attr,
3041 	&dev_attr_recompress.attr,
3042 #endif
3043 	&dev_attr_algorithm_params.attr,
3044 	NULL,
3045 };
3046 
3047 ATTRIBUTE_GROUPS(zram_disk);
3048 
3049 /*
3050  * Allocate and initialize new zram device. the function returns
3051  * '>= 0' device_id upon success, and negative value otherwise.
3052  */
3053 static int zram_add(void)
3054 {
3055 	struct queue_limits lim = {
3056 		.logical_block_size		= ZRAM_LOGICAL_BLOCK_SIZE,
3057 		/*
3058 		 * To ensure that we always get PAGE_SIZE aligned and
3059 		 * n*PAGE_SIZED sized I/O requests.
3060 		 */
3061 		.physical_block_size		= PAGE_SIZE,
3062 		.io_min				= PAGE_SIZE,
3063 		.io_opt				= PAGE_SIZE,
3064 		.max_hw_discard_sectors		= UINT_MAX,
3065 		/*
3066 		 * zram_bio_discard() will clear all logical blocks if logical
3067 		 * block size is identical with physical block size(PAGE_SIZE).
3068 		 * But if it is different, we will skip discarding some parts of
3069 		 * logical blocks in the part of the request range which isn't
3070 		 * aligned to physical block size.  So we can't ensure that all
3071 		 * discarded logical blocks are zeroed.
3072 		 */
3073 #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
3074 		.max_write_zeroes_sectors	= UINT_MAX,
3075 #endif
3076 		.features			= BLK_FEAT_STABLE_WRITES |
3077 						  BLK_FEAT_SYNCHRONOUS,
3078 	};
3079 	struct zram *zram;
3080 	int ret, device_id;
3081 
3082 	zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
3083 	if (!zram)
3084 		return -ENOMEM;
3085 
3086 	ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
3087 	if (ret < 0)
3088 		goto out_free_dev;
3089 	device_id = ret;
3090 
3091 	init_rwsem(&zram->dev_lock);
3092 #ifdef CONFIG_ZRAM_WRITEBACK
3093 	zram->wb_batch_size = 32;
3094 	zram->wb_compressed = false;
3095 #endif
3096 
3097 	/* gendisk structure */
3098 	zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
3099 	if (IS_ERR(zram->disk)) {
3100 		pr_err("Error allocating disk structure for device %d\n",
3101 			device_id);
3102 		ret = PTR_ERR(zram->disk);
3103 		goto out_free_idr;
3104 	}
3105 
3106 	zram->disk->major = zram_major;
3107 	zram->disk->first_minor = device_id;
3108 	zram->disk->minors = 1;
3109 	zram->disk->flags |= GENHD_FL_NO_PART;
3110 	zram->disk->fops = &zram_devops;
3111 	zram->disk->private_data = zram;
3112 	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
3113 	zram_comp_params_reset(zram);
3114 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
3115 
3116 	/* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
3117 	set_capacity(zram->disk, 0);
3118 	ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
3119 	if (ret)
3120 		goto out_cleanup_disk;
3121 
3122 	zram_debugfs_register(zram);
3123 	pr_info("Added device: %s\n", zram->disk->disk_name);
3124 	return device_id;
3125 
3126 out_cleanup_disk:
3127 	put_disk(zram->disk);
3128 out_free_idr:
3129 	idr_remove(&zram_index_idr, device_id);
3130 out_free_dev:
3131 	kfree(zram);
3132 	return ret;
3133 }
3134 
3135 static int zram_remove(struct zram *zram)
3136 {
3137 	bool claimed;
3138 
3139 	mutex_lock(&zram->disk->open_mutex);
3140 	if (disk_openers(zram->disk)) {
3141 		mutex_unlock(&zram->disk->open_mutex);
3142 		return -EBUSY;
3143 	}
3144 
3145 	claimed = zram->claim;
3146 	if (!claimed)
3147 		zram->claim = true;
3148 	mutex_unlock(&zram->disk->open_mutex);
3149 
3150 	zram_debugfs_unregister(zram);
3151 
3152 	if (claimed) {
3153 		/*
3154 		 * If we were claimed by reset_store(), del_gendisk() will
3155 		 * wait until reset_store() is done, so nothing need to do.
3156 		 */
3157 		;
3158 	} else {
3159 		/* Make sure all the pending I/O are finished */
3160 		sync_blockdev(zram->disk->part0);
3161 		zram_reset_device(zram);
3162 	}
3163 
3164 	pr_info("Removed device: %s\n", zram->disk->disk_name);
3165 
3166 	del_gendisk(zram->disk);
3167 
3168 	/* del_gendisk drains pending reset_store */
3169 	WARN_ON_ONCE(claimed && zram->claim);
3170 
3171 	/*
3172 	 * disksize_store() may be called in between zram_reset_device()
3173 	 * and del_gendisk(), so run the last reset to avoid leaking
3174 	 * anything allocated with disksize_store()
3175 	 */
3176 	zram_reset_device(zram);
3177 
3178 	put_disk(zram->disk);
3179 	kfree(zram);
3180 	return 0;
3181 }
3182 
3183 /* zram-control sysfs attributes */
3184 
3185 /*
3186  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
3187  * sense that reading from this file does alter the state of your system -- it
3188  * creates a new un-initialized zram device and returns back this device's
3189  * device_id (or an error code if it fails to create a new device).
3190  */
3191 static ssize_t hot_add_show(const struct class *class,
3192 			const struct class_attribute *attr,
3193 			char *buf)
3194 {
3195 	int ret;
3196 
3197 	mutex_lock(&zram_index_mutex);
3198 	ret = zram_add();
3199 	mutex_unlock(&zram_index_mutex);
3200 
3201 	if (ret < 0)
3202 		return ret;
3203 	return sysfs_emit(buf, "%d\n", ret);
3204 }
3205 /* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */
3206 static struct class_attribute class_attr_hot_add =
3207 	__ATTR(hot_add, 0400, hot_add_show, NULL);
3208 
3209 static ssize_t hot_remove_store(const struct class *class,
3210 			const struct class_attribute *attr,
3211 			const char *buf,
3212 			size_t count)
3213 {
3214 	struct zram *zram;
3215 	int ret, dev_id;
3216 
3217 	/* dev_id is gendisk->first_minor, which is `int' */
3218 	ret = kstrtoint(buf, 10, &dev_id);
3219 	if (ret)
3220 		return ret;
3221 	if (dev_id < 0)
3222 		return -EINVAL;
3223 
3224 	mutex_lock(&zram_index_mutex);
3225 
3226 	zram = idr_find(&zram_index_idr, dev_id);
3227 	if (zram) {
3228 		ret = zram_remove(zram);
3229 		if (!ret)
3230 			idr_remove(&zram_index_idr, dev_id);
3231 	} else {
3232 		ret = -ENODEV;
3233 	}
3234 
3235 	mutex_unlock(&zram_index_mutex);
3236 	return ret ? ret : count;
3237 }
3238 static CLASS_ATTR_WO(hot_remove);
3239 
3240 static struct attribute *zram_control_class_attrs[] = {
3241 	&class_attr_hot_add.attr,
3242 	&class_attr_hot_remove.attr,
3243 	NULL,
3244 };
3245 ATTRIBUTE_GROUPS(zram_control_class);
3246 
3247 static struct class zram_control_class = {
3248 	.name		= "zram-control",
3249 	.class_groups	= zram_control_class_groups,
3250 };
3251 
3252 static int zram_remove_cb(int id, void *ptr, void *data)
3253 {
3254 	WARN_ON_ONCE(zram_remove(ptr));
3255 	return 0;
3256 }
3257 
3258 static void destroy_devices(void)
3259 {
3260 	class_unregister(&zram_control_class);
3261 	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
3262 	zram_debugfs_destroy();
3263 	idr_destroy(&zram_index_idr);
3264 	unregister_blkdev(zram_major, "zram");
3265 	cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3266 }
3267 
3268 static int __init zram_init(void)
3269 {
3270 	struct zram_table_entry zram_te;
3271 	int ret;
3272 
3273 	BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.attr.flags) * 8);
3274 
3275 	ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
3276 				      zcomp_cpu_up_prepare, zcomp_cpu_dead);
3277 	if (ret < 0)
3278 		return ret;
3279 
3280 	ret = class_register(&zram_control_class);
3281 	if (ret) {
3282 		pr_err("Unable to register zram-control class\n");
3283 		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3284 		return ret;
3285 	}
3286 
3287 	zram_debugfs_create();
3288 	zram_major = register_blkdev(0, "zram");
3289 	if (zram_major <= 0) {
3290 		pr_err("Unable to get major number\n");
3291 		class_unregister(&zram_control_class);
3292 		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
3293 		return -EBUSY;
3294 	}
3295 
3296 	while (num_devices != 0) {
3297 		mutex_lock(&zram_index_mutex);
3298 		ret = zram_add();
3299 		mutex_unlock(&zram_index_mutex);
3300 		if (ret < 0)
3301 			goto out_error;
3302 		num_devices--;
3303 	}
3304 
3305 	return 0;
3306 
3307 out_error:
3308 	destroy_devices();
3309 	return ret;
3310 }
3311 
3312 static void __exit zram_exit(void)
3313 {
3314 	destroy_devices();
3315 }
3316 
3317 module_init(zram_init);
3318 module_exit(zram_exit);
3319 
3320 module_param(num_devices, uint, 0);
3321 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
3322 
3323 MODULE_LICENSE("Dual BSD/GPL");
3324 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
3325 MODULE_DESCRIPTION("Compressed RAM Block Device");
3326