xref: /linux/drivers/md/dm-writecache.c (revision 33eded29319d41fcba5d0257b126a48b449aad47)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/libnvdimm.h>
17 #include <linux/delay.h>
18 #include "dm-io-tracker.h"
19 
20 #define DM_MSG_PREFIX "writecache"
21 
22 #define HIGH_WATERMARK			50
23 #define LOW_WATERMARK			45
24 #define MAX_WRITEBACK_JOBS		min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
25 #define ENDIO_LATENCY			16
26 #define WRITEBACK_LATENCY		64
27 #define AUTOCOMMIT_BLOCKS_SSD		65536
28 #define AUTOCOMMIT_BLOCKS_PMEM		64
29 #define AUTOCOMMIT_MSEC			1000
30 #define MAX_AGE_DIV			16
31 #define MAX_AGE_UNSPECIFIED		-1UL
32 #define PAUSE_WRITEBACK			(HZ * 3)
33 
34 #define BITMAP_GRANULARITY	65536
35 #if BITMAP_GRANULARITY < PAGE_SIZE
36 #undef BITMAP_GRANULARITY
37 #define BITMAP_GRANULARITY	PAGE_SIZE
38 #endif
39 
40 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
41 #define DM_WRITECACHE_HAS_PMEM
42 #endif
43 
44 #ifdef DM_WRITECACHE_HAS_PMEM
45 #define pmem_assign(dest, src)					\
46 do {								\
47 	typeof(dest) uniq = (src);				\
48 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
49 } while (0)
50 #else
51 #define pmem_assign(dest, src)	((dest) = (src))
52 #endif
53 
54 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
55 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
56 #endif
57 
58 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
59 #define MEMORY_SUPERBLOCK_VERSION	1
60 
61 struct wc_memory_entry {
62 	__le64 original_sector;
63 	__le64 seq_count;
64 };
65 
66 struct wc_memory_superblock {
67 	union {
68 		struct {
69 			__le32 magic;
70 			__le32 version;
71 			__le32 block_size;
72 			__le32 pad;
73 			__le64 n_blocks;
74 			__le64 seq_count;
75 		};
76 		__le64 padding[8];
77 	};
78 	struct wc_memory_entry entries[];
79 };
80 
81 struct wc_entry {
82 	struct rb_node rb_node;
83 	struct list_head lru;
84 	unsigned short wc_list_contiguous;
85 #if BITS_PER_LONG == 64
86 	bool write_in_progress : 1;
87 	unsigned long index : 47;
88 #else
89 	bool write_in_progress;
90 	unsigned long index;
91 #endif
92 	unsigned long age;
93 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
94 	uint64_t original_sector;
95 	uint64_t seq_count;
96 #endif
97 };
98 
99 #ifdef DM_WRITECACHE_HAS_PMEM
100 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
101 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
102 #else
103 #define WC_MODE_PMEM(wc)			false
104 #define WC_MODE_FUA(wc)				false
105 #endif
106 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
107 
108 struct dm_writecache {
109 	struct mutex lock;
110 	struct list_head lru;
111 	union {
112 		struct list_head freelist;
113 		struct {
114 			struct rb_root freetree;
115 			struct wc_entry *current_free;
116 		};
117 	};
118 	struct rb_root tree;
119 
120 	size_t freelist_size;
121 	size_t writeback_size;
122 	size_t freelist_high_watermark;
123 	size_t freelist_low_watermark;
124 	unsigned long max_age;
125 	unsigned long pause;
126 
127 	unsigned int uncommitted_blocks;
128 	unsigned int autocommit_blocks;
129 	unsigned int max_writeback_jobs;
130 
131 	int error;
132 
133 	unsigned long autocommit_jiffies;
134 	struct timer_list autocommit_timer;
135 	struct wait_queue_head freelist_wait;
136 
137 	struct timer_list max_age_timer;
138 
139 	atomic_t bio_in_progress[2];
140 	struct wait_queue_head bio_in_progress_wait[2];
141 
142 	struct dm_target *ti;
143 	struct dm_dev *dev;
144 	struct dm_dev *ssd_dev;
145 	sector_t start_sector;
146 	void *memory_map;
147 	uint64_t memory_map_size;
148 	size_t metadata_sectors;
149 	size_t n_blocks;
150 	uint64_t seq_count;
151 	sector_t data_device_sectors;
152 	void *block_start;
153 	struct wc_entry *entries;
154 	unsigned int block_size;
155 	unsigned char block_size_bits;
156 
157 	bool pmem_mode:1;
158 	bool writeback_fua:1;
159 
160 	bool overwrote_committed:1;
161 	bool memory_vmapped:1;
162 
163 	bool start_sector_set:1;
164 	bool high_wm_percent_set:1;
165 	bool low_wm_percent_set:1;
166 	bool max_writeback_jobs_set:1;
167 	bool autocommit_blocks_set:1;
168 	bool autocommit_time_set:1;
169 	bool max_age_set:1;
170 	bool writeback_fua_set:1;
171 	bool flush_on_suspend:1;
172 	bool cleaner:1;
173 	bool cleaner_set:1;
174 	bool metadata_only:1;
175 	bool pause_set:1;
176 
177 	unsigned int high_wm_percent_value;
178 	unsigned int low_wm_percent_value;
179 	unsigned int autocommit_time_value;
180 	unsigned int max_age_value;
181 	unsigned int pause_value;
182 
183 	unsigned int writeback_all;
184 	struct workqueue_struct *writeback_wq;
185 	struct work_struct writeback_work;
186 	struct work_struct flush_work;
187 
188 	struct dm_io_tracker iot;
189 
190 	struct dm_io_client *dm_io;
191 
192 	raw_spinlock_t endio_list_lock;
193 	struct list_head endio_list;
194 	struct task_struct *endio_thread;
195 
196 	struct task_struct *flush_thread;
197 	struct bio_list flush_list;
198 
199 	struct dm_kcopyd_client *dm_kcopyd;
200 	unsigned long *dirty_bitmap;
201 	unsigned int dirty_bitmap_size;
202 
203 	struct bio_set bio_set;
204 	mempool_t copy_pool;
205 
206 	struct {
207 		unsigned long long reads;
208 		unsigned long long read_hits;
209 		unsigned long long writes;
210 		unsigned long long write_hits_uncommitted;
211 		unsigned long long write_hits_committed;
212 		unsigned long long writes_around;
213 		unsigned long long writes_allocate;
214 		unsigned long long writes_blocked_on_freelist;
215 		unsigned long long flushes;
216 		unsigned long long discards;
217 	} stats;
218 };
219 
220 #define WB_LIST_INLINE		16
221 
222 struct writeback_struct {
223 	struct list_head endio_entry;
224 	struct dm_writecache *wc;
225 	struct wc_entry **wc_list;
226 	unsigned int wc_list_n;
227 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
228 	struct bio bio;
229 };
230 
231 struct copy_struct {
232 	struct list_head endio_entry;
233 	struct dm_writecache *wc;
234 	struct wc_entry *e;
235 	unsigned int n_entries;
236 	int error;
237 };
238 
239 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
240 					    "A percentage of time allocated for data copying");
241 
242 static void wc_lock(struct dm_writecache *wc)
243 {
244 	mutex_lock(&wc->lock);
245 }
246 
247 static void wc_unlock(struct dm_writecache *wc)
248 {
249 	mutex_unlock(&wc->lock);
250 }
251 
252 #ifdef DM_WRITECACHE_HAS_PMEM
253 static int persistent_memory_claim(struct dm_writecache *wc)
254 {
255 	int r;
256 	loff_t s;
257 	long p, da;
258 	unsigned long pfn;
259 	int id;
260 	struct page **pages;
261 	sector_t offset;
262 
263 	wc->memory_vmapped = false;
264 
265 	s = wc->memory_map_size;
266 	p = s >> PAGE_SHIFT;
267 	if (!p) {
268 		r = -EINVAL;
269 		goto err1;
270 	}
271 	if (p != s >> PAGE_SHIFT) {
272 		r = -EOVERFLOW;
273 		goto err1;
274 	}
275 
276 	offset = get_start_sect(wc->ssd_dev->bdev);
277 	if (offset & (PAGE_SIZE / 512 - 1)) {
278 		r = -EINVAL;
279 		goto err1;
280 	}
281 	offset >>= PAGE_SHIFT - 9;
282 
283 	id = dax_read_lock();
284 
285 	da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
286 			&wc->memory_map, &pfn);
287 	if (da < 0) {
288 		wc->memory_map = NULL;
289 		r = da;
290 		goto err2;
291 	}
292 	if (!pfn_valid(pfn)) {
293 		wc->memory_map = NULL;
294 		r = -EOPNOTSUPP;
295 		goto err2;
296 	}
297 	if (da != p) {
298 		long i;
299 
300 		wc->memory_map = NULL;
301 		pages = vmalloc_array(p, sizeof(struct page *));
302 		if (!pages) {
303 			r = -ENOMEM;
304 			goto err2;
305 		}
306 		i = 0;
307 		do {
308 			long daa;
309 
310 			daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
311 					p - i, DAX_ACCESS, NULL, &pfn);
312 			if (daa <= 0) {
313 				r = daa ? daa : -EINVAL;
314 				goto err3;
315 			}
316 			if (!pfn_valid(pfn)) {
317 				r = -EOPNOTSUPP;
318 				goto err3;
319 			}
320 			while (daa-- && i < p) {
321 				pages[i++] = pfn_to_page(pfn);
322 				pfn++;
323 				if (!(i & 15))
324 					cond_resched();
325 			}
326 		} while (i < p);
327 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
328 		if (!wc->memory_map) {
329 			r = -ENOMEM;
330 			goto err3;
331 		}
332 		vfree(pages);
333 		wc->memory_vmapped = true;
334 	}
335 
336 	dax_read_unlock(id);
337 
338 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
339 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
340 
341 	return 0;
342 err3:
343 	vfree(pages);
344 err2:
345 	dax_read_unlock(id);
346 err1:
347 	return r;
348 }
349 #else
350 static int persistent_memory_claim(struct dm_writecache *wc)
351 {
352 	return -EOPNOTSUPP;
353 }
354 #endif
355 
356 static void persistent_memory_release(struct dm_writecache *wc)
357 {
358 	if (wc->memory_vmapped)
359 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
360 }
361 
362 static struct page *persistent_memory_page(void *addr)
363 {
364 	if (is_vmalloc_addr(addr))
365 		return vmalloc_to_page(addr);
366 	else
367 		return virt_to_page(addr);
368 }
369 
370 static unsigned int persistent_memory_page_offset(void *addr)
371 {
372 	return (unsigned long)addr & (PAGE_SIZE - 1);
373 }
374 
375 static void persistent_memory_flush_cache(void *ptr, size_t size)
376 {
377 	if (is_vmalloc_addr(ptr))
378 		flush_kernel_vmap_range(ptr, size);
379 }
380 
381 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
382 {
383 	if (is_vmalloc_addr(ptr))
384 		invalidate_kernel_vmap_range(ptr, size);
385 }
386 
387 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
388 {
389 	return wc->memory_map;
390 }
391 
392 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
393 {
394 	return &sb(wc)->entries[e->index];
395 }
396 
397 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
398 {
399 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
400 }
401 
402 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
403 {
404 	return wc->start_sector + wc->metadata_sectors +
405 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
406 }
407 
408 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
409 {
410 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
411 	return e->original_sector;
412 #else
413 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
414 #endif
415 }
416 
417 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
418 {
419 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
420 	return e->seq_count;
421 #else
422 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
423 #endif
424 }
425 
426 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
427 {
428 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
429 	e->seq_count = -1;
430 #endif
431 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
432 }
433 
434 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
435 					    uint64_t original_sector, uint64_t seq_count)
436 {
437 	struct wc_memory_entry me;
438 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
439 	e->original_sector = original_sector;
440 	e->seq_count = seq_count;
441 #endif
442 	me.original_sector = cpu_to_le64(original_sector);
443 	me.seq_count = cpu_to_le64(seq_count);
444 	pmem_assign(*memory_entry(wc, e), me);
445 }
446 
447 #define writecache_error(wc, err, msg, arg...)				\
448 do {									\
449 	if (!cmpxchg(&(wc)->error, 0, err))				\
450 		DMERR(msg, ##arg);					\
451 	wake_up(&(wc)->freelist_wait);					\
452 } while (0)
453 
454 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
455 
456 static void writecache_flush_all_metadata(struct dm_writecache *wc)
457 {
458 	if (!WC_MODE_PMEM(wc))
459 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
460 }
461 
462 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
463 {
464 	if (!WC_MODE_PMEM(wc))
465 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
466 			  wc->dirty_bitmap);
467 }
468 
469 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
470 
471 struct io_notify {
472 	struct dm_writecache *wc;
473 	struct completion c;
474 	atomic_t count;
475 };
476 
477 static void writecache_notify_io(unsigned long error, void *context)
478 {
479 	struct io_notify *endio = context;
480 
481 	if (unlikely(error != 0))
482 		writecache_error(endio->wc, -EIO, "error writing metadata");
483 	BUG_ON(atomic_read(&endio->count) <= 0);
484 	if (atomic_dec_and_test(&endio->count))
485 		complete(&endio->c);
486 }
487 
488 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
489 {
490 	wait_event(wc->bio_in_progress_wait[direction],
491 		   !atomic_read(&wc->bio_in_progress[direction]));
492 }
493 
494 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
495 {
496 	struct dm_io_region region;
497 	struct dm_io_request req;
498 	struct io_notify endio = {
499 		wc,
500 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
501 		ATOMIC_INIT(1),
502 	};
503 	unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
504 	unsigned int i = 0;
505 
506 	while (1) {
507 		unsigned int j;
508 
509 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
510 		if (unlikely(i == bitmap_bits))
511 			break;
512 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
513 
514 		region.bdev = wc->ssd_dev->bdev;
515 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
516 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
517 
518 		if (unlikely(region.sector >= wc->metadata_sectors))
519 			break;
520 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
521 			region.count = wc->metadata_sectors - region.sector;
522 
523 		region.sector += wc->start_sector;
524 		atomic_inc(&endio.count);
525 		req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
526 		req.mem.type = DM_IO_VMA;
527 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
528 		req.client = wc->dm_io;
529 		req.notify.fn = writecache_notify_io;
530 		req.notify.context = &endio;
531 
532 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
533 		(void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
534 		i = j;
535 	}
536 
537 	writecache_notify_io(0, &endio);
538 	wait_for_completion_io(&endio.c);
539 
540 	if (wait_for_ios)
541 		writecache_wait_for_ios(wc, WRITE);
542 
543 	writecache_disk_flush(wc, wc->ssd_dev);
544 
545 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
546 }
547 
548 static void ssd_commit_superblock(struct dm_writecache *wc)
549 {
550 	int r;
551 	struct dm_io_region region;
552 	struct dm_io_request req;
553 
554 	region.bdev = wc->ssd_dev->bdev;
555 	region.sector = 0;
556 	region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
557 
558 	if (unlikely(region.sector + region.count > wc->metadata_sectors))
559 		region.count = wc->metadata_sectors - region.sector;
560 
561 	region.sector += wc->start_sector;
562 
563 	req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
564 	req.mem.type = DM_IO_VMA;
565 	req.mem.ptr.vma = (char *)wc->memory_map;
566 	req.client = wc->dm_io;
567 	req.notify.fn = NULL;
568 	req.notify.context = NULL;
569 
570 	r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
571 	if (unlikely(r))
572 		writecache_error(wc, r, "error writing superblock");
573 }
574 
575 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
576 {
577 	if (WC_MODE_PMEM(wc))
578 		pmem_wmb();
579 	else
580 		ssd_commit_flushed(wc, wait_for_ios);
581 }
582 
583 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
584 {
585 	int r;
586 	struct dm_io_region region;
587 	struct dm_io_request req;
588 
589 	region.bdev = dev->bdev;
590 	region.sector = 0;
591 	region.count = 0;
592 	req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
593 	req.mem.type = DM_IO_KMEM;
594 	req.mem.ptr.addr = NULL;
595 	req.client = wc->dm_io;
596 	req.notify.fn = NULL;
597 
598 	r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
599 	if (unlikely(r))
600 		writecache_error(wc, r, "error flushing metadata: %d", r);
601 }
602 
603 #define WFE_RETURN_FOLLOWING	1
604 #define WFE_LOWEST_SEQ		2
605 
606 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
607 					      uint64_t block, int flags)
608 {
609 	struct wc_entry *e;
610 	struct rb_node *node = wc->tree.rb_node;
611 
612 	if (unlikely(!node))
613 		return NULL;
614 
615 	while (1) {
616 		e = container_of(node, struct wc_entry, rb_node);
617 		if (read_original_sector(wc, e) == block)
618 			break;
619 
620 		node = (read_original_sector(wc, e) >= block ?
621 			e->rb_node.rb_left : e->rb_node.rb_right);
622 		if (unlikely(!node)) {
623 			if (!(flags & WFE_RETURN_FOLLOWING))
624 				return NULL;
625 			if (read_original_sector(wc, e) >= block)
626 				return e;
627 
628 			node = rb_next(&e->rb_node);
629 			if (unlikely(!node))
630 				return NULL;
631 
632 			e = container_of(node, struct wc_entry, rb_node);
633 			return e;
634 		}
635 	}
636 
637 	while (1) {
638 		struct wc_entry *e2;
639 
640 		if (flags & WFE_LOWEST_SEQ)
641 			node = rb_prev(&e->rb_node);
642 		else
643 			node = rb_next(&e->rb_node);
644 		if (unlikely(!node))
645 			return e;
646 		e2 = container_of(node, struct wc_entry, rb_node);
647 		if (read_original_sector(wc, e2) != block)
648 			return e;
649 		e = e2;
650 	}
651 }
652 
653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
654 {
655 	struct wc_entry *e;
656 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
657 
658 	while (*node) {
659 		e = container_of(*node, struct wc_entry, rb_node);
660 		parent = &e->rb_node;
661 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
662 			node = &parent->rb_left;
663 		else
664 			node = &parent->rb_right;
665 	}
666 	rb_link_node(&ins->rb_node, parent, node);
667 	rb_insert_color(&ins->rb_node, &wc->tree);
668 	list_add(&ins->lru, &wc->lru);
669 	ins->age = jiffies;
670 }
671 
672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
673 {
674 	list_del(&e->lru);
675 	rb_erase(&e->rb_node, &wc->tree);
676 }
677 
678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
679 {
680 	if (WC_MODE_SORT_FREELIST(wc)) {
681 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
682 
683 		if (unlikely(!*node))
684 			wc->current_free = e;
685 		while (*node) {
686 			parent = *node;
687 			if (&e->rb_node < *node)
688 				node = &parent->rb_left;
689 			else
690 				node = &parent->rb_right;
691 		}
692 		rb_link_node(&e->rb_node, parent, node);
693 		rb_insert_color(&e->rb_node, &wc->freetree);
694 	} else {
695 		list_add_tail(&e->lru, &wc->freelist);
696 	}
697 	wc->freelist_size++;
698 }
699 
700 static inline void writecache_verify_watermark(struct dm_writecache *wc)
701 {
702 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
703 		queue_work(wc->writeback_wq, &wc->writeback_work);
704 }
705 
706 static void writecache_max_age_timer(struct timer_list *t)
707 {
708 	struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
709 
710 	if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
711 		queue_work(wc->writeback_wq, &wc->writeback_work);
712 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
713 	}
714 }
715 
716 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
717 {
718 	struct wc_entry *e;
719 
720 	if (WC_MODE_SORT_FREELIST(wc)) {
721 		struct rb_node *next;
722 
723 		if (unlikely(!wc->current_free))
724 			return NULL;
725 		e = wc->current_free;
726 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
727 			return NULL;
728 		next = rb_next(&e->rb_node);
729 		rb_erase(&e->rb_node, &wc->freetree);
730 		if (unlikely(!next))
731 			next = rb_first(&wc->freetree);
732 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
733 	} else {
734 		if (unlikely(list_empty(&wc->freelist)))
735 			return NULL;
736 		e = container_of(wc->freelist.next, struct wc_entry, lru);
737 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
738 			return NULL;
739 		list_del(&e->lru);
740 	}
741 	wc->freelist_size--;
742 
743 	writecache_verify_watermark(wc);
744 
745 	return e;
746 }
747 
748 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
749 {
750 	writecache_unlink(wc, e);
751 	writecache_add_to_freelist(wc, e);
752 	clear_seq_count(wc, e);
753 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
754 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
755 		wake_up(&wc->freelist_wait);
756 }
757 
758 static void writecache_wait_on_freelist(struct dm_writecache *wc)
759 {
760 	DEFINE_WAIT(wait);
761 
762 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
763 	wc_unlock(wc);
764 	io_schedule();
765 	finish_wait(&wc->freelist_wait, &wait);
766 	wc_lock(wc);
767 }
768 
769 static void writecache_poison_lists(struct dm_writecache *wc)
770 {
771 	/*
772 	 * Catch incorrect access to these values while the device is suspended.
773 	 */
774 	memset(&wc->tree, -1, sizeof(wc->tree));
775 	wc->lru.next = LIST_POISON1;
776 	wc->lru.prev = LIST_POISON2;
777 	wc->freelist.next = LIST_POISON1;
778 	wc->freelist.prev = LIST_POISON2;
779 }
780 
781 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
782 {
783 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
784 	if (WC_MODE_PMEM(wc))
785 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
786 }
787 
788 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
789 {
790 	return read_seq_count(wc, e) < wc->seq_count;
791 }
792 
793 static void writecache_flush(struct dm_writecache *wc)
794 {
795 	struct wc_entry *e, *e2;
796 	bool need_flush_after_free;
797 
798 	wc->uncommitted_blocks = 0;
799 	timer_delete(&wc->autocommit_timer);
800 
801 	if (list_empty(&wc->lru))
802 		return;
803 
804 	e = container_of(wc->lru.next, struct wc_entry, lru);
805 	if (writecache_entry_is_committed(wc, e)) {
806 		if (wc->overwrote_committed) {
807 			writecache_wait_for_ios(wc, WRITE);
808 			writecache_disk_flush(wc, wc->ssd_dev);
809 			wc->overwrote_committed = false;
810 		}
811 		return;
812 	}
813 	while (1) {
814 		writecache_flush_entry(wc, e);
815 		if (unlikely(e->lru.next == &wc->lru))
816 			break;
817 		e2 = container_of(e->lru.next, struct wc_entry, lru);
818 		if (writecache_entry_is_committed(wc, e2))
819 			break;
820 		e = e2;
821 		cond_resched();
822 	}
823 	writecache_commit_flushed(wc, true);
824 
825 	wc->seq_count++;
826 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
827 	if (WC_MODE_PMEM(wc))
828 		writecache_commit_flushed(wc, false);
829 	else
830 		ssd_commit_superblock(wc);
831 
832 	wc->overwrote_committed = false;
833 
834 	need_flush_after_free = false;
835 	while (1) {
836 		/* Free another committed entry with lower seq-count */
837 		struct rb_node *rb_node = rb_prev(&e->rb_node);
838 
839 		if (rb_node) {
840 			e2 = container_of(rb_node, struct wc_entry, rb_node);
841 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
842 			    likely(!e2->write_in_progress)) {
843 				writecache_free_entry(wc, e2);
844 				need_flush_after_free = true;
845 			}
846 		}
847 		if (unlikely(e->lru.prev == &wc->lru))
848 			break;
849 		e = container_of(e->lru.prev, struct wc_entry, lru);
850 		cond_resched();
851 	}
852 
853 	if (need_flush_after_free)
854 		writecache_commit_flushed(wc, false);
855 }
856 
857 static void writecache_flush_work(struct work_struct *work)
858 {
859 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
860 
861 	wc_lock(wc);
862 	writecache_flush(wc);
863 	wc_unlock(wc);
864 }
865 
866 static void writecache_autocommit_timer(struct timer_list *t)
867 {
868 	struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
869 
870 	if (!writecache_has_error(wc))
871 		queue_work(wc->writeback_wq, &wc->flush_work);
872 }
873 
874 static void writecache_schedule_autocommit(struct dm_writecache *wc)
875 {
876 	if (!timer_pending(&wc->autocommit_timer))
877 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
878 }
879 
880 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
881 {
882 	struct wc_entry *e;
883 	bool discarded_something = false;
884 
885 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
886 	if (unlikely(!e))
887 		return;
888 
889 	while (read_original_sector(wc, e) < end) {
890 		struct rb_node *node = rb_next(&e->rb_node);
891 
892 		if (likely(!e->write_in_progress)) {
893 			if (!discarded_something) {
894 				if (!WC_MODE_PMEM(wc)) {
895 					writecache_wait_for_ios(wc, READ);
896 					writecache_wait_for_ios(wc, WRITE);
897 				}
898 				discarded_something = true;
899 			}
900 			if (!writecache_entry_is_committed(wc, e))
901 				wc->uncommitted_blocks--;
902 			writecache_free_entry(wc, e);
903 		}
904 
905 		if (unlikely(!node))
906 			break;
907 
908 		e = container_of(node, struct wc_entry, rb_node);
909 	}
910 
911 	if (discarded_something)
912 		writecache_commit_flushed(wc, false);
913 }
914 
915 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
916 {
917 	if (wc->writeback_size) {
918 		writecache_wait_on_freelist(wc);
919 		return true;
920 	}
921 	return false;
922 }
923 
924 static void writecache_suspend(struct dm_target *ti)
925 {
926 	struct dm_writecache *wc = ti->private;
927 	bool flush_on_suspend;
928 
929 	timer_delete_sync(&wc->autocommit_timer);
930 	timer_delete_sync(&wc->max_age_timer);
931 
932 	wc_lock(wc);
933 	writecache_flush(wc);
934 	flush_on_suspend = wc->flush_on_suspend;
935 	if (flush_on_suspend) {
936 		wc->flush_on_suspend = false;
937 		wc->writeback_all++;
938 		queue_work(wc->writeback_wq, &wc->writeback_work);
939 	}
940 	wc_unlock(wc);
941 
942 	drain_workqueue(wc->writeback_wq);
943 
944 	wc_lock(wc);
945 	if (flush_on_suspend)
946 		wc->writeback_all--;
947 	while (writecache_wait_for_writeback(wc))
948 		;
949 
950 	if (WC_MODE_PMEM(wc))
951 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
952 
953 	writecache_poison_lists(wc);
954 
955 	wc_unlock(wc);
956 }
957 
958 static int writecache_alloc_entries(struct dm_writecache *wc)
959 {
960 	size_t b;
961 
962 	if (wc->entries)
963 		return 0;
964 	wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
965 	if (!wc->entries)
966 		return -ENOMEM;
967 	for (b = 0; b < wc->n_blocks; b++) {
968 		struct wc_entry *e = &wc->entries[b];
969 
970 		e->index = b;
971 		e->write_in_progress = false;
972 		cond_resched();
973 	}
974 
975 	return 0;
976 }
977 
978 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
979 {
980 	struct dm_io_region region;
981 	struct dm_io_request req;
982 
983 	region.bdev = wc->ssd_dev->bdev;
984 	region.sector = wc->start_sector;
985 	region.count = n_sectors;
986 	req.bi_opf = REQ_OP_READ | REQ_SYNC;
987 	req.mem.type = DM_IO_VMA;
988 	req.mem.ptr.vma = (char *)wc->memory_map;
989 	req.client = wc->dm_io;
990 	req.notify.fn = NULL;
991 
992 	return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
993 }
994 
995 static void writecache_resume(struct dm_target *ti)
996 {
997 	struct dm_writecache *wc = ti->private;
998 	size_t b;
999 	bool need_flush = false;
1000 	__le64 sb_seq_count;
1001 	int r;
1002 
1003 	wc_lock(wc);
1004 
1005 	wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
1006 
1007 	if (WC_MODE_PMEM(wc)) {
1008 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
1009 	} else {
1010 		r = writecache_read_metadata(wc, wc->metadata_sectors);
1011 		if (r) {
1012 			size_t sb_entries_offset;
1013 
1014 			writecache_error(wc, r, "unable to read metadata: %d", r);
1015 			sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1016 			memset((char *)wc->memory_map + sb_entries_offset, -1,
1017 			       (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1018 		}
1019 	}
1020 
1021 	wc->tree = RB_ROOT;
1022 	INIT_LIST_HEAD(&wc->lru);
1023 	if (WC_MODE_SORT_FREELIST(wc)) {
1024 		wc->freetree = RB_ROOT;
1025 		wc->current_free = NULL;
1026 	} else {
1027 		INIT_LIST_HEAD(&wc->freelist);
1028 	}
1029 	wc->freelist_size = 0;
1030 
1031 	r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1032 			      sizeof(uint64_t));
1033 	if (r) {
1034 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1035 		sb_seq_count = cpu_to_le64(0);
1036 	}
1037 	wc->seq_count = le64_to_cpu(sb_seq_count);
1038 
1039 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1040 	for (b = 0; b < wc->n_blocks; b++) {
1041 		struct wc_entry *e = &wc->entries[b];
1042 		struct wc_memory_entry wme;
1043 
1044 		if (writecache_has_error(wc)) {
1045 			e->original_sector = -1;
1046 			e->seq_count = -1;
1047 			continue;
1048 		}
1049 		r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1050 				      sizeof(struct wc_memory_entry));
1051 		if (r) {
1052 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1053 					 (unsigned long)b, r);
1054 			e->original_sector = -1;
1055 			e->seq_count = -1;
1056 		} else {
1057 			e->original_sector = le64_to_cpu(wme.original_sector);
1058 			e->seq_count = le64_to_cpu(wme.seq_count);
1059 		}
1060 		cond_resched();
1061 	}
1062 #endif
1063 	for (b = 0; b < wc->n_blocks; b++) {
1064 		struct wc_entry *e = &wc->entries[b];
1065 
1066 		if (!writecache_entry_is_committed(wc, e)) {
1067 			if (read_seq_count(wc, e) != -1) {
1068 erase_this:
1069 				clear_seq_count(wc, e);
1070 				need_flush = true;
1071 			}
1072 			writecache_add_to_freelist(wc, e);
1073 		} else {
1074 			struct wc_entry *old;
1075 
1076 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1077 			if (!old) {
1078 				writecache_insert_entry(wc, e);
1079 			} else {
1080 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1081 					writecache_error(wc, -EINVAL,
1082 						 "two identical entries, position %llu, sector %llu, sequence %llu",
1083 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1084 						 (unsigned long long)read_seq_count(wc, e));
1085 				}
1086 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1087 					goto erase_this;
1088 				} else {
1089 					writecache_free_entry(wc, old);
1090 					writecache_insert_entry(wc, e);
1091 					need_flush = true;
1092 				}
1093 			}
1094 		}
1095 		cond_resched();
1096 	}
1097 
1098 	if (need_flush) {
1099 		writecache_flush_all_metadata(wc);
1100 		writecache_commit_flushed(wc, false);
1101 	}
1102 
1103 	writecache_verify_watermark(wc);
1104 
1105 	if (wc->max_age != MAX_AGE_UNSPECIFIED)
1106 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1107 
1108 	wc_unlock(wc);
1109 }
1110 
1111 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1112 {
1113 	if (argc != 1)
1114 		return -EINVAL;
1115 
1116 	wc_lock(wc);
1117 	if (dm_suspended(wc->ti)) {
1118 		wc_unlock(wc);
1119 		return -EBUSY;
1120 	}
1121 	if (writecache_has_error(wc)) {
1122 		wc_unlock(wc);
1123 		return -EIO;
1124 	}
1125 
1126 	writecache_flush(wc);
1127 	wc->writeback_all++;
1128 	queue_work(wc->writeback_wq, &wc->writeback_work);
1129 	wc_unlock(wc);
1130 
1131 	flush_workqueue(wc->writeback_wq);
1132 
1133 	wc_lock(wc);
1134 	wc->writeback_all--;
1135 	if (writecache_has_error(wc)) {
1136 		wc_unlock(wc);
1137 		return -EIO;
1138 	}
1139 	wc_unlock(wc);
1140 
1141 	return 0;
1142 }
1143 
1144 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1145 {
1146 	if (argc != 1)
1147 		return -EINVAL;
1148 
1149 	wc_lock(wc);
1150 	wc->flush_on_suspend = true;
1151 	wc_unlock(wc);
1152 
1153 	return 0;
1154 }
1155 
1156 static void activate_cleaner(struct dm_writecache *wc)
1157 {
1158 	wc->flush_on_suspend = true;
1159 	wc->cleaner = true;
1160 	wc->freelist_high_watermark = wc->n_blocks;
1161 	wc->freelist_low_watermark = wc->n_blocks;
1162 }
1163 
1164 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1165 {
1166 	if (argc != 1)
1167 		return -EINVAL;
1168 
1169 	wc_lock(wc);
1170 	activate_cleaner(wc);
1171 	if (!dm_suspended(wc->ti))
1172 		writecache_verify_watermark(wc);
1173 	wc_unlock(wc);
1174 
1175 	return 0;
1176 }
1177 
1178 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1179 {
1180 	if (argc != 1)
1181 		return -EINVAL;
1182 
1183 	wc_lock(wc);
1184 	memset(&wc->stats, 0, sizeof(wc->stats));
1185 	wc_unlock(wc);
1186 
1187 	return 0;
1188 }
1189 
1190 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
1191 			      char *result, unsigned int maxlen)
1192 {
1193 	int r = -EINVAL;
1194 	struct dm_writecache *wc = ti->private;
1195 
1196 	if (!strcasecmp(argv[0], "flush"))
1197 		r = process_flush_mesg(argc, argv, wc);
1198 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1199 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1200 	else if (!strcasecmp(argv[0], "cleaner"))
1201 		r = process_cleaner_mesg(argc, argv, wc);
1202 	else if (!strcasecmp(argv[0], "clear_stats"))
1203 		r = process_clear_stats_mesg(argc, argv, wc);
1204 	else
1205 		DMERR("unrecognised message received: %s", argv[0]);
1206 
1207 	return r;
1208 }
1209 
1210 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1211 {
1212 	/*
1213 	 * clflushopt performs better with block size 1024, 2048, 4096
1214 	 * non-temporal stores perform better with block size 512
1215 	 *
1216 	 * block size   512             1024            2048            4096
1217 	 * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1218 	 * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1219 	 *
1220 	 * We see that movnti performs better for 512-byte blocks, and
1221 	 * clflushopt performs better for 1024-byte and larger blocks. So, we
1222 	 * prefer clflushopt for sizes >= 768.
1223 	 *
1224 	 * NOTE: this happens to be the case now (with dm-writecache's single
1225 	 * threaded model) but re-evaluate this once memcpy_flushcache() is
1226 	 * enabled to use movdir64b which might invalidate this performance
1227 	 * advantage seen with cache-allocating-writes plus flushing.
1228 	 */
1229 #ifdef CONFIG_X86
1230 	if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1231 	    likely(boot_cpu_data.x86_clflush_size == 64) &&
1232 	    likely(size >= 768)) {
1233 		do {
1234 			memcpy((void *)dest, (void *)source, 64);
1235 			clflushopt((void *)dest);
1236 			dest += 64;
1237 			source += 64;
1238 			size -= 64;
1239 		} while (size >= 64);
1240 		return;
1241 	}
1242 #endif
1243 	memcpy_flushcache(dest, source, size);
1244 }
1245 
1246 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1247 {
1248 	void *buf;
1249 	unsigned int size;
1250 	int rw = bio_data_dir(bio);
1251 	unsigned int remaining_size = wc->block_size;
1252 
1253 	do {
1254 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1255 
1256 		buf = bvec_kmap_local(&bv);
1257 		size = bv.bv_len;
1258 		if (unlikely(size > remaining_size))
1259 			size = remaining_size;
1260 
1261 		if (rw == READ) {
1262 			int r;
1263 
1264 			r = copy_mc_to_kernel(buf, data, size);
1265 			flush_dcache_page(bio_page(bio));
1266 			if (unlikely(r)) {
1267 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1268 				bio->bi_status = BLK_STS_IOERR;
1269 			}
1270 		} else {
1271 			flush_dcache_page(bio_page(bio));
1272 			memcpy_flushcache_optimized(data, buf, size);
1273 		}
1274 
1275 		kunmap_local(buf);
1276 
1277 		data = (char *)data + size;
1278 		remaining_size -= size;
1279 		bio_advance(bio, size);
1280 	} while (unlikely(remaining_size));
1281 }
1282 
1283 static int writecache_flush_thread(void *data)
1284 {
1285 	struct dm_writecache *wc = data;
1286 
1287 	while (1) {
1288 		struct bio *bio;
1289 
1290 		wc_lock(wc);
1291 		bio = bio_list_pop(&wc->flush_list);
1292 		if (!bio) {
1293 			set_current_state(TASK_INTERRUPTIBLE);
1294 			wc_unlock(wc);
1295 
1296 			if (unlikely(kthread_should_stop())) {
1297 				set_current_state(TASK_RUNNING);
1298 				break;
1299 			}
1300 
1301 			schedule();
1302 			continue;
1303 		}
1304 
1305 		if (bio_op(bio) == REQ_OP_DISCARD) {
1306 			writecache_discard(wc, bio->bi_iter.bi_sector,
1307 					   bio_end_sector(bio));
1308 			wc_unlock(wc);
1309 			bio_set_dev(bio, wc->dev->bdev);
1310 			submit_bio_noacct(bio);
1311 		} else {
1312 			writecache_flush(wc);
1313 			wc_unlock(wc);
1314 			if (writecache_has_error(wc))
1315 				bio->bi_status = BLK_STS_IOERR;
1316 			bio_endio(bio);
1317 		}
1318 	}
1319 
1320 	return 0;
1321 }
1322 
1323 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1324 {
1325 	if (bio_list_empty(&wc->flush_list))
1326 		wake_up_process(wc->flush_thread);
1327 	bio_list_add(&wc->flush_list, bio);
1328 }
1329 
1330 enum wc_map_op {
1331 	WC_MAP_SUBMIT,
1332 	WC_MAP_REMAP,
1333 	WC_MAP_REMAP_ORIGIN,
1334 	WC_MAP_RETURN,
1335 	WC_MAP_ERROR,
1336 };
1337 
1338 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
1339 					struct wc_entry *e)
1340 {
1341 	if (e) {
1342 		sector_t next_boundary =
1343 			read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1344 		if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
1345 			dm_accept_partial_bio(bio, next_boundary);
1346 	}
1347 }
1348 
1349 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
1350 {
1351 	enum wc_map_op map_op;
1352 	struct wc_entry *e;
1353 
1354 read_next_block:
1355 	wc->stats.reads++;
1356 	e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1357 	if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1358 		wc->stats.read_hits++;
1359 		if (WC_MODE_PMEM(wc)) {
1360 			bio_copy_block(wc, bio, memory_data(wc, e));
1361 			if (bio->bi_iter.bi_size)
1362 				goto read_next_block;
1363 			map_op = WC_MAP_SUBMIT;
1364 		} else {
1365 			dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1366 			bio_set_dev(bio, wc->ssd_dev->bdev);
1367 			bio->bi_iter.bi_sector = cache_sector(wc, e);
1368 			if (!writecache_entry_is_committed(wc, e))
1369 				writecache_wait_for_ios(wc, WRITE);
1370 			map_op = WC_MAP_REMAP;
1371 		}
1372 	} else {
1373 		writecache_map_remap_origin(wc, bio, e);
1374 		wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1375 		map_op = WC_MAP_REMAP_ORIGIN;
1376 	}
1377 
1378 	return map_op;
1379 }
1380 
1381 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
1382 				    struct wc_entry *e, bool search_used)
1383 {
1384 	unsigned int bio_size = wc->block_size;
1385 	sector_t start_cache_sec = cache_sector(wc, e);
1386 	sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1387 
1388 	while (bio_size < bio->bi_iter.bi_size) {
1389 		if (!search_used) {
1390 			struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1391 
1392 			if (!f)
1393 				break;
1394 			write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1395 							(bio_size >> SECTOR_SHIFT), wc->seq_count);
1396 			writecache_insert_entry(wc, f);
1397 			wc->uncommitted_blocks++;
1398 		} else {
1399 			struct wc_entry *f;
1400 			struct rb_node *next = rb_next(&e->rb_node);
1401 
1402 			if (!next)
1403 				break;
1404 			f = container_of(next, struct wc_entry, rb_node);
1405 			if (f != e + 1)
1406 				break;
1407 			if (read_original_sector(wc, f) !=
1408 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1409 				break;
1410 			if (unlikely(f->write_in_progress))
1411 				break;
1412 			if (writecache_entry_is_committed(wc, f))
1413 				wc->overwrote_committed = true;
1414 			e = f;
1415 		}
1416 		bio_size += wc->block_size;
1417 		current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1418 	}
1419 
1420 	bio_set_dev(bio, wc->ssd_dev->bdev);
1421 	bio->bi_iter.bi_sector = start_cache_sec;
1422 	dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1423 
1424 	wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1425 	wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1426 
1427 	if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1428 		wc->uncommitted_blocks = 0;
1429 		queue_work(wc->writeback_wq, &wc->flush_work);
1430 	} else {
1431 		writecache_schedule_autocommit(wc);
1432 	}
1433 }
1434 
1435 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
1436 {
1437 	struct wc_entry *e;
1438 
1439 	do {
1440 		bool found_entry = false;
1441 		bool search_used = false;
1442 
1443 		if (writecache_has_error(wc)) {
1444 			wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1445 			return WC_MAP_ERROR;
1446 		}
1447 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1448 		if (e) {
1449 			if (!writecache_entry_is_committed(wc, e)) {
1450 				wc->stats.write_hits_uncommitted++;
1451 				search_used = true;
1452 				goto bio_copy;
1453 			}
1454 			wc->stats.write_hits_committed++;
1455 			if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1456 				wc->overwrote_committed = true;
1457 				search_used = true;
1458 				goto bio_copy;
1459 			}
1460 			found_entry = true;
1461 		} else {
1462 			if (unlikely(wc->cleaner) ||
1463 			    (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1464 				goto direct_write;
1465 		}
1466 		e = writecache_pop_from_freelist(wc, (sector_t)-1);
1467 		if (unlikely(!e)) {
1468 			if (!WC_MODE_PMEM(wc) && !found_entry) {
1469 direct_write:
1470 				e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1471 				writecache_map_remap_origin(wc, bio, e);
1472 				wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
1473 				wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1474 				return WC_MAP_REMAP_ORIGIN;
1475 			}
1476 			wc->stats.writes_blocked_on_freelist++;
1477 			writecache_wait_on_freelist(wc);
1478 			continue;
1479 		}
1480 		write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1481 		writecache_insert_entry(wc, e);
1482 		wc->uncommitted_blocks++;
1483 		wc->stats.writes_allocate++;
1484 bio_copy:
1485 		if (WC_MODE_PMEM(wc)) {
1486 			bio_copy_block(wc, bio, memory_data(wc, e));
1487 			wc->stats.writes++;
1488 		} else {
1489 			writecache_bio_copy_ssd(wc, bio, e, search_used);
1490 			return WC_MAP_REMAP;
1491 		}
1492 	} while (bio->bi_iter.bi_size);
1493 
1494 	if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
1495 		writecache_flush(wc);
1496 	else
1497 		writecache_schedule_autocommit(wc);
1498 
1499 	return WC_MAP_SUBMIT;
1500 }
1501 
1502 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
1503 {
1504 	if (writecache_has_error(wc))
1505 		return WC_MAP_ERROR;
1506 
1507 	if (WC_MODE_PMEM(wc)) {
1508 		wc->stats.flushes++;
1509 		writecache_flush(wc);
1510 		if (writecache_has_error(wc))
1511 			return WC_MAP_ERROR;
1512 		else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1513 			return WC_MAP_REMAP_ORIGIN;
1514 		return WC_MAP_SUBMIT;
1515 	}
1516 	/* SSD: */
1517 	if (dm_bio_get_target_bio_nr(bio))
1518 		return WC_MAP_REMAP_ORIGIN;
1519 	wc->stats.flushes++;
1520 	writecache_offload_bio(wc, bio);
1521 	return WC_MAP_RETURN;
1522 }
1523 
1524 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
1525 {
1526 	wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
1527 
1528 	if (writecache_has_error(wc))
1529 		return WC_MAP_ERROR;
1530 
1531 	if (WC_MODE_PMEM(wc)) {
1532 		writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1533 		return WC_MAP_REMAP_ORIGIN;
1534 	}
1535 	/* SSD: */
1536 	writecache_offload_bio(wc, bio);
1537 	return WC_MAP_RETURN;
1538 }
1539 
1540 static int writecache_map(struct dm_target *ti, struct bio *bio)
1541 {
1542 	struct dm_writecache *wc = ti->private;
1543 	enum wc_map_op map_op;
1544 
1545 	bio->bi_private = NULL;
1546 
1547 	wc_lock(wc);
1548 
1549 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1550 		map_op = writecache_map_flush(wc, bio);
1551 		goto done;
1552 	}
1553 
1554 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1555 
1556 	if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1557 				(wc->block_size / 512 - 1)) != 0)) {
1558 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1559 		      (unsigned long long)bio->bi_iter.bi_sector,
1560 		      bio->bi_iter.bi_size, wc->block_size);
1561 		map_op = WC_MAP_ERROR;
1562 		goto done;
1563 	}
1564 
1565 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1566 		map_op = writecache_map_discard(wc, bio);
1567 		goto done;
1568 	}
1569 
1570 	if (bio_data_dir(bio) == READ)
1571 		map_op = writecache_map_read(wc, bio);
1572 	else
1573 		map_op = writecache_map_write(wc, bio);
1574 done:
1575 	switch (map_op) {
1576 	case WC_MAP_REMAP_ORIGIN:
1577 		if (likely(wc->pause != 0)) {
1578 			if (bio_op(bio) == REQ_OP_WRITE) {
1579 				dm_iot_io_begin(&wc->iot, 1);
1580 				bio->bi_private = (void *)2;
1581 			}
1582 		}
1583 		bio_set_dev(bio, wc->dev->bdev);
1584 		wc_unlock(wc);
1585 		return DM_MAPIO_REMAPPED;
1586 
1587 	case WC_MAP_REMAP:
1588 		/* make sure that writecache_end_io decrements bio_in_progress: */
1589 		bio->bi_private = (void *)1;
1590 		atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1591 		wc_unlock(wc);
1592 		return DM_MAPIO_REMAPPED;
1593 
1594 	case WC_MAP_SUBMIT:
1595 		wc_unlock(wc);
1596 		bio_endio(bio);
1597 		return DM_MAPIO_SUBMITTED;
1598 
1599 	case WC_MAP_RETURN:
1600 		wc_unlock(wc);
1601 		return DM_MAPIO_SUBMITTED;
1602 
1603 	case WC_MAP_ERROR:
1604 		wc_unlock(wc);
1605 		bio_io_error(bio);
1606 		return DM_MAPIO_SUBMITTED;
1607 
1608 	default:
1609 		BUG();
1610 		wc_unlock(wc);
1611 		return DM_MAPIO_KILL;
1612 	}
1613 }
1614 
1615 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1616 {
1617 	struct dm_writecache *wc = ti->private;
1618 
1619 	if (bio->bi_private == (void *)1) {
1620 		int dir = bio_data_dir(bio);
1621 
1622 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1623 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1624 				wake_up(&wc->bio_in_progress_wait[dir]);
1625 	} else if (bio->bi_private == (void *)2) {
1626 		dm_iot_io_end(&wc->iot, 1);
1627 	}
1628 	return 0;
1629 }
1630 
1631 static int writecache_iterate_devices(struct dm_target *ti,
1632 				      iterate_devices_callout_fn fn, void *data)
1633 {
1634 	struct dm_writecache *wc = ti->private;
1635 
1636 	return fn(ti, wc->dev, 0, ti->len, data);
1637 }
1638 
1639 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1640 {
1641 	struct dm_writecache *wc = ti->private;
1642 
1643 	dm_stack_bs_limits(limits, wc->block_size);
1644 }
1645 
1646 static void writecache_writeback_endio(struct bio *bio)
1647 {
1648 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1649 	struct dm_writecache *wc = wb->wc;
1650 	unsigned long flags;
1651 
1652 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1653 	if (unlikely(list_empty(&wc->endio_list)))
1654 		wake_up_process(wc->endio_thread);
1655 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1656 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1657 }
1658 
1659 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1660 {
1661 	struct copy_struct *c = ptr;
1662 	struct dm_writecache *wc = c->wc;
1663 
1664 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1665 
1666 	raw_spin_lock_irq(&wc->endio_list_lock);
1667 	if (unlikely(list_empty(&wc->endio_list)))
1668 		wake_up_process(wc->endio_thread);
1669 	list_add_tail(&c->endio_entry, &wc->endio_list);
1670 	raw_spin_unlock_irq(&wc->endio_list_lock);
1671 }
1672 
1673 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1674 {
1675 	unsigned int i;
1676 	struct writeback_struct *wb;
1677 	struct wc_entry *e;
1678 	unsigned long n_walked = 0;
1679 
1680 	do {
1681 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1682 		list_del(&wb->endio_entry);
1683 
1684 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1685 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1686 					"write error %d", wb->bio.bi_status);
1687 		i = 0;
1688 		do {
1689 			e = wb->wc_list[i];
1690 			BUG_ON(!e->write_in_progress);
1691 			e->write_in_progress = false;
1692 			INIT_LIST_HEAD(&e->lru);
1693 			if (!writecache_has_error(wc))
1694 				writecache_free_entry(wc, e);
1695 			BUG_ON(!wc->writeback_size);
1696 			wc->writeback_size--;
1697 			n_walked++;
1698 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1699 				writecache_commit_flushed(wc, false);
1700 				wc_unlock(wc);
1701 				wc_lock(wc);
1702 				n_walked = 0;
1703 			}
1704 		} while (++i < wb->wc_list_n);
1705 
1706 		if (wb->wc_list != wb->wc_list_inline)
1707 			kfree(wb->wc_list);
1708 		bio_put(&wb->bio);
1709 	} while (!list_empty(list));
1710 }
1711 
1712 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1713 {
1714 	struct copy_struct *c;
1715 	struct wc_entry *e;
1716 
1717 	do {
1718 		c = list_entry(list->next, struct copy_struct, endio_entry);
1719 		list_del(&c->endio_entry);
1720 
1721 		if (unlikely(c->error))
1722 			writecache_error(wc, c->error, "copy error");
1723 
1724 		e = c->e;
1725 		do {
1726 			BUG_ON(!e->write_in_progress);
1727 			e->write_in_progress = false;
1728 			INIT_LIST_HEAD(&e->lru);
1729 			if (!writecache_has_error(wc))
1730 				writecache_free_entry(wc, e);
1731 
1732 			BUG_ON(!wc->writeback_size);
1733 			wc->writeback_size--;
1734 			e++;
1735 		} while (--c->n_entries);
1736 		mempool_free(c, &wc->copy_pool);
1737 	} while (!list_empty(list));
1738 }
1739 
1740 static int writecache_endio_thread(void *data)
1741 {
1742 	struct dm_writecache *wc = data;
1743 
1744 	while (1) {
1745 		struct list_head list;
1746 
1747 		raw_spin_lock_irq(&wc->endio_list_lock);
1748 		if (!list_empty(&wc->endio_list))
1749 			goto pop_from_list;
1750 		set_current_state(TASK_INTERRUPTIBLE);
1751 		raw_spin_unlock_irq(&wc->endio_list_lock);
1752 
1753 		if (unlikely(kthread_should_stop())) {
1754 			set_current_state(TASK_RUNNING);
1755 			break;
1756 		}
1757 
1758 		schedule();
1759 
1760 		continue;
1761 
1762 pop_from_list:
1763 		list = wc->endio_list;
1764 		list.next->prev = list.prev->next = &list;
1765 		INIT_LIST_HEAD(&wc->endio_list);
1766 		raw_spin_unlock_irq(&wc->endio_list_lock);
1767 
1768 		if (!WC_MODE_FUA(wc))
1769 			writecache_disk_flush(wc, wc->dev);
1770 
1771 		wc_lock(wc);
1772 
1773 		if (WC_MODE_PMEM(wc)) {
1774 			__writecache_endio_pmem(wc, &list);
1775 		} else {
1776 			__writecache_endio_ssd(wc, &list);
1777 			writecache_wait_for_ios(wc, READ);
1778 		}
1779 
1780 		writecache_commit_flushed(wc, false);
1781 
1782 		wc_unlock(wc);
1783 	}
1784 
1785 	return 0;
1786 }
1787 
1788 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1789 {
1790 	struct dm_writecache *wc = wb->wc;
1791 	unsigned int block_size = wc->block_size;
1792 	void *address = memory_data(wc, e);
1793 
1794 	persistent_memory_flush_cache(address, block_size);
1795 
1796 	if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1797 		return true;
1798 
1799 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1800 			    block_size, persistent_memory_page_offset(address)) != 0;
1801 }
1802 
1803 struct writeback_list {
1804 	struct list_head list;
1805 	size_t size;
1806 };
1807 
1808 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1809 {
1810 	if (unlikely(wc->max_writeback_jobs)) {
1811 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1812 			wc_lock(wc);
1813 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1814 				writecache_wait_on_freelist(wc);
1815 			wc_unlock(wc);
1816 		}
1817 	}
1818 	cond_resched();
1819 }
1820 
1821 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1822 {
1823 	struct wc_entry *e, *f;
1824 	struct bio *bio;
1825 	struct writeback_struct *wb;
1826 	unsigned int max_pages;
1827 
1828 	while (wbl->size) {
1829 		wbl->size--;
1830 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1831 		list_del(&e->lru);
1832 
1833 		max_pages = e->wc_list_contiguous;
1834 
1835 		bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
1836 				       GFP_NOIO, &wc->bio_set);
1837 		wb = container_of(bio, struct writeback_struct, bio);
1838 		wb->wc = wc;
1839 		bio->bi_end_io = writecache_writeback_endio;
1840 		bio->bi_iter.bi_sector = read_original_sector(wc, e);
1841 
1842 		if (unlikely(max_pages > WB_LIST_INLINE))
1843 			wb->wc_list = kmalloc_objs(struct wc_entry *, max_pages,
1844 						   GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1845 
1846 		if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
1847 			wb->wc_list = wb->wc_list_inline;
1848 			max_pages = WB_LIST_INLINE;
1849 		}
1850 
1851 		BUG_ON(!wc_add_block(wb, e));
1852 
1853 		wb->wc_list[0] = e;
1854 		wb->wc_list_n = 1;
1855 
1856 		while (wbl->size && wb->wc_list_n < max_pages) {
1857 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1858 			if (read_original_sector(wc, f) !=
1859 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1860 				break;
1861 			if (!wc_add_block(wb, f))
1862 				break;
1863 			wbl->size--;
1864 			list_del(&f->lru);
1865 			wb->wc_list[wb->wc_list_n++] = f;
1866 			e = f;
1867 		}
1868 		if (WC_MODE_FUA(wc))
1869 			bio->bi_opf |= REQ_FUA;
1870 		if (writecache_has_error(wc)) {
1871 			bio->bi_status = BLK_STS_IOERR;
1872 			bio_endio(bio);
1873 		} else if (unlikely(!bio_sectors(bio))) {
1874 			bio->bi_status = BLK_STS_OK;
1875 			bio_endio(bio);
1876 		} else {
1877 			submit_bio(bio);
1878 		}
1879 
1880 		__writeback_throttle(wc, wbl);
1881 	}
1882 }
1883 
1884 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1885 {
1886 	struct wc_entry *e, *f;
1887 	struct dm_io_region from, to;
1888 	struct copy_struct *c;
1889 
1890 	while (wbl->size) {
1891 		unsigned int n_sectors;
1892 
1893 		wbl->size--;
1894 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1895 		list_del(&e->lru);
1896 
1897 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1898 
1899 		from.bdev = wc->ssd_dev->bdev;
1900 		from.sector = cache_sector(wc, e);
1901 		from.count = n_sectors;
1902 		to.bdev = wc->dev->bdev;
1903 		to.sector = read_original_sector(wc, e);
1904 		to.count = n_sectors;
1905 
1906 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1907 		c->wc = wc;
1908 		c->e = e;
1909 		c->n_entries = e->wc_list_contiguous;
1910 
1911 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1912 			wbl->size--;
1913 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1914 			BUG_ON(f != e + 1);
1915 			list_del(&f->lru);
1916 			e = f;
1917 		}
1918 
1919 		if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1920 			if (to.sector >= wc->data_device_sectors) {
1921 				writecache_copy_endio(0, 0, c);
1922 				continue;
1923 			}
1924 			from.count = to.count = wc->data_device_sectors - to.sector;
1925 		}
1926 
1927 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1928 
1929 		__writeback_throttle(wc, wbl);
1930 	}
1931 }
1932 
1933 static void writecache_writeback(struct work_struct *work)
1934 {
1935 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1936 	struct blk_plug plug;
1937 	struct wc_entry *f, *g, *e = NULL;
1938 	struct rb_node *node, *next_node;
1939 	struct list_head skipped;
1940 	struct writeback_list wbl;
1941 	unsigned long n_walked;
1942 
1943 	if (!WC_MODE_PMEM(wc)) {
1944 		/* Wait for any active kcopyd work on behalf of ssd writeback */
1945 		dm_kcopyd_client_flush(wc->dm_kcopyd);
1946 	}
1947 
1948 	if (likely(wc->pause != 0)) {
1949 		while (1) {
1950 			unsigned long idle;
1951 
1952 			if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1953 			    unlikely(dm_suspended(wc->ti)))
1954 				break;
1955 			idle = dm_iot_idle_time(&wc->iot);
1956 			if (idle >= wc->pause)
1957 				break;
1958 			idle = wc->pause - idle;
1959 			if (idle > HZ)
1960 				idle = HZ;
1961 			schedule_timeout_idle(idle);
1962 		}
1963 	}
1964 
1965 	wc_lock(wc);
1966 restart:
1967 	if (writecache_has_error(wc)) {
1968 		wc_unlock(wc);
1969 		return;
1970 	}
1971 
1972 	if (unlikely(wc->writeback_all)) {
1973 		if (writecache_wait_for_writeback(wc))
1974 			goto restart;
1975 	}
1976 
1977 	if (wc->overwrote_committed)
1978 		writecache_wait_for_ios(wc, WRITE);
1979 
1980 	n_walked = 0;
1981 	INIT_LIST_HEAD(&skipped);
1982 	INIT_LIST_HEAD(&wbl.list);
1983 	wbl.size = 0;
1984 	while (!list_empty(&wc->lru) &&
1985 	       (wc->writeback_all ||
1986 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1987 		(jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1988 		 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1989 
1990 		n_walked++;
1991 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1992 		    likely(!wc->writeback_all)) {
1993 			if (likely(!dm_suspended(wc->ti)))
1994 				queue_work(wc->writeback_wq, &wc->writeback_work);
1995 			break;
1996 		}
1997 
1998 		if (unlikely(wc->writeback_all)) {
1999 			if (unlikely(!e)) {
2000 				writecache_flush(wc);
2001 				e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
2002 			} else
2003 				e = g;
2004 		} else
2005 			e = container_of(wc->lru.prev, struct wc_entry, lru);
2006 		BUG_ON(e->write_in_progress);
2007 		if (unlikely(!writecache_entry_is_committed(wc, e)))
2008 			writecache_flush(wc);
2009 
2010 		node = rb_prev(&e->rb_node);
2011 		if (node) {
2012 			f = container_of(node, struct wc_entry, rb_node);
2013 			if (unlikely(read_original_sector(wc, f) ==
2014 				     read_original_sector(wc, e))) {
2015 				BUG_ON(!f->write_in_progress);
2016 				list_move(&e->lru, &skipped);
2017 				cond_resched();
2018 				continue;
2019 			}
2020 		}
2021 		wc->writeback_size++;
2022 		list_move(&e->lru, &wbl.list);
2023 		wbl.size++;
2024 		e->write_in_progress = true;
2025 		e->wc_list_contiguous = 1;
2026 
2027 		f = e;
2028 
2029 		while (1) {
2030 			next_node = rb_next(&f->rb_node);
2031 			if (unlikely(!next_node))
2032 				break;
2033 			g = container_of(next_node, struct wc_entry, rb_node);
2034 			if (unlikely(read_original_sector(wc, g) ==
2035 			    read_original_sector(wc, f))) {
2036 				f = g;
2037 				continue;
2038 			}
2039 			if (read_original_sector(wc, g) !=
2040 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
2041 				break;
2042 			if (unlikely(g->write_in_progress))
2043 				break;
2044 			if (unlikely(!writecache_entry_is_committed(wc, g)))
2045 				break;
2046 
2047 			if (!WC_MODE_PMEM(wc)) {
2048 				if (g != f + 1)
2049 					break;
2050 			}
2051 
2052 			n_walked++;
2053 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2054 			//	break;
2055 
2056 			wc->writeback_size++;
2057 			list_move(&g->lru, &wbl.list);
2058 			wbl.size++;
2059 			g->write_in_progress = true;
2060 			g->wc_list_contiguous = BIO_MAX_VECS;
2061 			f = g;
2062 			e->wc_list_contiguous++;
2063 			if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
2064 				if (unlikely(wc->writeback_all)) {
2065 					next_node = rb_next(&f->rb_node);
2066 					if (likely(next_node))
2067 						g = container_of(next_node, struct wc_entry, rb_node);
2068 				}
2069 				break;
2070 			}
2071 		}
2072 		cond_resched();
2073 	}
2074 
2075 	if (!list_empty(&skipped)) {
2076 		list_splice_tail(&skipped, &wc->lru);
2077 		/*
2078 		 * If we didn't do any progress, we must wait until some
2079 		 * writeback finishes to avoid burning CPU in a loop
2080 		 */
2081 		if (unlikely(!wbl.size))
2082 			writecache_wait_for_writeback(wc);
2083 	}
2084 
2085 	wc_unlock(wc);
2086 
2087 	blk_start_plug(&plug);
2088 
2089 	if (WC_MODE_PMEM(wc))
2090 		__writecache_writeback_pmem(wc, &wbl);
2091 	else
2092 		__writecache_writeback_ssd(wc, &wbl);
2093 
2094 	blk_finish_plug(&plug);
2095 
2096 	if (unlikely(wc->writeback_all)) {
2097 		wc_lock(wc);
2098 		while (writecache_wait_for_writeback(wc))
2099 			;
2100 		wc_unlock(wc);
2101 	}
2102 }
2103 
2104 static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
2105 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2106 {
2107 	uint64_t n_blocks, offset;
2108 	struct wc_entry e;
2109 
2110 	n_blocks = device_size;
2111 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2112 
2113 	while (1) {
2114 		if (!n_blocks)
2115 			return -ENOSPC;
2116 		/* Verify the following entries[n_blocks] won't overflow */
2117 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2118 				 sizeof(struct wc_memory_entry)))
2119 			return -EFBIG;
2120 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2121 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2122 		if (offset + n_blocks * block_size <= device_size)
2123 			break;
2124 		n_blocks--;
2125 	}
2126 
2127 	/* check if the bit field overflows */
2128 	e.index = n_blocks;
2129 	if (e.index != n_blocks)
2130 		return -EFBIG;
2131 
2132 	if (n_blocks_p)
2133 		*n_blocks_p = n_blocks;
2134 	if (n_metadata_blocks_p)
2135 		*n_metadata_blocks_p = offset >> __ffs(block_size);
2136 	return 0;
2137 }
2138 
2139 static int init_memory(struct dm_writecache *wc)
2140 {
2141 	size_t b;
2142 	int r;
2143 
2144 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2145 	if (r)
2146 		return r;
2147 
2148 	r = writecache_alloc_entries(wc);
2149 	if (r)
2150 		return r;
2151 
2152 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2153 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2154 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2155 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2156 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2157 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2158 
2159 	for (b = 0; b < wc->n_blocks; b++) {
2160 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2161 		cond_resched();
2162 	}
2163 
2164 	writecache_flush_all_metadata(wc);
2165 	writecache_commit_flushed(wc, false);
2166 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2167 	writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
2168 	writecache_commit_flushed(wc, false);
2169 
2170 	return 0;
2171 }
2172 
2173 static void writecache_dtr(struct dm_target *ti)
2174 {
2175 	struct dm_writecache *wc = ti->private;
2176 
2177 	if (!wc)
2178 		return;
2179 
2180 	if (wc->endio_thread)
2181 		kthread_stop(wc->endio_thread);
2182 
2183 	if (wc->flush_thread)
2184 		kthread_stop(wc->flush_thread);
2185 
2186 	bioset_exit(&wc->bio_set);
2187 
2188 	mempool_exit(&wc->copy_pool);
2189 
2190 	if (wc->writeback_wq)
2191 		destroy_workqueue(wc->writeback_wq);
2192 
2193 	if (wc->dev)
2194 		dm_put_device(ti, wc->dev);
2195 
2196 	if (wc->ssd_dev)
2197 		dm_put_device(ti, wc->ssd_dev);
2198 
2199 	vfree(wc->entries);
2200 
2201 	if (wc->memory_map) {
2202 		if (WC_MODE_PMEM(wc))
2203 			persistent_memory_release(wc);
2204 		else
2205 			vfree(wc->memory_map);
2206 	}
2207 
2208 	if (wc->dm_kcopyd)
2209 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
2210 
2211 	if (wc->dm_io)
2212 		dm_io_client_destroy(wc->dm_io);
2213 
2214 	vfree(wc->dirty_bitmap);
2215 
2216 	kfree(wc);
2217 }
2218 
2219 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2220 {
2221 	struct dm_writecache *wc;
2222 	struct dm_arg_set as;
2223 	const char *string;
2224 	unsigned int opt_params;
2225 	size_t offset, data_size;
2226 	int i, r;
2227 	char dummy;
2228 	int high_wm_percent = HIGH_WATERMARK;
2229 	int low_wm_percent = LOW_WATERMARK;
2230 	uint64_t x;
2231 	struct wc_memory_superblock s;
2232 
2233 	static struct dm_arg _args[] = {
2234 		{0, 18, "Invalid number of feature args"},
2235 	};
2236 
2237 	as.argc = argc;
2238 	as.argv = argv;
2239 
2240 	wc = kzalloc_obj(struct dm_writecache);
2241 	if (!wc) {
2242 		ti->error = "Cannot allocate writecache structure";
2243 		r = -ENOMEM;
2244 		goto bad;
2245 	}
2246 	ti->private = wc;
2247 	wc->ti = ti;
2248 
2249 	mutex_init(&wc->lock);
2250 	wc->max_age = MAX_AGE_UNSPECIFIED;
2251 	writecache_poison_lists(wc);
2252 	init_waitqueue_head(&wc->freelist_wait);
2253 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2254 	timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2255 
2256 	for (i = 0; i < 2; i++) {
2257 		atomic_set(&wc->bio_in_progress[i], 0);
2258 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2259 	}
2260 
2261 	wc->dm_io = dm_io_client_create();
2262 	if (IS_ERR(wc->dm_io)) {
2263 		r = PTR_ERR(wc->dm_io);
2264 		ti->error = "Unable to allocate dm-io client";
2265 		wc->dm_io = NULL;
2266 		goto bad;
2267 	}
2268 
2269 	wc->writeback_wq = alloc_workqueue("writecache-writeback",
2270 					   WQ_MEM_RECLAIM | WQ_PERCPU, 1);
2271 	if (!wc->writeback_wq) {
2272 		r = -ENOMEM;
2273 		ti->error = "Could not allocate writeback workqueue";
2274 		goto bad;
2275 	}
2276 	INIT_WORK(&wc->writeback_work, writecache_writeback);
2277 	INIT_WORK(&wc->flush_work, writecache_flush_work);
2278 
2279 	dm_iot_init(&wc->iot);
2280 
2281 	raw_spin_lock_init(&wc->endio_list_lock);
2282 	INIT_LIST_HEAD(&wc->endio_list);
2283 	wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
2284 	if (IS_ERR(wc->endio_thread)) {
2285 		r = PTR_ERR(wc->endio_thread);
2286 		wc->endio_thread = NULL;
2287 		ti->error = "Couldn't spawn endio thread";
2288 		goto bad;
2289 	}
2290 
2291 	/*
2292 	 * Parse the mode (pmem or ssd)
2293 	 */
2294 	string = dm_shift_arg(&as);
2295 	if (!string)
2296 		goto bad_arguments;
2297 
2298 	if (!strcasecmp(string, "s")) {
2299 		wc->pmem_mode = false;
2300 	} else if (!strcasecmp(string, "p")) {
2301 #ifdef DM_WRITECACHE_HAS_PMEM
2302 		wc->pmem_mode = true;
2303 		wc->writeback_fua = true;
2304 #else
2305 		/*
2306 		 * If the architecture doesn't support persistent memory or
2307 		 * the kernel doesn't support any DAX drivers, this driver can
2308 		 * only be used in SSD-only mode.
2309 		 */
2310 		r = -EOPNOTSUPP;
2311 		ti->error = "Persistent memory or DAX not supported on this system";
2312 		goto bad;
2313 #endif
2314 	} else {
2315 		goto bad_arguments;
2316 	}
2317 
2318 	if (WC_MODE_PMEM(wc)) {
2319 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2320 				offsetof(struct writeback_struct, bio),
2321 				BIOSET_NEED_BVECS);
2322 		if (r) {
2323 			ti->error = "Could not allocate bio set";
2324 			goto bad;
2325 		}
2326 	} else {
2327 		wc->pause = PAUSE_WRITEBACK;
2328 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2329 		if (r) {
2330 			ti->error = "Could not allocate mempool";
2331 			goto bad;
2332 		}
2333 	}
2334 
2335 	/*
2336 	 * Parse the origin data device
2337 	 */
2338 	string = dm_shift_arg(&as);
2339 	if (!string)
2340 		goto bad_arguments;
2341 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2342 	if (r) {
2343 		ti->error = "Origin data device lookup failed";
2344 		goto bad;
2345 	}
2346 
2347 	/*
2348 	 * Parse cache data device (be it pmem or ssd)
2349 	 */
2350 	string = dm_shift_arg(&as);
2351 	if (!string)
2352 		goto bad_arguments;
2353 
2354 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2355 	if (r) {
2356 		ti->error = "Cache data device lookup failed";
2357 		goto bad;
2358 	}
2359 	wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
2360 
2361 	/*
2362 	 * Parse the cache block size
2363 	 */
2364 	string = dm_shift_arg(&as);
2365 	if (!string)
2366 		goto bad_arguments;
2367 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2368 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2369 	    (wc->block_size & (wc->block_size - 1))) {
2370 		r = -EINVAL;
2371 		ti->error = "Invalid block size";
2372 		goto bad;
2373 	}
2374 	if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2375 	    wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2376 		r = -EINVAL;
2377 		ti->error = "Block size is smaller than device logical block size";
2378 		goto bad;
2379 	}
2380 	wc->block_size_bits = __ffs(wc->block_size);
2381 
2382 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2383 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2384 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2385 
2386 	/*
2387 	 * Parse optional arguments
2388 	 */
2389 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2390 	if (r)
2391 		goto bad;
2392 
2393 	while (opt_params) {
2394 		string = dm_shift_arg(&as), opt_params--;
2395 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2396 			unsigned long long start_sector;
2397 
2398 			string = dm_shift_arg(&as), opt_params--;
2399 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2400 				goto invalid_optional;
2401 			wc->start_sector = start_sector;
2402 			wc->start_sector_set = true;
2403 			if (wc->start_sector != start_sector ||
2404 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2405 				goto invalid_optional;
2406 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2407 			string = dm_shift_arg(&as), opt_params--;
2408 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2409 				goto invalid_optional;
2410 			if (high_wm_percent < 0 || high_wm_percent > 100)
2411 				goto invalid_optional;
2412 			wc->high_wm_percent_value = high_wm_percent;
2413 			wc->high_wm_percent_set = true;
2414 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2415 			string = dm_shift_arg(&as), opt_params--;
2416 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2417 				goto invalid_optional;
2418 			if (low_wm_percent < 0 || low_wm_percent > 100)
2419 				goto invalid_optional;
2420 			wc->low_wm_percent_value = low_wm_percent;
2421 			wc->low_wm_percent_set = true;
2422 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2423 			string = dm_shift_arg(&as), opt_params--;
2424 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2425 				goto invalid_optional;
2426 			wc->max_writeback_jobs_set = true;
2427 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2428 			string = dm_shift_arg(&as), opt_params--;
2429 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2430 				goto invalid_optional;
2431 			wc->autocommit_blocks_set = true;
2432 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2433 			unsigned int autocommit_msecs;
2434 
2435 			string = dm_shift_arg(&as), opt_params--;
2436 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2437 				goto invalid_optional;
2438 			if (autocommit_msecs > 3600000)
2439 				goto invalid_optional;
2440 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2441 			wc->autocommit_time_value = autocommit_msecs;
2442 			wc->autocommit_time_set = true;
2443 		} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2444 			unsigned int max_age_msecs;
2445 
2446 			string = dm_shift_arg(&as), opt_params--;
2447 			if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2448 				goto invalid_optional;
2449 			if (max_age_msecs > 86400000)
2450 				goto invalid_optional;
2451 			wc->max_age = msecs_to_jiffies(max_age_msecs);
2452 			wc->max_age_set = true;
2453 			wc->max_age_value = max_age_msecs;
2454 		} else if (!strcasecmp(string, "cleaner")) {
2455 			wc->cleaner_set = true;
2456 			wc->cleaner = true;
2457 		} else if (!strcasecmp(string, "fua")) {
2458 			if (WC_MODE_PMEM(wc)) {
2459 				wc->writeback_fua = true;
2460 				wc->writeback_fua_set = true;
2461 			} else
2462 				goto invalid_optional;
2463 		} else if (!strcasecmp(string, "nofua")) {
2464 			if (WC_MODE_PMEM(wc)) {
2465 				wc->writeback_fua = false;
2466 				wc->writeback_fua_set = true;
2467 			} else
2468 				goto invalid_optional;
2469 		} else if (!strcasecmp(string, "metadata_only")) {
2470 			wc->metadata_only = true;
2471 		} else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2472 			unsigned int pause_msecs;
2473 
2474 			if (WC_MODE_PMEM(wc))
2475 				goto invalid_optional;
2476 			string = dm_shift_arg(&as), opt_params--;
2477 			if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2478 				goto invalid_optional;
2479 			if (pause_msecs > 60000)
2480 				goto invalid_optional;
2481 			wc->pause = msecs_to_jiffies(pause_msecs);
2482 			wc->pause_set = true;
2483 			wc->pause_value = pause_msecs;
2484 		} else {
2485 invalid_optional:
2486 			r = -EINVAL;
2487 			ti->error = "Invalid optional argument";
2488 			goto bad;
2489 		}
2490 	}
2491 
2492 	if (high_wm_percent < low_wm_percent) {
2493 		r = -EINVAL;
2494 		ti->error = "High watermark must be greater than or equal to low watermark";
2495 		goto bad;
2496 	}
2497 
2498 	if (WC_MODE_PMEM(wc)) {
2499 		if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2500 			r = -EOPNOTSUPP;
2501 			ti->error = "Asynchronous persistent memory not supported as pmem cache";
2502 			goto bad;
2503 		}
2504 
2505 		r = persistent_memory_claim(wc);
2506 		if (r) {
2507 			ti->error = "Unable to map persistent memory for cache";
2508 			goto bad;
2509 		}
2510 	} else {
2511 		size_t n_blocks, n_metadata_blocks;
2512 		uint64_t n_bitmap_bits;
2513 
2514 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2515 
2516 		bio_list_init(&wc->flush_list);
2517 		wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
2518 		if (IS_ERR(wc->flush_thread)) {
2519 			r = PTR_ERR(wc->flush_thread);
2520 			wc->flush_thread = NULL;
2521 			ti->error = "Couldn't spawn flush thread";
2522 			goto bad;
2523 		}
2524 
2525 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2526 					  &n_blocks, &n_metadata_blocks);
2527 		if (r) {
2528 			ti->error = "Invalid device size";
2529 			goto bad;
2530 		}
2531 
2532 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2533 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2534 		/* this is limitation of test_bit functions */
2535 		if (n_bitmap_bits > 1U << 31) {
2536 			r = -EFBIG;
2537 			ti->error = "Invalid device size";
2538 			goto bad;
2539 		}
2540 
2541 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2542 		if (!wc->memory_map) {
2543 			r = -ENOMEM;
2544 			ti->error = "Unable to allocate memory for metadata";
2545 			goto bad;
2546 		}
2547 
2548 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2549 		if (IS_ERR(wc->dm_kcopyd)) {
2550 			r = PTR_ERR(wc->dm_kcopyd);
2551 			ti->error = "Unable to allocate dm-kcopyd client";
2552 			wc->dm_kcopyd = NULL;
2553 			goto bad;
2554 		}
2555 
2556 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2557 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2558 			BITS_PER_LONG * sizeof(unsigned long);
2559 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2560 		if (!wc->dirty_bitmap) {
2561 			r = -ENOMEM;
2562 			ti->error = "Unable to allocate dirty bitmap";
2563 			goto bad;
2564 		}
2565 
2566 		r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2567 		if (r) {
2568 			ti->error = "Unable to read first block of metadata";
2569 			goto bad;
2570 		}
2571 	}
2572 
2573 	r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2574 	if (r) {
2575 		ti->error = "Hardware memory error when reading superblock";
2576 		goto bad;
2577 	}
2578 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2579 		r = init_memory(wc);
2580 		if (r) {
2581 			ti->error = "Unable to initialize device";
2582 			goto bad;
2583 		}
2584 		r = copy_mc_to_kernel(&s, sb(wc),
2585 				      sizeof(struct wc_memory_superblock));
2586 		if (r) {
2587 			ti->error = "Hardware memory error when reading superblock";
2588 			goto bad;
2589 		}
2590 	}
2591 
2592 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2593 		ti->error = "Invalid magic in the superblock";
2594 		r = -EINVAL;
2595 		goto bad;
2596 	}
2597 
2598 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2599 		ti->error = "Invalid version in the superblock";
2600 		r = -EINVAL;
2601 		goto bad;
2602 	}
2603 
2604 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2605 		ti->error = "Block size does not match superblock";
2606 		r = -EINVAL;
2607 		goto bad;
2608 	}
2609 
2610 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2611 
2612 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2613 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2614 overflow:
2615 		ti->error = "Overflow in size calculation";
2616 		r = -EINVAL;
2617 		goto bad;
2618 	}
2619 	offset += sizeof(struct wc_memory_superblock);
2620 	if (offset < sizeof(struct wc_memory_superblock))
2621 		goto overflow;
2622 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2623 	data_size = wc->n_blocks * (size_t)wc->block_size;
2624 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2625 	    (offset + data_size < offset))
2626 		goto overflow;
2627 	if (offset + data_size > wc->memory_map_size) {
2628 		ti->error = "Memory area is too small";
2629 		r = -EINVAL;
2630 		goto bad;
2631 	}
2632 
2633 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2634 	wc->block_start = (char *)sb(wc) + offset;
2635 
2636 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2637 	x += 50;
2638 	do_div(x, 100);
2639 	wc->freelist_high_watermark = x;
2640 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2641 	x += 50;
2642 	do_div(x, 100);
2643 	wc->freelist_low_watermark = x;
2644 
2645 	if (wc->cleaner)
2646 		activate_cleaner(wc);
2647 
2648 	r = writecache_alloc_entries(wc);
2649 	if (r) {
2650 		ti->error = "Cannot allocate memory";
2651 		goto bad;
2652 	}
2653 
2654 	ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2655 	ti->flush_supported = true;
2656 	ti->num_discard_bios = 1;
2657 
2658 	if (WC_MODE_PMEM(wc))
2659 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2660 
2661 	return 0;
2662 
2663 bad_arguments:
2664 	r = -EINVAL;
2665 	ti->error = "Bad arguments";
2666 bad:
2667 	writecache_dtr(ti);
2668 	return r;
2669 }
2670 
2671 static void writecache_status(struct dm_target *ti, status_type_t type,
2672 			      unsigned int status_flags, char *result, unsigned int maxlen)
2673 {
2674 	struct dm_writecache *wc = ti->private;
2675 	unsigned int extra_args;
2676 	unsigned int sz = 0;
2677 
2678 	switch (type) {
2679 	case STATUSTYPE_INFO:
2680 		DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2681 		       writecache_has_error(wc),
2682 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2683 		       (unsigned long long)wc->writeback_size,
2684 		       wc->stats.reads,
2685 		       wc->stats.read_hits,
2686 		       wc->stats.writes,
2687 		       wc->stats.write_hits_uncommitted,
2688 		       wc->stats.write_hits_committed,
2689 		       wc->stats.writes_around,
2690 		       wc->stats.writes_allocate,
2691 		       wc->stats.writes_blocked_on_freelist,
2692 		       wc->stats.flushes,
2693 		       wc->stats.discards);
2694 		break;
2695 	case STATUSTYPE_TABLE:
2696 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2697 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2698 		extra_args = 0;
2699 		if (wc->start_sector_set)
2700 			extra_args += 2;
2701 		if (wc->high_wm_percent_set)
2702 			extra_args += 2;
2703 		if (wc->low_wm_percent_set)
2704 			extra_args += 2;
2705 		if (wc->max_writeback_jobs_set)
2706 			extra_args += 2;
2707 		if (wc->autocommit_blocks_set)
2708 			extra_args += 2;
2709 		if (wc->autocommit_time_set)
2710 			extra_args += 2;
2711 		if (wc->max_age_set)
2712 			extra_args += 2;
2713 		if (wc->cleaner_set)
2714 			extra_args++;
2715 		if (wc->writeback_fua_set)
2716 			extra_args++;
2717 		if (wc->metadata_only)
2718 			extra_args++;
2719 		if (wc->pause_set)
2720 			extra_args += 2;
2721 
2722 		DMEMIT("%u", extra_args);
2723 		if (wc->start_sector_set)
2724 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2725 		if (wc->high_wm_percent_set)
2726 			DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2727 		if (wc->low_wm_percent_set)
2728 			DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2729 		if (wc->max_writeback_jobs_set)
2730 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2731 		if (wc->autocommit_blocks_set)
2732 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2733 		if (wc->autocommit_time_set)
2734 			DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2735 		if (wc->max_age_set)
2736 			DMEMIT(" max_age %u", wc->max_age_value);
2737 		if (wc->cleaner_set)
2738 			DMEMIT(" cleaner");
2739 		if (wc->writeback_fua_set)
2740 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2741 		if (wc->metadata_only)
2742 			DMEMIT(" metadata_only");
2743 		if (wc->pause_set)
2744 			DMEMIT(" pause_writeback %u", wc->pause_value);
2745 		break;
2746 	case STATUSTYPE_IMA:
2747 		*result = '\0';
2748 		break;
2749 	}
2750 }
2751 
2752 static struct target_type writecache_target = {
2753 	.name			= "writecache",
2754 	.version		= {1, 6, 0},
2755 	.module			= THIS_MODULE,
2756 	.ctr			= writecache_ctr,
2757 	.dtr			= writecache_dtr,
2758 	.status			= writecache_status,
2759 	.postsuspend		= writecache_suspend,
2760 	.resume			= writecache_resume,
2761 	.message		= writecache_message,
2762 	.map			= writecache_map,
2763 	.end_io			= writecache_end_io,
2764 	.iterate_devices	= writecache_iterate_devices,
2765 	.io_hints		= writecache_io_hints,
2766 };
2767 module_dm(writecache);
2768 
2769 MODULE_DESCRIPTION(DM_NAME " writecache target");
2770 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2771 MODULE_LICENSE("GPL");
2772