xref: /linux/drivers/md/dm-writecache.c (revision e9ef810dfee7a2227da9d423aecb0ced35faddbe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/libnvdimm.h>
17 #include <linux/delay.h>
18 #include "dm-io-tracker.h"
19 
20 #define DM_MSG_PREFIX "writecache"
21 
22 #define HIGH_WATERMARK			50
23 #define LOW_WATERMARK			45
24 #define MAX_WRITEBACK_JOBS		min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
25 #define ENDIO_LATENCY			16
26 #define WRITEBACK_LATENCY		64
27 #define AUTOCOMMIT_BLOCKS_SSD		65536
28 #define AUTOCOMMIT_BLOCKS_PMEM		64
29 #define AUTOCOMMIT_MSEC			1000
30 #define MAX_AGE_DIV			16
31 #define MAX_AGE_UNSPECIFIED		-1UL
32 #define PAUSE_WRITEBACK			(HZ * 3)
33 
34 #define BITMAP_GRANULARITY	65536
35 #if BITMAP_GRANULARITY < PAGE_SIZE
36 #undef BITMAP_GRANULARITY
37 #define BITMAP_GRANULARITY	PAGE_SIZE
38 #endif
39 
40 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
41 #define DM_WRITECACHE_HAS_PMEM
42 #endif
43 
44 #ifdef DM_WRITECACHE_HAS_PMEM
45 #define pmem_assign(dest, src)					\
46 do {								\
47 	typeof(dest) uniq = (src);				\
48 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
49 } while (0)
50 #else
51 #define pmem_assign(dest, src)	((dest) = (src))
52 #endif
53 
54 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
55 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
56 #endif
57 
58 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
59 #define MEMORY_SUPERBLOCK_VERSION	1
60 
61 struct wc_memory_entry {
62 	__le64 original_sector;
63 	__le64 seq_count;
64 };
65 
66 struct wc_memory_superblock {
67 	union {
68 		struct {
69 			__le32 magic;
70 			__le32 version;
71 			__le32 block_size;
72 			__le32 pad;
73 			__le64 n_blocks;
74 			__le64 seq_count;
75 		};
76 		__le64 padding[8];
77 	};
78 	struct wc_memory_entry entries[];
79 };
80 
81 struct wc_entry {
82 	struct rb_node rb_node;
83 	struct list_head lru;
84 	unsigned short wc_list_contiguous;
85 #if BITS_PER_LONG == 64
86 	bool write_in_progress : 1;
87 	unsigned long index : 47;
88 #else
89 	bool write_in_progress;
90 	unsigned long index;
91 #endif
92 	unsigned long age;
93 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
94 	uint64_t original_sector;
95 	uint64_t seq_count;
96 #endif
97 };
98 
99 #ifdef DM_WRITECACHE_HAS_PMEM
100 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
101 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
102 #else
103 #define WC_MODE_PMEM(wc)			false
104 #define WC_MODE_FUA(wc)				false
105 #endif
106 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
107 
108 struct dm_writecache {
109 	struct mutex lock;
110 	struct list_head lru;
111 	union {
112 		struct list_head freelist;
113 		struct {
114 			struct rb_root freetree;
115 			struct wc_entry *current_free;
116 		};
117 	};
118 	struct rb_root tree;
119 
120 	size_t freelist_size;
121 	size_t writeback_size;
122 	size_t freelist_high_watermark;
123 	size_t freelist_low_watermark;
124 	unsigned long max_age;
125 	unsigned long pause;
126 
127 	unsigned int uncommitted_blocks;
128 	unsigned int autocommit_blocks;
129 	unsigned int max_writeback_jobs;
130 
131 	int error;
132 
133 	unsigned long autocommit_jiffies;
134 	struct timer_list autocommit_timer;
135 	struct wait_queue_head freelist_wait;
136 
137 	struct timer_list max_age_timer;
138 
139 	atomic_t bio_in_progress[2];
140 	struct wait_queue_head bio_in_progress_wait[2];
141 
142 	struct dm_target *ti;
143 	struct dm_dev *dev;
144 	struct dm_dev *ssd_dev;
145 	sector_t start_sector;
146 	void *memory_map;
147 	uint64_t memory_map_size;
148 	size_t metadata_sectors;
149 	size_t n_blocks;
150 	uint64_t seq_count;
151 	sector_t data_device_sectors;
152 	void *block_start;
153 	struct wc_entry *entries;
154 	unsigned int block_size;
155 	unsigned char block_size_bits;
156 
157 	bool pmem_mode:1;
158 	bool writeback_fua:1;
159 
160 	bool overwrote_committed:1;
161 	bool memory_vmapped:1;
162 
163 	bool start_sector_set:1;
164 	bool high_wm_percent_set:1;
165 	bool low_wm_percent_set:1;
166 	bool max_writeback_jobs_set:1;
167 	bool autocommit_blocks_set:1;
168 	bool autocommit_time_set:1;
169 	bool max_age_set:1;
170 	bool writeback_fua_set:1;
171 	bool flush_on_suspend:1;
172 	bool cleaner:1;
173 	bool cleaner_set:1;
174 	bool metadata_only:1;
175 	bool pause_set:1;
176 
177 	unsigned int high_wm_percent_value;
178 	unsigned int low_wm_percent_value;
179 	unsigned int autocommit_time_value;
180 	unsigned int max_age_value;
181 	unsigned int pause_value;
182 
183 	unsigned int writeback_all;
184 	struct workqueue_struct *writeback_wq;
185 	struct work_struct writeback_work;
186 	struct work_struct flush_work;
187 
188 	struct dm_io_tracker iot;
189 
190 	struct dm_io_client *dm_io;
191 
192 	raw_spinlock_t endio_list_lock;
193 	struct list_head endio_list;
194 	struct task_struct *endio_thread;
195 
196 	struct task_struct *flush_thread;
197 	struct bio_list flush_list;
198 
199 	struct dm_kcopyd_client *dm_kcopyd;
200 	unsigned long *dirty_bitmap;
201 	unsigned int dirty_bitmap_size;
202 
203 	struct bio_set bio_set;
204 	mempool_t copy_pool;
205 
206 	struct {
207 		unsigned long long reads;
208 		unsigned long long read_hits;
209 		unsigned long long writes;
210 		unsigned long long write_hits_uncommitted;
211 		unsigned long long write_hits_committed;
212 		unsigned long long writes_around;
213 		unsigned long long writes_allocate;
214 		unsigned long long writes_blocked_on_freelist;
215 		unsigned long long flushes;
216 		unsigned long long discards;
217 	} stats;
218 };
219 
220 #define WB_LIST_INLINE		16
221 
222 struct writeback_struct {
223 	struct list_head endio_entry;
224 	struct dm_writecache *wc;
225 	struct wc_entry **wc_list;
226 	unsigned int wc_list_n;
227 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
228 	struct bio bio;
229 };
230 
231 struct copy_struct {
232 	struct list_head endio_entry;
233 	struct dm_writecache *wc;
234 	struct wc_entry *e;
235 	unsigned int n_entries;
236 	int error;
237 };
238 
239 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
240 					    "A percentage of time allocated for data copying");
241 
wc_lock(struct dm_writecache * wc)242 static void wc_lock(struct dm_writecache *wc)
243 {
244 	mutex_lock(&wc->lock);
245 }
246 
wc_unlock(struct dm_writecache * wc)247 static void wc_unlock(struct dm_writecache *wc)
248 {
249 	mutex_unlock(&wc->lock);
250 }
251 
252 #ifdef DM_WRITECACHE_HAS_PMEM
persistent_memory_claim(struct dm_writecache * wc)253 static int persistent_memory_claim(struct dm_writecache *wc)
254 {
255 	int r;
256 	loff_t s;
257 	long p, da;
258 	unsigned long pfn;
259 	int id;
260 	struct page **pages;
261 	sector_t offset;
262 
263 	wc->memory_vmapped = false;
264 
265 	s = wc->memory_map_size;
266 	p = s >> PAGE_SHIFT;
267 	if (!p) {
268 		r = -EINVAL;
269 		goto err1;
270 	}
271 	if (p != s >> PAGE_SHIFT) {
272 		r = -EOVERFLOW;
273 		goto err1;
274 	}
275 
276 	offset = get_start_sect(wc->ssd_dev->bdev);
277 	if (offset & (PAGE_SIZE / 512 - 1)) {
278 		r = -EINVAL;
279 		goto err1;
280 	}
281 	offset >>= PAGE_SHIFT - 9;
282 
283 	id = dax_read_lock();
284 
285 	da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
286 			&wc->memory_map, &pfn);
287 	if (da < 0) {
288 		wc->memory_map = NULL;
289 		r = da;
290 		goto err2;
291 	}
292 	if (!pfn_valid(pfn)) {
293 		wc->memory_map = NULL;
294 		r = -EOPNOTSUPP;
295 		goto err2;
296 	}
297 	if (da != p) {
298 		long i;
299 
300 		wc->memory_map = NULL;
301 		pages = vmalloc_array(p, sizeof(struct page *));
302 		if (!pages) {
303 			r = -ENOMEM;
304 			goto err2;
305 		}
306 		i = 0;
307 		do {
308 			long daa;
309 
310 			daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
311 					p - i, DAX_ACCESS, NULL, &pfn);
312 			if (daa <= 0) {
313 				r = daa ? daa : -EINVAL;
314 				goto err3;
315 			}
316 			if (!pfn_valid(pfn)) {
317 				r = -EOPNOTSUPP;
318 				goto err3;
319 			}
320 			while (daa-- && i < p) {
321 				pages[i++] = pfn_to_page(pfn);
322 				pfn++;
323 				if (!(i & 15))
324 					cond_resched();
325 			}
326 		} while (i < p);
327 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
328 		if (!wc->memory_map) {
329 			r = -ENOMEM;
330 			goto err3;
331 		}
332 		vfree(pages);
333 		wc->memory_vmapped = true;
334 	}
335 
336 	dax_read_unlock(id);
337 
338 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
339 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
340 
341 	return 0;
342 err3:
343 	vfree(pages);
344 err2:
345 	dax_read_unlock(id);
346 err1:
347 	return r;
348 }
349 #else
persistent_memory_claim(struct dm_writecache * wc)350 static int persistent_memory_claim(struct dm_writecache *wc)
351 {
352 	return -EOPNOTSUPP;
353 }
354 #endif
355 
persistent_memory_release(struct dm_writecache * wc)356 static void persistent_memory_release(struct dm_writecache *wc)
357 {
358 	if (wc->memory_vmapped)
359 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
360 }
361 
persistent_memory_page(void * addr)362 static struct page *persistent_memory_page(void *addr)
363 {
364 	if (is_vmalloc_addr(addr))
365 		return vmalloc_to_page(addr);
366 	else
367 		return virt_to_page(addr);
368 }
369 
persistent_memory_page_offset(void * addr)370 static unsigned int persistent_memory_page_offset(void *addr)
371 {
372 	return (unsigned long)addr & (PAGE_SIZE - 1);
373 }
374 
persistent_memory_flush_cache(void * ptr,size_t size)375 static void persistent_memory_flush_cache(void *ptr, size_t size)
376 {
377 	if (is_vmalloc_addr(ptr))
378 		flush_kernel_vmap_range(ptr, size);
379 }
380 
persistent_memory_invalidate_cache(void * ptr,size_t size)381 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
382 {
383 	if (is_vmalloc_addr(ptr))
384 		invalidate_kernel_vmap_range(ptr, size);
385 }
386 
sb(struct dm_writecache * wc)387 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
388 {
389 	return wc->memory_map;
390 }
391 
memory_entry(struct dm_writecache * wc,struct wc_entry * e)392 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
393 {
394 	return &sb(wc)->entries[e->index];
395 }
396 
memory_data(struct dm_writecache * wc,struct wc_entry * e)397 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
398 {
399 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
400 }
401 
cache_sector(struct dm_writecache * wc,struct wc_entry * e)402 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
403 {
404 	return wc->start_sector + wc->metadata_sectors +
405 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
406 }
407 
read_original_sector(struct dm_writecache * wc,struct wc_entry * e)408 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
409 {
410 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
411 	return e->original_sector;
412 #else
413 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
414 #endif
415 }
416 
read_seq_count(struct dm_writecache * wc,struct wc_entry * e)417 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
418 {
419 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
420 	return e->seq_count;
421 #else
422 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
423 #endif
424 }
425 
clear_seq_count(struct dm_writecache * wc,struct wc_entry * e)426 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
427 {
428 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
429 	e->seq_count = -1;
430 #endif
431 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
432 }
433 
write_original_sector_seq_count(struct dm_writecache * wc,struct wc_entry * e,uint64_t original_sector,uint64_t seq_count)434 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
435 					    uint64_t original_sector, uint64_t seq_count)
436 {
437 	struct wc_memory_entry me;
438 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
439 	e->original_sector = original_sector;
440 	e->seq_count = seq_count;
441 #endif
442 	me.original_sector = cpu_to_le64(original_sector);
443 	me.seq_count = cpu_to_le64(seq_count);
444 	pmem_assign(*memory_entry(wc, e), me);
445 }
446 
447 #define writecache_error(wc, err, msg, arg...)				\
448 do {									\
449 	if (!cmpxchg(&(wc)->error, 0, err))				\
450 		DMERR(msg, ##arg);					\
451 	wake_up(&(wc)->freelist_wait);					\
452 } while (0)
453 
454 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
455 
writecache_flush_all_metadata(struct dm_writecache * wc)456 static void writecache_flush_all_metadata(struct dm_writecache *wc)
457 {
458 	if (!WC_MODE_PMEM(wc))
459 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
460 }
461 
writecache_flush_region(struct dm_writecache * wc,void * ptr,size_t size)462 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
463 {
464 	if (!WC_MODE_PMEM(wc))
465 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
466 			  wc->dirty_bitmap);
467 }
468 
469 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
470 
471 struct io_notify {
472 	struct dm_writecache *wc;
473 	struct completion c;
474 	atomic_t count;
475 };
476 
writecache_notify_io(unsigned long error,void * context)477 static void writecache_notify_io(unsigned long error, void *context)
478 {
479 	struct io_notify *endio = context;
480 
481 	if (unlikely(error != 0))
482 		writecache_error(endio->wc, -EIO, "error writing metadata");
483 	BUG_ON(atomic_read(&endio->count) <= 0);
484 	if (atomic_dec_and_test(&endio->count))
485 		complete(&endio->c);
486 }
487 
writecache_wait_for_ios(struct dm_writecache * wc,int direction)488 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
489 {
490 	wait_event(wc->bio_in_progress_wait[direction],
491 		   !atomic_read(&wc->bio_in_progress[direction]));
492 }
493 
ssd_commit_flushed(struct dm_writecache * wc,bool wait_for_ios)494 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
495 {
496 	struct dm_io_region region;
497 	struct dm_io_request req;
498 	struct io_notify endio = {
499 		wc,
500 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
501 		ATOMIC_INIT(1),
502 	};
503 	unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
504 	unsigned int i = 0;
505 
506 	while (1) {
507 		unsigned int j;
508 
509 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
510 		if (unlikely(i == bitmap_bits))
511 			break;
512 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
513 
514 		region.bdev = wc->ssd_dev->bdev;
515 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
516 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
517 
518 		if (unlikely(region.sector >= wc->metadata_sectors))
519 			break;
520 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
521 			region.count = wc->metadata_sectors - region.sector;
522 
523 		region.sector += wc->start_sector;
524 		atomic_inc(&endio.count);
525 		req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
526 		req.mem.type = DM_IO_VMA;
527 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
528 		req.client = wc->dm_io;
529 		req.notify.fn = writecache_notify_io;
530 		req.notify.context = &endio;
531 
532 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
533 		(void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
534 		i = j;
535 	}
536 
537 	writecache_notify_io(0, &endio);
538 	wait_for_completion_io(&endio.c);
539 
540 	if (wait_for_ios)
541 		writecache_wait_for_ios(wc, WRITE);
542 
543 	writecache_disk_flush(wc, wc->ssd_dev);
544 
545 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
546 }
547 
ssd_commit_superblock(struct dm_writecache * wc)548 static void ssd_commit_superblock(struct dm_writecache *wc)
549 {
550 	int r;
551 	struct dm_io_region region;
552 	struct dm_io_request req;
553 
554 	region.bdev = wc->ssd_dev->bdev;
555 	region.sector = 0;
556 	region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
557 
558 	if (unlikely(region.sector + region.count > wc->metadata_sectors))
559 		region.count = wc->metadata_sectors - region.sector;
560 
561 	region.sector += wc->start_sector;
562 
563 	req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
564 	req.mem.type = DM_IO_VMA;
565 	req.mem.ptr.vma = (char *)wc->memory_map;
566 	req.client = wc->dm_io;
567 	req.notify.fn = NULL;
568 	req.notify.context = NULL;
569 
570 	r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
571 	if (unlikely(r))
572 		writecache_error(wc, r, "error writing superblock");
573 }
574 
writecache_commit_flushed(struct dm_writecache * wc,bool wait_for_ios)575 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
576 {
577 	if (WC_MODE_PMEM(wc))
578 		pmem_wmb();
579 	else
580 		ssd_commit_flushed(wc, wait_for_ios);
581 }
582 
writecache_disk_flush(struct dm_writecache * wc,struct dm_dev * dev)583 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
584 {
585 	int r;
586 	struct dm_io_region region;
587 	struct dm_io_request req;
588 
589 	region.bdev = dev->bdev;
590 	region.sector = 0;
591 	region.count = 0;
592 	req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
593 	req.mem.type = DM_IO_KMEM;
594 	req.mem.ptr.addr = NULL;
595 	req.client = wc->dm_io;
596 	req.notify.fn = NULL;
597 
598 	r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
599 	if (unlikely(r))
600 		writecache_error(wc, r, "error flushing metadata: %d", r);
601 }
602 
603 #define WFE_RETURN_FOLLOWING	1
604 #define WFE_LOWEST_SEQ		2
605 
writecache_find_entry(struct dm_writecache * wc,uint64_t block,int flags)606 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
607 					      uint64_t block, int flags)
608 {
609 	struct wc_entry *e;
610 	struct rb_node *node = wc->tree.rb_node;
611 
612 	if (unlikely(!node))
613 		return NULL;
614 
615 	while (1) {
616 		e = container_of(node, struct wc_entry, rb_node);
617 		if (read_original_sector(wc, e) == block)
618 			break;
619 
620 		node = (read_original_sector(wc, e) >= block ?
621 			e->rb_node.rb_left : e->rb_node.rb_right);
622 		if (unlikely(!node)) {
623 			if (!(flags & WFE_RETURN_FOLLOWING))
624 				return NULL;
625 			if (read_original_sector(wc, e) >= block)
626 				return e;
627 
628 			node = rb_next(&e->rb_node);
629 			if (unlikely(!node))
630 				return NULL;
631 
632 			e = container_of(node, struct wc_entry, rb_node);
633 			return e;
634 		}
635 	}
636 
637 	while (1) {
638 		struct wc_entry *e2;
639 
640 		if (flags & WFE_LOWEST_SEQ)
641 			node = rb_prev(&e->rb_node);
642 		else
643 			node = rb_next(&e->rb_node);
644 		if (unlikely(!node))
645 			return e;
646 		e2 = container_of(node, struct wc_entry, rb_node);
647 		if (read_original_sector(wc, e2) != block)
648 			return e;
649 		e = e2;
650 	}
651 }
652 
writecache_insert_entry(struct dm_writecache * wc,struct wc_entry * ins)653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
654 {
655 	struct wc_entry *e;
656 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
657 
658 	while (*node) {
659 		e = container_of(*node, struct wc_entry, rb_node);
660 		parent = &e->rb_node;
661 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
662 			node = &parent->rb_left;
663 		else
664 			node = &parent->rb_right;
665 	}
666 	rb_link_node(&ins->rb_node, parent, node);
667 	rb_insert_color(&ins->rb_node, &wc->tree);
668 	list_add(&ins->lru, &wc->lru);
669 	ins->age = jiffies;
670 }
671 
writecache_unlink(struct dm_writecache * wc,struct wc_entry * e)672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
673 {
674 	list_del(&e->lru);
675 	rb_erase(&e->rb_node, &wc->tree);
676 }
677 
writecache_add_to_freelist(struct dm_writecache * wc,struct wc_entry * e)678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
679 {
680 	if (WC_MODE_SORT_FREELIST(wc)) {
681 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
682 
683 		if (unlikely(!*node))
684 			wc->current_free = e;
685 		while (*node) {
686 			parent = *node;
687 			if (&e->rb_node < *node)
688 				node = &parent->rb_left;
689 			else
690 				node = &parent->rb_right;
691 		}
692 		rb_link_node(&e->rb_node, parent, node);
693 		rb_insert_color(&e->rb_node, &wc->freetree);
694 	} else {
695 		list_add_tail(&e->lru, &wc->freelist);
696 	}
697 	wc->freelist_size++;
698 }
699 
writecache_verify_watermark(struct dm_writecache * wc)700 static inline void writecache_verify_watermark(struct dm_writecache *wc)
701 {
702 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
703 		queue_work(wc->writeback_wq, &wc->writeback_work);
704 }
705 
writecache_max_age_timer(struct timer_list * t)706 static void writecache_max_age_timer(struct timer_list *t)
707 {
708 	struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
709 
710 	if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
711 		queue_work(wc->writeback_wq, &wc->writeback_work);
712 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
713 	}
714 }
715 
writecache_pop_from_freelist(struct dm_writecache * wc,sector_t expected_sector)716 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
717 {
718 	struct wc_entry *e;
719 
720 	if (WC_MODE_SORT_FREELIST(wc)) {
721 		struct rb_node *next;
722 
723 		if (unlikely(!wc->current_free))
724 			return NULL;
725 		e = wc->current_free;
726 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
727 			return NULL;
728 		next = rb_next(&e->rb_node);
729 		rb_erase(&e->rb_node, &wc->freetree);
730 		if (unlikely(!next))
731 			next = rb_first(&wc->freetree);
732 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
733 	} else {
734 		if (unlikely(list_empty(&wc->freelist)))
735 			return NULL;
736 		e = container_of(wc->freelist.next, struct wc_entry, lru);
737 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
738 			return NULL;
739 		list_del(&e->lru);
740 	}
741 	wc->freelist_size--;
742 
743 	writecache_verify_watermark(wc);
744 
745 	return e;
746 }
747 
writecache_free_entry(struct dm_writecache * wc,struct wc_entry * e)748 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
749 {
750 	writecache_unlink(wc, e);
751 	writecache_add_to_freelist(wc, e);
752 	clear_seq_count(wc, e);
753 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
754 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
755 		wake_up(&wc->freelist_wait);
756 }
757 
writecache_wait_on_freelist(struct dm_writecache * wc)758 static void writecache_wait_on_freelist(struct dm_writecache *wc)
759 {
760 	DEFINE_WAIT(wait);
761 
762 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
763 	wc_unlock(wc);
764 	io_schedule();
765 	finish_wait(&wc->freelist_wait, &wait);
766 	wc_lock(wc);
767 }
768 
writecache_poison_lists(struct dm_writecache * wc)769 static void writecache_poison_lists(struct dm_writecache *wc)
770 {
771 	/*
772 	 * Catch incorrect access to these values while the device is suspended.
773 	 */
774 	memset(&wc->tree, -1, sizeof(wc->tree));
775 	wc->lru.next = LIST_POISON1;
776 	wc->lru.prev = LIST_POISON2;
777 	wc->freelist.next = LIST_POISON1;
778 	wc->freelist.prev = LIST_POISON2;
779 }
780 
writecache_flush_entry(struct dm_writecache * wc,struct wc_entry * e)781 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
782 {
783 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
784 	if (WC_MODE_PMEM(wc))
785 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
786 }
787 
writecache_entry_is_committed(struct dm_writecache * wc,struct wc_entry * e)788 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
789 {
790 	return read_seq_count(wc, e) < wc->seq_count;
791 }
792 
writecache_flush(struct dm_writecache * wc)793 static void writecache_flush(struct dm_writecache *wc)
794 {
795 	struct wc_entry *e, *e2;
796 	bool need_flush_after_free;
797 
798 	wc->uncommitted_blocks = 0;
799 	timer_delete(&wc->autocommit_timer);
800 
801 	if (list_empty(&wc->lru))
802 		return;
803 
804 	e = container_of(wc->lru.next, struct wc_entry, lru);
805 	if (writecache_entry_is_committed(wc, e)) {
806 		if (wc->overwrote_committed) {
807 			writecache_wait_for_ios(wc, WRITE);
808 			writecache_disk_flush(wc, wc->ssd_dev);
809 			wc->overwrote_committed = false;
810 		}
811 		return;
812 	}
813 	while (1) {
814 		writecache_flush_entry(wc, e);
815 		if (unlikely(e->lru.next == &wc->lru))
816 			break;
817 		e2 = container_of(e->lru.next, struct wc_entry, lru);
818 		if (writecache_entry_is_committed(wc, e2))
819 			break;
820 		e = e2;
821 		cond_resched();
822 	}
823 	writecache_commit_flushed(wc, true);
824 
825 	wc->seq_count++;
826 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
827 	if (WC_MODE_PMEM(wc))
828 		writecache_commit_flushed(wc, false);
829 	else
830 		ssd_commit_superblock(wc);
831 
832 	wc->overwrote_committed = false;
833 
834 	need_flush_after_free = false;
835 	while (1) {
836 		/* Free another committed entry with lower seq-count */
837 		struct rb_node *rb_node = rb_prev(&e->rb_node);
838 
839 		if (rb_node) {
840 			e2 = container_of(rb_node, struct wc_entry, rb_node);
841 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
842 			    likely(!e2->write_in_progress)) {
843 				writecache_free_entry(wc, e2);
844 				need_flush_after_free = true;
845 			}
846 		}
847 		if (unlikely(e->lru.prev == &wc->lru))
848 			break;
849 		e = container_of(e->lru.prev, struct wc_entry, lru);
850 		cond_resched();
851 	}
852 
853 	if (need_flush_after_free)
854 		writecache_commit_flushed(wc, false);
855 }
856 
writecache_flush_work(struct work_struct * work)857 static void writecache_flush_work(struct work_struct *work)
858 {
859 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
860 
861 	wc_lock(wc);
862 	writecache_flush(wc);
863 	wc_unlock(wc);
864 }
865 
writecache_autocommit_timer(struct timer_list * t)866 static void writecache_autocommit_timer(struct timer_list *t)
867 {
868 	struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
869 
870 	if (!writecache_has_error(wc))
871 		queue_work(wc->writeback_wq, &wc->flush_work);
872 }
873 
writecache_schedule_autocommit(struct dm_writecache * wc)874 static void writecache_schedule_autocommit(struct dm_writecache *wc)
875 {
876 	if (!timer_pending(&wc->autocommit_timer))
877 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
878 }
879 
writecache_discard(struct dm_writecache * wc,sector_t start,sector_t end)880 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
881 {
882 	struct wc_entry *e;
883 	bool discarded_something = false;
884 
885 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
886 	if (unlikely(!e))
887 		return;
888 
889 	while (read_original_sector(wc, e) < end) {
890 		struct rb_node *node = rb_next(&e->rb_node);
891 
892 		if (likely(!e->write_in_progress)) {
893 			if (!discarded_something) {
894 				if (!WC_MODE_PMEM(wc)) {
895 					writecache_wait_for_ios(wc, READ);
896 					writecache_wait_for_ios(wc, WRITE);
897 				}
898 				discarded_something = true;
899 			}
900 			if (!writecache_entry_is_committed(wc, e))
901 				wc->uncommitted_blocks--;
902 			writecache_free_entry(wc, e);
903 		}
904 
905 		if (unlikely(!node))
906 			break;
907 
908 		e = container_of(node, struct wc_entry, rb_node);
909 	}
910 
911 	if (discarded_something)
912 		writecache_commit_flushed(wc, false);
913 }
914 
writecache_wait_for_writeback(struct dm_writecache * wc)915 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
916 {
917 	if (wc->writeback_size) {
918 		writecache_wait_on_freelist(wc);
919 		return true;
920 	}
921 	return false;
922 }
923 
writecache_suspend(struct dm_target * ti)924 static void writecache_suspend(struct dm_target *ti)
925 {
926 	struct dm_writecache *wc = ti->private;
927 	bool flush_on_suspend;
928 
929 	timer_delete_sync(&wc->autocommit_timer);
930 	timer_delete_sync(&wc->max_age_timer);
931 
932 	wc_lock(wc);
933 	writecache_flush(wc);
934 	flush_on_suspend = wc->flush_on_suspend;
935 	if (flush_on_suspend) {
936 		wc->flush_on_suspend = false;
937 		wc->writeback_all++;
938 		queue_work(wc->writeback_wq, &wc->writeback_work);
939 	}
940 	wc_unlock(wc);
941 
942 	drain_workqueue(wc->writeback_wq);
943 
944 	wc_lock(wc);
945 	if (flush_on_suspend)
946 		wc->writeback_all--;
947 	while (writecache_wait_for_writeback(wc))
948 		;
949 
950 	if (WC_MODE_PMEM(wc))
951 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
952 
953 	writecache_poison_lists(wc);
954 
955 	wc_unlock(wc);
956 }
957 
writecache_alloc_entries(struct dm_writecache * wc)958 static int writecache_alloc_entries(struct dm_writecache *wc)
959 {
960 	size_t b;
961 
962 	if (wc->entries)
963 		return 0;
964 	wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
965 	if (!wc->entries)
966 		return -ENOMEM;
967 	for (b = 0; b < wc->n_blocks; b++) {
968 		struct wc_entry *e = &wc->entries[b];
969 
970 		e->index = b;
971 		e->write_in_progress = false;
972 		cond_resched();
973 	}
974 
975 	return 0;
976 }
977 
writecache_read_metadata(struct dm_writecache * wc,sector_t n_sectors)978 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
979 {
980 	struct dm_io_region region;
981 	struct dm_io_request req;
982 
983 	region.bdev = wc->ssd_dev->bdev;
984 	region.sector = wc->start_sector;
985 	region.count = n_sectors;
986 	req.bi_opf = REQ_OP_READ | REQ_SYNC;
987 	req.mem.type = DM_IO_VMA;
988 	req.mem.ptr.vma = (char *)wc->memory_map;
989 	req.client = wc->dm_io;
990 	req.notify.fn = NULL;
991 
992 	return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
993 }
994 
writecache_resume(struct dm_target * ti)995 static void writecache_resume(struct dm_target *ti)
996 {
997 	struct dm_writecache *wc = ti->private;
998 	size_t b;
999 	bool need_flush = false;
1000 	__le64 sb_seq_count;
1001 	int r;
1002 
1003 	wc_lock(wc);
1004 
1005 	wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
1006 
1007 	if (WC_MODE_PMEM(wc)) {
1008 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
1009 	} else {
1010 		r = writecache_read_metadata(wc, wc->metadata_sectors);
1011 		if (r) {
1012 			size_t sb_entries_offset;
1013 
1014 			writecache_error(wc, r, "unable to read metadata: %d", r);
1015 			sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1016 			memset((char *)wc->memory_map + sb_entries_offset, -1,
1017 			       (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1018 		}
1019 	}
1020 
1021 	wc->tree = RB_ROOT;
1022 	INIT_LIST_HEAD(&wc->lru);
1023 	if (WC_MODE_SORT_FREELIST(wc)) {
1024 		wc->freetree = RB_ROOT;
1025 		wc->current_free = NULL;
1026 	} else {
1027 		INIT_LIST_HEAD(&wc->freelist);
1028 	}
1029 	wc->freelist_size = 0;
1030 
1031 	r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1032 			      sizeof(uint64_t));
1033 	if (r) {
1034 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1035 		sb_seq_count = cpu_to_le64(0);
1036 	}
1037 	wc->seq_count = le64_to_cpu(sb_seq_count);
1038 
1039 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1040 	for (b = 0; b < wc->n_blocks; b++) {
1041 		struct wc_entry *e = &wc->entries[b];
1042 		struct wc_memory_entry wme;
1043 
1044 		if (writecache_has_error(wc)) {
1045 			e->original_sector = -1;
1046 			e->seq_count = -1;
1047 			continue;
1048 		}
1049 		r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1050 				      sizeof(struct wc_memory_entry));
1051 		if (r) {
1052 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1053 					 (unsigned long)b, r);
1054 			e->original_sector = -1;
1055 			e->seq_count = -1;
1056 		} else {
1057 			e->original_sector = le64_to_cpu(wme.original_sector);
1058 			e->seq_count = le64_to_cpu(wme.seq_count);
1059 		}
1060 		cond_resched();
1061 	}
1062 #endif
1063 	for (b = 0; b < wc->n_blocks; b++) {
1064 		struct wc_entry *e = &wc->entries[b];
1065 
1066 		if (!writecache_entry_is_committed(wc, e)) {
1067 			if (read_seq_count(wc, e) != -1) {
1068 erase_this:
1069 				clear_seq_count(wc, e);
1070 				need_flush = true;
1071 			}
1072 			writecache_add_to_freelist(wc, e);
1073 		} else {
1074 			struct wc_entry *old;
1075 
1076 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1077 			if (!old) {
1078 				writecache_insert_entry(wc, e);
1079 			} else {
1080 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1081 					writecache_error(wc, -EINVAL,
1082 						 "two identical entries, position %llu, sector %llu, sequence %llu",
1083 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1084 						 (unsigned long long)read_seq_count(wc, e));
1085 				}
1086 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1087 					goto erase_this;
1088 				} else {
1089 					writecache_free_entry(wc, old);
1090 					writecache_insert_entry(wc, e);
1091 					need_flush = true;
1092 				}
1093 			}
1094 		}
1095 		cond_resched();
1096 	}
1097 
1098 	if (need_flush) {
1099 		writecache_flush_all_metadata(wc);
1100 		writecache_commit_flushed(wc, false);
1101 	}
1102 
1103 	writecache_verify_watermark(wc);
1104 
1105 	if (wc->max_age != MAX_AGE_UNSPECIFIED)
1106 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1107 
1108 	wc_unlock(wc);
1109 }
1110 
process_flush_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1111 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1112 {
1113 	if (argc != 1)
1114 		return -EINVAL;
1115 
1116 	wc_lock(wc);
1117 	if (dm_suspended(wc->ti)) {
1118 		wc_unlock(wc);
1119 		return -EBUSY;
1120 	}
1121 	if (writecache_has_error(wc)) {
1122 		wc_unlock(wc);
1123 		return -EIO;
1124 	}
1125 
1126 	writecache_flush(wc);
1127 	wc->writeback_all++;
1128 	queue_work(wc->writeback_wq, &wc->writeback_work);
1129 	wc_unlock(wc);
1130 
1131 	flush_workqueue(wc->writeback_wq);
1132 
1133 	wc_lock(wc);
1134 	wc->writeback_all--;
1135 	if (writecache_has_error(wc)) {
1136 		wc_unlock(wc);
1137 		return -EIO;
1138 	}
1139 	wc_unlock(wc);
1140 
1141 	return 0;
1142 }
1143 
process_flush_on_suspend_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1144 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1145 {
1146 	if (argc != 1)
1147 		return -EINVAL;
1148 
1149 	wc_lock(wc);
1150 	wc->flush_on_suspend = true;
1151 	wc_unlock(wc);
1152 
1153 	return 0;
1154 }
1155 
activate_cleaner(struct dm_writecache * wc)1156 static void activate_cleaner(struct dm_writecache *wc)
1157 {
1158 	wc->flush_on_suspend = true;
1159 	wc->cleaner = true;
1160 	wc->freelist_high_watermark = wc->n_blocks;
1161 	wc->freelist_low_watermark = wc->n_blocks;
1162 }
1163 
process_cleaner_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1164 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1165 {
1166 	if (argc != 1)
1167 		return -EINVAL;
1168 
1169 	wc_lock(wc);
1170 	activate_cleaner(wc);
1171 	if (!dm_suspended(wc->ti))
1172 		writecache_verify_watermark(wc);
1173 	wc_unlock(wc);
1174 
1175 	return 0;
1176 }
1177 
process_clear_stats_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1178 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1179 {
1180 	if (argc != 1)
1181 		return -EINVAL;
1182 
1183 	wc_lock(wc);
1184 	memset(&wc->stats, 0, sizeof(wc->stats));
1185 	wc_unlock(wc);
1186 
1187 	return 0;
1188 }
1189 
writecache_message(struct dm_target * ti,unsigned int argc,char ** argv,char * result,unsigned int maxlen)1190 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
1191 			      char *result, unsigned int maxlen)
1192 {
1193 	int r = -EINVAL;
1194 	struct dm_writecache *wc = ti->private;
1195 
1196 	if (!strcasecmp(argv[0], "flush"))
1197 		r = process_flush_mesg(argc, argv, wc);
1198 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1199 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1200 	else if (!strcasecmp(argv[0], "cleaner"))
1201 		r = process_cleaner_mesg(argc, argv, wc);
1202 	else if (!strcasecmp(argv[0], "clear_stats"))
1203 		r = process_clear_stats_mesg(argc, argv, wc);
1204 	else
1205 		DMERR("unrecognised message received: %s", argv[0]);
1206 
1207 	return r;
1208 }
1209 
memcpy_flushcache_optimized(void * dest,void * source,size_t size)1210 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1211 {
1212 	/*
1213 	 * clflushopt performs better with block size 1024, 2048, 4096
1214 	 * non-temporal stores perform better with block size 512
1215 	 *
1216 	 * block size   512             1024            2048            4096
1217 	 * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1218 	 * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1219 	 *
1220 	 * We see that movnti performs better for 512-byte blocks, and
1221 	 * clflushopt performs better for 1024-byte and larger blocks. So, we
1222 	 * prefer clflushopt for sizes >= 768.
1223 	 *
1224 	 * NOTE: this happens to be the case now (with dm-writecache's single
1225 	 * threaded model) but re-evaluate this once memcpy_flushcache() is
1226 	 * enabled to use movdir64b which might invalidate this performance
1227 	 * advantage seen with cache-allocating-writes plus flushing.
1228 	 */
1229 #ifdef CONFIG_X86
1230 	if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1231 	    likely(boot_cpu_data.x86_clflush_size == 64) &&
1232 	    likely(size >= 768)) {
1233 		do {
1234 			memcpy((void *)dest, (void *)source, 64);
1235 			clflushopt((void *)dest);
1236 			dest += 64;
1237 			source += 64;
1238 			size -= 64;
1239 		} while (size >= 64);
1240 		return;
1241 	}
1242 #endif
1243 	memcpy_flushcache(dest, source, size);
1244 }
1245 
bio_copy_block(struct dm_writecache * wc,struct bio * bio,void * data)1246 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1247 {
1248 	void *buf;
1249 	unsigned int size;
1250 	int rw = bio_data_dir(bio);
1251 	unsigned int remaining_size = wc->block_size;
1252 
1253 	do {
1254 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1255 
1256 		buf = bvec_kmap_local(&bv);
1257 		size = bv.bv_len;
1258 		if (unlikely(size > remaining_size))
1259 			size = remaining_size;
1260 
1261 		if (rw == READ) {
1262 			int r;
1263 
1264 			r = copy_mc_to_kernel(buf, data, size);
1265 			flush_dcache_page(bio_page(bio));
1266 			if (unlikely(r)) {
1267 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1268 				bio->bi_status = BLK_STS_IOERR;
1269 			}
1270 		} else {
1271 			flush_dcache_page(bio_page(bio));
1272 			memcpy_flushcache_optimized(data, buf, size);
1273 		}
1274 
1275 		kunmap_local(buf);
1276 
1277 		data = (char *)data + size;
1278 		remaining_size -= size;
1279 		bio_advance(bio, size);
1280 	} while (unlikely(remaining_size));
1281 }
1282 
writecache_flush_thread(void * data)1283 static int writecache_flush_thread(void *data)
1284 {
1285 	struct dm_writecache *wc = data;
1286 
1287 	while (1) {
1288 		struct bio *bio;
1289 
1290 		wc_lock(wc);
1291 		bio = bio_list_pop(&wc->flush_list);
1292 		if (!bio) {
1293 			set_current_state(TASK_INTERRUPTIBLE);
1294 			wc_unlock(wc);
1295 
1296 			if (unlikely(kthread_should_stop())) {
1297 				set_current_state(TASK_RUNNING);
1298 				break;
1299 			}
1300 
1301 			schedule();
1302 			continue;
1303 		}
1304 
1305 		if (bio_op(bio) == REQ_OP_DISCARD) {
1306 			writecache_discard(wc, bio->bi_iter.bi_sector,
1307 					   bio_end_sector(bio));
1308 			wc_unlock(wc);
1309 			bio_set_dev(bio, wc->dev->bdev);
1310 			submit_bio_noacct(bio);
1311 		} else {
1312 			writecache_flush(wc);
1313 			wc_unlock(wc);
1314 			if (writecache_has_error(wc))
1315 				bio->bi_status = BLK_STS_IOERR;
1316 			bio_endio(bio);
1317 		}
1318 	}
1319 
1320 	return 0;
1321 }
1322 
writecache_offload_bio(struct dm_writecache * wc,struct bio * bio)1323 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1324 {
1325 	if (bio_list_empty(&wc->flush_list))
1326 		wake_up_process(wc->flush_thread);
1327 	bio_list_add(&wc->flush_list, bio);
1328 }
1329 
1330 enum wc_map_op {
1331 	WC_MAP_SUBMIT,
1332 	WC_MAP_REMAP,
1333 	WC_MAP_REMAP_ORIGIN,
1334 	WC_MAP_RETURN,
1335 	WC_MAP_ERROR,
1336 };
1337 
writecache_map_remap_origin(struct dm_writecache * wc,struct bio * bio,struct wc_entry * e)1338 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
1339 					struct wc_entry *e)
1340 {
1341 	if (e) {
1342 		sector_t next_boundary =
1343 			read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1344 		if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
1345 			dm_accept_partial_bio(bio, next_boundary);
1346 	}
1347 }
1348 
writecache_map_read(struct dm_writecache * wc,struct bio * bio)1349 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
1350 {
1351 	enum wc_map_op map_op;
1352 	struct wc_entry *e;
1353 
1354 read_next_block:
1355 	wc->stats.reads++;
1356 	e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1357 	if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1358 		wc->stats.read_hits++;
1359 		if (WC_MODE_PMEM(wc)) {
1360 			bio_copy_block(wc, bio, memory_data(wc, e));
1361 			if (bio->bi_iter.bi_size)
1362 				goto read_next_block;
1363 			map_op = WC_MAP_SUBMIT;
1364 		} else {
1365 			dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1366 			bio_set_dev(bio, wc->ssd_dev->bdev);
1367 			bio->bi_iter.bi_sector = cache_sector(wc, e);
1368 			if (!writecache_entry_is_committed(wc, e))
1369 				writecache_wait_for_ios(wc, WRITE);
1370 			map_op = WC_MAP_REMAP;
1371 		}
1372 	} else {
1373 		writecache_map_remap_origin(wc, bio, e);
1374 		wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1375 		map_op = WC_MAP_REMAP_ORIGIN;
1376 	}
1377 
1378 	return map_op;
1379 }
1380 
writecache_bio_copy_ssd(struct dm_writecache * wc,struct bio * bio,struct wc_entry * e,bool search_used)1381 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
1382 				    struct wc_entry *e, bool search_used)
1383 {
1384 	unsigned int bio_size = wc->block_size;
1385 	sector_t start_cache_sec = cache_sector(wc, e);
1386 	sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1387 
1388 	while (bio_size < bio->bi_iter.bi_size) {
1389 		if (!search_used) {
1390 			struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1391 
1392 			if (!f)
1393 				break;
1394 			write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1395 							(bio_size >> SECTOR_SHIFT), wc->seq_count);
1396 			writecache_insert_entry(wc, f);
1397 			wc->uncommitted_blocks++;
1398 		} else {
1399 			struct wc_entry *f;
1400 			struct rb_node *next = rb_next(&e->rb_node);
1401 
1402 			if (!next)
1403 				break;
1404 			f = container_of(next, struct wc_entry, rb_node);
1405 			if (f != e + 1)
1406 				break;
1407 			if (read_original_sector(wc, f) !=
1408 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1409 				break;
1410 			if (unlikely(f->write_in_progress))
1411 				break;
1412 			if (writecache_entry_is_committed(wc, f))
1413 				wc->overwrote_committed = true;
1414 			e = f;
1415 		}
1416 		bio_size += wc->block_size;
1417 		current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1418 	}
1419 
1420 	bio_set_dev(bio, wc->ssd_dev->bdev);
1421 	bio->bi_iter.bi_sector = start_cache_sec;
1422 	dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1423 
1424 	wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1425 	wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1426 
1427 	if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1428 		wc->uncommitted_blocks = 0;
1429 		queue_work(wc->writeback_wq, &wc->flush_work);
1430 	} else {
1431 		writecache_schedule_autocommit(wc);
1432 	}
1433 }
1434 
writecache_map_write(struct dm_writecache * wc,struct bio * bio)1435 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
1436 {
1437 	struct wc_entry *e;
1438 
1439 	do {
1440 		bool found_entry = false;
1441 		bool search_used = false;
1442 
1443 		if (writecache_has_error(wc)) {
1444 			wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1445 			return WC_MAP_ERROR;
1446 		}
1447 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1448 		if (e) {
1449 			if (!writecache_entry_is_committed(wc, e)) {
1450 				wc->stats.write_hits_uncommitted++;
1451 				search_used = true;
1452 				goto bio_copy;
1453 			}
1454 			wc->stats.write_hits_committed++;
1455 			if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1456 				wc->overwrote_committed = true;
1457 				search_used = true;
1458 				goto bio_copy;
1459 			}
1460 			found_entry = true;
1461 		} else {
1462 			if (unlikely(wc->cleaner) ||
1463 			    (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1464 				goto direct_write;
1465 		}
1466 		e = writecache_pop_from_freelist(wc, (sector_t)-1);
1467 		if (unlikely(!e)) {
1468 			if (!WC_MODE_PMEM(wc) && !found_entry) {
1469 direct_write:
1470 				e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1471 				writecache_map_remap_origin(wc, bio, e);
1472 				wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
1473 				wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1474 				return WC_MAP_REMAP_ORIGIN;
1475 			}
1476 			wc->stats.writes_blocked_on_freelist++;
1477 			writecache_wait_on_freelist(wc);
1478 			continue;
1479 		}
1480 		write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1481 		writecache_insert_entry(wc, e);
1482 		wc->uncommitted_blocks++;
1483 		wc->stats.writes_allocate++;
1484 bio_copy:
1485 		if (WC_MODE_PMEM(wc)) {
1486 			bio_copy_block(wc, bio, memory_data(wc, e));
1487 			wc->stats.writes++;
1488 		} else {
1489 			writecache_bio_copy_ssd(wc, bio, e, search_used);
1490 			return WC_MAP_REMAP;
1491 		}
1492 	} while (bio->bi_iter.bi_size);
1493 
1494 	if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
1495 		writecache_flush(wc);
1496 	else
1497 		writecache_schedule_autocommit(wc);
1498 
1499 	return WC_MAP_SUBMIT;
1500 }
1501 
writecache_map_flush(struct dm_writecache * wc,struct bio * bio)1502 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
1503 {
1504 	if (writecache_has_error(wc))
1505 		return WC_MAP_ERROR;
1506 
1507 	if (WC_MODE_PMEM(wc)) {
1508 		wc->stats.flushes++;
1509 		writecache_flush(wc);
1510 		if (writecache_has_error(wc))
1511 			return WC_MAP_ERROR;
1512 		else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1513 			return WC_MAP_REMAP_ORIGIN;
1514 		return WC_MAP_SUBMIT;
1515 	}
1516 	/* SSD: */
1517 	if (dm_bio_get_target_bio_nr(bio))
1518 		return WC_MAP_REMAP_ORIGIN;
1519 	wc->stats.flushes++;
1520 	writecache_offload_bio(wc, bio);
1521 	return WC_MAP_RETURN;
1522 }
1523 
writecache_map_discard(struct dm_writecache * wc,struct bio * bio)1524 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
1525 {
1526 	wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
1527 
1528 	if (writecache_has_error(wc))
1529 		return WC_MAP_ERROR;
1530 
1531 	if (WC_MODE_PMEM(wc)) {
1532 		writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1533 		return WC_MAP_REMAP_ORIGIN;
1534 	}
1535 	/* SSD: */
1536 	writecache_offload_bio(wc, bio);
1537 	return WC_MAP_RETURN;
1538 }
1539 
writecache_map(struct dm_target * ti,struct bio * bio)1540 static int writecache_map(struct dm_target *ti, struct bio *bio)
1541 {
1542 	struct dm_writecache *wc = ti->private;
1543 	enum wc_map_op map_op;
1544 
1545 	bio->bi_private = NULL;
1546 
1547 	wc_lock(wc);
1548 
1549 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1550 		map_op = writecache_map_flush(wc, bio);
1551 		goto done;
1552 	}
1553 
1554 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1555 
1556 	if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1557 				(wc->block_size / 512 - 1)) != 0)) {
1558 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1559 		      (unsigned long long)bio->bi_iter.bi_sector,
1560 		      bio->bi_iter.bi_size, wc->block_size);
1561 		map_op = WC_MAP_ERROR;
1562 		goto done;
1563 	}
1564 
1565 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1566 		map_op = writecache_map_discard(wc, bio);
1567 		goto done;
1568 	}
1569 
1570 	if (bio_data_dir(bio) == READ)
1571 		map_op = writecache_map_read(wc, bio);
1572 	else
1573 		map_op = writecache_map_write(wc, bio);
1574 done:
1575 	switch (map_op) {
1576 	case WC_MAP_REMAP_ORIGIN:
1577 		if (likely(wc->pause != 0)) {
1578 			if (bio_op(bio) == REQ_OP_WRITE) {
1579 				dm_iot_io_begin(&wc->iot, 1);
1580 				bio->bi_private = (void *)2;
1581 			}
1582 		}
1583 		bio_set_dev(bio, wc->dev->bdev);
1584 		wc_unlock(wc);
1585 		return DM_MAPIO_REMAPPED;
1586 
1587 	case WC_MAP_REMAP:
1588 		/* make sure that writecache_end_io decrements bio_in_progress: */
1589 		bio->bi_private = (void *)1;
1590 		atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1591 		wc_unlock(wc);
1592 		return DM_MAPIO_REMAPPED;
1593 
1594 	case WC_MAP_SUBMIT:
1595 		wc_unlock(wc);
1596 		bio_endio(bio);
1597 		return DM_MAPIO_SUBMITTED;
1598 
1599 	case WC_MAP_RETURN:
1600 		wc_unlock(wc);
1601 		return DM_MAPIO_SUBMITTED;
1602 
1603 	case WC_MAP_ERROR:
1604 		wc_unlock(wc);
1605 		bio_io_error(bio);
1606 		return DM_MAPIO_SUBMITTED;
1607 
1608 	default:
1609 		BUG();
1610 		wc_unlock(wc);
1611 		return DM_MAPIO_KILL;
1612 	}
1613 }
1614 
writecache_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * status)1615 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1616 {
1617 	struct dm_writecache *wc = ti->private;
1618 
1619 	if (bio->bi_private == (void *)1) {
1620 		int dir = bio_data_dir(bio);
1621 
1622 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1623 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1624 				wake_up(&wc->bio_in_progress_wait[dir]);
1625 	} else if (bio->bi_private == (void *)2) {
1626 		dm_iot_io_end(&wc->iot, 1);
1627 	}
1628 	return 0;
1629 }
1630 
writecache_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)1631 static int writecache_iterate_devices(struct dm_target *ti,
1632 				      iterate_devices_callout_fn fn, void *data)
1633 {
1634 	struct dm_writecache *wc = ti->private;
1635 
1636 	return fn(ti, wc->dev, 0, ti->len, data);
1637 }
1638 
writecache_io_hints(struct dm_target * ti,struct queue_limits * limits)1639 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1640 {
1641 	struct dm_writecache *wc = ti->private;
1642 
1643 	if (limits->logical_block_size < wc->block_size)
1644 		limits->logical_block_size = wc->block_size;
1645 
1646 	if (limits->physical_block_size < wc->block_size)
1647 		limits->physical_block_size = wc->block_size;
1648 
1649 	if (limits->io_min < wc->block_size)
1650 		limits->io_min = wc->block_size;
1651 }
1652 
1653 
writecache_writeback_endio(struct bio * bio)1654 static void writecache_writeback_endio(struct bio *bio)
1655 {
1656 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1657 	struct dm_writecache *wc = wb->wc;
1658 	unsigned long flags;
1659 
1660 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1661 	if (unlikely(list_empty(&wc->endio_list)))
1662 		wake_up_process(wc->endio_thread);
1663 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1664 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1665 }
1666 
writecache_copy_endio(int read_err,unsigned long write_err,void * ptr)1667 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1668 {
1669 	struct copy_struct *c = ptr;
1670 	struct dm_writecache *wc = c->wc;
1671 
1672 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1673 
1674 	raw_spin_lock_irq(&wc->endio_list_lock);
1675 	if (unlikely(list_empty(&wc->endio_list)))
1676 		wake_up_process(wc->endio_thread);
1677 	list_add_tail(&c->endio_entry, &wc->endio_list);
1678 	raw_spin_unlock_irq(&wc->endio_list_lock);
1679 }
1680 
__writecache_endio_pmem(struct dm_writecache * wc,struct list_head * list)1681 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1682 {
1683 	unsigned int i;
1684 	struct writeback_struct *wb;
1685 	struct wc_entry *e;
1686 	unsigned long n_walked = 0;
1687 
1688 	do {
1689 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1690 		list_del(&wb->endio_entry);
1691 
1692 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1693 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1694 					"write error %d", wb->bio.bi_status);
1695 		i = 0;
1696 		do {
1697 			e = wb->wc_list[i];
1698 			BUG_ON(!e->write_in_progress);
1699 			e->write_in_progress = false;
1700 			INIT_LIST_HEAD(&e->lru);
1701 			if (!writecache_has_error(wc))
1702 				writecache_free_entry(wc, e);
1703 			BUG_ON(!wc->writeback_size);
1704 			wc->writeback_size--;
1705 			n_walked++;
1706 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1707 				writecache_commit_flushed(wc, false);
1708 				wc_unlock(wc);
1709 				wc_lock(wc);
1710 				n_walked = 0;
1711 			}
1712 		} while (++i < wb->wc_list_n);
1713 
1714 		if (wb->wc_list != wb->wc_list_inline)
1715 			kfree(wb->wc_list);
1716 		bio_put(&wb->bio);
1717 	} while (!list_empty(list));
1718 }
1719 
__writecache_endio_ssd(struct dm_writecache * wc,struct list_head * list)1720 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1721 {
1722 	struct copy_struct *c;
1723 	struct wc_entry *e;
1724 
1725 	do {
1726 		c = list_entry(list->next, struct copy_struct, endio_entry);
1727 		list_del(&c->endio_entry);
1728 
1729 		if (unlikely(c->error))
1730 			writecache_error(wc, c->error, "copy error");
1731 
1732 		e = c->e;
1733 		do {
1734 			BUG_ON(!e->write_in_progress);
1735 			e->write_in_progress = false;
1736 			INIT_LIST_HEAD(&e->lru);
1737 			if (!writecache_has_error(wc))
1738 				writecache_free_entry(wc, e);
1739 
1740 			BUG_ON(!wc->writeback_size);
1741 			wc->writeback_size--;
1742 			e++;
1743 		} while (--c->n_entries);
1744 		mempool_free(c, &wc->copy_pool);
1745 	} while (!list_empty(list));
1746 }
1747 
writecache_endio_thread(void * data)1748 static int writecache_endio_thread(void *data)
1749 {
1750 	struct dm_writecache *wc = data;
1751 
1752 	while (1) {
1753 		struct list_head list;
1754 
1755 		raw_spin_lock_irq(&wc->endio_list_lock);
1756 		if (!list_empty(&wc->endio_list))
1757 			goto pop_from_list;
1758 		set_current_state(TASK_INTERRUPTIBLE);
1759 		raw_spin_unlock_irq(&wc->endio_list_lock);
1760 
1761 		if (unlikely(kthread_should_stop())) {
1762 			set_current_state(TASK_RUNNING);
1763 			break;
1764 		}
1765 
1766 		schedule();
1767 
1768 		continue;
1769 
1770 pop_from_list:
1771 		list = wc->endio_list;
1772 		list.next->prev = list.prev->next = &list;
1773 		INIT_LIST_HEAD(&wc->endio_list);
1774 		raw_spin_unlock_irq(&wc->endio_list_lock);
1775 
1776 		if (!WC_MODE_FUA(wc))
1777 			writecache_disk_flush(wc, wc->dev);
1778 
1779 		wc_lock(wc);
1780 
1781 		if (WC_MODE_PMEM(wc)) {
1782 			__writecache_endio_pmem(wc, &list);
1783 		} else {
1784 			__writecache_endio_ssd(wc, &list);
1785 			writecache_wait_for_ios(wc, READ);
1786 		}
1787 
1788 		writecache_commit_flushed(wc, false);
1789 
1790 		wc_unlock(wc);
1791 	}
1792 
1793 	return 0;
1794 }
1795 
wc_add_block(struct writeback_struct * wb,struct wc_entry * e)1796 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1797 {
1798 	struct dm_writecache *wc = wb->wc;
1799 	unsigned int block_size = wc->block_size;
1800 	void *address = memory_data(wc, e);
1801 
1802 	persistent_memory_flush_cache(address, block_size);
1803 
1804 	if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1805 		return true;
1806 
1807 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1808 			    block_size, persistent_memory_page_offset(address)) != 0;
1809 }
1810 
1811 struct writeback_list {
1812 	struct list_head list;
1813 	size_t size;
1814 };
1815 
__writeback_throttle(struct dm_writecache * wc,struct writeback_list * wbl)1816 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1817 {
1818 	if (unlikely(wc->max_writeback_jobs)) {
1819 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1820 			wc_lock(wc);
1821 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1822 				writecache_wait_on_freelist(wc);
1823 			wc_unlock(wc);
1824 		}
1825 	}
1826 	cond_resched();
1827 }
1828 
__writecache_writeback_pmem(struct dm_writecache * wc,struct writeback_list * wbl)1829 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1830 {
1831 	struct wc_entry *e, *f;
1832 	struct bio *bio;
1833 	struct writeback_struct *wb;
1834 	unsigned int max_pages;
1835 
1836 	while (wbl->size) {
1837 		wbl->size--;
1838 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1839 		list_del(&e->lru);
1840 
1841 		max_pages = e->wc_list_contiguous;
1842 
1843 		bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
1844 				       GFP_NOIO, &wc->bio_set);
1845 		wb = container_of(bio, struct writeback_struct, bio);
1846 		wb->wc = wc;
1847 		bio->bi_end_io = writecache_writeback_endio;
1848 		bio->bi_iter.bi_sector = read_original_sector(wc, e);
1849 
1850 		if (unlikely(max_pages > WB_LIST_INLINE))
1851 			wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1852 						    GFP_NOIO | __GFP_NORETRY |
1853 						    __GFP_NOMEMALLOC | __GFP_NOWARN);
1854 
1855 		if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
1856 			wb->wc_list = wb->wc_list_inline;
1857 			max_pages = WB_LIST_INLINE;
1858 		}
1859 
1860 		BUG_ON(!wc_add_block(wb, e));
1861 
1862 		wb->wc_list[0] = e;
1863 		wb->wc_list_n = 1;
1864 
1865 		while (wbl->size && wb->wc_list_n < max_pages) {
1866 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1867 			if (read_original_sector(wc, f) !=
1868 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1869 				break;
1870 			if (!wc_add_block(wb, f))
1871 				break;
1872 			wbl->size--;
1873 			list_del(&f->lru);
1874 			wb->wc_list[wb->wc_list_n++] = f;
1875 			e = f;
1876 		}
1877 		if (WC_MODE_FUA(wc))
1878 			bio->bi_opf |= REQ_FUA;
1879 		if (writecache_has_error(wc)) {
1880 			bio->bi_status = BLK_STS_IOERR;
1881 			bio_endio(bio);
1882 		} else if (unlikely(!bio_sectors(bio))) {
1883 			bio->bi_status = BLK_STS_OK;
1884 			bio_endio(bio);
1885 		} else {
1886 			submit_bio(bio);
1887 		}
1888 
1889 		__writeback_throttle(wc, wbl);
1890 	}
1891 }
1892 
__writecache_writeback_ssd(struct dm_writecache * wc,struct writeback_list * wbl)1893 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1894 {
1895 	struct wc_entry *e, *f;
1896 	struct dm_io_region from, to;
1897 	struct copy_struct *c;
1898 
1899 	while (wbl->size) {
1900 		unsigned int n_sectors;
1901 
1902 		wbl->size--;
1903 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1904 		list_del(&e->lru);
1905 
1906 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1907 
1908 		from.bdev = wc->ssd_dev->bdev;
1909 		from.sector = cache_sector(wc, e);
1910 		from.count = n_sectors;
1911 		to.bdev = wc->dev->bdev;
1912 		to.sector = read_original_sector(wc, e);
1913 		to.count = n_sectors;
1914 
1915 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1916 		c->wc = wc;
1917 		c->e = e;
1918 		c->n_entries = e->wc_list_contiguous;
1919 
1920 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1921 			wbl->size--;
1922 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1923 			BUG_ON(f != e + 1);
1924 			list_del(&f->lru);
1925 			e = f;
1926 		}
1927 
1928 		if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1929 			if (to.sector >= wc->data_device_sectors) {
1930 				writecache_copy_endio(0, 0, c);
1931 				continue;
1932 			}
1933 			from.count = to.count = wc->data_device_sectors - to.sector;
1934 		}
1935 
1936 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1937 
1938 		__writeback_throttle(wc, wbl);
1939 	}
1940 }
1941 
writecache_writeback(struct work_struct * work)1942 static void writecache_writeback(struct work_struct *work)
1943 {
1944 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1945 	struct blk_plug plug;
1946 	struct wc_entry *f, *g, *e = NULL;
1947 	struct rb_node *node, *next_node;
1948 	struct list_head skipped;
1949 	struct writeback_list wbl;
1950 	unsigned long n_walked;
1951 
1952 	if (!WC_MODE_PMEM(wc)) {
1953 		/* Wait for any active kcopyd work on behalf of ssd writeback */
1954 		dm_kcopyd_client_flush(wc->dm_kcopyd);
1955 	}
1956 
1957 	if (likely(wc->pause != 0)) {
1958 		while (1) {
1959 			unsigned long idle;
1960 
1961 			if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1962 			    unlikely(dm_suspended(wc->ti)))
1963 				break;
1964 			idle = dm_iot_idle_time(&wc->iot);
1965 			if (idle >= wc->pause)
1966 				break;
1967 			idle = wc->pause - idle;
1968 			if (idle > HZ)
1969 				idle = HZ;
1970 			schedule_timeout_idle(idle);
1971 		}
1972 	}
1973 
1974 	wc_lock(wc);
1975 restart:
1976 	if (writecache_has_error(wc)) {
1977 		wc_unlock(wc);
1978 		return;
1979 	}
1980 
1981 	if (unlikely(wc->writeback_all)) {
1982 		if (writecache_wait_for_writeback(wc))
1983 			goto restart;
1984 	}
1985 
1986 	if (wc->overwrote_committed)
1987 		writecache_wait_for_ios(wc, WRITE);
1988 
1989 	n_walked = 0;
1990 	INIT_LIST_HEAD(&skipped);
1991 	INIT_LIST_HEAD(&wbl.list);
1992 	wbl.size = 0;
1993 	while (!list_empty(&wc->lru) &&
1994 	       (wc->writeback_all ||
1995 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1996 		(jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1997 		 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1998 
1999 		n_walked++;
2000 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
2001 		    likely(!wc->writeback_all)) {
2002 			if (likely(!dm_suspended(wc->ti)))
2003 				queue_work(wc->writeback_wq, &wc->writeback_work);
2004 			break;
2005 		}
2006 
2007 		if (unlikely(wc->writeback_all)) {
2008 			if (unlikely(!e)) {
2009 				writecache_flush(wc);
2010 				e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
2011 			} else
2012 				e = g;
2013 		} else
2014 			e = container_of(wc->lru.prev, struct wc_entry, lru);
2015 		BUG_ON(e->write_in_progress);
2016 		if (unlikely(!writecache_entry_is_committed(wc, e)))
2017 			writecache_flush(wc);
2018 
2019 		node = rb_prev(&e->rb_node);
2020 		if (node) {
2021 			f = container_of(node, struct wc_entry, rb_node);
2022 			if (unlikely(read_original_sector(wc, f) ==
2023 				     read_original_sector(wc, e))) {
2024 				BUG_ON(!f->write_in_progress);
2025 				list_move(&e->lru, &skipped);
2026 				cond_resched();
2027 				continue;
2028 			}
2029 		}
2030 		wc->writeback_size++;
2031 		list_move(&e->lru, &wbl.list);
2032 		wbl.size++;
2033 		e->write_in_progress = true;
2034 		e->wc_list_contiguous = 1;
2035 
2036 		f = e;
2037 
2038 		while (1) {
2039 			next_node = rb_next(&f->rb_node);
2040 			if (unlikely(!next_node))
2041 				break;
2042 			g = container_of(next_node, struct wc_entry, rb_node);
2043 			if (unlikely(read_original_sector(wc, g) ==
2044 			    read_original_sector(wc, f))) {
2045 				f = g;
2046 				continue;
2047 			}
2048 			if (read_original_sector(wc, g) !=
2049 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
2050 				break;
2051 			if (unlikely(g->write_in_progress))
2052 				break;
2053 			if (unlikely(!writecache_entry_is_committed(wc, g)))
2054 				break;
2055 
2056 			if (!WC_MODE_PMEM(wc)) {
2057 				if (g != f + 1)
2058 					break;
2059 			}
2060 
2061 			n_walked++;
2062 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2063 			//	break;
2064 
2065 			wc->writeback_size++;
2066 			list_move(&g->lru, &wbl.list);
2067 			wbl.size++;
2068 			g->write_in_progress = true;
2069 			g->wc_list_contiguous = BIO_MAX_VECS;
2070 			f = g;
2071 			e->wc_list_contiguous++;
2072 			if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
2073 				if (unlikely(wc->writeback_all)) {
2074 					next_node = rb_next(&f->rb_node);
2075 					if (likely(next_node))
2076 						g = container_of(next_node, struct wc_entry, rb_node);
2077 				}
2078 				break;
2079 			}
2080 		}
2081 		cond_resched();
2082 	}
2083 
2084 	if (!list_empty(&skipped)) {
2085 		list_splice_tail(&skipped, &wc->lru);
2086 		/*
2087 		 * If we didn't do any progress, we must wait until some
2088 		 * writeback finishes to avoid burning CPU in a loop
2089 		 */
2090 		if (unlikely(!wbl.size))
2091 			writecache_wait_for_writeback(wc);
2092 	}
2093 
2094 	wc_unlock(wc);
2095 
2096 	blk_start_plug(&plug);
2097 
2098 	if (WC_MODE_PMEM(wc))
2099 		__writecache_writeback_pmem(wc, &wbl);
2100 	else
2101 		__writecache_writeback_ssd(wc, &wbl);
2102 
2103 	blk_finish_plug(&plug);
2104 
2105 	if (unlikely(wc->writeback_all)) {
2106 		wc_lock(wc);
2107 		while (writecache_wait_for_writeback(wc))
2108 			;
2109 		wc_unlock(wc);
2110 	}
2111 }
2112 
calculate_memory_size(uint64_t device_size,unsigned int block_size,size_t * n_blocks_p,size_t * n_metadata_blocks_p)2113 static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
2114 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2115 {
2116 	uint64_t n_blocks, offset;
2117 	struct wc_entry e;
2118 
2119 	n_blocks = device_size;
2120 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2121 
2122 	while (1) {
2123 		if (!n_blocks)
2124 			return -ENOSPC;
2125 		/* Verify the following entries[n_blocks] won't overflow */
2126 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2127 				 sizeof(struct wc_memory_entry)))
2128 			return -EFBIG;
2129 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2130 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2131 		if (offset + n_blocks * block_size <= device_size)
2132 			break;
2133 		n_blocks--;
2134 	}
2135 
2136 	/* check if the bit field overflows */
2137 	e.index = n_blocks;
2138 	if (e.index != n_blocks)
2139 		return -EFBIG;
2140 
2141 	if (n_blocks_p)
2142 		*n_blocks_p = n_blocks;
2143 	if (n_metadata_blocks_p)
2144 		*n_metadata_blocks_p = offset >> __ffs(block_size);
2145 	return 0;
2146 }
2147 
init_memory(struct dm_writecache * wc)2148 static int init_memory(struct dm_writecache *wc)
2149 {
2150 	size_t b;
2151 	int r;
2152 
2153 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2154 	if (r)
2155 		return r;
2156 
2157 	r = writecache_alloc_entries(wc);
2158 	if (r)
2159 		return r;
2160 
2161 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2162 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2163 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2164 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2165 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2166 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2167 
2168 	for (b = 0; b < wc->n_blocks; b++) {
2169 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2170 		cond_resched();
2171 	}
2172 
2173 	writecache_flush_all_metadata(wc);
2174 	writecache_commit_flushed(wc, false);
2175 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2176 	writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
2177 	writecache_commit_flushed(wc, false);
2178 
2179 	return 0;
2180 }
2181 
writecache_dtr(struct dm_target * ti)2182 static void writecache_dtr(struct dm_target *ti)
2183 {
2184 	struct dm_writecache *wc = ti->private;
2185 
2186 	if (!wc)
2187 		return;
2188 
2189 	if (wc->endio_thread)
2190 		kthread_stop(wc->endio_thread);
2191 
2192 	if (wc->flush_thread)
2193 		kthread_stop(wc->flush_thread);
2194 
2195 	bioset_exit(&wc->bio_set);
2196 
2197 	mempool_exit(&wc->copy_pool);
2198 
2199 	if (wc->writeback_wq)
2200 		destroy_workqueue(wc->writeback_wq);
2201 
2202 	if (wc->dev)
2203 		dm_put_device(ti, wc->dev);
2204 
2205 	if (wc->ssd_dev)
2206 		dm_put_device(ti, wc->ssd_dev);
2207 
2208 	vfree(wc->entries);
2209 
2210 	if (wc->memory_map) {
2211 		if (WC_MODE_PMEM(wc))
2212 			persistent_memory_release(wc);
2213 		else
2214 			vfree(wc->memory_map);
2215 	}
2216 
2217 	if (wc->dm_kcopyd)
2218 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
2219 
2220 	if (wc->dm_io)
2221 		dm_io_client_destroy(wc->dm_io);
2222 
2223 	vfree(wc->dirty_bitmap);
2224 
2225 	kfree(wc);
2226 }
2227 
writecache_ctr(struct dm_target * ti,unsigned int argc,char ** argv)2228 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2229 {
2230 	struct dm_writecache *wc;
2231 	struct dm_arg_set as;
2232 	const char *string;
2233 	unsigned int opt_params;
2234 	size_t offset, data_size;
2235 	int i, r;
2236 	char dummy;
2237 	int high_wm_percent = HIGH_WATERMARK;
2238 	int low_wm_percent = LOW_WATERMARK;
2239 	uint64_t x;
2240 	struct wc_memory_superblock s;
2241 
2242 	static struct dm_arg _args[] = {
2243 		{0, 18, "Invalid number of feature args"},
2244 	};
2245 
2246 	as.argc = argc;
2247 	as.argv = argv;
2248 
2249 	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2250 	if (!wc) {
2251 		ti->error = "Cannot allocate writecache structure";
2252 		r = -ENOMEM;
2253 		goto bad;
2254 	}
2255 	ti->private = wc;
2256 	wc->ti = ti;
2257 
2258 	mutex_init(&wc->lock);
2259 	wc->max_age = MAX_AGE_UNSPECIFIED;
2260 	writecache_poison_lists(wc);
2261 	init_waitqueue_head(&wc->freelist_wait);
2262 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2263 	timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2264 
2265 	for (i = 0; i < 2; i++) {
2266 		atomic_set(&wc->bio_in_progress[i], 0);
2267 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2268 	}
2269 
2270 	wc->dm_io = dm_io_client_create();
2271 	if (IS_ERR(wc->dm_io)) {
2272 		r = PTR_ERR(wc->dm_io);
2273 		ti->error = "Unable to allocate dm-io client";
2274 		wc->dm_io = NULL;
2275 		goto bad;
2276 	}
2277 
2278 	wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2279 	if (!wc->writeback_wq) {
2280 		r = -ENOMEM;
2281 		ti->error = "Could not allocate writeback workqueue";
2282 		goto bad;
2283 	}
2284 	INIT_WORK(&wc->writeback_work, writecache_writeback);
2285 	INIT_WORK(&wc->flush_work, writecache_flush_work);
2286 
2287 	dm_iot_init(&wc->iot);
2288 
2289 	raw_spin_lock_init(&wc->endio_list_lock);
2290 	INIT_LIST_HEAD(&wc->endio_list);
2291 	wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
2292 	if (IS_ERR(wc->endio_thread)) {
2293 		r = PTR_ERR(wc->endio_thread);
2294 		wc->endio_thread = NULL;
2295 		ti->error = "Couldn't spawn endio thread";
2296 		goto bad;
2297 	}
2298 
2299 	/*
2300 	 * Parse the mode (pmem or ssd)
2301 	 */
2302 	string = dm_shift_arg(&as);
2303 	if (!string)
2304 		goto bad_arguments;
2305 
2306 	if (!strcasecmp(string, "s")) {
2307 		wc->pmem_mode = false;
2308 	} else if (!strcasecmp(string, "p")) {
2309 #ifdef DM_WRITECACHE_HAS_PMEM
2310 		wc->pmem_mode = true;
2311 		wc->writeback_fua = true;
2312 #else
2313 		/*
2314 		 * If the architecture doesn't support persistent memory or
2315 		 * the kernel doesn't support any DAX drivers, this driver can
2316 		 * only be used in SSD-only mode.
2317 		 */
2318 		r = -EOPNOTSUPP;
2319 		ti->error = "Persistent memory or DAX not supported on this system";
2320 		goto bad;
2321 #endif
2322 	} else {
2323 		goto bad_arguments;
2324 	}
2325 
2326 	if (WC_MODE_PMEM(wc)) {
2327 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2328 				offsetof(struct writeback_struct, bio),
2329 				BIOSET_NEED_BVECS);
2330 		if (r) {
2331 			ti->error = "Could not allocate bio set";
2332 			goto bad;
2333 		}
2334 	} else {
2335 		wc->pause = PAUSE_WRITEBACK;
2336 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2337 		if (r) {
2338 			ti->error = "Could not allocate mempool";
2339 			goto bad;
2340 		}
2341 	}
2342 
2343 	/*
2344 	 * Parse the origin data device
2345 	 */
2346 	string = dm_shift_arg(&as);
2347 	if (!string)
2348 		goto bad_arguments;
2349 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2350 	if (r) {
2351 		ti->error = "Origin data device lookup failed";
2352 		goto bad;
2353 	}
2354 
2355 	/*
2356 	 * Parse cache data device (be it pmem or ssd)
2357 	 */
2358 	string = dm_shift_arg(&as);
2359 	if (!string)
2360 		goto bad_arguments;
2361 
2362 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2363 	if (r) {
2364 		ti->error = "Cache data device lookup failed";
2365 		goto bad;
2366 	}
2367 	wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
2368 
2369 	/*
2370 	 * Parse the cache block size
2371 	 */
2372 	string = dm_shift_arg(&as);
2373 	if (!string)
2374 		goto bad_arguments;
2375 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2376 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2377 	    (wc->block_size & (wc->block_size - 1))) {
2378 		r = -EINVAL;
2379 		ti->error = "Invalid block size";
2380 		goto bad;
2381 	}
2382 	if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2383 	    wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2384 		r = -EINVAL;
2385 		ti->error = "Block size is smaller than device logical block size";
2386 		goto bad;
2387 	}
2388 	wc->block_size_bits = __ffs(wc->block_size);
2389 
2390 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2391 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2392 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2393 
2394 	/*
2395 	 * Parse optional arguments
2396 	 */
2397 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2398 	if (r)
2399 		goto bad;
2400 
2401 	while (opt_params) {
2402 		string = dm_shift_arg(&as), opt_params--;
2403 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2404 			unsigned long long start_sector;
2405 
2406 			string = dm_shift_arg(&as), opt_params--;
2407 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2408 				goto invalid_optional;
2409 			wc->start_sector = start_sector;
2410 			wc->start_sector_set = true;
2411 			if (wc->start_sector != start_sector ||
2412 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2413 				goto invalid_optional;
2414 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2415 			string = dm_shift_arg(&as), opt_params--;
2416 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2417 				goto invalid_optional;
2418 			if (high_wm_percent < 0 || high_wm_percent > 100)
2419 				goto invalid_optional;
2420 			wc->high_wm_percent_value = high_wm_percent;
2421 			wc->high_wm_percent_set = true;
2422 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2423 			string = dm_shift_arg(&as), opt_params--;
2424 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2425 				goto invalid_optional;
2426 			if (low_wm_percent < 0 || low_wm_percent > 100)
2427 				goto invalid_optional;
2428 			wc->low_wm_percent_value = low_wm_percent;
2429 			wc->low_wm_percent_set = true;
2430 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2431 			string = dm_shift_arg(&as), opt_params--;
2432 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2433 				goto invalid_optional;
2434 			wc->max_writeback_jobs_set = true;
2435 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2436 			string = dm_shift_arg(&as), opt_params--;
2437 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2438 				goto invalid_optional;
2439 			wc->autocommit_blocks_set = true;
2440 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2441 			unsigned int autocommit_msecs;
2442 
2443 			string = dm_shift_arg(&as), opt_params--;
2444 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2445 				goto invalid_optional;
2446 			if (autocommit_msecs > 3600000)
2447 				goto invalid_optional;
2448 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2449 			wc->autocommit_time_value = autocommit_msecs;
2450 			wc->autocommit_time_set = true;
2451 		} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2452 			unsigned int max_age_msecs;
2453 
2454 			string = dm_shift_arg(&as), opt_params--;
2455 			if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2456 				goto invalid_optional;
2457 			if (max_age_msecs > 86400000)
2458 				goto invalid_optional;
2459 			wc->max_age = msecs_to_jiffies(max_age_msecs);
2460 			wc->max_age_set = true;
2461 			wc->max_age_value = max_age_msecs;
2462 		} else if (!strcasecmp(string, "cleaner")) {
2463 			wc->cleaner_set = true;
2464 			wc->cleaner = true;
2465 		} else if (!strcasecmp(string, "fua")) {
2466 			if (WC_MODE_PMEM(wc)) {
2467 				wc->writeback_fua = true;
2468 				wc->writeback_fua_set = true;
2469 			} else
2470 				goto invalid_optional;
2471 		} else if (!strcasecmp(string, "nofua")) {
2472 			if (WC_MODE_PMEM(wc)) {
2473 				wc->writeback_fua = false;
2474 				wc->writeback_fua_set = true;
2475 			} else
2476 				goto invalid_optional;
2477 		} else if (!strcasecmp(string, "metadata_only")) {
2478 			wc->metadata_only = true;
2479 		} else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2480 			unsigned int pause_msecs;
2481 
2482 			if (WC_MODE_PMEM(wc))
2483 				goto invalid_optional;
2484 			string = dm_shift_arg(&as), opt_params--;
2485 			if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2486 				goto invalid_optional;
2487 			if (pause_msecs > 60000)
2488 				goto invalid_optional;
2489 			wc->pause = msecs_to_jiffies(pause_msecs);
2490 			wc->pause_set = true;
2491 			wc->pause_value = pause_msecs;
2492 		} else {
2493 invalid_optional:
2494 			r = -EINVAL;
2495 			ti->error = "Invalid optional argument";
2496 			goto bad;
2497 		}
2498 	}
2499 
2500 	if (high_wm_percent < low_wm_percent) {
2501 		r = -EINVAL;
2502 		ti->error = "High watermark must be greater than or equal to low watermark";
2503 		goto bad;
2504 	}
2505 
2506 	if (WC_MODE_PMEM(wc)) {
2507 		if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2508 			r = -EOPNOTSUPP;
2509 			ti->error = "Asynchronous persistent memory not supported as pmem cache";
2510 			goto bad;
2511 		}
2512 
2513 		r = persistent_memory_claim(wc);
2514 		if (r) {
2515 			ti->error = "Unable to map persistent memory for cache";
2516 			goto bad;
2517 		}
2518 	} else {
2519 		size_t n_blocks, n_metadata_blocks;
2520 		uint64_t n_bitmap_bits;
2521 
2522 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2523 
2524 		bio_list_init(&wc->flush_list);
2525 		wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
2526 		if (IS_ERR(wc->flush_thread)) {
2527 			r = PTR_ERR(wc->flush_thread);
2528 			wc->flush_thread = NULL;
2529 			ti->error = "Couldn't spawn flush thread";
2530 			goto bad;
2531 		}
2532 
2533 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2534 					  &n_blocks, &n_metadata_blocks);
2535 		if (r) {
2536 			ti->error = "Invalid device size";
2537 			goto bad;
2538 		}
2539 
2540 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2541 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2542 		/* this is limitation of test_bit functions */
2543 		if (n_bitmap_bits > 1U << 31) {
2544 			r = -EFBIG;
2545 			ti->error = "Invalid device size";
2546 			goto bad;
2547 		}
2548 
2549 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2550 		if (!wc->memory_map) {
2551 			r = -ENOMEM;
2552 			ti->error = "Unable to allocate memory for metadata";
2553 			goto bad;
2554 		}
2555 
2556 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2557 		if (IS_ERR(wc->dm_kcopyd)) {
2558 			r = PTR_ERR(wc->dm_kcopyd);
2559 			ti->error = "Unable to allocate dm-kcopyd client";
2560 			wc->dm_kcopyd = NULL;
2561 			goto bad;
2562 		}
2563 
2564 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2565 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2566 			BITS_PER_LONG * sizeof(unsigned long);
2567 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2568 		if (!wc->dirty_bitmap) {
2569 			r = -ENOMEM;
2570 			ti->error = "Unable to allocate dirty bitmap";
2571 			goto bad;
2572 		}
2573 
2574 		r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2575 		if (r) {
2576 			ti->error = "Unable to read first block of metadata";
2577 			goto bad;
2578 		}
2579 	}
2580 
2581 	r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2582 	if (r) {
2583 		ti->error = "Hardware memory error when reading superblock";
2584 		goto bad;
2585 	}
2586 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2587 		r = init_memory(wc);
2588 		if (r) {
2589 			ti->error = "Unable to initialize device";
2590 			goto bad;
2591 		}
2592 		r = copy_mc_to_kernel(&s, sb(wc),
2593 				      sizeof(struct wc_memory_superblock));
2594 		if (r) {
2595 			ti->error = "Hardware memory error when reading superblock";
2596 			goto bad;
2597 		}
2598 	}
2599 
2600 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2601 		ti->error = "Invalid magic in the superblock";
2602 		r = -EINVAL;
2603 		goto bad;
2604 	}
2605 
2606 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2607 		ti->error = "Invalid version in the superblock";
2608 		r = -EINVAL;
2609 		goto bad;
2610 	}
2611 
2612 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2613 		ti->error = "Block size does not match superblock";
2614 		r = -EINVAL;
2615 		goto bad;
2616 	}
2617 
2618 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2619 
2620 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2621 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2622 overflow:
2623 		ti->error = "Overflow in size calculation";
2624 		r = -EINVAL;
2625 		goto bad;
2626 	}
2627 	offset += sizeof(struct wc_memory_superblock);
2628 	if (offset < sizeof(struct wc_memory_superblock))
2629 		goto overflow;
2630 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2631 	data_size = wc->n_blocks * (size_t)wc->block_size;
2632 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2633 	    (offset + data_size < offset))
2634 		goto overflow;
2635 	if (offset + data_size > wc->memory_map_size) {
2636 		ti->error = "Memory area is too small";
2637 		r = -EINVAL;
2638 		goto bad;
2639 	}
2640 
2641 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2642 	wc->block_start = (char *)sb(wc) + offset;
2643 
2644 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2645 	x += 50;
2646 	do_div(x, 100);
2647 	wc->freelist_high_watermark = x;
2648 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2649 	x += 50;
2650 	do_div(x, 100);
2651 	wc->freelist_low_watermark = x;
2652 
2653 	if (wc->cleaner)
2654 		activate_cleaner(wc);
2655 
2656 	r = writecache_alloc_entries(wc);
2657 	if (r) {
2658 		ti->error = "Cannot allocate memory";
2659 		goto bad;
2660 	}
2661 
2662 	ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2663 	ti->flush_supported = true;
2664 	ti->num_discard_bios = 1;
2665 
2666 	if (WC_MODE_PMEM(wc))
2667 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2668 
2669 	return 0;
2670 
2671 bad_arguments:
2672 	r = -EINVAL;
2673 	ti->error = "Bad arguments";
2674 bad:
2675 	writecache_dtr(ti);
2676 	return r;
2677 }
2678 
writecache_status(struct dm_target * ti,status_type_t type,unsigned int status_flags,char * result,unsigned int maxlen)2679 static void writecache_status(struct dm_target *ti, status_type_t type,
2680 			      unsigned int status_flags, char *result, unsigned int maxlen)
2681 {
2682 	struct dm_writecache *wc = ti->private;
2683 	unsigned int extra_args;
2684 	unsigned int sz = 0;
2685 
2686 	switch (type) {
2687 	case STATUSTYPE_INFO:
2688 		DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2689 		       writecache_has_error(wc),
2690 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2691 		       (unsigned long long)wc->writeback_size,
2692 		       wc->stats.reads,
2693 		       wc->stats.read_hits,
2694 		       wc->stats.writes,
2695 		       wc->stats.write_hits_uncommitted,
2696 		       wc->stats.write_hits_committed,
2697 		       wc->stats.writes_around,
2698 		       wc->stats.writes_allocate,
2699 		       wc->stats.writes_blocked_on_freelist,
2700 		       wc->stats.flushes,
2701 		       wc->stats.discards);
2702 		break;
2703 	case STATUSTYPE_TABLE:
2704 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2705 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2706 		extra_args = 0;
2707 		if (wc->start_sector_set)
2708 			extra_args += 2;
2709 		if (wc->high_wm_percent_set)
2710 			extra_args += 2;
2711 		if (wc->low_wm_percent_set)
2712 			extra_args += 2;
2713 		if (wc->max_writeback_jobs_set)
2714 			extra_args += 2;
2715 		if (wc->autocommit_blocks_set)
2716 			extra_args += 2;
2717 		if (wc->autocommit_time_set)
2718 			extra_args += 2;
2719 		if (wc->max_age_set)
2720 			extra_args += 2;
2721 		if (wc->cleaner_set)
2722 			extra_args++;
2723 		if (wc->writeback_fua_set)
2724 			extra_args++;
2725 		if (wc->metadata_only)
2726 			extra_args++;
2727 		if (wc->pause_set)
2728 			extra_args += 2;
2729 
2730 		DMEMIT("%u", extra_args);
2731 		if (wc->start_sector_set)
2732 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2733 		if (wc->high_wm_percent_set)
2734 			DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2735 		if (wc->low_wm_percent_set)
2736 			DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2737 		if (wc->max_writeback_jobs_set)
2738 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2739 		if (wc->autocommit_blocks_set)
2740 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2741 		if (wc->autocommit_time_set)
2742 			DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2743 		if (wc->max_age_set)
2744 			DMEMIT(" max_age %u", wc->max_age_value);
2745 		if (wc->cleaner_set)
2746 			DMEMIT(" cleaner");
2747 		if (wc->writeback_fua_set)
2748 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2749 		if (wc->metadata_only)
2750 			DMEMIT(" metadata_only");
2751 		if (wc->pause_set)
2752 			DMEMIT(" pause_writeback %u", wc->pause_value);
2753 		break;
2754 	case STATUSTYPE_IMA:
2755 		*result = '\0';
2756 		break;
2757 	}
2758 }
2759 
2760 static struct target_type writecache_target = {
2761 	.name			= "writecache",
2762 	.version		= {1, 6, 0},
2763 	.module			= THIS_MODULE,
2764 	.ctr			= writecache_ctr,
2765 	.dtr			= writecache_dtr,
2766 	.status			= writecache_status,
2767 	.postsuspend		= writecache_suspend,
2768 	.resume			= writecache_resume,
2769 	.message		= writecache_message,
2770 	.map			= writecache_map,
2771 	.end_io			= writecache_end_io,
2772 	.iterate_devices	= writecache_iterate_devices,
2773 	.io_hints		= writecache_io_hints,
2774 };
2775 module_dm(writecache);
2776 
2777 MODULE_DESCRIPTION(DM_NAME " writecache target");
2778 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2779 MODULE_LICENSE("GPL");
2780