1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2018 Red Hat. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/libnvdimm.h>
17 #include <linux/delay.h>
18 #include "dm-io-tracker.h"
19
20 #define DM_MSG_PREFIX "writecache"
21
22 #define HIGH_WATERMARK 50
23 #define LOW_WATERMARK 45
24 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
25 #define ENDIO_LATENCY 16
26 #define WRITEBACK_LATENCY 64
27 #define AUTOCOMMIT_BLOCKS_SSD 65536
28 #define AUTOCOMMIT_BLOCKS_PMEM 64
29 #define AUTOCOMMIT_MSEC 1000
30 #define MAX_AGE_DIV 16
31 #define MAX_AGE_UNSPECIFIED -1UL
32 #define PAUSE_WRITEBACK (HZ * 3)
33
34 #define BITMAP_GRANULARITY 65536
35 #if BITMAP_GRANULARITY < PAGE_SIZE
36 #undef BITMAP_GRANULARITY
37 #define BITMAP_GRANULARITY PAGE_SIZE
38 #endif
39
40 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
41 #define DM_WRITECACHE_HAS_PMEM
42 #endif
43
44 #ifdef DM_WRITECACHE_HAS_PMEM
45 #define pmem_assign(dest, src) \
46 do { \
47 typeof(dest) uniq = (src); \
48 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
49 } while (0)
50 #else
51 #define pmem_assign(dest, src) ((dest) = (src))
52 #endif
53
54 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
55 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
56 #endif
57
58 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
59 #define MEMORY_SUPERBLOCK_VERSION 1
60
61 struct wc_memory_entry {
62 __le64 original_sector;
63 __le64 seq_count;
64 };
65
66 struct wc_memory_superblock {
67 union {
68 struct {
69 __le32 magic;
70 __le32 version;
71 __le32 block_size;
72 __le32 pad;
73 __le64 n_blocks;
74 __le64 seq_count;
75 };
76 __le64 padding[8];
77 };
78 struct wc_memory_entry entries[];
79 };
80
81 struct wc_entry {
82 struct rb_node rb_node;
83 struct list_head lru;
84 unsigned short wc_list_contiguous;
85 #if BITS_PER_LONG == 64
86 bool write_in_progress : 1;
87 unsigned long index : 47;
88 #else
89 bool write_in_progress;
90 unsigned long index;
91 #endif
92 unsigned long age;
93 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
94 uint64_t original_sector;
95 uint64_t seq_count;
96 #endif
97 };
98
99 #ifdef DM_WRITECACHE_HAS_PMEM
100 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
101 #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
102 #else
103 #define WC_MODE_PMEM(wc) false
104 #define WC_MODE_FUA(wc) false
105 #endif
106 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
107
108 struct dm_writecache {
109 struct mutex lock;
110 struct list_head lru;
111 union {
112 struct list_head freelist;
113 struct {
114 struct rb_root freetree;
115 struct wc_entry *current_free;
116 };
117 };
118 struct rb_root tree;
119
120 size_t freelist_size;
121 size_t writeback_size;
122 size_t freelist_high_watermark;
123 size_t freelist_low_watermark;
124 unsigned long max_age;
125 unsigned long pause;
126
127 unsigned int uncommitted_blocks;
128 unsigned int autocommit_blocks;
129 unsigned int max_writeback_jobs;
130
131 int error;
132
133 unsigned long autocommit_jiffies;
134 struct timer_list autocommit_timer;
135 struct wait_queue_head freelist_wait;
136
137 struct timer_list max_age_timer;
138
139 atomic_t bio_in_progress[2];
140 struct wait_queue_head bio_in_progress_wait[2];
141
142 struct dm_target *ti;
143 struct dm_dev *dev;
144 struct dm_dev *ssd_dev;
145 sector_t start_sector;
146 void *memory_map;
147 uint64_t memory_map_size;
148 size_t metadata_sectors;
149 size_t n_blocks;
150 uint64_t seq_count;
151 sector_t data_device_sectors;
152 void *block_start;
153 struct wc_entry *entries;
154 unsigned int block_size;
155 unsigned char block_size_bits;
156
157 bool pmem_mode:1;
158 bool writeback_fua:1;
159
160 bool overwrote_committed:1;
161 bool memory_vmapped:1;
162
163 bool start_sector_set:1;
164 bool high_wm_percent_set:1;
165 bool low_wm_percent_set:1;
166 bool max_writeback_jobs_set:1;
167 bool autocommit_blocks_set:1;
168 bool autocommit_time_set:1;
169 bool max_age_set:1;
170 bool writeback_fua_set:1;
171 bool flush_on_suspend:1;
172 bool cleaner:1;
173 bool cleaner_set:1;
174 bool metadata_only:1;
175 bool pause_set:1;
176
177 unsigned int high_wm_percent_value;
178 unsigned int low_wm_percent_value;
179 unsigned int autocommit_time_value;
180 unsigned int max_age_value;
181 unsigned int pause_value;
182
183 unsigned int writeback_all;
184 struct workqueue_struct *writeback_wq;
185 struct work_struct writeback_work;
186 struct work_struct flush_work;
187
188 struct dm_io_tracker iot;
189
190 struct dm_io_client *dm_io;
191
192 raw_spinlock_t endio_list_lock;
193 struct list_head endio_list;
194 struct task_struct *endio_thread;
195
196 struct task_struct *flush_thread;
197 struct bio_list flush_list;
198
199 struct dm_kcopyd_client *dm_kcopyd;
200 unsigned long *dirty_bitmap;
201 unsigned int dirty_bitmap_size;
202
203 struct bio_set bio_set;
204 mempool_t copy_pool;
205
206 struct {
207 unsigned long long reads;
208 unsigned long long read_hits;
209 unsigned long long writes;
210 unsigned long long write_hits_uncommitted;
211 unsigned long long write_hits_committed;
212 unsigned long long writes_around;
213 unsigned long long writes_allocate;
214 unsigned long long writes_blocked_on_freelist;
215 unsigned long long flushes;
216 unsigned long long discards;
217 } stats;
218 };
219
220 #define WB_LIST_INLINE 16
221
222 struct writeback_struct {
223 struct list_head endio_entry;
224 struct dm_writecache *wc;
225 struct wc_entry **wc_list;
226 unsigned int wc_list_n;
227 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
228 struct bio bio;
229 };
230
231 struct copy_struct {
232 struct list_head endio_entry;
233 struct dm_writecache *wc;
234 struct wc_entry *e;
235 unsigned int n_entries;
236 int error;
237 };
238
239 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
240 "A percentage of time allocated for data copying");
241
wc_lock(struct dm_writecache * wc)242 static void wc_lock(struct dm_writecache *wc)
243 {
244 mutex_lock(&wc->lock);
245 }
246
wc_unlock(struct dm_writecache * wc)247 static void wc_unlock(struct dm_writecache *wc)
248 {
249 mutex_unlock(&wc->lock);
250 }
251
252 #ifdef DM_WRITECACHE_HAS_PMEM
persistent_memory_claim(struct dm_writecache * wc)253 static int persistent_memory_claim(struct dm_writecache *wc)
254 {
255 int r;
256 loff_t s;
257 long p, da;
258 unsigned long pfn;
259 int id;
260 struct page **pages;
261 sector_t offset;
262
263 wc->memory_vmapped = false;
264
265 s = wc->memory_map_size;
266 p = s >> PAGE_SHIFT;
267 if (!p) {
268 r = -EINVAL;
269 goto err1;
270 }
271 if (p != s >> PAGE_SHIFT) {
272 r = -EOVERFLOW;
273 goto err1;
274 }
275
276 offset = get_start_sect(wc->ssd_dev->bdev);
277 if (offset & (PAGE_SIZE / 512 - 1)) {
278 r = -EINVAL;
279 goto err1;
280 }
281 offset >>= PAGE_SHIFT - 9;
282
283 id = dax_read_lock();
284
285 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
286 &wc->memory_map, &pfn);
287 if (da < 0) {
288 wc->memory_map = NULL;
289 r = da;
290 goto err2;
291 }
292 if (!pfn_valid(pfn)) {
293 wc->memory_map = NULL;
294 r = -EOPNOTSUPP;
295 goto err2;
296 }
297 if (da != p) {
298 long i;
299
300 wc->memory_map = NULL;
301 pages = vmalloc_array(p, sizeof(struct page *));
302 if (!pages) {
303 r = -ENOMEM;
304 goto err2;
305 }
306 i = 0;
307 do {
308 long daa;
309
310 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
311 p - i, DAX_ACCESS, NULL, &pfn);
312 if (daa <= 0) {
313 r = daa ? daa : -EINVAL;
314 goto err3;
315 }
316 if (!pfn_valid(pfn)) {
317 r = -EOPNOTSUPP;
318 goto err3;
319 }
320 while (daa-- && i < p) {
321 pages[i++] = pfn_to_page(pfn);
322 pfn++;
323 if (!(i & 15))
324 cond_resched();
325 }
326 } while (i < p);
327 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
328 if (!wc->memory_map) {
329 r = -ENOMEM;
330 goto err3;
331 }
332 vfree(pages);
333 wc->memory_vmapped = true;
334 }
335
336 dax_read_unlock(id);
337
338 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
339 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
340
341 return 0;
342 err3:
343 vfree(pages);
344 err2:
345 dax_read_unlock(id);
346 err1:
347 return r;
348 }
349 #else
persistent_memory_claim(struct dm_writecache * wc)350 static int persistent_memory_claim(struct dm_writecache *wc)
351 {
352 return -EOPNOTSUPP;
353 }
354 #endif
355
persistent_memory_release(struct dm_writecache * wc)356 static void persistent_memory_release(struct dm_writecache *wc)
357 {
358 if (wc->memory_vmapped)
359 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
360 }
361
persistent_memory_page(void * addr)362 static struct page *persistent_memory_page(void *addr)
363 {
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 else
367 return virt_to_page(addr);
368 }
369
persistent_memory_page_offset(void * addr)370 static unsigned int persistent_memory_page_offset(void *addr)
371 {
372 return (unsigned long)addr & (PAGE_SIZE - 1);
373 }
374
persistent_memory_flush_cache(void * ptr,size_t size)375 static void persistent_memory_flush_cache(void *ptr, size_t size)
376 {
377 if (is_vmalloc_addr(ptr))
378 flush_kernel_vmap_range(ptr, size);
379 }
380
persistent_memory_invalidate_cache(void * ptr,size_t size)381 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
382 {
383 if (is_vmalloc_addr(ptr))
384 invalidate_kernel_vmap_range(ptr, size);
385 }
386
sb(struct dm_writecache * wc)387 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
388 {
389 return wc->memory_map;
390 }
391
memory_entry(struct dm_writecache * wc,struct wc_entry * e)392 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
393 {
394 return &sb(wc)->entries[e->index];
395 }
396
memory_data(struct dm_writecache * wc,struct wc_entry * e)397 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
398 {
399 return (char *)wc->block_start + (e->index << wc->block_size_bits);
400 }
401
cache_sector(struct dm_writecache * wc,struct wc_entry * e)402 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
403 {
404 return wc->start_sector + wc->metadata_sectors +
405 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
406 }
407
read_original_sector(struct dm_writecache * wc,struct wc_entry * e)408 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
409 {
410 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
411 return e->original_sector;
412 #else
413 return le64_to_cpu(memory_entry(wc, e)->original_sector);
414 #endif
415 }
416
read_seq_count(struct dm_writecache * wc,struct wc_entry * e)417 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
418 {
419 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
420 return e->seq_count;
421 #else
422 return le64_to_cpu(memory_entry(wc, e)->seq_count);
423 #endif
424 }
425
clear_seq_count(struct dm_writecache * wc,struct wc_entry * e)426 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
427 {
428 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
429 e->seq_count = -1;
430 #endif
431 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
432 }
433
write_original_sector_seq_count(struct dm_writecache * wc,struct wc_entry * e,uint64_t original_sector,uint64_t seq_count)434 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
435 uint64_t original_sector, uint64_t seq_count)
436 {
437 struct wc_memory_entry me;
438 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
439 e->original_sector = original_sector;
440 e->seq_count = seq_count;
441 #endif
442 me.original_sector = cpu_to_le64(original_sector);
443 me.seq_count = cpu_to_le64(seq_count);
444 pmem_assign(*memory_entry(wc, e), me);
445 }
446
447 #define writecache_error(wc, err, msg, arg...) \
448 do { \
449 if (!cmpxchg(&(wc)->error, 0, err)) \
450 DMERR(msg, ##arg); \
451 wake_up(&(wc)->freelist_wait); \
452 } while (0)
453
454 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
455
writecache_flush_all_metadata(struct dm_writecache * wc)456 static void writecache_flush_all_metadata(struct dm_writecache *wc)
457 {
458 if (!WC_MODE_PMEM(wc))
459 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
460 }
461
writecache_flush_region(struct dm_writecache * wc,void * ptr,size_t size)462 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
463 {
464 if (!WC_MODE_PMEM(wc))
465 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
466 wc->dirty_bitmap);
467 }
468
469 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
470
471 struct io_notify {
472 struct dm_writecache *wc;
473 struct completion c;
474 atomic_t count;
475 };
476
writecache_notify_io(unsigned long error,void * context)477 static void writecache_notify_io(unsigned long error, void *context)
478 {
479 struct io_notify *endio = context;
480
481 if (unlikely(error != 0))
482 writecache_error(endio->wc, -EIO, "error writing metadata");
483 BUG_ON(atomic_read(&endio->count) <= 0);
484 if (atomic_dec_and_test(&endio->count))
485 complete(&endio->c);
486 }
487
writecache_wait_for_ios(struct dm_writecache * wc,int direction)488 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
489 {
490 wait_event(wc->bio_in_progress_wait[direction],
491 !atomic_read(&wc->bio_in_progress[direction]));
492 }
493
ssd_commit_flushed(struct dm_writecache * wc,bool wait_for_ios)494 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
495 {
496 struct dm_io_region region;
497 struct dm_io_request req;
498 struct io_notify endio = {
499 wc,
500 COMPLETION_INITIALIZER_ONSTACK(endio.c),
501 ATOMIC_INIT(1),
502 };
503 unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
504 unsigned int i = 0;
505
506 while (1) {
507 unsigned int j;
508
509 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
510 if (unlikely(i == bitmap_bits))
511 break;
512 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
513
514 region.bdev = wc->ssd_dev->bdev;
515 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
516 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
517
518 if (unlikely(region.sector >= wc->metadata_sectors))
519 break;
520 if (unlikely(region.sector + region.count > wc->metadata_sectors))
521 region.count = wc->metadata_sectors - region.sector;
522
523 region.sector += wc->start_sector;
524 atomic_inc(&endio.count);
525 req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
526 req.mem.type = DM_IO_VMA;
527 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
528 req.client = wc->dm_io;
529 req.notify.fn = writecache_notify_io;
530 req.notify.context = &endio;
531
532 /* writing via async dm-io (implied by notify.fn above) won't return an error */
533 (void) dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT);
534 i = j;
535 }
536
537 writecache_notify_io(0, &endio);
538 wait_for_completion_io(&endio.c);
539
540 if (wait_for_ios)
541 writecache_wait_for_ios(wc, WRITE);
542
543 writecache_disk_flush(wc, wc->ssd_dev);
544
545 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
546 }
547
ssd_commit_superblock(struct dm_writecache * wc)548 static void ssd_commit_superblock(struct dm_writecache *wc)
549 {
550 int r;
551 struct dm_io_region region;
552 struct dm_io_request req;
553
554 region.bdev = wc->ssd_dev->bdev;
555 region.sector = 0;
556 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
557
558 if (unlikely(region.sector + region.count > wc->metadata_sectors))
559 region.count = wc->metadata_sectors - region.sector;
560
561 region.sector += wc->start_sector;
562
563 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
564 req.mem.type = DM_IO_VMA;
565 req.mem.ptr.vma = (char *)wc->memory_map;
566 req.client = wc->dm_io;
567 req.notify.fn = NULL;
568 req.notify.context = NULL;
569
570 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT);
571 if (unlikely(r))
572 writecache_error(wc, r, "error writing superblock");
573 }
574
writecache_commit_flushed(struct dm_writecache * wc,bool wait_for_ios)575 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
576 {
577 if (WC_MODE_PMEM(wc))
578 pmem_wmb();
579 else
580 ssd_commit_flushed(wc, wait_for_ios);
581 }
582
writecache_disk_flush(struct dm_writecache * wc,struct dm_dev * dev)583 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
584 {
585 int r;
586 struct dm_io_region region;
587 struct dm_io_request req;
588
589 region.bdev = dev->bdev;
590 region.sector = 0;
591 region.count = 0;
592 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
593 req.mem.type = DM_IO_KMEM;
594 req.mem.ptr.addr = NULL;
595 req.client = wc->dm_io;
596 req.notify.fn = NULL;
597
598 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT);
599 if (unlikely(r))
600 writecache_error(wc, r, "error flushing metadata: %d", r);
601 }
602
603 #define WFE_RETURN_FOLLOWING 1
604 #define WFE_LOWEST_SEQ 2
605
writecache_find_entry(struct dm_writecache * wc,uint64_t block,int flags)606 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
607 uint64_t block, int flags)
608 {
609 struct wc_entry *e;
610 struct rb_node *node = wc->tree.rb_node;
611
612 if (unlikely(!node))
613 return NULL;
614
615 while (1) {
616 e = container_of(node, struct wc_entry, rb_node);
617 if (read_original_sector(wc, e) == block)
618 break;
619
620 node = (read_original_sector(wc, e) >= block ?
621 e->rb_node.rb_left : e->rb_node.rb_right);
622 if (unlikely(!node)) {
623 if (!(flags & WFE_RETURN_FOLLOWING))
624 return NULL;
625 if (read_original_sector(wc, e) >= block)
626 return e;
627
628 node = rb_next(&e->rb_node);
629 if (unlikely(!node))
630 return NULL;
631
632 e = container_of(node, struct wc_entry, rb_node);
633 return e;
634 }
635 }
636
637 while (1) {
638 struct wc_entry *e2;
639
640 if (flags & WFE_LOWEST_SEQ)
641 node = rb_prev(&e->rb_node);
642 else
643 node = rb_next(&e->rb_node);
644 if (unlikely(!node))
645 return e;
646 e2 = container_of(node, struct wc_entry, rb_node);
647 if (read_original_sector(wc, e2) != block)
648 return e;
649 e = e2;
650 }
651 }
652
writecache_insert_entry(struct dm_writecache * wc,struct wc_entry * ins)653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
654 {
655 struct wc_entry *e;
656 struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
657
658 while (*node) {
659 e = container_of(*node, struct wc_entry, rb_node);
660 parent = &e->rb_node;
661 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
662 node = &parent->rb_left;
663 else
664 node = &parent->rb_right;
665 }
666 rb_link_node(&ins->rb_node, parent, node);
667 rb_insert_color(&ins->rb_node, &wc->tree);
668 list_add(&ins->lru, &wc->lru);
669 ins->age = jiffies;
670 }
671
writecache_unlink(struct dm_writecache * wc,struct wc_entry * e)672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
673 {
674 list_del(&e->lru);
675 rb_erase(&e->rb_node, &wc->tree);
676 }
677
writecache_add_to_freelist(struct dm_writecache * wc,struct wc_entry * e)678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
679 {
680 if (WC_MODE_SORT_FREELIST(wc)) {
681 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
682
683 if (unlikely(!*node))
684 wc->current_free = e;
685 while (*node) {
686 parent = *node;
687 if (&e->rb_node < *node)
688 node = &parent->rb_left;
689 else
690 node = &parent->rb_right;
691 }
692 rb_link_node(&e->rb_node, parent, node);
693 rb_insert_color(&e->rb_node, &wc->freetree);
694 } else {
695 list_add_tail(&e->lru, &wc->freelist);
696 }
697 wc->freelist_size++;
698 }
699
writecache_verify_watermark(struct dm_writecache * wc)700 static inline void writecache_verify_watermark(struct dm_writecache *wc)
701 {
702 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
703 queue_work(wc->writeback_wq, &wc->writeback_work);
704 }
705
writecache_max_age_timer(struct timer_list * t)706 static void writecache_max_age_timer(struct timer_list *t)
707 {
708 struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
709
710 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
711 queue_work(wc->writeback_wq, &wc->writeback_work);
712 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
713 }
714 }
715
writecache_pop_from_freelist(struct dm_writecache * wc,sector_t expected_sector)716 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
717 {
718 struct wc_entry *e;
719
720 if (WC_MODE_SORT_FREELIST(wc)) {
721 struct rb_node *next;
722
723 if (unlikely(!wc->current_free))
724 return NULL;
725 e = wc->current_free;
726 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
727 return NULL;
728 next = rb_next(&e->rb_node);
729 rb_erase(&e->rb_node, &wc->freetree);
730 if (unlikely(!next))
731 next = rb_first(&wc->freetree);
732 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
733 } else {
734 if (unlikely(list_empty(&wc->freelist)))
735 return NULL;
736 e = container_of(wc->freelist.next, struct wc_entry, lru);
737 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
738 return NULL;
739 list_del(&e->lru);
740 }
741 wc->freelist_size--;
742
743 writecache_verify_watermark(wc);
744
745 return e;
746 }
747
writecache_free_entry(struct dm_writecache * wc,struct wc_entry * e)748 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
749 {
750 writecache_unlink(wc, e);
751 writecache_add_to_freelist(wc, e);
752 clear_seq_count(wc, e);
753 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
754 if (unlikely(waitqueue_active(&wc->freelist_wait)))
755 wake_up(&wc->freelist_wait);
756 }
757
writecache_wait_on_freelist(struct dm_writecache * wc)758 static void writecache_wait_on_freelist(struct dm_writecache *wc)
759 {
760 DEFINE_WAIT(wait);
761
762 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
763 wc_unlock(wc);
764 io_schedule();
765 finish_wait(&wc->freelist_wait, &wait);
766 wc_lock(wc);
767 }
768
writecache_poison_lists(struct dm_writecache * wc)769 static void writecache_poison_lists(struct dm_writecache *wc)
770 {
771 /*
772 * Catch incorrect access to these values while the device is suspended.
773 */
774 memset(&wc->tree, -1, sizeof(wc->tree));
775 wc->lru.next = LIST_POISON1;
776 wc->lru.prev = LIST_POISON2;
777 wc->freelist.next = LIST_POISON1;
778 wc->freelist.prev = LIST_POISON2;
779 }
780
writecache_flush_entry(struct dm_writecache * wc,struct wc_entry * e)781 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
782 {
783 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
784 if (WC_MODE_PMEM(wc))
785 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
786 }
787
writecache_entry_is_committed(struct dm_writecache * wc,struct wc_entry * e)788 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
789 {
790 return read_seq_count(wc, e) < wc->seq_count;
791 }
792
writecache_flush(struct dm_writecache * wc)793 static void writecache_flush(struct dm_writecache *wc)
794 {
795 struct wc_entry *e, *e2;
796 bool need_flush_after_free;
797
798 wc->uncommitted_blocks = 0;
799 timer_delete(&wc->autocommit_timer);
800
801 if (list_empty(&wc->lru))
802 return;
803
804 e = container_of(wc->lru.next, struct wc_entry, lru);
805 if (writecache_entry_is_committed(wc, e)) {
806 if (wc->overwrote_committed) {
807 writecache_wait_for_ios(wc, WRITE);
808 writecache_disk_flush(wc, wc->ssd_dev);
809 wc->overwrote_committed = false;
810 }
811 return;
812 }
813 while (1) {
814 writecache_flush_entry(wc, e);
815 if (unlikely(e->lru.next == &wc->lru))
816 break;
817 e2 = container_of(e->lru.next, struct wc_entry, lru);
818 if (writecache_entry_is_committed(wc, e2))
819 break;
820 e = e2;
821 cond_resched();
822 }
823 writecache_commit_flushed(wc, true);
824
825 wc->seq_count++;
826 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
827 if (WC_MODE_PMEM(wc))
828 writecache_commit_flushed(wc, false);
829 else
830 ssd_commit_superblock(wc);
831
832 wc->overwrote_committed = false;
833
834 need_flush_after_free = false;
835 while (1) {
836 /* Free another committed entry with lower seq-count */
837 struct rb_node *rb_node = rb_prev(&e->rb_node);
838
839 if (rb_node) {
840 e2 = container_of(rb_node, struct wc_entry, rb_node);
841 if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
842 likely(!e2->write_in_progress)) {
843 writecache_free_entry(wc, e2);
844 need_flush_after_free = true;
845 }
846 }
847 if (unlikely(e->lru.prev == &wc->lru))
848 break;
849 e = container_of(e->lru.prev, struct wc_entry, lru);
850 cond_resched();
851 }
852
853 if (need_flush_after_free)
854 writecache_commit_flushed(wc, false);
855 }
856
writecache_flush_work(struct work_struct * work)857 static void writecache_flush_work(struct work_struct *work)
858 {
859 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
860
861 wc_lock(wc);
862 writecache_flush(wc);
863 wc_unlock(wc);
864 }
865
writecache_autocommit_timer(struct timer_list * t)866 static void writecache_autocommit_timer(struct timer_list *t)
867 {
868 struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
869
870 if (!writecache_has_error(wc))
871 queue_work(wc->writeback_wq, &wc->flush_work);
872 }
873
writecache_schedule_autocommit(struct dm_writecache * wc)874 static void writecache_schedule_autocommit(struct dm_writecache *wc)
875 {
876 if (!timer_pending(&wc->autocommit_timer))
877 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
878 }
879
writecache_discard(struct dm_writecache * wc,sector_t start,sector_t end)880 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
881 {
882 struct wc_entry *e;
883 bool discarded_something = false;
884
885 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
886 if (unlikely(!e))
887 return;
888
889 while (read_original_sector(wc, e) < end) {
890 struct rb_node *node = rb_next(&e->rb_node);
891
892 if (likely(!e->write_in_progress)) {
893 if (!discarded_something) {
894 if (!WC_MODE_PMEM(wc)) {
895 writecache_wait_for_ios(wc, READ);
896 writecache_wait_for_ios(wc, WRITE);
897 }
898 discarded_something = true;
899 }
900 if (!writecache_entry_is_committed(wc, e))
901 wc->uncommitted_blocks--;
902 writecache_free_entry(wc, e);
903 }
904
905 if (unlikely(!node))
906 break;
907
908 e = container_of(node, struct wc_entry, rb_node);
909 }
910
911 if (discarded_something)
912 writecache_commit_flushed(wc, false);
913 }
914
writecache_wait_for_writeback(struct dm_writecache * wc)915 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
916 {
917 if (wc->writeback_size) {
918 writecache_wait_on_freelist(wc);
919 return true;
920 }
921 return false;
922 }
923
writecache_suspend(struct dm_target * ti)924 static void writecache_suspend(struct dm_target *ti)
925 {
926 struct dm_writecache *wc = ti->private;
927 bool flush_on_suspend;
928
929 timer_delete_sync(&wc->autocommit_timer);
930 timer_delete_sync(&wc->max_age_timer);
931
932 wc_lock(wc);
933 writecache_flush(wc);
934 flush_on_suspend = wc->flush_on_suspend;
935 if (flush_on_suspend) {
936 wc->flush_on_suspend = false;
937 wc->writeback_all++;
938 queue_work(wc->writeback_wq, &wc->writeback_work);
939 }
940 wc_unlock(wc);
941
942 drain_workqueue(wc->writeback_wq);
943
944 wc_lock(wc);
945 if (flush_on_suspend)
946 wc->writeback_all--;
947 while (writecache_wait_for_writeback(wc))
948 ;
949
950 if (WC_MODE_PMEM(wc))
951 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
952
953 writecache_poison_lists(wc);
954
955 wc_unlock(wc);
956 }
957
writecache_alloc_entries(struct dm_writecache * wc)958 static int writecache_alloc_entries(struct dm_writecache *wc)
959 {
960 size_t b;
961
962 if (wc->entries)
963 return 0;
964 wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
965 if (!wc->entries)
966 return -ENOMEM;
967 for (b = 0; b < wc->n_blocks; b++) {
968 struct wc_entry *e = &wc->entries[b];
969
970 e->index = b;
971 e->write_in_progress = false;
972 cond_resched();
973 }
974
975 return 0;
976 }
977
writecache_read_metadata(struct dm_writecache * wc,sector_t n_sectors)978 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
979 {
980 struct dm_io_region region;
981 struct dm_io_request req;
982
983 region.bdev = wc->ssd_dev->bdev;
984 region.sector = wc->start_sector;
985 region.count = n_sectors;
986 req.bi_opf = REQ_OP_READ | REQ_SYNC;
987 req.mem.type = DM_IO_VMA;
988 req.mem.ptr.vma = (char *)wc->memory_map;
989 req.client = wc->dm_io;
990 req.notify.fn = NULL;
991
992 return dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT);
993 }
994
writecache_resume(struct dm_target * ti)995 static void writecache_resume(struct dm_target *ti)
996 {
997 struct dm_writecache *wc = ti->private;
998 size_t b;
999 bool need_flush = false;
1000 __le64 sb_seq_count;
1001 int r;
1002
1003 wc_lock(wc);
1004
1005 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
1006
1007 if (WC_MODE_PMEM(wc)) {
1008 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
1009 } else {
1010 r = writecache_read_metadata(wc, wc->metadata_sectors);
1011 if (r) {
1012 size_t sb_entries_offset;
1013
1014 writecache_error(wc, r, "unable to read metadata: %d", r);
1015 sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1016 memset((char *)wc->memory_map + sb_entries_offset, -1,
1017 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1018 }
1019 }
1020
1021 wc->tree = RB_ROOT;
1022 INIT_LIST_HEAD(&wc->lru);
1023 if (WC_MODE_SORT_FREELIST(wc)) {
1024 wc->freetree = RB_ROOT;
1025 wc->current_free = NULL;
1026 } else {
1027 INIT_LIST_HEAD(&wc->freelist);
1028 }
1029 wc->freelist_size = 0;
1030
1031 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1032 sizeof(uint64_t));
1033 if (r) {
1034 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1035 sb_seq_count = cpu_to_le64(0);
1036 }
1037 wc->seq_count = le64_to_cpu(sb_seq_count);
1038
1039 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1040 for (b = 0; b < wc->n_blocks; b++) {
1041 struct wc_entry *e = &wc->entries[b];
1042 struct wc_memory_entry wme;
1043
1044 if (writecache_has_error(wc)) {
1045 e->original_sector = -1;
1046 e->seq_count = -1;
1047 continue;
1048 }
1049 r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1050 sizeof(struct wc_memory_entry));
1051 if (r) {
1052 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1053 (unsigned long)b, r);
1054 e->original_sector = -1;
1055 e->seq_count = -1;
1056 } else {
1057 e->original_sector = le64_to_cpu(wme.original_sector);
1058 e->seq_count = le64_to_cpu(wme.seq_count);
1059 }
1060 cond_resched();
1061 }
1062 #endif
1063 for (b = 0; b < wc->n_blocks; b++) {
1064 struct wc_entry *e = &wc->entries[b];
1065
1066 if (!writecache_entry_is_committed(wc, e)) {
1067 if (read_seq_count(wc, e) != -1) {
1068 erase_this:
1069 clear_seq_count(wc, e);
1070 need_flush = true;
1071 }
1072 writecache_add_to_freelist(wc, e);
1073 } else {
1074 struct wc_entry *old;
1075
1076 old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1077 if (!old) {
1078 writecache_insert_entry(wc, e);
1079 } else {
1080 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1081 writecache_error(wc, -EINVAL,
1082 "two identical entries, position %llu, sector %llu, sequence %llu",
1083 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1084 (unsigned long long)read_seq_count(wc, e));
1085 }
1086 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1087 goto erase_this;
1088 } else {
1089 writecache_free_entry(wc, old);
1090 writecache_insert_entry(wc, e);
1091 need_flush = true;
1092 }
1093 }
1094 }
1095 cond_resched();
1096 }
1097
1098 if (need_flush) {
1099 writecache_flush_all_metadata(wc);
1100 writecache_commit_flushed(wc, false);
1101 }
1102
1103 writecache_verify_watermark(wc);
1104
1105 if (wc->max_age != MAX_AGE_UNSPECIFIED)
1106 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1107
1108 wc_unlock(wc);
1109 }
1110
process_flush_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1111 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1112 {
1113 if (argc != 1)
1114 return -EINVAL;
1115
1116 wc_lock(wc);
1117 if (dm_suspended(wc->ti)) {
1118 wc_unlock(wc);
1119 return -EBUSY;
1120 }
1121 if (writecache_has_error(wc)) {
1122 wc_unlock(wc);
1123 return -EIO;
1124 }
1125
1126 writecache_flush(wc);
1127 wc->writeback_all++;
1128 queue_work(wc->writeback_wq, &wc->writeback_work);
1129 wc_unlock(wc);
1130
1131 flush_workqueue(wc->writeback_wq);
1132
1133 wc_lock(wc);
1134 wc->writeback_all--;
1135 if (writecache_has_error(wc)) {
1136 wc_unlock(wc);
1137 return -EIO;
1138 }
1139 wc_unlock(wc);
1140
1141 return 0;
1142 }
1143
process_flush_on_suspend_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1144 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1145 {
1146 if (argc != 1)
1147 return -EINVAL;
1148
1149 wc_lock(wc);
1150 wc->flush_on_suspend = true;
1151 wc_unlock(wc);
1152
1153 return 0;
1154 }
1155
activate_cleaner(struct dm_writecache * wc)1156 static void activate_cleaner(struct dm_writecache *wc)
1157 {
1158 wc->flush_on_suspend = true;
1159 wc->cleaner = true;
1160 wc->freelist_high_watermark = wc->n_blocks;
1161 wc->freelist_low_watermark = wc->n_blocks;
1162 }
1163
process_cleaner_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1164 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1165 {
1166 if (argc != 1)
1167 return -EINVAL;
1168
1169 wc_lock(wc);
1170 activate_cleaner(wc);
1171 if (!dm_suspended(wc->ti))
1172 writecache_verify_watermark(wc);
1173 wc_unlock(wc);
1174
1175 return 0;
1176 }
1177
process_clear_stats_mesg(unsigned int argc,char ** argv,struct dm_writecache * wc)1178 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
1179 {
1180 if (argc != 1)
1181 return -EINVAL;
1182
1183 wc_lock(wc);
1184 memset(&wc->stats, 0, sizeof(wc->stats));
1185 wc_unlock(wc);
1186
1187 return 0;
1188 }
1189
writecache_message(struct dm_target * ti,unsigned int argc,char ** argv,char * result,unsigned int maxlen)1190 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
1191 char *result, unsigned int maxlen)
1192 {
1193 int r = -EINVAL;
1194 struct dm_writecache *wc = ti->private;
1195
1196 if (!strcasecmp(argv[0], "flush"))
1197 r = process_flush_mesg(argc, argv, wc);
1198 else if (!strcasecmp(argv[0], "flush_on_suspend"))
1199 r = process_flush_on_suspend_mesg(argc, argv, wc);
1200 else if (!strcasecmp(argv[0], "cleaner"))
1201 r = process_cleaner_mesg(argc, argv, wc);
1202 else if (!strcasecmp(argv[0], "clear_stats"))
1203 r = process_clear_stats_mesg(argc, argv, wc);
1204 else
1205 DMERR("unrecognised message received: %s", argv[0]);
1206
1207 return r;
1208 }
1209
memcpy_flushcache_optimized(void * dest,void * source,size_t size)1210 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1211 {
1212 /*
1213 * clflushopt performs better with block size 1024, 2048, 4096
1214 * non-temporal stores perform better with block size 512
1215 *
1216 * block size 512 1024 2048 4096
1217 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
1218 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
1219 *
1220 * We see that movnti performs better for 512-byte blocks, and
1221 * clflushopt performs better for 1024-byte and larger blocks. So, we
1222 * prefer clflushopt for sizes >= 768.
1223 *
1224 * NOTE: this happens to be the case now (with dm-writecache's single
1225 * threaded model) but re-evaluate this once memcpy_flushcache() is
1226 * enabled to use movdir64b which might invalidate this performance
1227 * advantage seen with cache-allocating-writes plus flushing.
1228 */
1229 #ifdef CONFIG_X86
1230 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1231 likely(boot_cpu_data.x86_clflush_size == 64) &&
1232 likely(size >= 768)) {
1233 do {
1234 memcpy((void *)dest, (void *)source, 64);
1235 clflushopt((void *)dest);
1236 dest += 64;
1237 source += 64;
1238 size -= 64;
1239 } while (size >= 64);
1240 return;
1241 }
1242 #endif
1243 memcpy_flushcache(dest, source, size);
1244 }
1245
bio_copy_block(struct dm_writecache * wc,struct bio * bio,void * data)1246 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1247 {
1248 void *buf;
1249 unsigned int size;
1250 int rw = bio_data_dir(bio);
1251 unsigned int remaining_size = wc->block_size;
1252
1253 do {
1254 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1255
1256 buf = bvec_kmap_local(&bv);
1257 size = bv.bv_len;
1258 if (unlikely(size > remaining_size))
1259 size = remaining_size;
1260
1261 if (rw == READ) {
1262 int r;
1263
1264 r = copy_mc_to_kernel(buf, data, size);
1265 flush_dcache_page(bio_page(bio));
1266 if (unlikely(r)) {
1267 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1268 bio->bi_status = BLK_STS_IOERR;
1269 }
1270 } else {
1271 flush_dcache_page(bio_page(bio));
1272 memcpy_flushcache_optimized(data, buf, size);
1273 }
1274
1275 kunmap_local(buf);
1276
1277 data = (char *)data + size;
1278 remaining_size -= size;
1279 bio_advance(bio, size);
1280 } while (unlikely(remaining_size));
1281 }
1282
writecache_flush_thread(void * data)1283 static int writecache_flush_thread(void *data)
1284 {
1285 struct dm_writecache *wc = data;
1286
1287 while (1) {
1288 struct bio *bio;
1289
1290 wc_lock(wc);
1291 bio = bio_list_pop(&wc->flush_list);
1292 if (!bio) {
1293 set_current_state(TASK_INTERRUPTIBLE);
1294 wc_unlock(wc);
1295
1296 if (unlikely(kthread_should_stop())) {
1297 set_current_state(TASK_RUNNING);
1298 break;
1299 }
1300
1301 schedule();
1302 continue;
1303 }
1304
1305 if (bio_op(bio) == REQ_OP_DISCARD) {
1306 writecache_discard(wc, bio->bi_iter.bi_sector,
1307 bio_end_sector(bio));
1308 wc_unlock(wc);
1309 bio_set_dev(bio, wc->dev->bdev);
1310 submit_bio_noacct(bio);
1311 } else {
1312 writecache_flush(wc);
1313 wc_unlock(wc);
1314 if (writecache_has_error(wc))
1315 bio->bi_status = BLK_STS_IOERR;
1316 bio_endio(bio);
1317 }
1318 }
1319
1320 return 0;
1321 }
1322
writecache_offload_bio(struct dm_writecache * wc,struct bio * bio)1323 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1324 {
1325 if (bio_list_empty(&wc->flush_list))
1326 wake_up_process(wc->flush_thread);
1327 bio_list_add(&wc->flush_list, bio);
1328 }
1329
1330 enum wc_map_op {
1331 WC_MAP_SUBMIT,
1332 WC_MAP_REMAP,
1333 WC_MAP_REMAP_ORIGIN,
1334 WC_MAP_RETURN,
1335 WC_MAP_ERROR,
1336 };
1337
writecache_map_remap_origin(struct dm_writecache * wc,struct bio * bio,struct wc_entry * e)1338 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
1339 struct wc_entry *e)
1340 {
1341 if (e) {
1342 sector_t next_boundary =
1343 read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1344 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
1345 dm_accept_partial_bio(bio, next_boundary);
1346 }
1347 }
1348
writecache_map_read(struct dm_writecache * wc,struct bio * bio)1349 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
1350 {
1351 enum wc_map_op map_op;
1352 struct wc_entry *e;
1353
1354 read_next_block:
1355 wc->stats.reads++;
1356 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1357 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1358 wc->stats.read_hits++;
1359 if (WC_MODE_PMEM(wc)) {
1360 bio_copy_block(wc, bio, memory_data(wc, e));
1361 if (bio->bi_iter.bi_size)
1362 goto read_next_block;
1363 map_op = WC_MAP_SUBMIT;
1364 } else {
1365 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1366 bio_set_dev(bio, wc->ssd_dev->bdev);
1367 bio->bi_iter.bi_sector = cache_sector(wc, e);
1368 if (!writecache_entry_is_committed(wc, e))
1369 writecache_wait_for_ios(wc, WRITE);
1370 map_op = WC_MAP_REMAP;
1371 }
1372 } else {
1373 writecache_map_remap_origin(wc, bio, e);
1374 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1375 map_op = WC_MAP_REMAP_ORIGIN;
1376 }
1377
1378 return map_op;
1379 }
1380
writecache_bio_copy_ssd(struct dm_writecache * wc,struct bio * bio,struct wc_entry * e,bool search_used)1381 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
1382 struct wc_entry *e, bool search_used)
1383 {
1384 unsigned int bio_size = wc->block_size;
1385 sector_t start_cache_sec = cache_sector(wc, e);
1386 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1387
1388 while (bio_size < bio->bi_iter.bi_size) {
1389 if (!search_used) {
1390 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1391
1392 if (!f)
1393 break;
1394 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1395 (bio_size >> SECTOR_SHIFT), wc->seq_count);
1396 writecache_insert_entry(wc, f);
1397 wc->uncommitted_blocks++;
1398 } else {
1399 struct wc_entry *f;
1400 struct rb_node *next = rb_next(&e->rb_node);
1401
1402 if (!next)
1403 break;
1404 f = container_of(next, struct wc_entry, rb_node);
1405 if (f != e + 1)
1406 break;
1407 if (read_original_sector(wc, f) !=
1408 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1409 break;
1410 if (unlikely(f->write_in_progress))
1411 break;
1412 if (writecache_entry_is_committed(wc, f))
1413 wc->overwrote_committed = true;
1414 e = f;
1415 }
1416 bio_size += wc->block_size;
1417 current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1418 }
1419
1420 bio_set_dev(bio, wc->ssd_dev->bdev);
1421 bio->bi_iter.bi_sector = start_cache_sec;
1422 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1423
1424 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1425 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1426
1427 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1428 wc->uncommitted_blocks = 0;
1429 queue_work(wc->writeback_wq, &wc->flush_work);
1430 } else {
1431 writecache_schedule_autocommit(wc);
1432 }
1433 }
1434
writecache_map_write(struct dm_writecache * wc,struct bio * bio)1435 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
1436 {
1437 struct wc_entry *e;
1438
1439 do {
1440 bool found_entry = false;
1441 bool search_used = false;
1442
1443 if (writecache_has_error(wc)) {
1444 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1445 return WC_MAP_ERROR;
1446 }
1447 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1448 if (e) {
1449 if (!writecache_entry_is_committed(wc, e)) {
1450 wc->stats.write_hits_uncommitted++;
1451 search_used = true;
1452 goto bio_copy;
1453 }
1454 wc->stats.write_hits_committed++;
1455 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1456 wc->overwrote_committed = true;
1457 search_used = true;
1458 goto bio_copy;
1459 }
1460 found_entry = true;
1461 } else {
1462 if (unlikely(wc->cleaner) ||
1463 (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1464 goto direct_write;
1465 }
1466 e = writecache_pop_from_freelist(wc, (sector_t)-1);
1467 if (unlikely(!e)) {
1468 if (!WC_MODE_PMEM(wc) && !found_entry) {
1469 direct_write:
1470 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1471 writecache_map_remap_origin(wc, bio, e);
1472 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
1473 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1474 return WC_MAP_REMAP_ORIGIN;
1475 }
1476 wc->stats.writes_blocked_on_freelist++;
1477 writecache_wait_on_freelist(wc);
1478 continue;
1479 }
1480 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1481 writecache_insert_entry(wc, e);
1482 wc->uncommitted_blocks++;
1483 wc->stats.writes_allocate++;
1484 bio_copy:
1485 if (WC_MODE_PMEM(wc)) {
1486 bio_copy_block(wc, bio, memory_data(wc, e));
1487 wc->stats.writes++;
1488 } else {
1489 writecache_bio_copy_ssd(wc, bio, e, search_used);
1490 return WC_MAP_REMAP;
1491 }
1492 } while (bio->bi_iter.bi_size);
1493
1494 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
1495 writecache_flush(wc);
1496 else
1497 writecache_schedule_autocommit(wc);
1498
1499 return WC_MAP_SUBMIT;
1500 }
1501
writecache_map_flush(struct dm_writecache * wc,struct bio * bio)1502 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
1503 {
1504 if (writecache_has_error(wc))
1505 return WC_MAP_ERROR;
1506
1507 if (WC_MODE_PMEM(wc)) {
1508 wc->stats.flushes++;
1509 writecache_flush(wc);
1510 if (writecache_has_error(wc))
1511 return WC_MAP_ERROR;
1512 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1513 return WC_MAP_REMAP_ORIGIN;
1514 return WC_MAP_SUBMIT;
1515 }
1516 /* SSD: */
1517 if (dm_bio_get_target_bio_nr(bio))
1518 return WC_MAP_REMAP_ORIGIN;
1519 wc->stats.flushes++;
1520 writecache_offload_bio(wc, bio);
1521 return WC_MAP_RETURN;
1522 }
1523
writecache_map_discard(struct dm_writecache * wc,struct bio * bio)1524 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
1525 {
1526 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
1527
1528 if (writecache_has_error(wc))
1529 return WC_MAP_ERROR;
1530
1531 if (WC_MODE_PMEM(wc)) {
1532 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1533 return WC_MAP_REMAP_ORIGIN;
1534 }
1535 /* SSD: */
1536 writecache_offload_bio(wc, bio);
1537 return WC_MAP_RETURN;
1538 }
1539
writecache_map(struct dm_target * ti,struct bio * bio)1540 static int writecache_map(struct dm_target *ti, struct bio *bio)
1541 {
1542 struct dm_writecache *wc = ti->private;
1543 enum wc_map_op map_op;
1544
1545 bio->bi_private = NULL;
1546
1547 wc_lock(wc);
1548
1549 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1550 map_op = writecache_map_flush(wc, bio);
1551 goto done;
1552 }
1553
1554 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1555
1556 if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1557 (wc->block_size / 512 - 1)) != 0)) {
1558 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1559 (unsigned long long)bio->bi_iter.bi_sector,
1560 bio->bi_iter.bi_size, wc->block_size);
1561 map_op = WC_MAP_ERROR;
1562 goto done;
1563 }
1564
1565 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1566 map_op = writecache_map_discard(wc, bio);
1567 goto done;
1568 }
1569
1570 if (bio_data_dir(bio) == READ)
1571 map_op = writecache_map_read(wc, bio);
1572 else
1573 map_op = writecache_map_write(wc, bio);
1574 done:
1575 switch (map_op) {
1576 case WC_MAP_REMAP_ORIGIN:
1577 if (likely(wc->pause != 0)) {
1578 if (bio_op(bio) == REQ_OP_WRITE) {
1579 dm_iot_io_begin(&wc->iot, 1);
1580 bio->bi_private = (void *)2;
1581 }
1582 }
1583 bio_set_dev(bio, wc->dev->bdev);
1584 wc_unlock(wc);
1585 return DM_MAPIO_REMAPPED;
1586
1587 case WC_MAP_REMAP:
1588 /* make sure that writecache_end_io decrements bio_in_progress: */
1589 bio->bi_private = (void *)1;
1590 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1591 wc_unlock(wc);
1592 return DM_MAPIO_REMAPPED;
1593
1594 case WC_MAP_SUBMIT:
1595 wc_unlock(wc);
1596 bio_endio(bio);
1597 return DM_MAPIO_SUBMITTED;
1598
1599 case WC_MAP_RETURN:
1600 wc_unlock(wc);
1601 return DM_MAPIO_SUBMITTED;
1602
1603 case WC_MAP_ERROR:
1604 wc_unlock(wc);
1605 bio_io_error(bio);
1606 return DM_MAPIO_SUBMITTED;
1607
1608 default:
1609 BUG();
1610 wc_unlock(wc);
1611 return DM_MAPIO_KILL;
1612 }
1613 }
1614
writecache_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * status)1615 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1616 {
1617 struct dm_writecache *wc = ti->private;
1618
1619 if (bio->bi_private == (void *)1) {
1620 int dir = bio_data_dir(bio);
1621
1622 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1623 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1624 wake_up(&wc->bio_in_progress_wait[dir]);
1625 } else if (bio->bi_private == (void *)2) {
1626 dm_iot_io_end(&wc->iot, 1);
1627 }
1628 return 0;
1629 }
1630
writecache_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)1631 static int writecache_iterate_devices(struct dm_target *ti,
1632 iterate_devices_callout_fn fn, void *data)
1633 {
1634 struct dm_writecache *wc = ti->private;
1635
1636 return fn(ti, wc->dev, 0, ti->len, data);
1637 }
1638
writecache_io_hints(struct dm_target * ti,struct queue_limits * limits)1639 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1640 {
1641 struct dm_writecache *wc = ti->private;
1642
1643 if (limits->logical_block_size < wc->block_size)
1644 limits->logical_block_size = wc->block_size;
1645
1646 if (limits->physical_block_size < wc->block_size)
1647 limits->physical_block_size = wc->block_size;
1648
1649 if (limits->io_min < wc->block_size)
1650 limits->io_min = wc->block_size;
1651 }
1652
1653
writecache_writeback_endio(struct bio * bio)1654 static void writecache_writeback_endio(struct bio *bio)
1655 {
1656 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1657 struct dm_writecache *wc = wb->wc;
1658 unsigned long flags;
1659
1660 raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1661 if (unlikely(list_empty(&wc->endio_list)))
1662 wake_up_process(wc->endio_thread);
1663 list_add_tail(&wb->endio_entry, &wc->endio_list);
1664 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1665 }
1666
writecache_copy_endio(int read_err,unsigned long write_err,void * ptr)1667 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1668 {
1669 struct copy_struct *c = ptr;
1670 struct dm_writecache *wc = c->wc;
1671
1672 c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1673
1674 raw_spin_lock_irq(&wc->endio_list_lock);
1675 if (unlikely(list_empty(&wc->endio_list)))
1676 wake_up_process(wc->endio_thread);
1677 list_add_tail(&c->endio_entry, &wc->endio_list);
1678 raw_spin_unlock_irq(&wc->endio_list_lock);
1679 }
1680
__writecache_endio_pmem(struct dm_writecache * wc,struct list_head * list)1681 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1682 {
1683 unsigned int i;
1684 struct writeback_struct *wb;
1685 struct wc_entry *e;
1686 unsigned long n_walked = 0;
1687
1688 do {
1689 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1690 list_del(&wb->endio_entry);
1691
1692 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1693 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1694 "write error %d", wb->bio.bi_status);
1695 i = 0;
1696 do {
1697 e = wb->wc_list[i];
1698 BUG_ON(!e->write_in_progress);
1699 e->write_in_progress = false;
1700 INIT_LIST_HEAD(&e->lru);
1701 if (!writecache_has_error(wc))
1702 writecache_free_entry(wc, e);
1703 BUG_ON(!wc->writeback_size);
1704 wc->writeback_size--;
1705 n_walked++;
1706 if (unlikely(n_walked >= ENDIO_LATENCY)) {
1707 writecache_commit_flushed(wc, false);
1708 wc_unlock(wc);
1709 wc_lock(wc);
1710 n_walked = 0;
1711 }
1712 } while (++i < wb->wc_list_n);
1713
1714 if (wb->wc_list != wb->wc_list_inline)
1715 kfree(wb->wc_list);
1716 bio_put(&wb->bio);
1717 } while (!list_empty(list));
1718 }
1719
__writecache_endio_ssd(struct dm_writecache * wc,struct list_head * list)1720 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1721 {
1722 struct copy_struct *c;
1723 struct wc_entry *e;
1724
1725 do {
1726 c = list_entry(list->next, struct copy_struct, endio_entry);
1727 list_del(&c->endio_entry);
1728
1729 if (unlikely(c->error))
1730 writecache_error(wc, c->error, "copy error");
1731
1732 e = c->e;
1733 do {
1734 BUG_ON(!e->write_in_progress);
1735 e->write_in_progress = false;
1736 INIT_LIST_HEAD(&e->lru);
1737 if (!writecache_has_error(wc))
1738 writecache_free_entry(wc, e);
1739
1740 BUG_ON(!wc->writeback_size);
1741 wc->writeback_size--;
1742 e++;
1743 } while (--c->n_entries);
1744 mempool_free(c, &wc->copy_pool);
1745 } while (!list_empty(list));
1746 }
1747
writecache_endio_thread(void * data)1748 static int writecache_endio_thread(void *data)
1749 {
1750 struct dm_writecache *wc = data;
1751
1752 while (1) {
1753 struct list_head list;
1754
1755 raw_spin_lock_irq(&wc->endio_list_lock);
1756 if (!list_empty(&wc->endio_list))
1757 goto pop_from_list;
1758 set_current_state(TASK_INTERRUPTIBLE);
1759 raw_spin_unlock_irq(&wc->endio_list_lock);
1760
1761 if (unlikely(kthread_should_stop())) {
1762 set_current_state(TASK_RUNNING);
1763 break;
1764 }
1765
1766 schedule();
1767
1768 continue;
1769
1770 pop_from_list:
1771 list = wc->endio_list;
1772 list.next->prev = list.prev->next = &list;
1773 INIT_LIST_HEAD(&wc->endio_list);
1774 raw_spin_unlock_irq(&wc->endio_list_lock);
1775
1776 if (!WC_MODE_FUA(wc))
1777 writecache_disk_flush(wc, wc->dev);
1778
1779 wc_lock(wc);
1780
1781 if (WC_MODE_PMEM(wc)) {
1782 __writecache_endio_pmem(wc, &list);
1783 } else {
1784 __writecache_endio_ssd(wc, &list);
1785 writecache_wait_for_ios(wc, READ);
1786 }
1787
1788 writecache_commit_flushed(wc, false);
1789
1790 wc_unlock(wc);
1791 }
1792
1793 return 0;
1794 }
1795
wc_add_block(struct writeback_struct * wb,struct wc_entry * e)1796 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1797 {
1798 struct dm_writecache *wc = wb->wc;
1799 unsigned int block_size = wc->block_size;
1800 void *address = memory_data(wc, e);
1801
1802 persistent_memory_flush_cache(address, block_size);
1803
1804 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1805 return true;
1806
1807 return bio_add_page(&wb->bio, persistent_memory_page(address),
1808 block_size, persistent_memory_page_offset(address)) != 0;
1809 }
1810
1811 struct writeback_list {
1812 struct list_head list;
1813 size_t size;
1814 };
1815
__writeback_throttle(struct dm_writecache * wc,struct writeback_list * wbl)1816 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1817 {
1818 if (unlikely(wc->max_writeback_jobs)) {
1819 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1820 wc_lock(wc);
1821 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1822 writecache_wait_on_freelist(wc);
1823 wc_unlock(wc);
1824 }
1825 }
1826 cond_resched();
1827 }
1828
__writecache_writeback_pmem(struct dm_writecache * wc,struct writeback_list * wbl)1829 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1830 {
1831 struct wc_entry *e, *f;
1832 struct bio *bio;
1833 struct writeback_struct *wb;
1834 unsigned int max_pages;
1835
1836 while (wbl->size) {
1837 wbl->size--;
1838 e = container_of(wbl->list.prev, struct wc_entry, lru);
1839 list_del(&e->lru);
1840
1841 max_pages = e->wc_list_contiguous;
1842
1843 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
1844 GFP_NOIO, &wc->bio_set);
1845 wb = container_of(bio, struct writeback_struct, bio);
1846 wb->wc = wc;
1847 bio->bi_end_io = writecache_writeback_endio;
1848 bio->bi_iter.bi_sector = read_original_sector(wc, e);
1849
1850 if (unlikely(max_pages > WB_LIST_INLINE))
1851 wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1852 GFP_NOIO | __GFP_NORETRY |
1853 __GFP_NOMEMALLOC | __GFP_NOWARN);
1854
1855 if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
1856 wb->wc_list = wb->wc_list_inline;
1857 max_pages = WB_LIST_INLINE;
1858 }
1859
1860 BUG_ON(!wc_add_block(wb, e));
1861
1862 wb->wc_list[0] = e;
1863 wb->wc_list_n = 1;
1864
1865 while (wbl->size && wb->wc_list_n < max_pages) {
1866 f = container_of(wbl->list.prev, struct wc_entry, lru);
1867 if (read_original_sector(wc, f) !=
1868 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1869 break;
1870 if (!wc_add_block(wb, f))
1871 break;
1872 wbl->size--;
1873 list_del(&f->lru);
1874 wb->wc_list[wb->wc_list_n++] = f;
1875 e = f;
1876 }
1877 if (WC_MODE_FUA(wc))
1878 bio->bi_opf |= REQ_FUA;
1879 if (writecache_has_error(wc)) {
1880 bio->bi_status = BLK_STS_IOERR;
1881 bio_endio(bio);
1882 } else if (unlikely(!bio_sectors(bio))) {
1883 bio->bi_status = BLK_STS_OK;
1884 bio_endio(bio);
1885 } else {
1886 submit_bio(bio);
1887 }
1888
1889 __writeback_throttle(wc, wbl);
1890 }
1891 }
1892
__writecache_writeback_ssd(struct dm_writecache * wc,struct writeback_list * wbl)1893 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1894 {
1895 struct wc_entry *e, *f;
1896 struct dm_io_region from, to;
1897 struct copy_struct *c;
1898
1899 while (wbl->size) {
1900 unsigned int n_sectors;
1901
1902 wbl->size--;
1903 e = container_of(wbl->list.prev, struct wc_entry, lru);
1904 list_del(&e->lru);
1905
1906 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1907
1908 from.bdev = wc->ssd_dev->bdev;
1909 from.sector = cache_sector(wc, e);
1910 from.count = n_sectors;
1911 to.bdev = wc->dev->bdev;
1912 to.sector = read_original_sector(wc, e);
1913 to.count = n_sectors;
1914
1915 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1916 c->wc = wc;
1917 c->e = e;
1918 c->n_entries = e->wc_list_contiguous;
1919
1920 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1921 wbl->size--;
1922 f = container_of(wbl->list.prev, struct wc_entry, lru);
1923 BUG_ON(f != e + 1);
1924 list_del(&f->lru);
1925 e = f;
1926 }
1927
1928 if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1929 if (to.sector >= wc->data_device_sectors) {
1930 writecache_copy_endio(0, 0, c);
1931 continue;
1932 }
1933 from.count = to.count = wc->data_device_sectors - to.sector;
1934 }
1935
1936 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1937
1938 __writeback_throttle(wc, wbl);
1939 }
1940 }
1941
writecache_writeback(struct work_struct * work)1942 static void writecache_writeback(struct work_struct *work)
1943 {
1944 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1945 struct blk_plug plug;
1946 struct wc_entry *f, *g, *e = NULL;
1947 struct rb_node *node, *next_node;
1948 struct list_head skipped;
1949 struct writeback_list wbl;
1950 unsigned long n_walked;
1951
1952 if (!WC_MODE_PMEM(wc)) {
1953 /* Wait for any active kcopyd work on behalf of ssd writeback */
1954 dm_kcopyd_client_flush(wc->dm_kcopyd);
1955 }
1956
1957 if (likely(wc->pause != 0)) {
1958 while (1) {
1959 unsigned long idle;
1960
1961 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1962 unlikely(dm_suspended(wc->ti)))
1963 break;
1964 idle = dm_iot_idle_time(&wc->iot);
1965 if (idle >= wc->pause)
1966 break;
1967 idle = wc->pause - idle;
1968 if (idle > HZ)
1969 idle = HZ;
1970 schedule_timeout_idle(idle);
1971 }
1972 }
1973
1974 wc_lock(wc);
1975 restart:
1976 if (writecache_has_error(wc)) {
1977 wc_unlock(wc);
1978 return;
1979 }
1980
1981 if (unlikely(wc->writeback_all)) {
1982 if (writecache_wait_for_writeback(wc))
1983 goto restart;
1984 }
1985
1986 if (wc->overwrote_committed)
1987 writecache_wait_for_ios(wc, WRITE);
1988
1989 n_walked = 0;
1990 INIT_LIST_HEAD(&skipped);
1991 INIT_LIST_HEAD(&wbl.list);
1992 wbl.size = 0;
1993 while (!list_empty(&wc->lru) &&
1994 (wc->writeback_all ||
1995 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1996 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1997 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1998
1999 n_walked++;
2000 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
2001 likely(!wc->writeback_all)) {
2002 if (likely(!dm_suspended(wc->ti)))
2003 queue_work(wc->writeback_wq, &wc->writeback_work);
2004 break;
2005 }
2006
2007 if (unlikely(wc->writeback_all)) {
2008 if (unlikely(!e)) {
2009 writecache_flush(wc);
2010 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
2011 } else
2012 e = g;
2013 } else
2014 e = container_of(wc->lru.prev, struct wc_entry, lru);
2015 BUG_ON(e->write_in_progress);
2016 if (unlikely(!writecache_entry_is_committed(wc, e)))
2017 writecache_flush(wc);
2018
2019 node = rb_prev(&e->rb_node);
2020 if (node) {
2021 f = container_of(node, struct wc_entry, rb_node);
2022 if (unlikely(read_original_sector(wc, f) ==
2023 read_original_sector(wc, e))) {
2024 BUG_ON(!f->write_in_progress);
2025 list_move(&e->lru, &skipped);
2026 cond_resched();
2027 continue;
2028 }
2029 }
2030 wc->writeback_size++;
2031 list_move(&e->lru, &wbl.list);
2032 wbl.size++;
2033 e->write_in_progress = true;
2034 e->wc_list_contiguous = 1;
2035
2036 f = e;
2037
2038 while (1) {
2039 next_node = rb_next(&f->rb_node);
2040 if (unlikely(!next_node))
2041 break;
2042 g = container_of(next_node, struct wc_entry, rb_node);
2043 if (unlikely(read_original_sector(wc, g) ==
2044 read_original_sector(wc, f))) {
2045 f = g;
2046 continue;
2047 }
2048 if (read_original_sector(wc, g) !=
2049 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
2050 break;
2051 if (unlikely(g->write_in_progress))
2052 break;
2053 if (unlikely(!writecache_entry_is_committed(wc, g)))
2054 break;
2055
2056 if (!WC_MODE_PMEM(wc)) {
2057 if (g != f + 1)
2058 break;
2059 }
2060
2061 n_walked++;
2062 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2063 // break;
2064
2065 wc->writeback_size++;
2066 list_move(&g->lru, &wbl.list);
2067 wbl.size++;
2068 g->write_in_progress = true;
2069 g->wc_list_contiguous = BIO_MAX_VECS;
2070 f = g;
2071 e->wc_list_contiguous++;
2072 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
2073 if (unlikely(wc->writeback_all)) {
2074 next_node = rb_next(&f->rb_node);
2075 if (likely(next_node))
2076 g = container_of(next_node, struct wc_entry, rb_node);
2077 }
2078 break;
2079 }
2080 }
2081 cond_resched();
2082 }
2083
2084 if (!list_empty(&skipped)) {
2085 list_splice_tail(&skipped, &wc->lru);
2086 /*
2087 * If we didn't do any progress, we must wait until some
2088 * writeback finishes to avoid burning CPU in a loop
2089 */
2090 if (unlikely(!wbl.size))
2091 writecache_wait_for_writeback(wc);
2092 }
2093
2094 wc_unlock(wc);
2095
2096 blk_start_plug(&plug);
2097
2098 if (WC_MODE_PMEM(wc))
2099 __writecache_writeback_pmem(wc, &wbl);
2100 else
2101 __writecache_writeback_ssd(wc, &wbl);
2102
2103 blk_finish_plug(&plug);
2104
2105 if (unlikely(wc->writeback_all)) {
2106 wc_lock(wc);
2107 while (writecache_wait_for_writeback(wc))
2108 ;
2109 wc_unlock(wc);
2110 }
2111 }
2112
calculate_memory_size(uint64_t device_size,unsigned int block_size,size_t * n_blocks_p,size_t * n_metadata_blocks_p)2113 static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
2114 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2115 {
2116 uint64_t n_blocks, offset;
2117 struct wc_entry e;
2118
2119 n_blocks = device_size;
2120 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2121
2122 while (1) {
2123 if (!n_blocks)
2124 return -ENOSPC;
2125 /* Verify the following entries[n_blocks] won't overflow */
2126 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2127 sizeof(struct wc_memory_entry)))
2128 return -EFBIG;
2129 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2130 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2131 if (offset + n_blocks * block_size <= device_size)
2132 break;
2133 n_blocks--;
2134 }
2135
2136 /* check if the bit field overflows */
2137 e.index = n_blocks;
2138 if (e.index != n_blocks)
2139 return -EFBIG;
2140
2141 if (n_blocks_p)
2142 *n_blocks_p = n_blocks;
2143 if (n_metadata_blocks_p)
2144 *n_metadata_blocks_p = offset >> __ffs(block_size);
2145 return 0;
2146 }
2147
init_memory(struct dm_writecache * wc)2148 static int init_memory(struct dm_writecache *wc)
2149 {
2150 size_t b;
2151 int r;
2152
2153 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2154 if (r)
2155 return r;
2156
2157 r = writecache_alloc_entries(wc);
2158 if (r)
2159 return r;
2160
2161 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2162 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2163 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2164 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2165 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2166 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2167
2168 for (b = 0; b < wc->n_blocks; b++) {
2169 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2170 cond_resched();
2171 }
2172
2173 writecache_flush_all_metadata(wc);
2174 writecache_commit_flushed(wc, false);
2175 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2176 writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
2177 writecache_commit_flushed(wc, false);
2178
2179 return 0;
2180 }
2181
writecache_dtr(struct dm_target * ti)2182 static void writecache_dtr(struct dm_target *ti)
2183 {
2184 struct dm_writecache *wc = ti->private;
2185
2186 if (!wc)
2187 return;
2188
2189 if (wc->endio_thread)
2190 kthread_stop(wc->endio_thread);
2191
2192 if (wc->flush_thread)
2193 kthread_stop(wc->flush_thread);
2194
2195 bioset_exit(&wc->bio_set);
2196
2197 mempool_exit(&wc->copy_pool);
2198
2199 if (wc->writeback_wq)
2200 destroy_workqueue(wc->writeback_wq);
2201
2202 if (wc->dev)
2203 dm_put_device(ti, wc->dev);
2204
2205 if (wc->ssd_dev)
2206 dm_put_device(ti, wc->ssd_dev);
2207
2208 vfree(wc->entries);
2209
2210 if (wc->memory_map) {
2211 if (WC_MODE_PMEM(wc))
2212 persistent_memory_release(wc);
2213 else
2214 vfree(wc->memory_map);
2215 }
2216
2217 if (wc->dm_kcopyd)
2218 dm_kcopyd_client_destroy(wc->dm_kcopyd);
2219
2220 if (wc->dm_io)
2221 dm_io_client_destroy(wc->dm_io);
2222
2223 vfree(wc->dirty_bitmap);
2224
2225 kfree(wc);
2226 }
2227
writecache_ctr(struct dm_target * ti,unsigned int argc,char ** argv)2228 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2229 {
2230 struct dm_writecache *wc;
2231 struct dm_arg_set as;
2232 const char *string;
2233 unsigned int opt_params;
2234 size_t offset, data_size;
2235 int i, r;
2236 char dummy;
2237 int high_wm_percent = HIGH_WATERMARK;
2238 int low_wm_percent = LOW_WATERMARK;
2239 uint64_t x;
2240 struct wc_memory_superblock s;
2241
2242 static struct dm_arg _args[] = {
2243 {0, 18, "Invalid number of feature args"},
2244 };
2245
2246 as.argc = argc;
2247 as.argv = argv;
2248
2249 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2250 if (!wc) {
2251 ti->error = "Cannot allocate writecache structure";
2252 r = -ENOMEM;
2253 goto bad;
2254 }
2255 ti->private = wc;
2256 wc->ti = ti;
2257
2258 mutex_init(&wc->lock);
2259 wc->max_age = MAX_AGE_UNSPECIFIED;
2260 writecache_poison_lists(wc);
2261 init_waitqueue_head(&wc->freelist_wait);
2262 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2263 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2264
2265 for (i = 0; i < 2; i++) {
2266 atomic_set(&wc->bio_in_progress[i], 0);
2267 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2268 }
2269
2270 wc->dm_io = dm_io_client_create();
2271 if (IS_ERR(wc->dm_io)) {
2272 r = PTR_ERR(wc->dm_io);
2273 ti->error = "Unable to allocate dm-io client";
2274 wc->dm_io = NULL;
2275 goto bad;
2276 }
2277
2278 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2279 if (!wc->writeback_wq) {
2280 r = -ENOMEM;
2281 ti->error = "Could not allocate writeback workqueue";
2282 goto bad;
2283 }
2284 INIT_WORK(&wc->writeback_work, writecache_writeback);
2285 INIT_WORK(&wc->flush_work, writecache_flush_work);
2286
2287 dm_iot_init(&wc->iot);
2288
2289 raw_spin_lock_init(&wc->endio_list_lock);
2290 INIT_LIST_HEAD(&wc->endio_list);
2291 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
2292 if (IS_ERR(wc->endio_thread)) {
2293 r = PTR_ERR(wc->endio_thread);
2294 wc->endio_thread = NULL;
2295 ti->error = "Couldn't spawn endio thread";
2296 goto bad;
2297 }
2298
2299 /*
2300 * Parse the mode (pmem or ssd)
2301 */
2302 string = dm_shift_arg(&as);
2303 if (!string)
2304 goto bad_arguments;
2305
2306 if (!strcasecmp(string, "s")) {
2307 wc->pmem_mode = false;
2308 } else if (!strcasecmp(string, "p")) {
2309 #ifdef DM_WRITECACHE_HAS_PMEM
2310 wc->pmem_mode = true;
2311 wc->writeback_fua = true;
2312 #else
2313 /*
2314 * If the architecture doesn't support persistent memory or
2315 * the kernel doesn't support any DAX drivers, this driver can
2316 * only be used in SSD-only mode.
2317 */
2318 r = -EOPNOTSUPP;
2319 ti->error = "Persistent memory or DAX not supported on this system";
2320 goto bad;
2321 #endif
2322 } else {
2323 goto bad_arguments;
2324 }
2325
2326 if (WC_MODE_PMEM(wc)) {
2327 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2328 offsetof(struct writeback_struct, bio),
2329 BIOSET_NEED_BVECS);
2330 if (r) {
2331 ti->error = "Could not allocate bio set";
2332 goto bad;
2333 }
2334 } else {
2335 wc->pause = PAUSE_WRITEBACK;
2336 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2337 if (r) {
2338 ti->error = "Could not allocate mempool";
2339 goto bad;
2340 }
2341 }
2342
2343 /*
2344 * Parse the origin data device
2345 */
2346 string = dm_shift_arg(&as);
2347 if (!string)
2348 goto bad_arguments;
2349 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2350 if (r) {
2351 ti->error = "Origin data device lookup failed";
2352 goto bad;
2353 }
2354
2355 /*
2356 * Parse cache data device (be it pmem or ssd)
2357 */
2358 string = dm_shift_arg(&as);
2359 if (!string)
2360 goto bad_arguments;
2361
2362 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2363 if (r) {
2364 ti->error = "Cache data device lookup failed";
2365 goto bad;
2366 }
2367 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
2368
2369 /*
2370 * Parse the cache block size
2371 */
2372 string = dm_shift_arg(&as);
2373 if (!string)
2374 goto bad_arguments;
2375 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2376 wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2377 (wc->block_size & (wc->block_size - 1))) {
2378 r = -EINVAL;
2379 ti->error = "Invalid block size";
2380 goto bad;
2381 }
2382 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2383 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2384 r = -EINVAL;
2385 ti->error = "Block size is smaller than device logical block size";
2386 goto bad;
2387 }
2388 wc->block_size_bits = __ffs(wc->block_size);
2389
2390 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2391 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2392 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2393
2394 /*
2395 * Parse optional arguments
2396 */
2397 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2398 if (r)
2399 goto bad;
2400
2401 while (opt_params) {
2402 string = dm_shift_arg(&as), opt_params--;
2403 if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2404 unsigned long long start_sector;
2405
2406 string = dm_shift_arg(&as), opt_params--;
2407 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2408 goto invalid_optional;
2409 wc->start_sector = start_sector;
2410 wc->start_sector_set = true;
2411 if (wc->start_sector != start_sector ||
2412 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2413 goto invalid_optional;
2414 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2415 string = dm_shift_arg(&as), opt_params--;
2416 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2417 goto invalid_optional;
2418 if (high_wm_percent < 0 || high_wm_percent > 100)
2419 goto invalid_optional;
2420 wc->high_wm_percent_value = high_wm_percent;
2421 wc->high_wm_percent_set = true;
2422 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2423 string = dm_shift_arg(&as), opt_params--;
2424 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2425 goto invalid_optional;
2426 if (low_wm_percent < 0 || low_wm_percent > 100)
2427 goto invalid_optional;
2428 wc->low_wm_percent_value = low_wm_percent;
2429 wc->low_wm_percent_set = true;
2430 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2431 string = dm_shift_arg(&as), opt_params--;
2432 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2433 goto invalid_optional;
2434 wc->max_writeback_jobs_set = true;
2435 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2436 string = dm_shift_arg(&as), opt_params--;
2437 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2438 goto invalid_optional;
2439 wc->autocommit_blocks_set = true;
2440 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2441 unsigned int autocommit_msecs;
2442
2443 string = dm_shift_arg(&as), opt_params--;
2444 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2445 goto invalid_optional;
2446 if (autocommit_msecs > 3600000)
2447 goto invalid_optional;
2448 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2449 wc->autocommit_time_value = autocommit_msecs;
2450 wc->autocommit_time_set = true;
2451 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2452 unsigned int max_age_msecs;
2453
2454 string = dm_shift_arg(&as), opt_params--;
2455 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2456 goto invalid_optional;
2457 if (max_age_msecs > 86400000)
2458 goto invalid_optional;
2459 wc->max_age = msecs_to_jiffies(max_age_msecs);
2460 wc->max_age_set = true;
2461 wc->max_age_value = max_age_msecs;
2462 } else if (!strcasecmp(string, "cleaner")) {
2463 wc->cleaner_set = true;
2464 wc->cleaner = true;
2465 } else if (!strcasecmp(string, "fua")) {
2466 if (WC_MODE_PMEM(wc)) {
2467 wc->writeback_fua = true;
2468 wc->writeback_fua_set = true;
2469 } else
2470 goto invalid_optional;
2471 } else if (!strcasecmp(string, "nofua")) {
2472 if (WC_MODE_PMEM(wc)) {
2473 wc->writeback_fua = false;
2474 wc->writeback_fua_set = true;
2475 } else
2476 goto invalid_optional;
2477 } else if (!strcasecmp(string, "metadata_only")) {
2478 wc->metadata_only = true;
2479 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2480 unsigned int pause_msecs;
2481
2482 if (WC_MODE_PMEM(wc))
2483 goto invalid_optional;
2484 string = dm_shift_arg(&as), opt_params--;
2485 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2486 goto invalid_optional;
2487 if (pause_msecs > 60000)
2488 goto invalid_optional;
2489 wc->pause = msecs_to_jiffies(pause_msecs);
2490 wc->pause_set = true;
2491 wc->pause_value = pause_msecs;
2492 } else {
2493 invalid_optional:
2494 r = -EINVAL;
2495 ti->error = "Invalid optional argument";
2496 goto bad;
2497 }
2498 }
2499
2500 if (high_wm_percent < low_wm_percent) {
2501 r = -EINVAL;
2502 ti->error = "High watermark must be greater than or equal to low watermark";
2503 goto bad;
2504 }
2505
2506 if (WC_MODE_PMEM(wc)) {
2507 if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2508 r = -EOPNOTSUPP;
2509 ti->error = "Asynchronous persistent memory not supported as pmem cache";
2510 goto bad;
2511 }
2512
2513 r = persistent_memory_claim(wc);
2514 if (r) {
2515 ti->error = "Unable to map persistent memory for cache";
2516 goto bad;
2517 }
2518 } else {
2519 size_t n_blocks, n_metadata_blocks;
2520 uint64_t n_bitmap_bits;
2521
2522 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2523
2524 bio_list_init(&wc->flush_list);
2525 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
2526 if (IS_ERR(wc->flush_thread)) {
2527 r = PTR_ERR(wc->flush_thread);
2528 wc->flush_thread = NULL;
2529 ti->error = "Couldn't spawn flush thread";
2530 goto bad;
2531 }
2532
2533 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2534 &n_blocks, &n_metadata_blocks);
2535 if (r) {
2536 ti->error = "Invalid device size";
2537 goto bad;
2538 }
2539
2540 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2541 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2542 /* this is limitation of test_bit functions */
2543 if (n_bitmap_bits > 1U << 31) {
2544 r = -EFBIG;
2545 ti->error = "Invalid device size";
2546 goto bad;
2547 }
2548
2549 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2550 if (!wc->memory_map) {
2551 r = -ENOMEM;
2552 ti->error = "Unable to allocate memory for metadata";
2553 goto bad;
2554 }
2555
2556 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2557 if (IS_ERR(wc->dm_kcopyd)) {
2558 r = PTR_ERR(wc->dm_kcopyd);
2559 ti->error = "Unable to allocate dm-kcopyd client";
2560 wc->dm_kcopyd = NULL;
2561 goto bad;
2562 }
2563
2564 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2565 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2566 BITS_PER_LONG * sizeof(unsigned long);
2567 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2568 if (!wc->dirty_bitmap) {
2569 r = -ENOMEM;
2570 ti->error = "Unable to allocate dirty bitmap";
2571 goto bad;
2572 }
2573
2574 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2575 if (r) {
2576 ti->error = "Unable to read first block of metadata";
2577 goto bad;
2578 }
2579 }
2580
2581 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2582 if (r) {
2583 ti->error = "Hardware memory error when reading superblock";
2584 goto bad;
2585 }
2586 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2587 r = init_memory(wc);
2588 if (r) {
2589 ti->error = "Unable to initialize device";
2590 goto bad;
2591 }
2592 r = copy_mc_to_kernel(&s, sb(wc),
2593 sizeof(struct wc_memory_superblock));
2594 if (r) {
2595 ti->error = "Hardware memory error when reading superblock";
2596 goto bad;
2597 }
2598 }
2599
2600 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2601 ti->error = "Invalid magic in the superblock";
2602 r = -EINVAL;
2603 goto bad;
2604 }
2605
2606 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2607 ti->error = "Invalid version in the superblock";
2608 r = -EINVAL;
2609 goto bad;
2610 }
2611
2612 if (le32_to_cpu(s.block_size) != wc->block_size) {
2613 ti->error = "Block size does not match superblock";
2614 r = -EINVAL;
2615 goto bad;
2616 }
2617
2618 wc->n_blocks = le64_to_cpu(s.n_blocks);
2619
2620 offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2621 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2622 overflow:
2623 ti->error = "Overflow in size calculation";
2624 r = -EINVAL;
2625 goto bad;
2626 }
2627 offset += sizeof(struct wc_memory_superblock);
2628 if (offset < sizeof(struct wc_memory_superblock))
2629 goto overflow;
2630 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2631 data_size = wc->n_blocks * (size_t)wc->block_size;
2632 if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2633 (offset + data_size < offset))
2634 goto overflow;
2635 if (offset + data_size > wc->memory_map_size) {
2636 ti->error = "Memory area is too small";
2637 r = -EINVAL;
2638 goto bad;
2639 }
2640
2641 wc->metadata_sectors = offset >> SECTOR_SHIFT;
2642 wc->block_start = (char *)sb(wc) + offset;
2643
2644 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2645 x += 50;
2646 do_div(x, 100);
2647 wc->freelist_high_watermark = x;
2648 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2649 x += 50;
2650 do_div(x, 100);
2651 wc->freelist_low_watermark = x;
2652
2653 if (wc->cleaner)
2654 activate_cleaner(wc);
2655
2656 r = writecache_alloc_entries(wc);
2657 if (r) {
2658 ti->error = "Cannot allocate memory";
2659 goto bad;
2660 }
2661
2662 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2663 ti->flush_supported = true;
2664 ti->num_discard_bios = 1;
2665
2666 if (WC_MODE_PMEM(wc))
2667 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2668
2669 return 0;
2670
2671 bad_arguments:
2672 r = -EINVAL;
2673 ti->error = "Bad arguments";
2674 bad:
2675 writecache_dtr(ti);
2676 return r;
2677 }
2678
writecache_status(struct dm_target * ti,status_type_t type,unsigned int status_flags,char * result,unsigned int maxlen)2679 static void writecache_status(struct dm_target *ti, status_type_t type,
2680 unsigned int status_flags, char *result, unsigned int maxlen)
2681 {
2682 struct dm_writecache *wc = ti->private;
2683 unsigned int extra_args;
2684 unsigned int sz = 0;
2685
2686 switch (type) {
2687 case STATUSTYPE_INFO:
2688 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2689 writecache_has_error(wc),
2690 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2691 (unsigned long long)wc->writeback_size,
2692 wc->stats.reads,
2693 wc->stats.read_hits,
2694 wc->stats.writes,
2695 wc->stats.write_hits_uncommitted,
2696 wc->stats.write_hits_committed,
2697 wc->stats.writes_around,
2698 wc->stats.writes_allocate,
2699 wc->stats.writes_blocked_on_freelist,
2700 wc->stats.flushes,
2701 wc->stats.discards);
2702 break;
2703 case STATUSTYPE_TABLE:
2704 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2705 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2706 extra_args = 0;
2707 if (wc->start_sector_set)
2708 extra_args += 2;
2709 if (wc->high_wm_percent_set)
2710 extra_args += 2;
2711 if (wc->low_wm_percent_set)
2712 extra_args += 2;
2713 if (wc->max_writeback_jobs_set)
2714 extra_args += 2;
2715 if (wc->autocommit_blocks_set)
2716 extra_args += 2;
2717 if (wc->autocommit_time_set)
2718 extra_args += 2;
2719 if (wc->max_age_set)
2720 extra_args += 2;
2721 if (wc->cleaner_set)
2722 extra_args++;
2723 if (wc->writeback_fua_set)
2724 extra_args++;
2725 if (wc->metadata_only)
2726 extra_args++;
2727 if (wc->pause_set)
2728 extra_args += 2;
2729
2730 DMEMIT("%u", extra_args);
2731 if (wc->start_sector_set)
2732 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2733 if (wc->high_wm_percent_set)
2734 DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2735 if (wc->low_wm_percent_set)
2736 DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2737 if (wc->max_writeback_jobs_set)
2738 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2739 if (wc->autocommit_blocks_set)
2740 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2741 if (wc->autocommit_time_set)
2742 DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2743 if (wc->max_age_set)
2744 DMEMIT(" max_age %u", wc->max_age_value);
2745 if (wc->cleaner_set)
2746 DMEMIT(" cleaner");
2747 if (wc->writeback_fua_set)
2748 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2749 if (wc->metadata_only)
2750 DMEMIT(" metadata_only");
2751 if (wc->pause_set)
2752 DMEMIT(" pause_writeback %u", wc->pause_value);
2753 break;
2754 case STATUSTYPE_IMA:
2755 *result = '\0';
2756 break;
2757 }
2758 }
2759
2760 static struct target_type writecache_target = {
2761 .name = "writecache",
2762 .version = {1, 6, 0},
2763 .module = THIS_MODULE,
2764 .ctr = writecache_ctr,
2765 .dtr = writecache_dtr,
2766 .status = writecache_status,
2767 .postsuspend = writecache_suspend,
2768 .resume = writecache_resume,
2769 .message = writecache_message,
2770 .map = writecache_map,
2771 .end_io = writecache_end_io,
2772 .iterate_devices = writecache_iterate_devices,
2773 .io_hints = writecache_io_hints,
2774 };
2775 module_dm(writecache);
2776
2777 MODULE_DESCRIPTION(DM_NAME " writecache target");
2778 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
2779 MODULE_LICENSE("GPL");
2780