1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/pfn_t.h> 17 #include <linux/libnvdimm.h> 18 #include <linux/delay.h> 19 #include "dm-io-tracker.h" 20 21 #define DM_MSG_PREFIX "writecache" 22 23 #define HIGH_WATERMARK 50 24 #define LOW_WATERMARK 45 25 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) 26 #define ENDIO_LATENCY 16 27 #define WRITEBACK_LATENCY 64 28 #define AUTOCOMMIT_BLOCKS_SSD 65536 29 #define AUTOCOMMIT_BLOCKS_PMEM 64 30 #define AUTOCOMMIT_MSEC 1000 31 #define MAX_AGE_DIV 16 32 #define MAX_AGE_UNSPECIFIED -1UL 33 #define PAUSE_WRITEBACK (HZ * 3) 34 35 #define BITMAP_GRANULARITY 65536 36 #if BITMAP_GRANULARITY < PAGE_SIZE 37 #undef BITMAP_GRANULARITY 38 #define BITMAP_GRANULARITY PAGE_SIZE 39 #endif 40 41 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 42 #define DM_WRITECACHE_HAS_PMEM 43 #endif 44 45 #ifdef DM_WRITECACHE_HAS_PMEM 46 #define pmem_assign(dest, src) \ 47 do { \ 48 typeof(dest) uniq = (src); \ 49 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 50 } while (0) 51 #else 52 #define pmem_assign(dest, src) ((dest) = (src)) 53 #endif 54 55 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) 56 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 57 #endif 58 59 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 60 #define MEMORY_SUPERBLOCK_VERSION 1 61 62 struct wc_memory_entry { 63 __le64 original_sector; 64 __le64 seq_count; 65 }; 66 67 struct wc_memory_superblock { 68 union { 69 struct { 70 __le32 magic; 71 __le32 version; 72 __le32 block_size; 73 __le32 pad; 74 __le64 n_blocks; 75 __le64 seq_count; 76 }; 77 __le64 padding[8]; 78 }; 79 struct wc_memory_entry entries[]; 80 }; 81 82 struct wc_entry { 83 struct rb_node rb_node; 84 struct list_head lru; 85 unsigned short wc_list_contiguous; 86 #if BITS_PER_LONG == 64 87 bool write_in_progress : 1; 88 unsigned long index : 47; 89 #else 90 bool write_in_progress; 91 unsigned long index; 92 #endif 93 unsigned long age; 94 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 95 uint64_t original_sector; 96 uint64_t seq_count; 97 #endif 98 }; 99 100 #ifdef DM_WRITECACHE_HAS_PMEM 101 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 102 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 103 #else 104 #define WC_MODE_PMEM(wc) false 105 #define WC_MODE_FUA(wc) false 106 #endif 107 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 108 109 struct dm_writecache { 110 struct mutex lock; 111 struct list_head lru; 112 union { 113 struct list_head freelist; 114 struct { 115 struct rb_root freetree; 116 struct wc_entry *current_free; 117 }; 118 }; 119 struct rb_root tree; 120 121 size_t freelist_size; 122 size_t writeback_size; 123 size_t freelist_high_watermark; 124 size_t freelist_low_watermark; 125 unsigned long max_age; 126 unsigned long pause; 127 128 unsigned int uncommitted_blocks; 129 unsigned int autocommit_blocks; 130 unsigned int max_writeback_jobs; 131 132 int error; 133 134 unsigned long autocommit_jiffies; 135 struct timer_list autocommit_timer; 136 struct wait_queue_head freelist_wait; 137 138 struct timer_list max_age_timer; 139 140 atomic_t bio_in_progress[2]; 141 struct wait_queue_head bio_in_progress_wait[2]; 142 143 struct dm_target *ti; 144 struct dm_dev *dev; 145 struct dm_dev *ssd_dev; 146 sector_t start_sector; 147 void *memory_map; 148 uint64_t memory_map_size; 149 size_t metadata_sectors; 150 size_t n_blocks; 151 uint64_t seq_count; 152 sector_t data_device_sectors; 153 void *block_start; 154 struct wc_entry *entries; 155 unsigned int block_size; 156 unsigned char block_size_bits; 157 158 bool pmem_mode:1; 159 bool writeback_fua:1; 160 161 bool overwrote_committed:1; 162 bool memory_vmapped:1; 163 164 bool start_sector_set:1; 165 bool high_wm_percent_set:1; 166 bool low_wm_percent_set:1; 167 bool max_writeback_jobs_set:1; 168 bool autocommit_blocks_set:1; 169 bool autocommit_time_set:1; 170 bool max_age_set:1; 171 bool writeback_fua_set:1; 172 bool flush_on_suspend:1; 173 bool cleaner:1; 174 bool cleaner_set:1; 175 bool metadata_only:1; 176 bool pause_set:1; 177 178 unsigned int high_wm_percent_value; 179 unsigned int low_wm_percent_value; 180 unsigned int autocommit_time_value; 181 unsigned int max_age_value; 182 unsigned int pause_value; 183 184 unsigned int writeback_all; 185 struct workqueue_struct *writeback_wq; 186 struct work_struct writeback_work; 187 struct work_struct flush_work; 188 189 struct dm_io_tracker iot; 190 191 struct dm_io_client *dm_io; 192 193 raw_spinlock_t endio_list_lock; 194 struct list_head endio_list; 195 struct task_struct *endio_thread; 196 197 struct task_struct *flush_thread; 198 struct bio_list flush_list; 199 200 struct dm_kcopyd_client *dm_kcopyd; 201 unsigned long *dirty_bitmap; 202 unsigned int dirty_bitmap_size; 203 204 struct bio_set bio_set; 205 mempool_t copy_pool; 206 207 struct { 208 unsigned long long reads; 209 unsigned long long read_hits; 210 unsigned long long writes; 211 unsigned long long write_hits_uncommitted; 212 unsigned long long write_hits_committed; 213 unsigned long long writes_around; 214 unsigned long long writes_allocate; 215 unsigned long long writes_blocked_on_freelist; 216 unsigned long long flushes; 217 unsigned long long discards; 218 } stats; 219 }; 220 221 #define WB_LIST_INLINE 16 222 223 struct writeback_struct { 224 struct list_head endio_entry; 225 struct dm_writecache *wc; 226 struct wc_entry **wc_list; 227 unsigned int wc_list_n; 228 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 229 struct bio bio; 230 }; 231 232 struct copy_struct { 233 struct list_head endio_entry; 234 struct dm_writecache *wc; 235 struct wc_entry *e; 236 unsigned int n_entries; 237 int error; 238 }; 239 240 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 241 "A percentage of time allocated for data copying"); 242 243 static void wc_lock(struct dm_writecache *wc) 244 { 245 mutex_lock(&wc->lock); 246 } 247 248 static void wc_unlock(struct dm_writecache *wc) 249 { 250 mutex_unlock(&wc->lock); 251 } 252 253 #ifdef DM_WRITECACHE_HAS_PMEM 254 static int persistent_memory_claim(struct dm_writecache *wc) 255 { 256 int r; 257 loff_t s; 258 long p, da; 259 pfn_t pfn; 260 int id; 261 struct page **pages; 262 sector_t offset; 263 264 wc->memory_vmapped = false; 265 266 s = wc->memory_map_size; 267 p = s >> PAGE_SHIFT; 268 if (!p) { 269 r = -EINVAL; 270 goto err1; 271 } 272 if (p != s >> PAGE_SHIFT) { 273 r = -EOVERFLOW; 274 goto err1; 275 } 276 277 offset = get_start_sect(wc->ssd_dev->bdev); 278 if (offset & (PAGE_SIZE / 512 - 1)) { 279 r = -EINVAL; 280 goto err1; 281 } 282 offset >>= PAGE_SHIFT - 9; 283 284 id = dax_read_lock(); 285 286 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 287 &wc->memory_map, &pfn); 288 if (da < 0) { 289 wc->memory_map = NULL; 290 r = da; 291 goto err2; 292 } 293 if (!pfn_t_has_page(pfn)) { 294 wc->memory_map = NULL; 295 r = -EOPNOTSUPP; 296 goto err2; 297 } 298 if (da != p) { 299 long i; 300 301 wc->memory_map = NULL; 302 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); 303 if (!pages) { 304 r = -ENOMEM; 305 goto err2; 306 } 307 i = 0; 308 do { 309 long daa; 310 311 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 312 p - i, DAX_ACCESS, NULL, &pfn); 313 if (daa <= 0) { 314 r = daa ? daa : -EINVAL; 315 goto err3; 316 } 317 if (!pfn_t_has_page(pfn)) { 318 r = -EOPNOTSUPP; 319 goto err3; 320 } 321 while (daa-- && i < p) { 322 pages[i++] = pfn_t_to_page(pfn); 323 pfn.val++; 324 if (!(i & 15)) 325 cond_resched(); 326 } 327 } while (i < p); 328 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 329 if (!wc->memory_map) { 330 r = -ENOMEM; 331 goto err3; 332 } 333 kvfree(pages); 334 wc->memory_vmapped = true; 335 } 336 337 dax_read_unlock(id); 338 339 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 340 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 341 342 return 0; 343 err3: 344 kvfree(pages); 345 err2: 346 dax_read_unlock(id); 347 err1: 348 return r; 349 } 350 #else 351 static int persistent_memory_claim(struct dm_writecache *wc) 352 { 353 return -EOPNOTSUPP; 354 } 355 #endif 356 357 static void persistent_memory_release(struct dm_writecache *wc) 358 { 359 if (wc->memory_vmapped) 360 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 361 } 362 363 static struct page *persistent_memory_page(void *addr) 364 { 365 if (is_vmalloc_addr(addr)) 366 return vmalloc_to_page(addr); 367 else 368 return virt_to_page(addr); 369 } 370 371 static unsigned int persistent_memory_page_offset(void *addr) 372 { 373 return (unsigned long)addr & (PAGE_SIZE - 1); 374 } 375 376 static void persistent_memory_flush_cache(void *ptr, size_t size) 377 { 378 if (is_vmalloc_addr(ptr)) 379 flush_kernel_vmap_range(ptr, size); 380 } 381 382 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 383 { 384 if (is_vmalloc_addr(ptr)) 385 invalidate_kernel_vmap_range(ptr, size); 386 } 387 388 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 389 { 390 return wc->memory_map; 391 } 392 393 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 394 { 395 return &sb(wc)->entries[e->index]; 396 } 397 398 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 399 { 400 return (char *)wc->block_start + (e->index << wc->block_size_bits); 401 } 402 403 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 404 { 405 return wc->start_sector + wc->metadata_sectors + 406 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 407 } 408 409 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 410 { 411 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 412 return e->original_sector; 413 #else 414 return le64_to_cpu(memory_entry(wc, e)->original_sector); 415 #endif 416 } 417 418 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 419 { 420 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 421 return e->seq_count; 422 #else 423 return le64_to_cpu(memory_entry(wc, e)->seq_count); 424 #endif 425 } 426 427 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 428 { 429 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 430 e->seq_count = -1; 431 #endif 432 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 433 } 434 435 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 436 uint64_t original_sector, uint64_t seq_count) 437 { 438 struct wc_memory_entry me; 439 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 440 e->original_sector = original_sector; 441 e->seq_count = seq_count; 442 #endif 443 me.original_sector = cpu_to_le64(original_sector); 444 me.seq_count = cpu_to_le64(seq_count); 445 pmem_assign(*memory_entry(wc, e), me); 446 } 447 448 #define writecache_error(wc, err, msg, arg...) \ 449 do { \ 450 if (!cmpxchg(&(wc)->error, 0, err)) \ 451 DMERR(msg, ##arg); \ 452 wake_up(&(wc)->freelist_wait); \ 453 } while (0) 454 455 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 456 457 static void writecache_flush_all_metadata(struct dm_writecache *wc) 458 { 459 if (!WC_MODE_PMEM(wc)) 460 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 461 } 462 463 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 464 { 465 if (!WC_MODE_PMEM(wc)) 466 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 467 wc->dirty_bitmap); 468 } 469 470 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 471 472 struct io_notify { 473 struct dm_writecache *wc; 474 struct completion c; 475 atomic_t count; 476 }; 477 478 static void writecache_notify_io(unsigned long error, void *context) 479 { 480 struct io_notify *endio = context; 481 482 if (unlikely(error != 0)) 483 writecache_error(endio->wc, -EIO, "error writing metadata"); 484 BUG_ON(atomic_read(&endio->count) <= 0); 485 if (atomic_dec_and_test(&endio->count)) 486 complete(&endio->c); 487 } 488 489 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 490 { 491 wait_event(wc->bio_in_progress_wait[direction], 492 !atomic_read(&wc->bio_in_progress[direction])); 493 } 494 495 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 496 { 497 struct dm_io_region region; 498 struct dm_io_request req; 499 struct io_notify endio = { 500 wc, 501 COMPLETION_INITIALIZER_ONSTACK(endio.c), 502 ATOMIC_INIT(1), 503 }; 504 unsigned int bitmap_bits = wc->dirty_bitmap_size * 8; 505 unsigned int i = 0; 506 507 while (1) { 508 unsigned int j; 509 510 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 511 if (unlikely(i == bitmap_bits)) 512 break; 513 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 514 515 region.bdev = wc->ssd_dev->bdev; 516 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 517 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 518 519 if (unlikely(region.sector >= wc->metadata_sectors)) 520 break; 521 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 522 region.count = wc->metadata_sectors - region.sector; 523 524 region.sector += wc->start_sector; 525 atomic_inc(&endio.count); 526 req.bi_opf = REQ_OP_WRITE | REQ_SYNC; 527 req.mem.type = DM_IO_VMA; 528 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 529 req.client = wc->dm_io; 530 req.notify.fn = writecache_notify_io; 531 req.notify.context = &endio; 532 533 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 534 (void) dm_io(&req, 1, ®ion, NULL); 535 i = j; 536 } 537 538 writecache_notify_io(0, &endio); 539 wait_for_completion_io(&endio.c); 540 541 if (wait_for_ios) 542 writecache_wait_for_ios(wc, WRITE); 543 544 writecache_disk_flush(wc, wc->ssd_dev); 545 546 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 547 } 548 549 static void ssd_commit_superblock(struct dm_writecache *wc) 550 { 551 int r; 552 struct dm_io_region region; 553 struct dm_io_request req; 554 555 region.bdev = wc->ssd_dev->bdev; 556 region.sector = 0; 557 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; 558 559 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 560 region.count = wc->metadata_sectors - region.sector; 561 562 region.sector += wc->start_sector; 563 564 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; 565 req.mem.type = DM_IO_VMA; 566 req.mem.ptr.vma = (char *)wc->memory_map; 567 req.client = wc->dm_io; 568 req.notify.fn = NULL; 569 req.notify.context = NULL; 570 571 r = dm_io(&req, 1, ®ion, NULL); 572 if (unlikely(r)) 573 writecache_error(wc, r, "error writing superblock"); 574 } 575 576 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 577 { 578 if (WC_MODE_PMEM(wc)) 579 pmem_wmb(); 580 else 581 ssd_commit_flushed(wc, wait_for_ios); 582 } 583 584 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 585 { 586 int r; 587 struct dm_io_region region; 588 struct dm_io_request req; 589 590 region.bdev = dev->bdev; 591 region.sector = 0; 592 region.count = 0; 593 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 594 req.mem.type = DM_IO_KMEM; 595 req.mem.ptr.addr = NULL; 596 req.client = wc->dm_io; 597 req.notify.fn = NULL; 598 599 r = dm_io(&req, 1, ®ion, NULL); 600 if (unlikely(r)) 601 writecache_error(wc, r, "error flushing metadata: %d", r); 602 } 603 604 #define WFE_RETURN_FOLLOWING 1 605 #define WFE_LOWEST_SEQ 2 606 607 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 608 uint64_t block, int flags) 609 { 610 struct wc_entry *e; 611 struct rb_node *node = wc->tree.rb_node; 612 613 if (unlikely(!node)) 614 return NULL; 615 616 while (1) { 617 e = container_of(node, struct wc_entry, rb_node); 618 if (read_original_sector(wc, e) == block) 619 break; 620 621 node = (read_original_sector(wc, e) >= block ? 622 e->rb_node.rb_left : e->rb_node.rb_right); 623 if (unlikely(!node)) { 624 if (!(flags & WFE_RETURN_FOLLOWING)) 625 return NULL; 626 if (read_original_sector(wc, e) >= block) 627 return e; 628 629 node = rb_next(&e->rb_node); 630 if (unlikely(!node)) 631 return NULL; 632 633 e = container_of(node, struct wc_entry, rb_node); 634 return e; 635 } 636 } 637 638 while (1) { 639 struct wc_entry *e2; 640 641 if (flags & WFE_LOWEST_SEQ) 642 node = rb_prev(&e->rb_node); 643 else 644 node = rb_next(&e->rb_node); 645 if (unlikely(!node)) 646 return e; 647 e2 = container_of(node, struct wc_entry, rb_node); 648 if (read_original_sector(wc, e2) != block) 649 return e; 650 e = e2; 651 } 652 } 653 654 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 655 { 656 struct wc_entry *e; 657 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 658 659 while (*node) { 660 e = container_of(*node, struct wc_entry, rb_node); 661 parent = &e->rb_node; 662 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 663 node = &parent->rb_left; 664 else 665 node = &parent->rb_right; 666 } 667 rb_link_node(&ins->rb_node, parent, node); 668 rb_insert_color(&ins->rb_node, &wc->tree); 669 list_add(&ins->lru, &wc->lru); 670 ins->age = jiffies; 671 } 672 673 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 674 { 675 list_del(&e->lru); 676 rb_erase(&e->rb_node, &wc->tree); 677 } 678 679 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 680 { 681 if (WC_MODE_SORT_FREELIST(wc)) { 682 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 683 684 if (unlikely(!*node)) 685 wc->current_free = e; 686 while (*node) { 687 parent = *node; 688 if (&e->rb_node < *node) 689 node = &parent->rb_left; 690 else 691 node = &parent->rb_right; 692 } 693 rb_link_node(&e->rb_node, parent, node); 694 rb_insert_color(&e->rb_node, &wc->freetree); 695 } else { 696 list_add_tail(&e->lru, &wc->freelist); 697 } 698 wc->freelist_size++; 699 } 700 701 static inline void writecache_verify_watermark(struct dm_writecache *wc) 702 { 703 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 704 queue_work(wc->writeback_wq, &wc->writeback_work); 705 } 706 707 static void writecache_max_age_timer(struct timer_list *t) 708 { 709 struct dm_writecache *wc = from_timer(wc, t, max_age_timer); 710 711 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 712 queue_work(wc->writeback_wq, &wc->writeback_work); 713 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 714 } 715 } 716 717 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 718 { 719 struct wc_entry *e; 720 721 if (WC_MODE_SORT_FREELIST(wc)) { 722 struct rb_node *next; 723 724 if (unlikely(!wc->current_free)) 725 return NULL; 726 e = wc->current_free; 727 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 728 return NULL; 729 next = rb_next(&e->rb_node); 730 rb_erase(&e->rb_node, &wc->freetree); 731 if (unlikely(!next)) 732 next = rb_first(&wc->freetree); 733 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 734 } else { 735 if (unlikely(list_empty(&wc->freelist))) 736 return NULL; 737 e = container_of(wc->freelist.next, struct wc_entry, lru); 738 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 739 return NULL; 740 list_del(&e->lru); 741 } 742 wc->freelist_size--; 743 744 writecache_verify_watermark(wc); 745 746 return e; 747 } 748 749 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 750 { 751 writecache_unlink(wc, e); 752 writecache_add_to_freelist(wc, e); 753 clear_seq_count(wc, e); 754 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 755 if (unlikely(waitqueue_active(&wc->freelist_wait))) 756 wake_up(&wc->freelist_wait); 757 } 758 759 static void writecache_wait_on_freelist(struct dm_writecache *wc) 760 { 761 DEFINE_WAIT(wait); 762 763 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 764 wc_unlock(wc); 765 io_schedule(); 766 finish_wait(&wc->freelist_wait, &wait); 767 wc_lock(wc); 768 } 769 770 static void writecache_poison_lists(struct dm_writecache *wc) 771 { 772 /* 773 * Catch incorrect access to these values while the device is suspended. 774 */ 775 memset(&wc->tree, -1, sizeof(wc->tree)); 776 wc->lru.next = LIST_POISON1; 777 wc->lru.prev = LIST_POISON2; 778 wc->freelist.next = LIST_POISON1; 779 wc->freelist.prev = LIST_POISON2; 780 } 781 782 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 783 { 784 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 785 if (WC_MODE_PMEM(wc)) 786 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 787 } 788 789 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 790 { 791 return read_seq_count(wc, e) < wc->seq_count; 792 } 793 794 static void writecache_flush(struct dm_writecache *wc) 795 { 796 struct wc_entry *e, *e2; 797 bool need_flush_after_free; 798 799 wc->uncommitted_blocks = 0; 800 del_timer(&wc->autocommit_timer); 801 802 if (list_empty(&wc->lru)) 803 return; 804 805 e = container_of(wc->lru.next, struct wc_entry, lru); 806 if (writecache_entry_is_committed(wc, e)) { 807 if (wc->overwrote_committed) { 808 writecache_wait_for_ios(wc, WRITE); 809 writecache_disk_flush(wc, wc->ssd_dev); 810 wc->overwrote_committed = false; 811 } 812 return; 813 } 814 while (1) { 815 writecache_flush_entry(wc, e); 816 if (unlikely(e->lru.next == &wc->lru)) 817 break; 818 e2 = container_of(e->lru.next, struct wc_entry, lru); 819 if (writecache_entry_is_committed(wc, e2)) 820 break; 821 e = e2; 822 cond_resched(); 823 } 824 writecache_commit_flushed(wc, true); 825 826 wc->seq_count++; 827 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 828 if (WC_MODE_PMEM(wc)) 829 writecache_commit_flushed(wc, false); 830 else 831 ssd_commit_superblock(wc); 832 833 wc->overwrote_committed = false; 834 835 need_flush_after_free = false; 836 while (1) { 837 /* Free another committed entry with lower seq-count */ 838 struct rb_node *rb_node = rb_prev(&e->rb_node); 839 840 if (rb_node) { 841 e2 = container_of(rb_node, struct wc_entry, rb_node); 842 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 843 likely(!e2->write_in_progress)) { 844 writecache_free_entry(wc, e2); 845 need_flush_after_free = true; 846 } 847 } 848 if (unlikely(e->lru.prev == &wc->lru)) 849 break; 850 e = container_of(e->lru.prev, struct wc_entry, lru); 851 cond_resched(); 852 } 853 854 if (need_flush_after_free) 855 writecache_commit_flushed(wc, false); 856 } 857 858 static void writecache_flush_work(struct work_struct *work) 859 { 860 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 861 862 wc_lock(wc); 863 writecache_flush(wc); 864 wc_unlock(wc); 865 } 866 867 static void writecache_autocommit_timer(struct timer_list *t) 868 { 869 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 870 871 if (!writecache_has_error(wc)) 872 queue_work(wc->writeback_wq, &wc->flush_work); 873 } 874 875 static void writecache_schedule_autocommit(struct dm_writecache *wc) 876 { 877 if (!timer_pending(&wc->autocommit_timer)) 878 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 879 } 880 881 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 882 { 883 struct wc_entry *e; 884 bool discarded_something = false; 885 886 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 887 if (unlikely(!e)) 888 return; 889 890 while (read_original_sector(wc, e) < end) { 891 struct rb_node *node = rb_next(&e->rb_node); 892 893 if (likely(!e->write_in_progress)) { 894 if (!discarded_something) { 895 if (!WC_MODE_PMEM(wc)) { 896 writecache_wait_for_ios(wc, READ); 897 writecache_wait_for_ios(wc, WRITE); 898 } 899 discarded_something = true; 900 } 901 if (!writecache_entry_is_committed(wc, e)) 902 wc->uncommitted_blocks--; 903 writecache_free_entry(wc, e); 904 } 905 906 if (unlikely(!node)) 907 break; 908 909 e = container_of(node, struct wc_entry, rb_node); 910 } 911 912 if (discarded_something) 913 writecache_commit_flushed(wc, false); 914 } 915 916 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 917 { 918 if (wc->writeback_size) { 919 writecache_wait_on_freelist(wc); 920 return true; 921 } 922 return false; 923 } 924 925 static void writecache_suspend(struct dm_target *ti) 926 { 927 struct dm_writecache *wc = ti->private; 928 bool flush_on_suspend; 929 930 del_timer_sync(&wc->autocommit_timer); 931 del_timer_sync(&wc->max_age_timer); 932 933 wc_lock(wc); 934 writecache_flush(wc); 935 flush_on_suspend = wc->flush_on_suspend; 936 if (flush_on_suspend) { 937 wc->flush_on_suspend = false; 938 wc->writeback_all++; 939 queue_work(wc->writeback_wq, &wc->writeback_work); 940 } 941 wc_unlock(wc); 942 943 drain_workqueue(wc->writeback_wq); 944 945 wc_lock(wc); 946 if (flush_on_suspend) 947 wc->writeback_all--; 948 while (writecache_wait_for_writeback(wc)) 949 ; 950 951 if (WC_MODE_PMEM(wc)) 952 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 953 954 writecache_poison_lists(wc); 955 956 wc_unlock(wc); 957 } 958 959 static int writecache_alloc_entries(struct dm_writecache *wc) 960 { 961 size_t b; 962 963 if (wc->entries) 964 return 0; 965 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); 966 if (!wc->entries) 967 return -ENOMEM; 968 for (b = 0; b < wc->n_blocks; b++) { 969 struct wc_entry *e = &wc->entries[b]; 970 971 e->index = b; 972 e->write_in_progress = false; 973 cond_resched(); 974 } 975 976 return 0; 977 } 978 979 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 980 { 981 struct dm_io_region region; 982 struct dm_io_request req; 983 984 region.bdev = wc->ssd_dev->bdev; 985 region.sector = wc->start_sector; 986 region.count = n_sectors; 987 req.bi_opf = REQ_OP_READ | REQ_SYNC; 988 req.mem.type = DM_IO_VMA; 989 req.mem.ptr.vma = (char *)wc->memory_map; 990 req.client = wc->dm_io; 991 req.notify.fn = NULL; 992 993 return dm_io(&req, 1, ®ion, NULL); 994 } 995 996 static void writecache_resume(struct dm_target *ti) 997 { 998 struct dm_writecache *wc = ti->private; 999 size_t b; 1000 bool need_flush = false; 1001 __le64 sb_seq_count; 1002 int r; 1003 1004 wc_lock(wc); 1005 1006 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); 1007 1008 if (WC_MODE_PMEM(wc)) { 1009 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 1010 } else { 1011 r = writecache_read_metadata(wc, wc->metadata_sectors); 1012 if (r) { 1013 size_t sb_entries_offset; 1014 1015 writecache_error(wc, r, "unable to read metadata: %d", r); 1016 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 1017 memset((char *)wc->memory_map + sb_entries_offset, -1, 1018 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 1019 } 1020 } 1021 1022 wc->tree = RB_ROOT; 1023 INIT_LIST_HEAD(&wc->lru); 1024 if (WC_MODE_SORT_FREELIST(wc)) { 1025 wc->freetree = RB_ROOT; 1026 wc->current_free = NULL; 1027 } else { 1028 INIT_LIST_HEAD(&wc->freelist); 1029 } 1030 wc->freelist_size = 0; 1031 1032 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, 1033 sizeof(uint64_t)); 1034 if (r) { 1035 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 1036 sb_seq_count = cpu_to_le64(0); 1037 } 1038 wc->seq_count = le64_to_cpu(sb_seq_count); 1039 1040 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 1041 for (b = 0; b < wc->n_blocks; b++) { 1042 struct wc_entry *e = &wc->entries[b]; 1043 struct wc_memory_entry wme; 1044 1045 if (writecache_has_error(wc)) { 1046 e->original_sector = -1; 1047 e->seq_count = -1; 1048 continue; 1049 } 1050 r = copy_mc_to_kernel(&wme, memory_entry(wc, e), 1051 sizeof(struct wc_memory_entry)); 1052 if (r) { 1053 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1054 (unsigned long)b, r); 1055 e->original_sector = -1; 1056 e->seq_count = -1; 1057 } else { 1058 e->original_sector = le64_to_cpu(wme.original_sector); 1059 e->seq_count = le64_to_cpu(wme.seq_count); 1060 } 1061 cond_resched(); 1062 } 1063 #endif 1064 for (b = 0; b < wc->n_blocks; b++) { 1065 struct wc_entry *e = &wc->entries[b]; 1066 1067 if (!writecache_entry_is_committed(wc, e)) { 1068 if (read_seq_count(wc, e) != -1) { 1069 erase_this: 1070 clear_seq_count(wc, e); 1071 need_flush = true; 1072 } 1073 writecache_add_to_freelist(wc, e); 1074 } else { 1075 struct wc_entry *old; 1076 1077 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1078 if (!old) { 1079 writecache_insert_entry(wc, e); 1080 } else { 1081 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1082 writecache_error(wc, -EINVAL, 1083 "two identical entries, position %llu, sector %llu, sequence %llu", 1084 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1085 (unsigned long long)read_seq_count(wc, e)); 1086 } 1087 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1088 goto erase_this; 1089 } else { 1090 writecache_free_entry(wc, old); 1091 writecache_insert_entry(wc, e); 1092 need_flush = true; 1093 } 1094 } 1095 } 1096 cond_resched(); 1097 } 1098 1099 if (need_flush) { 1100 writecache_flush_all_metadata(wc); 1101 writecache_commit_flushed(wc, false); 1102 } 1103 1104 writecache_verify_watermark(wc); 1105 1106 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1107 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1108 1109 wc_unlock(wc); 1110 } 1111 1112 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1113 { 1114 if (argc != 1) 1115 return -EINVAL; 1116 1117 wc_lock(wc); 1118 if (dm_suspended(wc->ti)) { 1119 wc_unlock(wc); 1120 return -EBUSY; 1121 } 1122 if (writecache_has_error(wc)) { 1123 wc_unlock(wc); 1124 return -EIO; 1125 } 1126 1127 writecache_flush(wc); 1128 wc->writeback_all++; 1129 queue_work(wc->writeback_wq, &wc->writeback_work); 1130 wc_unlock(wc); 1131 1132 flush_workqueue(wc->writeback_wq); 1133 1134 wc_lock(wc); 1135 wc->writeback_all--; 1136 if (writecache_has_error(wc)) { 1137 wc_unlock(wc); 1138 return -EIO; 1139 } 1140 wc_unlock(wc); 1141 1142 return 0; 1143 } 1144 1145 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1146 { 1147 if (argc != 1) 1148 return -EINVAL; 1149 1150 wc_lock(wc); 1151 wc->flush_on_suspend = true; 1152 wc_unlock(wc); 1153 1154 return 0; 1155 } 1156 1157 static void activate_cleaner(struct dm_writecache *wc) 1158 { 1159 wc->flush_on_suspend = true; 1160 wc->cleaner = true; 1161 wc->freelist_high_watermark = wc->n_blocks; 1162 wc->freelist_low_watermark = wc->n_blocks; 1163 } 1164 1165 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1166 { 1167 if (argc != 1) 1168 return -EINVAL; 1169 1170 wc_lock(wc); 1171 activate_cleaner(wc); 1172 if (!dm_suspended(wc->ti)) 1173 writecache_verify_watermark(wc); 1174 wc_unlock(wc); 1175 1176 return 0; 1177 } 1178 1179 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1180 { 1181 if (argc != 1) 1182 return -EINVAL; 1183 1184 wc_lock(wc); 1185 memset(&wc->stats, 0, sizeof(wc->stats)); 1186 wc_unlock(wc); 1187 1188 return 0; 1189 } 1190 1191 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv, 1192 char *result, unsigned int maxlen) 1193 { 1194 int r = -EINVAL; 1195 struct dm_writecache *wc = ti->private; 1196 1197 if (!strcasecmp(argv[0], "flush")) 1198 r = process_flush_mesg(argc, argv, wc); 1199 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1200 r = process_flush_on_suspend_mesg(argc, argv, wc); 1201 else if (!strcasecmp(argv[0], "cleaner")) 1202 r = process_cleaner_mesg(argc, argv, wc); 1203 else if (!strcasecmp(argv[0], "clear_stats")) 1204 r = process_clear_stats_mesg(argc, argv, wc); 1205 else 1206 DMERR("unrecognised message received: %s", argv[0]); 1207 1208 return r; 1209 } 1210 1211 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) 1212 { 1213 /* 1214 * clflushopt performs better with block size 1024, 2048, 4096 1215 * non-temporal stores perform better with block size 512 1216 * 1217 * block size 512 1024 2048 4096 1218 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s 1219 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s 1220 * 1221 * We see that movnti performs better for 512-byte blocks, and 1222 * clflushopt performs better for 1024-byte and larger blocks. So, we 1223 * prefer clflushopt for sizes >= 768. 1224 * 1225 * NOTE: this happens to be the case now (with dm-writecache's single 1226 * threaded model) but re-evaluate this once memcpy_flushcache() is 1227 * enabled to use movdir64b which might invalidate this performance 1228 * advantage seen with cache-allocating-writes plus flushing. 1229 */ 1230 #ifdef CONFIG_X86 1231 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && 1232 likely(boot_cpu_data.x86_clflush_size == 64) && 1233 likely(size >= 768)) { 1234 do { 1235 memcpy((void *)dest, (void *)source, 64); 1236 clflushopt((void *)dest); 1237 dest += 64; 1238 source += 64; 1239 size -= 64; 1240 } while (size >= 64); 1241 return; 1242 } 1243 #endif 1244 memcpy_flushcache(dest, source, size); 1245 } 1246 1247 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1248 { 1249 void *buf; 1250 unsigned int size; 1251 int rw = bio_data_dir(bio); 1252 unsigned int remaining_size = wc->block_size; 1253 1254 do { 1255 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1256 1257 buf = bvec_kmap_local(&bv); 1258 size = bv.bv_len; 1259 if (unlikely(size > remaining_size)) 1260 size = remaining_size; 1261 1262 if (rw == READ) { 1263 int r; 1264 1265 r = copy_mc_to_kernel(buf, data, size); 1266 flush_dcache_page(bio_page(bio)); 1267 if (unlikely(r)) { 1268 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1269 bio->bi_status = BLK_STS_IOERR; 1270 } 1271 } else { 1272 flush_dcache_page(bio_page(bio)); 1273 memcpy_flushcache_optimized(data, buf, size); 1274 } 1275 1276 kunmap_local(buf); 1277 1278 data = (char *)data + size; 1279 remaining_size -= size; 1280 bio_advance(bio, size); 1281 } while (unlikely(remaining_size)); 1282 } 1283 1284 static int writecache_flush_thread(void *data) 1285 { 1286 struct dm_writecache *wc = data; 1287 1288 while (1) { 1289 struct bio *bio; 1290 1291 wc_lock(wc); 1292 bio = bio_list_pop(&wc->flush_list); 1293 if (!bio) { 1294 set_current_state(TASK_INTERRUPTIBLE); 1295 wc_unlock(wc); 1296 1297 if (unlikely(kthread_should_stop())) { 1298 set_current_state(TASK_RUNNING); 1299 break; 1300 } 1301 1302 schedule(); 1303 continue; 1304 } 1305 1306 if (bio_op(bio) == REQ_OP_DISCARD) { 1307 writecache_discard(wc, bio->bi_iter.bi_sector, 1308 bio_end_sector(bio)); 1309 wc_unlock(wc); 1310 bio_set_dev(bio, wc->dev->bdev); 1311 submit_bio_noacct(bio); 1312 } else { 1313 writecache_flush(wc); 1314 wc_unlock(wc); 1315 if (writecache_has_error(wc)) 1316 bio->bi_status = BLK_STS_IOERR; 1317 bio_endio(bio); 1318 } 1319 } 1320 1321 return 0; 1322 } 1323 1324 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1325 { 1326 if (bio_list_empty(&wc->flush_list)) 1327 wake_up_process(wc->flush_thread); 1328 bio_list_add(&wc->flush_list, bio); 1329 } 1330 1331 enum wc_map_op { 1332 WC_MAP_SUBMIT, 1333 WC_MAP_REMAP, 1334 WC_MAP_REMAP_ORIGIN, 1335 WC_MAP_RETURN, 1336 WC_MAP_ERROR, 1337 }; 1338 1339 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, 1340 struct wc_entry *e) 1341 { 1342 if (e) { 1343 sector_t next_boundary = 1344 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1345 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) 1346 dm_accept_partial_bio(bio, next_boundary); 1347 } 1348 } 1349 1350 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) 1351 { 1352 enum wc_map_op map_op; 1353 struct wc_entry *e; 1354 1355 read_next_block: 1356 wc->stats.reads++; 1357 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1358 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1359 wc->stats.read_hits++; 1360 if (WC_MODE_PMEM(wc)) { 1361 bio_copy_block(wc, bio, memory_data(wc, e)); 1362 if (bio->bi_iter.bi_size) 1363 goto read_next_block; 1364 map_op = WC_MAP_SUBMIT; 1365 } else { 1366 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1367 bio_set_dev(bio, wc->ssd_dev->bdev); 1368 bio->bi_iter.bi_sector = cache_sector(wc, e); 1369 if (!writecache_entry_is_committed(wc, e)) 1370 writecache_wait_for_ios(wc, WRITE); 1371 map_op = WC_MAP_REMAP; 1372 } 1373 } else { 1374 writecache_map_remap_origin(wc, bio, e); 1375 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1376 map_op = WC_MAP_REMAP_ORIGIN; 1377 } 1378 1379 return map_op; 1380 } 1381 1382 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, 1383 struct wc_entry *e, bool search_used) 1384 { 1385 unsigned int bio_size = wc->block_size; 1386 sector_t start_cache_sec = cache_sector(wc, e); 1387 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1388 1389 while (bio_size < bio->bi_iter.bi_size) { 1390 if (!search_used) { 1391 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1392 1393 if (!f) 1394 break; 1395 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1396 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1397 writecache_insert_entry(wc, f); 1398 wc->uncommitted_blocks++; 1399 } else { 1400 struct wc_entry *f; 1401 struct rb_node *next = rb_next(&e->rb_node); 1402 1403 if (!next) 1404 break; 1405 f = container_of(next, struct wc_entry, rb_node); 1406 if (f != e + 1) 1407 break; 1408 if (read_original_sector(wc, f) != 1409 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1410 break; 1411 if (unlikely(f->write_in_progress)) 1412 break; 1413 if (writecache_entry_is_committed(wc, f)) 1414 wc->overwrote_committed = true; 1415 e = f; 1416 } 1417 bio_size += wc->block_size; 1418 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1419 } 1420 1421 bio_set_dev(bio, wc->ssd_dev->bdev); 1422 bio->bi_iter.bi_sector = start_cache_sec; 1423 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1424 1425 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1426 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1427 1428 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1429 wc->uncommitted_blocks = 0; 1430 queue_work(wc->writeback_wq, &wc->flush_work); 1431 } else { 1432 writecache_schedule_autocommit(wc); 1433 } 1434 } 1435 1436 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) 1437 { 1438 struct wc_entry *e; 1439 1440 do { 1441 bool found_entry = false; 1442 bool search_used = false; 1443 1444 if (writecache_has_error(wc)) { 1445 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1446 return WC_MAP_ERROR; 1447 } 1448 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1449 if (e) { 1450 if (!writecache_entry_is_committed(wc, e)) { 1451 wc->stats.write_hits_uncommitted++; 1452 search_used = true; 1453 goto bio_copy; 1454 } 1455 wc->stats.write_hits_committed++; 1456 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1457 wc->overwrote_committed = true; 1458 search_used = true; 1459 goto bio_copy; 1460 } 1461 found_entry = true; 1462 } else { 1463 if (unlikely(wc->cleaner) || 1464 (wc->metadata_only && !(bio->bi_opf & REQ_META))) 1465 goto direct_write; 1466 } 1467 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1468 if (unlikely(!e)) { 1469 if (!WC_MODE_PMEM(wc) && !found_entry) { 1470 direct_write: 1471 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1472 writecache_map_remap_origin(wc, bio, e); 1473 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; 1474 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1475 return WC_MAP_REMAP_ORIGIN; 1476 } 1477 wc->stats.writes_blocked_on_freelist++; 1478 writecache_wait_on_freelist(wc); 1479 continue; 1480 } 1481 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1482 writecache_insert_entry(wc, e); 1483 wc->uncommitted_blocks++; 1484 wc->stats.writes_allocate++; 1485 bio_copy: 1486 if (WC_MODE_PMEM(wc)) { 1487 bio_copy_block(wc, bio, memory_data(wc, e)); 1488 wc->stats.writes++; 1489 } else { 1490 writecache_bio_copy_ssd(wc, bio, e, search_used); 1491 return WC_MAP_REMAP; 1492 } 1493 } while (bio->bi_iter.bi_size); 1494 1495 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) 1496 writecache_flush(wc); 1497 else 1498 writecache_schedule_autocommit(wc); 1499 1500 return WC_MAP_SUBMIT; 1501 } 1502 1503 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) 1504 { 1505 if (writecache_has_error(wc)) 1506 return WC_MAP_ERROR; 1507 1508 if (WC_MODE_PMEM(wc)) { 1509 wc->stats.flushes++; 1510 writecache_flush(wc); 1511 if (writecache_has_error(wc)) 1512 return WC_MAP_ERROR; 1513 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) 1514 return WC_MAP_REMAP_ORIGIN; 1515 return WC_MAP_SUBMIT; 1516 } 1517 /* SSD: */ 1518 if (dm_bio_get_target_bio_nr(bio)) 1519 return WC_MAP_REMAP_ORIGIN; 1520 wc->stats.flushes++; 1521 writecache_offload_bio(wc, bio); 1522 return WC_MAP_RETURN; 1523 } 1524 1525 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) 1526 { 1527 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; 1528 1529 if (writecache_has_error(wc)) 1530 return WC_MAP_ERROR; 1531 1532 if (WC_MODE_PMEM(wc)) { 1533 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1534 return WC_MAP_REMAP_ORIGIN; 1535 } 1536 /* SSD: */ 1537 writecache_offload_bio(wc, bio); 1538 return WC_MAP_RETURN; 1539 } 1540 1541 static int writecache_map(struct dm_target *ti, struct bio *bio) 1542 { 1543 struct dm_writecache *wc = ti->private; 1544 enum wc_map_op map_op; 1545 1546 bio->bi_private = NULL; 1547 1548 wc_lock(wc); 1549 1550 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1551 map_op = writecache_map_flush(wc, bio); 1552 goto done; 1553 } 1554 1555 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1556 1557 if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1558 (wc->block_size / 512 - 1)) != 0)) { 1559 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1560 (unsigned long long)bio->bi_iter.bi_sector, 1561 bio->bi_iter.bi_size, wc->block_size); 1562 map_op = WC_MAP_ERROR; 1563 goto done; 1564 } 1565 1566 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1567 map_op = writecache_map_discard(wc, bio); 1568 goto done; 1569 } 1570 1571 if (bio_data_dir(bio) == READ) 1572 map_op = writecache_map_read(wc, bio); 1573 else 1574 map_op = writecache_map_write(wc, bio); 1575 done: 1576 switch (map_op) { 1577 case WC_MAP_REMAP_ORIGIN: 1578 if (likely(wc->pause != 0)) { 1579 if (bio_op(bio) == REQ_OP_WRITE) { 1580 dm_iot_io_begin(&wc->iot, 1); 1581 bio->bi_private = (void *)2; 1582 } 1583 } 1584 bio_set_dev(bio, wc->dev->bdev); 1585 wc_unlock(wc); 1586 return DM_MAPIO_REMAPPED; 1587 1588 case WC_MAP_REMAP: 1589 /* make sure that writecache_end_io decrements bio_in_progress: */ 1590 bio->bi_private = (void *)1; 1591 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1592 wc_unlock(wc); 1593 return DM_MAPIO_REMAPPED; 1594 1595 case WC_MAP_SUBMIT: 1596 wc_unlock(wc); 1597 bio_endio(bio); 1598 return DM_MAPIO_SUBMITTED; 1599 1600 case WC_MAP_RETURN: 1601 wc_unlock(wc); 1602 return DM_MAPIO_SUBMITTED; 1603 1604 case WC_MAP_ERROR: 1605 wc_unlock(wc); 1606 bio_io_error(bio); 1607 return DM_MAPIO_SUBMITTED; 1608 1609 default: 1610 BUG(); 1611 wc_unlock(wc); 1612 return DM_MAPIO_KILL; 1613 } 1614 } 1615 1616 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1617 { 1618 struct dm_writecache *wc = ti->private; 1619 1620 if (bio->bi_private == (void *)1) { 1621 int dir = bio_data_dir(bio); 1622 1623 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1624 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1625 wake_up(&wc->bio_in_progress_wait[dir]); 1626 } else if (bio->bi_private == (void *)2) { 1627 dm_iot_io_end(&wc->iot, 1); 1628 } 1629 return 0; 1630 } 1631 1632 static int writecache_iterate_devices(struct dm_target *ti, 1633 iterate_devices_callout_fn fn, void *data) 1634 { 1635 struct dm_writecache *wc = ti->private; 1636 1637 return fn(ti, wc->dev, 0, ti->len, data); 1638 } 1639 1640 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1641 { 1642 struct dm_writecache *wc = ti->private; 1643 1644 if (limits->logical_block_size < wc->block_size) 1645 limits->logical_block_size = wc->block_size; 1646 1647 if (limits->physical_block_size < wc->block_size) 1648 limits->physical_block_size = wc->block_size; 1649 1650 if (limits->io_min < wc->block_size) 1651 limits->io_min = wc->block_size; 1652 } 1653 1654 1655 static void writecache_writeback_endio(struct bio *bio) 1656 { 1657 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1658 struct dm_writecache *wc = wb->wc; 1659 unsigned long flags; 1660 1661 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1662 if (unlikely(list_empty(&wc->endio_list))) 1663 wake_up_process(wc->endio_thread); 1664 list_add_tail(&wb->endio_entry, &wc->endio_list); 1665 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1666 } 1667 1668 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1669 { 1670 struct copy_struct *c = ptr; 1671 struct dm_writecache *wc = c->wc; 1672 1673 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1674 1675 raw_spin_lock_irq(&wc->endio_list_lock); 1676 if (unlikely(list_empty(&wc->endio_list))) 1677 wake_up_process(wc->endio_thread); 1678 list_add_tail(&c->endio_entry, &wc->endio_list); 1679 raw_spin_unlock_irq(&wc->endio_list_lock); 1680 } 1681 1682 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1683 { 1684 unsigned int i; 1685 struct writeback_struct *wb; 1686 struct wc_entry *e; 1687 unsigned long n_walked = 0; 1688 1689 do { 1690 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1691 list_del(&wb->endio_entry); 1692 1693 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1694 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1695 "write error %d", wb->bio.bi_status); 1696 i = 0; 1697 do { 1698 e = wb->wc_list[i]; 1699 BUG_ON(!e->write_in_progress); 1700 e->write_in_progress = false; 1701 INIT_LIST_HEAD(&e->lru); 1702 if (!writecache_has_error(wc)) 1703 writecache_free_entry(wc, e); 1704 BUG_ON(!wc->writeback_size); 1705 wc->writeback_size--; 1706 n_walked++; 1707 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1708 writecache_commit_flushed(wc, false); 1709 wc_unlock(wc); 1710 wc_lock(wc); 1711 n_walked = 0; 1712 } 1713 } while (++i < wb->wc_list_n); 1714 1715 if (wb->wc_list != wb->wc_list_inline) 1716 kfree(wb->wc_list); 1717 bio_put(&wb->bio); 1718 } while (!list_empty(list)); 1719 } 1720 1721 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1722 { 1723 struct copy_struct *c; 1724 struct wc_entry *e; 1725 1726 do { 1727 c = list_entry(list->next, struct copy_struct, endio_entry); 1728 list_del(&c->endio_entry); 1729 1730 if (unlikely(c->error)) 1731 writecache_error(wc, c->error, "copy error"); 1732 1733 e = c->e; 1734 do { 1735 BUG_ON(!e->write_in_progress); 1736 e->write_in_progress = false; 1737 INIT_LIST_HEAD(&e->lru); 1738 if (!writecache_has_error(wc)) 1739 writecache_free_entry(wc, e); 1740 1741 BUG_ON(!wc->writeback_size); 1742 wc->writeback_size--; 1743 e++; 1744 } while (--c->n_entries); 1745 mempool_free(c, &wc->copy_pool); 1746 } while (!list_empty(list)); 1747 } 1748 1749 static int writecache_endio_thread(void *data) 1750 { 1751 struct dm_writecache *wc = data; 1752 1753 while (1) { 1754 struct list_head list; 1755 1756 raw_spin_lock_irq(&wc->endio_list_lock); 1757 if (!list_empty(&wc->endio_list)) 1758 goto pop_from_list; 1759 set_current_state(TASK_INTERRUPTIBLE); 1760 raw_spin_unlock_irq(&wc->endio_list_lock); 1761 1762 if (unlikely(kthread_should_stop())) { 1763 set_current_state(TASK_RUNNING); 1764 break; 1765 } 1766 1767 schedule(); 1768 1769 continue; 1770 1771 pop_from_list: 1772 list = wc->endio_list; 1773 list.next->prev = list.prev->next = &list; 1774 INIT_LIST_HEAD(&wc->endio_list); 1775 raw_spin_unlock_irq(&wc->endio_list_lock); 1776 1777 if (!WC_MODE_FUA(wc)) 1778 writecache_disk_flush(wc, wc->dev); 1779 1780 wc_lock(wc); 1781 1782 if (WC_MODE_PMEM(wc)) { 1783 __writecache_endio_pmem(wc, &list); 1784 } else { 1785 __writecache_endio_ssd(wc, &list); 1786 writecache_wait_for_ios(wc, READ); 1787 } 1788 1789 writecache_commit_flushed(wc, false); 1790 1791 wc_unlock(wc); 1792 } 1793 1794 return 0; 1795 } 1796 1797 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) 1798 { 1799 struct dm_writecache *wc = wb->wc; 1800 unsigned int block_size = wc->block_size; 1801 void *address = memory_data(wc, e); 1802 1803 persistent_memory_flush_cache(address, block_size); 1804 1805 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) 1806 return true; 1807 1808 return bio_add_page(&wb->bio, persistent_memory_page(address), 1809 block_size, persistent_memory_page_offset(address)) != 0; 1810 } 1811 1812 struct writeback_list { 1813 struct list_head list; 1814 size_t size; 1815 }; 1816 1817 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1818 { 1819 if (unlikely(wc->max_writeback_jobs)) { 1820 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1821 wc_lock(wc); 1822 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1823 writecache_wait_on_freelist(wc); 1824 wc_unlock(wc); 1825 } 1826 } 1827 cond_resched(); 1828 } 1829 1830 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1831 { 1832 struct wc_entry *e, *f; 1833 struct bio *bio; 1834 struct writeback_struct *wb; 1835 unsigned int max_pages; 1836 1837 while (wbl->size) { 1838 wbl->size--; 1839 e = container_of(wbl->list.prev, struct wc_entry, lru); 1840 list_del(&e->lru); 1841 1842 max_pages = e->wc_list_contiguous; 1843 1844 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, 1845 GFP_NOIO, &wc->bio_set); 1846 wb = container_of(bio, struct writeback_struct, bio); 1847 wb->wc = wc; 1848 bio->bi_end_io = writecache_writeback_endio; 1849 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1850 1851 if (unlikely(max_pages > WB_LIST_INLINE)) 1852 wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1853 GFP_NOIO | __GFP_NORETRY | 1854 __GFP_NOMEMALLOC | __GFP_NOWARN); 1855 1856 if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) { 1857 wb->wc_list = wb->wc_list_inline; 1858 max_pages = WB_LIST_INLINE; 1859 } 1860 1861 BUG_ON(!wc_add_block(wb, e)); 1862 1863 wb->wc_list[0] = e; 1864 wb->wc_list_n = 1; 1865 1866 while (wbl->size && wb->wc_list_n < max_pages) { 1867 f = container_of(wbl->list.prev, struct wc_entry, lru); 1868 if (read_original_sector(wc, f) != 1869 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1870 break; 1871 if (!wc_add_block(wb, f)) 1872 break; 1873 wbl->size--; 1874 list_del(&f->lru); 1875 wb->wc_list[wb->wc_list_n++] = f; 1876 e = f; 1877 } 1878 if (WC_MODE_FUA(wc)) 1879 bio->bi_opf |= REQ_FUA; 1880 if (writecache_has_error(wc)) { 1881 bio->bi_status = BLK_STS_IOERR; 1882 bio_endio(bio); 1883 } else if (unlikely(!bio_sectors(bio))) { 1884 bio->bi_status = BLK_STS_OK; 1885 bio_endio(bio); 1886 } else { 1887 submit_bio(bio); 1888 } 1889 1890 __writeback_throttle(wc, wbl); 1891 } 1892 } 1893 1894 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1895 { 1896 struct wc_entry *e, *f; 1897 struct dm_io_region from, to; 1898 struct copy_struct *c; 1899 1900 while (wbl->size) { 1901 unsigned int n_sectors; 1902 1903 wbl->size--; 1904 e = container_of(wbl->list.prev, struct wc_entry, lru); 1905 list_del(&e->lru); 1906 1907 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1908 1909 from.bdev = wc->ssd_dev->bdev; 1910 from.sector = cache_sector(wc, e); 1911 from.count = n_sectors; 1912 to.bdev = wc->dev->bdev; 1913 to.sector = read_original_sector(wc, e); 1914 to.count = n_sectors; 1915 1916 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1917 c->wc = wc; 1918 c->e = e; 1919 c->n_entries = e->wc_list_contiguous; 1920 1921 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1922 wbl->size--; 1923 f = container_of(wbl->list.prev, struct wc_entry, lru); 1924 BUG_ON(f != e + 1); 1925 list_del(&f->lru); 1926 e = f; 1927 } 1928 1929 if (unlikely(to.sector + to.count > wc->data_device_sectors)) { 1930 if (to.sector >= wc->data_device_sectors) { 1931 writecache_copy_endio(0, 0, c); 1932 continue; 1933 } 1934 from.count = to.count = wc->data_device_sectors - to.sector; 1935 } 1936 1937 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1938 1939 __writeback_throttle(wc, wbl); 1940 } 1941 } 1942 1943 static void writecache_writeback(struct work_struct *work) 1944 { 1945 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1946 struct blk_plug plug; 1947 struct wc_entry *f, *g, *e = NULL; 1948 struct rb_node *node, *next_node; 1949 struct list_head skipped; 1950 struct writeback_list wbl; 1951 unsigned long n_walked; 1952 1953 if (!WC_MODE_PMEM(wc)) { 1954 /* Wait for any active kcopyd work on behalf of ssd writeback */ 1955 dm_kcopyd_client_flush(wc->dm_kcopyd); 1956 } 1957 1958 if (likely(wc->pause != 0)) { 1959 while (1) { 1960 unsigned long idle; 1961 1962 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || 1963 unlikely(dm_suspended(wc->ti))) 1964 break; 1965 idle = dm_iot_idle_time(&wc->iot); 1966 if (idle >= wc->pause) 1967 break; 1968 idle = wc->pause - idle; 1969 if (idle > HZ) 1970 idle = HZ; 1971 schedule_timeout_idle(idle); 1972 } 1973 } 1974 1975 wc_lock(wc); 1976 restart: 1977 if (writecache_has_error(wc)) { 1978 wc_unlock(wc); 1979 return; 1980 } 1981 1982 if (unlikely(wc->writeback_all)) { 1983 if (writecache_wait_for_writeback(wc)) 1984 goto restart; 1985 } 1986 1987 if (wc->overwrote_committed) 1988 writecache_wait_for_ios(wc, WRITE); 1989 1990 n_walked = 0; 1991 INIT_LIST_HEAD(&skipped); 1992 INIT_LIST_HEAD(&wbl.list); 1993 wbl.size = 0; 1994 while (!list_empty(&wc->lru) && 1995 (wc->writeback_all || 1996 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1997 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1998 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1999 2000 n_walked++; 2001 if (unlikely(n_walked > WRITEBACK_LATENCY) && 2002 likely(!wc->writeback_all)) { 2003 if (likely(!dm_suspended(wc->ti))) 2004 queue_work(wc->writeback_wq, &wc->writeback_work); 2005 break; 2006 } 2007 2008 if (unlikely(wc->writeback_all)) { 2009 if (unlikely(!e)) { 2010 writecache_flush(wc); 2011 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 2012 } else 2013 e = g; 2014 } else 2015 e = container_of(wc->lru.prev, struct wc_entry, lru); 2016 BUG_ON(e->write_in_progress); 2017 if (unlikely(!writecache_entry_is_committed(wc, e))) 2018 writecache_flush(wc); 2019 2020 node = rb_prev(&e->rb_node); 2021 if (node) { 2022 f = container_of(node, struct wc_entry, rb_node); 2023 if (unlikely(read_original_sector(wc, f) == 2024 read_original_sector(wc, e))) { 2025 BUG_ON(!f->write_in_progress); 2026 list_move(&e->lru, &skipped); 2027 cond_resched(); 2028 continue; 2029 } 2030 } 2031 wc->writeback_size++; 2032 list_move(&e->lru, &wbl.list); 2033 wbl.size++; 2034 e->write_in_progress = true; 2035 e->wc_list_contiguous = 1; 2036 2037 f = e; 2038 2039 while (1) { 2040 next_node = rb_next(&f->rb_node); 2041 if (unlikely(!next_node)) 2042 break; 2043 g = container_of(next_node, struct wc_entry, rb_node); 2044 if (unlikely(read_original_sector(wc, g) == 2045 read_original_sector(wc, f))) { 2046 f = g; 2047 continue; 2048 } 2049 if (read_original_sector(wc, g) != 2050 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 2051 break; 2052 if (unlikely(g->write_in_progress)) 2053 break; 2054 if (unlikely(!writecache_entry_is_committed(wc, g))) 2055 break; 2056 2057 if (!WC_MODE_PMEM(wc)) { 2058 if (g != f + 1) 2059 break; 2060 } 2061 2062 n_walked++; 2063 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 2064 // break; 2065 2066 wc->writeback_size++; 2067 list_move(&g->lru, &wbl.list); 2068 wbl.size++; 2069 g->write_in_progress = true; 2070 g->wc_list_contiguous = BIO_MAX_VECS; 2071 f = g; 2072 e->wc_list_contiguous++; 2073 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { 2074 if (unlikely(wc->writeback_all)) { 2075 next_node = rb_next(&f->rb_node); 2076 if (likely(next_node)) 2077 g = container_of(next_node, struct wc_entry, rb_node); 2078 } 2079 break; 2080 } 2081 } 2082 cond_resched(); 2083 } 2084 2085 if (!list_empty(&skipped)) { 2086 list_splice_tail(&skipped, &wc->lru); 2087 /* 2088 * If we didn't do any progress, we must wait until some 2089 * writeback finishes to avoid burning CPU in a loop 2090 */ 2091 if (unlikely(!wbl.size)) 2092 writecache_wait_for_writeback(wc); 2093 } 2094 2095 wc_unlock(wc); 2096 2097 blk_start_plug(&plug); 2098 2099 if (WC_MODE_PMEM(wc)) 2100 __writecache_writeback_pmem(wc, &wbl); 2101 else 2102 __writecache_writeback_ssd(wc, &wbl); 2103 2104 blk_finish_plug(&plug); 2105 2106 if (unlikely(wc->writeback_all)) { 2107 wc_lock(wc); 2108 while (writecache_wait_for_writeback(wc)) 2109 ; 2110 wc_unlock(wc); 2111 } 2112 } 2113 2114 static int calculate_memory_size(uint64_t device_size, unsigned int block_size, 2115 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 2116 { 2117 uint64_t n_blocks, offset; 2118 struct wc_entry e; 2119 2120 n_blocks = device_size; 2121 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 2122 2123 while (1) { 2124 if (!n_blocks) 2125 return -ENOSPC; 2126 /* Verify the following entries[n_blocks] won't overflow */ 2127 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 2128 sizeof(struct wc_memory_entry))) 2129 return -EFBIG; 2130 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 2131 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 2132 if (offset + n_blocks * block_size <= device_size) 2133 break; 2134 n_blocks--; 2135 } 2136 2137 /* check if the bit field overflows */ 2138 e.index = n_blocks; 2139 if (e.index != n_blocks) 2140 return -EFBIG; 2141 2142 if (n_blocks_p) 2143 *n_blocks_p = n_blocks; 2144 if (n_metadata_blocks_p) 2145 *n_metadata_blocks_p = offset >> __ffs(block_size); 2146 return 0; 2147 } 2148 2149 static int init_memory(struct dm_writecache *wc) 2150 { 2151 size_t b; 2152 int r; 2153 2154 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 2155 if (r) 2156 return r; 2157 2158 r = writecache_alloc_entries(wc); 2159 if (r) 2160 return r; 2161 2162 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 2163 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 2164 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 2165 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 2166 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 2167 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 2168 2169 for (b = 0; b < wc->n_blocks; b++) { 2170 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 2171 cond_resched(); 2172 } 2173 2174 writecache_flush_all_metadata(wc); 2175 writecache_commit_flushed(wc, false); 2176 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 2177 writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic)); 2178 writecache_commit_flushed(wc, false); 2179 2180 return 0; 2181 } 2182 2183 static void writecache_dtr(struct dm_target *ti) 2184 { 2185 struct dm_writecache *wc = ti->private; 2186 2187 if (!wc) 2188 return; 2189 2190 if (wc->endio_thread) 2191 kthread_stop(wc->endio_thread); 2192 2193 if (wc->flush_thread) 2194 kthread_stop(wc->flush_thread); 2195 2196 bioset_exit(&wc->bio_set); 2197 2198 mempool_exit(&wc->copy_pool); 2199 2200 if (wc->writeback_wq) 2201 destroy_workqueue(wc->writeback_wq); 2202 2203 if (wc->dev) 2204 dm_put_device(ti, wc->dev); 2205 2206 if (wc->ssd_dev) 2207 dm_put_device(ti, wc->ssd_dev); 2208 2209 vfree(wc->entries); 2210 2211 if (wc->memory_map) { 2212 if (WC_MODE_PMEM(wc)) 2213 persistent_memory_release(wc); 2214 else 2215 vfree(wc->memory_map); 2216 } 2217 2218 if (wc->dm_kcopyd) 2219 dm_kcopyd_client_destroy(wc->dm_kcopyd); 2220 2221 if (wc->dm_io) 2222 dm_io_client_destroy(wc->dm_io); 2223 2224 vfree(wc->dirty_bitmap); 2225 2226 kfree(wc); 2227 } 2228 2229 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2230 { 2231 struct dm_writecache *wc; 2232 struct dm_arg_set as; 2233 const char *string; 2234 unsigned int opt_params; 2235 size_t offset, data_size; 2236 int i, r; 2237 char dummy; 2238 int high_wm_percent = HIGH_WATERMARK; 2239 int low_wm_percent = LOW_WATERMARK; 2240 uint64_t x; 2241 struct wc_memory_superblock s; 2242 2243 static struct dm_arg _args[] = { 2244 {0, 18, "Invalid number of feature args"}, 2245 }; 2246 2247 as.argc = argc; 2248 as.argv = argv; 2249 2250 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 2251 if (!wc) { 2252 ti->error = "Cannot allocate writecache structure"; 2253 r = -ENOMEM; 2254 goto bad; 2255 } 2256 ti->private = wc; 2257 wc->ti = ti; 2258 2259 mutex_init(&wc->lock); 2260 wc->max_age = MAX_AGE_UNSPECIFIED; 2261 writecache_poison_lists(wc); 2262 init_waitqueue_head(&wc->freelist_wait); 2263 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2264 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2265 2266 for (i = 0; i < 2; i++) { 2267 atomic_set(&wc->bio_in_progress[i], 0); 2268 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2269 } 2270 2271 wc->dm_io = dm_io_client_create(); 2272 if (IS_ERR(wc->dm_io)) { 2273 r = PTR_ERR(wc->dm_io); 2274 ti->error = "Unable to allocate dm-io client"; 2275 wc->dm_io = NULL; 2276 goto bad; 2277 } 2278 2279 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); 2280 if (!wc->writeback_wq) { 2281 r = -ENOMEM; 2282 ti->error = "Could not allocate writeback workqueue"; 2283 goto bad; 2284 } 2285 INIT_WORK(&wc->writeback_work, writecache_writeback); 2286 INIT_WORK(&wc->flush_work, writecache_flush_work); 2287 2288 dm_iot_init(&wc->iot); 2289 2290 raw_spin_lock_init(&wc->endio_list_lock); 2291 INIT_LIST_HEAD(&wc->endio_list); 2292 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); 2293 if (IS_ERR(wc->endio_thread)) { 2294 r = PTR_ERR(wc->endio_thread); 2295 wc->endio_thread = NULL; 2296 ti->error = "Couldn't spawn endio thread"; 2297 goto bad; 2298 } 2299 2300 /* 2301 * Parse the mode (pmem or ssd) 2302 */ 2303 string = dm_shift_arg(&as); 2304 if (!string) 2305 goto bad_arguments; 2306 2307 if (!strcasecmp(string, "s")) { 2308 wc->pmem_mode = false; 2309 } else if (!strcasecmp(string, "p")) { 2310 #ifdef DM_WRITECACHE_HAS_PMEM 2311 wc->pmem_mode = true; 2312 wc->writeback_fua = true; 2313 #else 2314 /* 2315 * If the architecture doesn't support persistent memory or 2316 * the kernel doesn't support any DAX drivers, this driver can 2317 * only be used in SSD-only mode. 2318 */ 2319 r = -EOPNOTSUPP; 2320 ti->error = "Persistent memory or DAX not supported on this system"; 2321 goto bad; 2322 #endif 2323 } else { 2324 goto bad_arguments; 2325 } 2326 2327 if (WC_MODE_PMEM(wc)) { 2328 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2329 offsetof(struct writeback_struct, bio), 2330 BIOSET_NEED_BVECS); 2331 if (r) { 2332 ti->error = "Could not allocate bio set"; 2333 goto bad; 2334 } 2335 } else { 2336 wc->pause = PAUSE_WRITEBACK; 2337 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2338 if (r) { 2339 ti->error = "Could not allocate mempool"; 2340 goto bad; 2341 } 2342 } 2343 2344 /* 2345 * Parse the origin data device 2346 */ 2347 string = dm_shift_arg(&as); 2348 if (!string) 2349 goto bad_arguments; 2350 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2351 if (r) { 2352 ti->error = "Origin data device lookup failed"; 2353 goto bad; 2354 } 2355 2356 /* 2357 * Parse cache data device (be it pmem or ssd) 2358 */ 2359 string = dm_shift_arg(&as); 2360 if (!string) 2361 goto bad_arguments; 2362 2363 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2364 if (r) { 2365 ti->error = "Cache data device lookup failed"; 2366 goto bad; 2367 } 2368 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); 2369 2370 /* 2371 * Parse the cache block size 2372 */ 2373 string = dm_shift_arg(&as); 2374 if (!string) 2375 goto bad_arguments; 2376 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2377 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2378 (wc->block_size & (wc->block_size - 1))) { 2379 r = -EINVAL; 2380 ti->error = "Invalid block size"; 2381 goto bad; 2382 } 2383 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2384 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2385 r = -EINVAL; 2386 ti->error = "Block size is smaller than device logical block size"; 2387 goto bad; 2388 } 2389 wc->block_size_bits = __ffs(wc->block_size); 2390 2391 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2392 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2393 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2394 2395 /* 2396 * Parse optional arguments 2397 */ 2398 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2399 if (r) 2400 goto bad; 2401 2402 while (opt_params) { 2403 string = dm_shift_arg(&as), opt_params--; 2404 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2405 unsigned long long start_sector; 2406 2407 string = dm_shift_arg(&as), opt_params--; 2408 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2409 goto invalid_optional; 2410 wc->start_sector = start_sector; 2411 wc->start_sector_set = true; 2412 if (wc->start_sector != start_sector || 2413 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2414 goto invalid_optional; 2415 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2416 string = dm_shift_arg(&as), opt_params--; 2417 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2418 goto invalid_optional; 2419 if (high_wm_percent < 0 || high_wm_percent > 100) 2420 goto invalid_optional; 2421 wc->high_wm_percent_value = high_wm_percent; 2422 wc->high_wm_percent_set = true; 2423 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2424 string = dm_shift_arg(&as), opt_params--; 2425 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2426 goto invalid_optional; 2427 if (low_wm_percent < 0 || low_wm_percent > 100) 2428 goto invalid_optional; 2429 wc->low_wm_percent_value = low_wm_percent; 2430 wc->low_wm_percent_set = true; 2431 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2432 string = dm_shift_arg(&as), opt_params--; 2433 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2434 goto invalid_optional; 2435 wc->max_writeback_jobs_set = true; 2436 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2437 string = dm_shift_arg(&as), opt_params--; 2438 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2439 goto invalid_optional; 2440 wc->autocommit_blocks_set = true; 2441 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2442 unsigned int autocommit_msecs; 2443 2444 string = dm_shift_arg(&as), opt_params--; 2445 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2446 goto invalid_optional; 2447 if (autocommit_msecs > 3600000) 2448 goto invalid_optional; 2449 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2450 wc->autocommit_time_value = autocommit_msecs; 2451 wc->autocommit_time_set = true; 2452 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2453 unsigned int max_age_msecs; 2454 2455 string = dm_shift_arg(&as), opt_params--; 2456 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2457 goto invalid_optional; 2458 if (max_age_msecs > 86400000) 2459 goto invalid_optional; 2460 wc->max_age = msecs_to_jiffies(max_age_msecs); 2461 wc->max_age_set = true; 2462 wc->max_age_value = max_age_msecs; 2463 } else if (!strcasecmp(string, "cleaner")) { 2464 wc->cleaner_set = true; 2465 wc->cleaner = true; 2466 } else if (!strcasecmp(string, "fua")) { 2467 if (WC_MODE_PMEM(wc)) { 2468 wc->writeback_fua = true; 2469 wc->writeback_fua_set = true; 2470 } else 2471 goto invalid_optional; 2472 } else if (!strcasecmp(string, "nofua")) { 2473 if (WC_MODE_PMEM(wc)) { 2474 wc->writeback_fua = false; 2475 wc->writeback_fua_set = true; 2476 } else 2477 goto invalid_optional; 2478 } else if (!strcasecmp(string, "metadata_only")) { 2479 wc->metadata_only = true; 2480 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { 2481 unsigned int pause_msecs; 2482 2483 if (WC_MODE_PMEM(wc)) 2484 goto invalid_optional; 2485 string = dm_shift_arg(&as), opt_params--; 2486 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) 2487 goto invalid_optional; 2488 if (pause_msecs > 60000) 2489 goto invalid_optional; 2490 wc->pause = msecs_to_jiffies(pause_msecs); 2491 wc->pause_set = true; 2492 wc->pause_value = pause_msecs; 2493 } else { 2494 invalid_optional: 2495 r = -EINVAL; 2496 ti->error = "Invalid optional argument"; 2497 goto bad; 2498 } 2499 } 2500 2501 if (high_wm_percent < low_wm_percent) { 2502 r = -EINVAL; 2503 ti->error = "High watermark must be greater than or equal to low watermark"; 2504 goto bad; 2505 } 2506 2507 if (WC_MODE_PMEM(wc)) { 2508 if (!dax_synchronous(wc->ssd_dev->dax_dev)) { 2509 r = -EOPNOTSUPP; 2510 ti->error = "Asynchronous persistent memory not supported as pmem cache"; 2511 goto bad; 2512 } 2513 2514 r = persistent_memory_claim(wc); 2515 if (r) { 2516 ti->error = "Unable to map persistent memory for cache"; 2517 goto bad; 2518 } 2519 } else { 2520 size_t n_blocks, n_metadata_blocks; 2521 uint64_t n_bitmap_bits; 2522 2523 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2524 2525 bio_list_init(&wc->flush_list); 2526 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); 2527 if (IS_ERR(wc->flush_thread)) { 2528 r = PTR_ERR(wc->flush_thread); 2529 wc->flush_thread = NULL; 2530 ti->error = "Couldn't spawn flush thread"; 2531 goto bad; 2532 } 2533 2534 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2535 &n_blocks, &n_metadata_blocks); 2536 if (r) { 2537 ti->error = "Invalid device size"; 2538 goto bad; 2539 } 2540 2541 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2542 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2543 /* this is limitation of test_bit functions */ 2544 if (n_bitmap_bits > 1U << 31) { 2545 r = -EFBIG; 2546 ti->error = "Invalid device size"; 2547 goto bad; 2548 } 2549 2550 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2551 if (!wc->memory_map) { 2552 r = -ENOMEM; 2553 ti->error = "Unable to allocate memory for metadata"; 2554 goto bad; 2555 } 2556 2557 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2558 if (IS_ERR(wc->dm_kcopyd)) { 2559 r = PTR_ERR(wc->dm_kcopyd); 2560 ti->error = "Unable to allocate dm-kcopyd client"; 2561 wc->dm_kcopyd = NULL; 2562 goto bad; 2563 } 2564 2565 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2566 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2567 BITS_PER_LONG * sizeof(unsigned long); 2568 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2569 if (!wc->dirty_bitmap) { 2570 r = -ENOMEM; 2571 ti->error = "Unable to allocate dirty bitmap"; 2572 goto bad; 2573 } 2574 2575 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2576 if (r) { 2577 ti->error = "Unable to read first block of metadata"; 2578 goto bad; 2579 } 2580 } 2581 2582 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2583 if (r) { 2584 ti->error = "Hardware memory error when reading superblock"; 2585 goto bad; 2586 } 2587 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2588 r = init_memory(wc); 2589 if (r) { 2590 ti->error = "Unable to initialize device"; 2591 goto bad; 2592 } 2593 r = copy_mc_to_kernel(&s, sb(wc), 2594 sizeof(struct wc_memory_superblock)); 2595 if (r) { 2596 ti->error = "Hardware memory error when reading superblock"; 2597 goto bad; 2598 } 2599 } 2600 2601 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2602 ti->error = "Invalid magic in the superblock"; 2603 r = -EINVAL; 2604 goto bad; 2605 } 2606 2607 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2608 ti->error = "Invalid version in the superblock"; 2609 r = -EINVAL; 2610 goto bad; 2611 } 2612 2613 if (le32_to_cpu(s.block_size) != wc->block_size) { 2614 ti->error = "Block size does not match superblock"; 2615 r = -EINVAL; 2616 goto bad; 2617 } 2618 2619 wc->n_blocks = le64_to_cpu(s.n_blocks); 2620 2621 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2622 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2623 overflow: 2624 ti->error = "Overflow in size calculation"; 2625 r = -EINVAL; 2626 goto bad; 2627 } 2628 offset += sizeof(struct wc_memory_superblock); 2629 if (offset < sizeof(struct wc_memory_superblock)) 2630 goto overflow; 2631 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2632 data_size = wc->n_blocks * (size_t)wc->block_size; 2633 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2634 (offset + data_size < offset)) 2635 goto overflow; 2636 if (offset + data_size > wc->memory_map_size) { 2637 ti->error = "Memory area is too small"; 2638 r = -EINVAL; 2639 goto bad; 2640 } 2641 2642 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2643 wc->block_start = (char *)sb(wc) + offset; 2644 2645 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2646 x += 50; 2647 do_div(x, 100); 2648 wc->freelist_high_watermark = x; 2649 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2650 x += 50; 2651 do_div(x, 100); 2652 wc->freelist_low_watermark = x; 2653 2654 if (wc->cleaner) 2655 activate_cleaner(wc); 2656 2657 r = writecache_alloc_entries(wc); 2658 if (r) { 2659 ti->error = "Cannot allocate memory"; 2660 goto bad; 2661 } 2662 2663 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; 2664 ti->flush_supported = true; 2665 ti->num_discard_bios = 1; 2666 2667 if (WC_MODE_PMEM(wc)) 2668 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2669 2670 return 0; 2671 2672 bad_arguments: 2673 r = -EINVAL; 2674 ti->error = "Bad arguments"; 2675 bad: 2676 writecache_dtr(ti); 2677 return r; 2678 } 2679 2680 static void writecache_status(struct dm_target *ti, status_type_t type, 2681 unsigned int status_flags, char *result, unsigned int maxlen) 2682 { 2683 struct dm_writecache *wc = ti->private; 2684 unsigned int extra_args; 2685 unsigned int sz = 0; 2686 2687 switch (type) { 2688 case STATUSTYPE_INFO: 2689 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", 2690 writecache_has_error(wc), 2691 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2692 (unsigned long long)wc->writeback_size, 2693 wc->stats.reads, 2694 wc->stats.read_hits, 2695 wc->stats.writes, 2696 wc->stats.write_hits_uncommitted, 2697 wc->stats.write_hits_committed, 2698 wc->stats.writes_around, 2699 wc->stats.writes_allocate, 2700 wc->stats.writes_blocked_on_freelist, 2701 wc->stats.flushes, 2702 wc->stats.discards); 2703 break; 2704 case STATUSTYPE_TABLE: 2705 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2706 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2707 extra_args = 0; 2708 if (wc->start_sector_set) 2709 extra_args += 2; 2710 if (wc->high_wm_percent_set) 2711 extra_args += 2; 2712 if (wc->low_wm_percent_set) 2713 extra_args += 2; 2714 if (wc->max_writeback_jobs_set) 2715 extra_args += 2; 2716 if (wc->autocommit_blocks_set) 2717 extra_args += 2; 2718 if (wc->autocommit_time_set) 2719 extra_args += 2; 2720 if (wc->max_age_set) 2721 extra_args += 2; 2722 if (wc->cleaner_set) 2723 extra_args++; 2724 if (wc->writeback_fua_set) 2725 extra_args++; 2726 if (wc->metadata_only) 2727 extra_args++; 2728 if (wc->pause_set) 2729 extra_args += 2; 2730 2731 DMEMIT("%u", extra_args); 2732 if (wc->start_sector_set) 2733 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2734 if (wc->high_wm_percent_set) 2735 DMEMIT(" high_watermark %u", wc->high_wm_percent_value); 2736 if (wc->low_wm_percent_set) 2737 DMEMIT(" low_watermark %u", wc->low_wm_percent_value); 2738 if (wc->max_writeback_jobs_set) 2739 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2740 if (wc->autocommit_blocks_set) 2741 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2742 if (wc->autocommit_time_set) 2743 DMEMIT(" autocommit_time %u", wc->autocommit_time_value); 2744 if (wc->max_age_set) 2745 DMEMIT(" max_age %u", wc->max_age_value); 2746 if (wc->cleaner_set) 2747 DMEMIT(" cleaner"); 2748 if (wc->writeback_fua_set) 2749 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2750 if (wc->metadata_only) 2751 DMEMIT(" metadata_only"); 2752 if (wc->pause_set) 2753 DMEMIT(" pause_writeback %u", wc->pause_value); 2754 break; 2755 case STATUSTYPE_IMA: 2756 *result = '\0'; 2757 break; 2758 } 2759 } 2760 2761 static struct target_type writecache_target = { 2762 .name = "writecache", 2763 .version = {1, 6, 0}, 2764 .module = THIS_MODULE, 2765 .ctr = writecache_ctr, 2766 .dtr = writecache_dtr, 2767 .status = writecache_status, 2768 .postsuspend = writecache_suspend, 2769 .resume = writecache_resume, 2770 .message = writecache_message, 2771 .map = writecache_map, 2772 .end_io = writecache_end_io, 2773 .iterate_devices = writecache_iterate_devices, 2774 .io_hints = writecache_io_hints, 2775 }; 2776 module_dm(writecache); 2777 2778 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2779 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2780 MODULE_LICENSE("GPL"); 2781