1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/libnvdimm.h> 17 #include <linux/delay.h> 18 #include "dm-io-tracker.h" 19 20 #define DM_MSG_PREFIX "writecache" 21 22 #define HIGH_WATERMARK 50 23 #define LOW_WATERMARK 45 24 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) 25 #define ENDIO_LATENCY 16 26 #define WRITEBACK_LATENCY 64 27 #define AUTOCOMMIT_BLOCKS_SSD 65536 28 #define AUTOCOMMIT_BLOCKS_PMEM 64 29 #define AUTOCOMMIT_MSEC 1000 30 #define MAX_AGE_DIV 16 31 #define MAX_AGE_UNSPECIFIED -1UL 32 #define PAUSE_WRITEBACK (HZ * 3) 33 34 #define BITMAP_GRANULARITY 65536 35 #if BITMAP_GRANULARITY < PAGE_SIZE 36 #undef BITMAP_GRANULARITY 37 #define BITMAP_GRANULARITY PAGE_SIZE 38 #endif 39 40 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 41 #define DM_WRITECACHE_HAS_PMEM 42 #endif 43 44 #ifdef DM_WRITECACHE_HAS_PMEM 45 #define pmem_assign(dest, src) \ 46 do { \ 47 typeof(dest) uniq = (src); \ 48 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 49 } while (0) 50 #else 51 #define pmem_assign(dest, src) ((dest) = (src)) 52 #endif 53 54 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) 55 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 56 #endif 57 58 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 59 #define MEMORY_SUPERBLOCK_VERSION 1 60 61 struct wc_memory_entry { 62 __le64 original_sector; 63 __le64 seq_count; 64 }; 65 66 struct wc_memory_superblock { 67 union { 68 struct { 69 __le32 magic; 70 __le32 version; 71 __le32 block_size; 72 __le32 pad; 73 __le64 n_blocks; 74 __le64 seq_count; 75 }; 76 __le64 padding[8]; 77 }; 78 struct wc_memory_entry entries[]; 79 }; 80 81 struct wc_entry { 82 struct rb_node rb_node; 83 struct list_head lru; 84 unsigned short wc_list_contiguous; 85 #if BITS_PER_LONG == 64 86 bool write_in_progress : 1; 87 unsigned long index : 47; 88 #else 89 bool write_in_progress; 90 unsigned long index; 91 #endif 92 unsigned long age; 93 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 94 uint64_t original_sector; 95 uint64_t seq_count; 96 #endif 97 }; 98 99 #ifdef DM_WRITECACHE_HAS_PMEM 100 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 101 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 102 #else 103 #define WC_MODE_PMEM(wc) false 104 #define WC_MODE_FUA(wc) false 105 #endif 106 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 107 108 struct dm_writecache { 109 struct mutex lock; 110 struct list_head lru; 111 union { 112 struct list_head freelist; 113 struct { 114 struct rb_root freetree; 115 struct wc_entry *current_free; 116 }; 117 }; 118 struct rb_root tree; 119 120 size_t freelist_size; 121 size_t writeback_size; 122 size_t freelist_high_watermark; 123 size_t freelist_low_watermark; 124 unsigned long max_age; 125 unsigned long pause; 126 127 unsigned int uncommitted_blocks; 128 unsigned int autocommit_blocks; 129 unsigned int max_writeback_jobs; 130 131 int error; 132 133 unsigned long autocommit_jiffies; 134 struct timer_list autocommit_timer; 135 struct wait_queue_head freelist_wait; 136 137 struct timer_list max_age_timer; 138 139 atomic_t bio_in_progress[2]; 140 struct wait_queue_head bio_in_progress_wait[2]; 141 142 struct dm_target *ti; 143 struct dm_dev *dev; 144 struct dm_dev *ssd_dev; 145 sector_t start_sector; 146 void *memory_map; 147 uint64_t memory_map_size; 148 size_t metadata_sectors; 149 size_t n_blocks; 150 uint64_t seq_count; 151 sector_t data_device_sectors; 152 void *block_start; 153 struct wc_entry *entries; 154 unsigned int block_size; 155 unsigned char block_size_bits; 156 157 bool pmem_mode:1; 158 bool writeback_fua:1; 159 160 bool overwrote_committed:1; 161 bool memory_vmapped:1; 162 163 bool start_sector_set:1; 164 bool high_wm_percent_set:1; 165 bool low_wm_percent_set:1; 166 bool max_writeback_jobs_set:1; 167 bool autocommit_blocks_set:1; 168 bool autocommit_time_set:1; 169 bool max_age_set:1; 170 bool writeback_fua_set:1; 171 bool flush_on_suspend:1; 172 bool cleaner:1; 173 bool cleaner_set:1; 174 bool metadata_only:1; 175 bool pause_set:1; 176 177 unsigned int high_wm_percent_value; 178 unsigned int low_wm_percent_value; 179 unsigned int autocommit_time_value; 180 unsigned int max_age_value; 181 unsigned int pause_value; 182 183 unsigned int writeback_all; 184 struct workqueue_struct *writeback_wq; 185 struct work_struct writeback_work; 186 struct work_struct flush_work; 187 188 struct dm_io_tracker iot; 189 190 struct dm_io_client *dm_io; 191 192 raw_spinlock_t endio_list_lock; 193 struct list_head endio_list; 194 struct task_struct *endio_thread; 195 196 struct task_struct *flush_thread; 197 struct bio_list flush_list; 198 199 struct dm_kcopyd_client *dm_kcopyd; 200 unsigned long *dirty_bitmap; 201 unsigned int dirty_bitmap_size; 202 203 struct bio_set bio_set; 204 mempool_t copy_pool; 205 206 struct { 207 unsigned long long reads; 208 unsigned long long read_hits; 209 unsigned long long writes; 210 unsigned long long write_hits_uncommitted; 211 unsigned long long write_hits_committed; 212 unsigned long long writes_around; 213 unsigned long long writes_allocate; 214 unsigned long long writes_blocked_on_freelist; 215 unsigned long long flushes; 216 unsigned long long discards; 217 } stats; 218 }; 219 220 #define WB_LIST_INLINE 16 221 222 struct writeback_struct { 223 struct list_head endio_entry; 224 struct dm_writecache *wc; 225 struct wc_entry **wc_list; 226 unsigned int wc_list_n; 227 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 228 struct bio bio; 229 }; 230 231 struct copy_struct { 232 struct list_head endio_entry; 233 struct dm_writecache *wc; 234 struct wc_entry *e; 235 unsigned int n_entries; 236 int error; 237 }; 238 239 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 240 "A percentage of time allocated for data copying"); 241 242 static void wc_lock(struct dm_writecache *wc) 243 { 244 mutex_lock(&wc->lock); 245 } 246 247 static void wc_unlock(struct dm_writecache *wc) 248 { 249 mutex_unlock(&wc->lock); 250 } 251 252 #ifdef DM_WRITECACHE_HAS_PMEM 253 static int persistent_memory_claim(struct dm_writecache *wc) 254 { 255 int r; 256 loff_t s; 257 long p, da; 258 unsigned long pfn; 259 int id; 260 struct page **pages; 261 sector_t offset; 262 263 wc->memory_vmapped = false; 264 265 s = wc->memory_map_size; 266 p = s >> PAGE_SHIFT; 267 if (!p) { 268 r = -EINVAL; 269 goto err1; 270 } 271 if (p != s >> PAGE_SHIFT) { 272 r = -EOVERFLOW; 273 goto err1; 274 } 275 276 offset = get_start_sect(wc->ssd_dev->bdev); 277 if (offset & (PAGE_SIZE / 512 - 1)) { 278 r = -EINVAL; 279 goto err1; 280 } 281 offset >>= PAGE_SHIFT - 9; 282 283 id = dax_read_lock(); 284 285 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 286 &wc->memory_map, &pfn); 287 if (da < 0) { 288 wc->memory_map = NULL; 289 r = da; 290 goto err2; 291 } 292 if (!pfn_valid(pfn)) { 293 wc->memory_map = NULL; 294 r = -EOPNOTSUPP; 295 goto err2; 296 } 297 if (da != p) { 298 long i; 299 300 wc->memory_map = NULL; 301 pages = vmalloc_array(p, sizeof(struct page *)); 302 if (!pages) { 303 r = -ENOMEM; 304 goto err2; 305 } 306 i = 0; 307 do { 308 long daa; 309 310 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 311 p - i, DAX_ACCESS, NULL, &pfn); 312 if (daa <= 0) { 313 r = daa ? daa : -EINVAL; 314 goto err3; 315 } 316 if (!pfn_valid(pfn)) { 317 r = -EOPNOTSUPP; 318 goto err3; 319 } 320 while (daa-- && i < p) { 321 pages[i++] = pfn_to_page(pfn); 322 pfn++; 323 if (!(i & 15)) 324 cond_resched(); 325 } 326 } while (i < p); 327 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 328 if (!wc->memory_map) { 329 r = -ENOMEM; 330 goto err3; 331 } 332 vfree(pages); 333 wc->memory_vmapped = true; 334 } 335 336 dax_read_unlock(id); 337 338 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 339 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 340 341 return 0; 342 err3: 343 vfree(pages); 344 err2: 345 dax_read_unlock(id); 346 err1: 347 return r; 348 } 349 #else 350 static int persistent_memory_claim(struct dm_writecache *wc) 351 { 352 return -EOPNOTSUPP; 353 } 354 #endif 355 356 static void persistent_memory_release(struct dm_writecache *wc) 357 { 358 if (wc->memory_vmapped) 359 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 360 } 361 362 static struct page *persistent_memory_page(void *addr) 363 { 364 if (is_vmalloc_addr(addr)) 365 return vmalloc_to_page(addr); 366 else 367 return virt_to_page(addr); 368 } 369 370 static unsigned int persistent_memory_page_offset(void *addr) 371 { 372 return (unsigned long)addr & (PAGE_SIZE - 1); 373 } 374 375 static void persistent_memory_flush_cache(void *ptr, size_t size) 376 { 377 if (is_vmalloc_addr(ptr)) 378 flush_kernel_vmap_range(ptr, size); 379 } 380 381 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 382 { 383 if (is_vmalloc_addr(ptr)) 384 invalidate_kernel_vmap_range(ptr, size); 385 } 386 387 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 388 { 389 return wc->memory_map; 390 } 391 392 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 393 { 394 return &sb(wc)->entries[e->index]; 395 } 396 397 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 398 { 399 return (char *)wc->block_start + (e->index << wc->block_size_bits); 400 } 401 402 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 403 { 404 return wc->start_sector + wc->metadata_sectors + 405 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 406 } 407 408 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 409 { 410 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 411 return e->original_sector; 412 #else 413 return le64_to_cpu(memory_entry(wc, e)->original_sector); 414 #endif 415 } 416 417 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 418 { 419 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 420 return e->seq_count; 421 #else 422 return le64_to_cpu(memory_entry(wc, e)->seq_count); 423 #endif 424 } 425 426 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 427 { 428 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 429 e->seq_count = -1; 430 #endif 431 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 432 } 433 434 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 435 uint64_t original_sector, uint64_t seq_count) 436 { 437 struct wc_memory_entry me; 438 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 439 e->original_sector = original_sector; 440 e->seq_count = seq_count; 441 #endif 442 me.original_sector = cpu_to_le64(original_sector); 443 me.seq_count = cpu_to_le64(seq_count); 444 pmem_assign(*memory_entry(wc, e), me); 445 } 446 447 #define writecache_error(wc, err, msg, arg...) \ 448 do { \ 449 if (!cmpxchg(&(wc)->error, 0, err)) \ 450 DMERR(msg, ##arg); \ 451 wake_up(&(wc)->freelist_wait); \ 452 } while (0) 453 454 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 455 456 static void writecache_flush_all_metadata(struct dm_writecache *wc) 457 { 458 if (!WC_MODE_PMEM(wc)) 459 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 460 } 461 462 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 463 { 464 if (!WC_MODE_PMEM(wc)) 465 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 466 wc->dirty_bitmap); 467 } 468 469 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 470 471 struct io_notify { 472 struct dm_writecache *wc; 473 struct completion c; 474 atomic_t count; 475 }; 476 477 static void writecache_notify_io(unsigned long error, void *context) 478 { 479 struct io_notify *endio = context; 480 481 if (unlikely(error != 0)) 482 writecache_error(endio->wc, -EIO, "error writing metadata"); 483 BUG_ON(atomic_read(&endio->count) <= 0); 484 if (atomic_dec_and_test(&endio->count)) 485 complete(&endio->c); 486 } 487 488 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 489 { 490 wait_event(wc->bio_in_progress_wait[direction], 491 !atomic_read(&wc->bio_in_progress[direction])); 492 } 493 494 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 495 { 496 struct dm_io_region region; 497 struct dm_io_request req; 498 struct io_notify endio = { 499 wc, 500 COMPLETION_INITIALIZER_ONSTACK(endio.c), 501 ATOMIC_INIT(1), 502 }; 503 unsigned int bitmap_bits = wc->dirty_bitmap_size * 8; 504 unsigned int i = 0; 505 506 while (1) { 507 unsigned int j; 508 509 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 510 if (unlikely(i == bitmap_bits)) 511 break; 512 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 513 514 region.bdev = wc->ssd_dev->bdev; 515 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 516 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 517 518 if (unlikely(region.sector >= wc->metadata_sectors)) 519 break; 520 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 521 region.count = wc->metadata_sectors - region.sector; 522 523 region.sector += wc->start_sector; 524 atomic_inc(&endio.count); 525 req.bi_opf = REQ_OP_WRITE | REQ_SYNC; 526 req.mem.type = DM_IO_VMA; 527 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 528 req.client = wc->dm_io; 529 req.notify.fn = writecache_notify_io; 530 req.notify.context = &endio; 531 532 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 533 (void) dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 534 i = j; 535 } 536 537 writecache_notify_io(0, &endio); 538 wait_for_completion_io(&endio.c); 539 540 if (wait_for_ios) 541 writecache_wait_for_ios(wc, WRITE); 542 543 writecache_disk_flush(wc, wc->ssd_dev); 544 545 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 546 } 547 548 static void ssd_commit_superblock(struct dm_writecache *wc) 549 { 550 int r; 551 struct dm_io_region region; 552 struct dm_io_request req; 553 554 region.bdev = wc->ssd_dev->bdev; 555 region.sector = 0; 556 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; 557 558 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 559 region.count = wc->metadata_sectors - region.sector; 560 561 region.sector += wc->start_sector; 562 563 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; 564 req.mem.type = DM_IO_VMA; 565 req.mem.ptr.vma = (char *)wc->memory_map; 566 req.client = wc->dm_io; 567 req.notify.fn = NULL; 568 req.notify.context = NULL; 569 570 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 571 if (unlikely(r)) 572 writecache_error(wc, r, "error writing superblock"); 573 } 574 575 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 576 { 577 if (WC_MODE_PMEM(wc)) 578 pmem_wmb(); 579 else 580 ssd_commit_flushed(wc, wait_for_ios); 581 } 582 583 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 584 { 585 int r; 586 struct dm_io_region region; 587 struct dm_io_request req; 588 589 region.bdev = dev->bdev; 590 region.sector = 0; 591 region.count = 0; 592 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 593 req.mem.type = DM_IO_KMEM; 594 req.mem.ptr.addr = NULL; 595 req.client = wc->dm_io; 596 req.notify.fn = NULL; 597 598 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 599 if (unlikely(r)) 600 writecache_error(wc, r, "error flushing metadata: %d", r); 601 } 602 603 #define WFE_RETURN_FOLLOWING 1 604 #define WFE_LOWEST_SEQ 2 605 606 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 607 uint64_t block, int flags) 608 { 609 struct wc_entry *e; 610 struct rb_node *node = wc->tree.rb_node; 611 612 if (unlikely(!node)) 613 return NULL; 614 615 while (1) { 616 e = container_of(node, struct wc_entry, rb_node); 617 if (read_original_sector(wc, e) == block) 618 break; 619 620 node = (read_original_sector(wc, e) >= block ? 621 e->rb_node.rb_left : e->rb_node.rb_right); 622 if (unlikely(!node)) { 623 if (!(flags & WFE_RETURN_FOLLOWING)) 624 return NULL; 625 if (read_original_sector(wc, e) >= block) 626 return e; 627 628 node = rb_next(&e->rb_node); 629 if (unlikely(!node)) 630 return NULL; 631 632 e = container_of(node, struct wc_entry, rb_node); 633 return e; 634 } 635 } 636 637 while (1) { 638 struct wc_entry *e2; 639 640 if (flags & WFE_LOWEST_SEQ) 641 node = rb_prev(&e->rb_node); 642 else 643 node = rb_next(&e->rb_node); 644 if (unlikely(!node)) 645 return e; 646 e2 = container_of(node, struct wc_entry, rb_node); 647 if (read_original_sector(wc, e2) != block) 648 return e; 649 e = e2; 650 } 651 } 652 653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 654 { 655 struct wc_entry *e; 656 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 657 658 while (*node) { 659 e = container_of(*node, struct wc_entry, rb_node); 660 parent = &e->rb_node; 661 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 662 node = &parent->rb_left; 663 else 664 node = &parent->rb_right; 665 } 666 rb_link_node(&ins->rb_node, parent, node); 667 rb_insert_color(&ins->rb_node, &wc->tree); 668 list_add(&ins->lru, &wc->lru); 669 ins->age = jiffies; 670 } 671 672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 673 { 674 list_del(&e->lru); 675 rb_erase(&e->rb_node, &wc->tree); 676 } 677 678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 679 { 680 if (WC_MODE_SORT_FREELIST(wc)) { 681 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 682 683 if (unlikely(!*node)) 684 wc->current_free = e; 685 while (*node) { 686 parent = *node; 687 if (&e->rb_node < *node) 688 node = &parent->rb_left; 689 else 690 node = &parent->rb_right; 691 } 692 rb_link_node(&e->rb_node, parent, node); 693 rb_insert_color(&e->rb_node, &wc->freetree); 694 } else { 695 list_add_tail(&e->lru, &wc->freelist); 696 } 697 wc->freelist_size++; 698 } 699 700 static inline void writecache_verify_watermark(struct dm_writecache *wc) 701 { 702 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 703 queue_work(wc->writeback_wq, &wc->writeback_work); 704 } 705 706 static void writecache_max_age_timer(struct timer_list *t) 707 { 708 struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer); 709 710 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 711 queue_work(wc->writeback_wq, &wc->writeback_work); 712 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 713 } 714 } 715 716 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 717 { 718 struct wc_entry *e; 719 720 if (WC_MODE_SORT_FREELIST(wc)) { 721 struct rb_node *next; 722 723 if (unlikely(!wc->current_free)) 724 return NULL; 725 e = wc->current_free; 726 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 727 return NULL; 728 next = rb_next(&e->rb_node); 729 rb_erase(&e->rb_node, &wc->freetree); 730 if (unlikely(!next)) 731 next = rb_first(&wc->freetree); 732 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 733 } else { 734 if (unlikely(list_empty(&wc->freelist))) 735 return NULL; 736 e = container_of(wc->freelist.next, struct wc_entry, lru); 737 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 738 return NULL; 739 list_del(&e->lru); 740 } 741 wc->freelist_size--; 742 743 writecache_verify_watermark(wc); 744 745 return e; 746 } 747 748 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 749 { 750 writecache_unlink(wc, e); 751 writecache_add_to_freelist(wc, e); 752 clear_seq_count(wc, e); 753 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 754 if (unlikely(waitqueue_active(&wc->freelist_wait))) 755 wake_up(&wc->freelist_wait); 756 } 757 758 static void writecache_wait_on_freelist(struct dm_writecache *wc) 759 { 760 DEFINE_WAIT(wait); 761 762 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 763 wc_unlock(wc); 764 io_schedule(); 765 finish_wait(&wc->freelist_wait, &wait); 766 wc_lock(wc); 767 } 768 769 static void writecache_poison_lists(struct dm_writecache *wc) 770 { 771 /* 772 * Catch incorrect access to these values while the device is suspended. 773 */ 774 memset(&wc->tree, -1, sizeof(wc->tree)); 775 wc->lru.next = LIST_POISON1; 776 wc->lru.prev = LIST_POISON2; 777 wc->freelist.next = LIST_POISON1; 778 wc->freelist.prev = LIST_POISON2; 779 } 780 781 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 782 { 783 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 784 if (WC_MODE_PMEM(wc)) 785 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 786 } 787 788 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 789 { 790 return read_seq_count(wc, e) < wc->seq_count; 791 } 792 793 static void writecache_flush(struct dm_writecache *wc) 794 { 795 struct wc_entry *e, *e2; 796 bool need_flush_after_free; 797 798 wc->uncommitted_blocks = 0; 799 timer_delete(&wc->autocommit_timer); 800 801 if (list_empty(&wc->lru)) 802 return; 803 804 e = container_of(wc->lru.next, struct wc_entry, lru); 805 if (writecache_entry_is_committed(wc, e)) { 806 if (wc->overwrote_committed) { 807 writecache_wait_for_ios(wc, WRITE); 808 writecache_disk_flush(wc, wc->ssd_dev); 809 wc->overwrote_committed = false; 810 } 811 return; 812 } 813 while (1) { 814 writecache_flush_entry(wc, e); 815 if (unlikely(e->lru.next == &wc->lru)) 816 break; 817 e2 = container_of(e->lru.next, struct wc_entry, lru); 818 if (writecache_entry_is_committed(wc, e2)) 819 break; 820 e = e2; 821 cond_resched(); 822 } 823 writecache_commit_flushed(wc, true); 824 825 wc->seq_count++; 826 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 827 if (WC_MODE_PMEM(wc)) 828 writecache_commit_flushed(wc, false); 829 else 830 ssd_commit_superblock(wc); 831 832 wc->overwrote_committed = false; 833 834 need_flush_after_free = false; 835 while (1) { 836 /* Free another committed entry with lower seq-count */ 837 struct rb_node *rb_node = rb_prev(&e->rb_node); 838 839 if (rb_node) { 840 e2 = container_of(rb_node, struct wc_entry, rb_node); 841 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 842 likely(!e2->write_in_progress)) { 843 writecache_free_entry(wc, e2); 844 need_flush_after_free = true; 845 } 846 } 847 if (unlikely(e->lru.prev == &wc->lru)) 848 break; 849 e = container_of(e->lru.prev, struct wc_entry, lru); 850 cond_resched(); 851 } 852 853 if (need_flush_after_free) 854 writecache_commit_flushed(wc, false); 855 } 856 857 static void writecache_flush_work(struct work_struct *work) 858 { 859 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 860 861 wc_lock(wc); 862 writecache_flush(wc); 863 wc_unlock(wc); 864 } 865 866 static void writecache_autocommit_timer(struct timer_list *t) 867 { 868 struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer); 869 870 if (!writecache_has_error(wc)) 871 queue_work(wc->writeback_wq, &wc->flush_work); 872 } 873 874 static void writecache_schedule_autocommit(struct dm_writecache *wc) 875 { 876 if (!timer_pending(&wc->autocommit_timer)) 877 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 878 } 879 880 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 881 { 882 struct wc_entry *e; 883 bool discarded_something = false; 884 885 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 886 if (unlikely(!e)) 887 return; 888 889 while (read_original_sector(wc, e) < end) { 890 struct rb_node *node = rb_next(&e->rb_node); 891 892 if (likely(!e->write_in_progress)) { 893 if (!discarded_something) { 894 if (!WC_MODE_PMEM(wc)) { 895 writecache_wait_for_ios(wc, READ); 896 writecache_wait_for_ios(wc, WRITE); 897 } 898 discarded_something = true; 899 } 900 if (!writecache_entry_is_committed(wc, e)) 901 wc->uncommitted_blocks--; 902 writecache_free_entry(wc, e); 903 } 904 905 if (unlikely(!node)) 906 break; 907 908 e = container_of(node, struct wc_entry, rb_node); 909 } 910 911 if (discarded_something) 912 writecache_commit_flushed(wc, false); 913 } 914 915 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 916 { 917 if (wc->writeback_size) { 918 writecache_wait_on_freelist(wc); 919 return true; 920 } 921 return false; 922 } 923 924 static void writecache_suspend(struct dm_target *ti) 925 { 926 struct dm_writecache *wc = ti->private; 927 bool flush_on_suspend; 928 929 timer_delete_sync(&wc->autocommit_timer); 930 timer_delete_sync(&wc->max_age_timer); 931 932 wc_lock(wc); 933 writecache_flush(wc); 934 flush_on_suspend = wc->flush_on_suspend; 935 if (flush_on_suspend) { 936 wc->flush_on_suspend = false; 937 wc->writeback_all++; 938 queue_work(wc->writeback_wq, &wc->writeback_work); 939 } 940 wc_unlock(wc); 941 942 drain_workqueue(wc->writeback_wq); 943 944 wc_lock(wc); 945 if (flush_on_suspend) 946 wc->writeback_all--; 947 while (writecache_wait_for_writeback(wc)) 948 ; 949 950 if (WC_MODE_PMEM(wc)) 951 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 952 953 writecache_poison_lists(wc); 954 955 wc_unlock(wc); 956 } 957 958 static int writecache_alloc_entries(struct dm_writecache *wc) 959 { 960 size_t b; 961 962 if (wc->entries) 963 return 0; 964 wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry)); 965 if (!wc->entries) 966 return -ENOMEM; 967 for (b = 0; b < wc->n_blocks; b++) { 968 struct wc_entry *e = &wc->entries[b]; 969 970 e->index = b; 971 e->write_in_progress = false; 972 cond_resched(); 973 } 974 975 return 0; 976 } 977 978 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 979 { 980 struct dm_io_region region; 981 struct dm_io_request req; 982 983 region.bdev = wc->ssd_dev->bdev; 984 region.sector = wc->start_sector; 985 region.count = n_sectors; 986 req.bi_opf = REQ_OP_READ | REQ_SYNC; 987 req.mem.type = DM_IO_VMA; 988 req.mem.ptr.vma = (char *)wc->memory_map; 989 req.client = wc->dm_io; 990 req.notify.fn = NULL; 991 992 return dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 993 } 994 995 static void writecache_resume(struct dm_target *ti) 996 { 997 struct dm_writecache *wc = ti->private; 998 size_t b; 999 bool need_flush = false; 1000 __le64 sb_seq_count; 1001 int r; 1002 1003 wc_lock(wc); 1004 1005 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); 1006 1007 if (WC_MODE_PMEM(wc)) { 1008 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 1009 } else { 1010 r = writecache_read_metadata(wc, wc->metadata_sectors); 1011 if (r) { 1012 size_t sb_entries_offset; 1013 1014 writecache_error(wc, r, "unable to read metadata: %d", r); 1015 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 1016 memset((char *)wc->memory_map + sb_entries_offset, -1, 1017 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 1018 } 1019 } 1020 1021 wc->tree = RB_ROOT; 1022 INIT_LIST_HEAD(&wc->lru); 1023 if (WC_MODE_SORT_FREELIST(wc)) { 1024 wc->freetree = RB_ROOT; 1025 wc->current_free = NULL; 1026 } else { 1027 INIT_LIST_HEAD(&wc->freelist); 1028 } 1029 wc->freelist_size = 0; 1030 1031 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, 1032 sizeof(uint64_t)); 1033 if (r) { 1034 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 1035 sb_seq_count = cpu_to_le64(0); 1036 } 1037 wc->seq_count = le64_to_cpu(sb_seq_count); 1038 1039 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 1040 for (b = 0; b < wc->n_blocks; b++) { 1041 struct wc_entry *e = &wc->entries[b]; 1042 struct wc_memory_entry wme; 1043 1044 if (writecache_has_error(wc)) { 1045 e->original_sector = -1; 1046 e->seq_count = -1; 1047 continue; 1048 } 1049 r = copy_mc_to_kernel(&wme, memory_entry(wc, e), 1050 sizeof(struct wc_memory_entry)); 1051 if (r) { 1052 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1053 (unsigned long)b, r); 1054 e->original_sector = -1; 1055 e->seq_count = -1; 1056 } else { 1057 e->original_sector = le64_to_cpu(wme.original_sector); 1058 e->seq_count = le64_to_cpu(wme.seq_count); 1059 } 1060 cond_resched(); 1061 } 1062 #endif 1063 for (b = 0; b < wc->n_blocks; b++) { 1064 struct wc_entry *e = &wc->entries[b]; 1065 1066 if (!writecache_entry_is_committed(wc, e)) { 1067 if (read_seq_count(wc, e) != -1) { 1068 erase_this: 1069 clear_seq_count(wc, e); 1070 need_flush = true; 1071 } 1072 writecache_add_to_freelist(wc, e); 1073 } else { 1074 struct wc_entry *old; 1075 1076 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1077 if (!old) { 1078 writecache_insert_entry(wc, e); 1079 } else { 1080 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1081 writecache_error(wc, -EINVAL, 1082 "two identical entries, position %llu, sector %llu, sequence %llu", 1083 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1084 (unsigned long long)read_seq_count(wc, e)); 1085 } 1086 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1087 goto erase_this; 1088 } else { 1089 writecache_free_entry(wc, old); 1090 writecache_insert_entry(wc, e); 1091 need_flush = true; 1092 } 1093 } 1094 } 1095 cond_resched(); 1096 } 1097 1098 if (need_flush) { 1099 writecache_flush_all_metadata(wc); 1100 writecache_commit_flushed(wc, false); 1101 } 1102 1103 writecache_verify_watermark(wc); 1104 1105 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1106 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1107 1108 wc_unlock(wc); 1109 } 1110 1111 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1112 { 1113 if (argc != 1) 1114 return -EINVAL; 1115 1116 wc_lock(wc); 1117 if (dm_suspended(wc->ti)) { 1118 wc_unlock(wc); 1119 return -EBUSY; 1120 } 1121 if (writecache_has_error(wc)) { 1122 wc_unlock(wc); 1123 return -EIO; 1124 } 1125 1126 writecache_flush(wc); 1127 wc->writeback_all++; 1128 queue_work(wc->writeback_wq, &wc->writeback_work); 1129 wc_unlock(wc); 1130 1131 flush_workqueue(wc->writeback_wq); 1132 1133 wc_lock(wc); 1134 wc->writeback_all--; 1135 if (writecache_has_error(wc)) { 1136 wc_unlock(wc); 1137 return -EIO; 1138 } 1139 wc_unlock(wc); 1140 1141 return 0; 1142 } 1143 1144 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1145 { 1146 if (argc != 1) 1147 return -EINVAL; 1148 1149 wc_lock(wc); 1150 wc->flush_on_suspend = true; 1151 wc_unlock(wc); 1152 1153 return 0; 1154 } 1155 1156 static void activate_cleaner(struct dm_writecache *wc) 1157 { 1158 wc->flush_on_suspend = true; 1159 wc->cleaner = true; 1160 wc->freelist_high_watermark = wc->n_blocks; 1161 wc->freelist_low_watermark = wc->n_blocks; 1162 } 1163 1164 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1165 { 1166 if (argc != 1) 1167 return -EINVAL; 1168 1169 wc_lock(wc); 1170 activate_cleaner(wc); 1171 if (!dm_suspended(wc->ti)) 1172 writecache_verify_watermark(wc); 1173 wc_unlock(wc); 1174 1175 return 0; 1176 } 1177 1178 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1179 { 1180 if (argc != 1) 1181 return -EINVAL; 1182 1183 wc_lock(wc); 1184 memset(&wc->stats, 0, sizeof(wc->stats)); 1185 wc_unlock(wc); 1186 1187 return 0; 1188 } 1189 1190 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv, 1191 char *result, unsigned int maxlen) 1192 { 1193 int r = -EINVAL; 1194 struct dm_writecache *wc = ti->private; 1195 1196 if (!strcasecmp(argv[0], "flush")) 1197 r = process_flush_mesg(argc, argv, wc); 1198 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1199 r = process_flush_on_suspend_mesg(argc, argv, wc); 1200 else if (!strcasecmp(argv[0], "cleaner")) 1201 r = process_cleaner_mesg(argc, argv, wc); 1202 else if (!strcasecmp(argv[0], "clear_stats")) 1203 r = process_clear_stats_mesg(argc, argv, wc); 1204 else 1205 DMERR("unrecognised message received: %s", argv[0]); 1206 1207 return r; 1208 } 1209 1210 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) 1211 { 1212 /* 1213 * clflushopt performs better with block size 1024, 2048, 4096 1214 * non-temporal stores perform better with block size 512 1215 * 1216 * block size 512 1024 2048 4096 1217 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s 1218 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s 1219 * 1220 * We see that movnti performs better for 512-byte blocks, and 1221 * clflushopt performs better for 1024-byte and larger blocks. So, we 1222 * prefer clflushopt for sizes >= 768. 1223 * 1224 * NOTE: this happens to be the case now (with dm-writecache's single 1225 * threaded model) but re-evaluate this once memcpy_flushcache() is 1226 * enabled to use movdir64b which might invalidate this performance 1227 * advantage seen with cache-allocating-writes plus flushing. 1228 */ 1229 #ifdef CONFIG_X86 1230 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && 1231 likely(boot_cpu_data.x86_clflush_size == 64) && 1232 likely(size >= 768)) { 1233 do { 1234 memcpy((void *)dest, (void *)source, 64); 1235 clflushopt((void *)dest); 1236 dest += 64; 1237 source += 64; 1238 size -= 64; 1239 } while (size >= 64); 1240 return; 1241 } 1242 #endif 1243 memcpy_flushcache(dest, source, size); 1244 } 1245 1246 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1247 { 1248 void *buf; 1249 unsigned int size; 1250 int rw = bio_data_dir(bio); 1251 unsigned int remaining_size = wc->block_size; 1252 1253 do { 1254 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1255 1256 buf = bvec_kmap_local(&bv); 1257 size = bv.bv_len; 1258 if (unlikely(size > remaining_size)) 1259 size = remaining_size; 1260 1261 if (rw == READ) { 1262 int r; 1263 1264 r = copy_mc_to_kernel(buf, data, size); 1265 flush_dcache_page(bio_page(bio)); 1266 if (unlikely(r)) { 1267 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1268 bio->bi_status = BLK_STS_IOERR; 1269 } 1270 } else { 1271 flush_dcache_page(bio_page(bio)); 1272 memcpy_flushcache_optimized(data, buf, size); 1273 } 1274 1275 kunmap_local(buf); 1276 1277 data = (char *)data + size; 1278 remaining_size -= size; 1279 bio_advance(bio, size); 1280 } while (unlikely(remaining_size)); 1281 } 1282 1283 static int writecache_flush_thread(void *data) 1284 { 1285 struct dm_writecache *wc = data; 1286 1287 while (1) { 1288 struct bio *bio; 1289 1290 wc_lock(wc); 1291 bio = bio_list_pop(&wc->flush_list); 1292 if (!bio) { 1293 set_current_state(TASK_INTERRUPTIBLE); 1294 wc_unlock(wc); 1295 1296 if (unlikely(kthread_should_stop())) { 1297 set_current_state(TASK_RUNNING); 1298 break; 1299 } 1300 1301 schedule(); 1302 continue; 1303 } 1304 1305 if (bio_op(bio) == REQ_OP_DISCARD) { 1306 writecache_discard(wc, bio->bi_iter.bi_sector, 1307 bio_end_sector(bio)); 1308 wc_unlock(wc); 1309 bio_set_dev(bio, wc->dev->bdev); 1310 submit_bio_noacct(bio); 1311 } else { 1312 writecache_flush(wc); 1313 wc_unlock(wc); 1314 if (writecache_has_error(wc)) 1315 bio->bi_status = BLK_STS_IOERR; 1316 bio_endio(bio); 1317 } 1318 } 1319 1320 return 0; 1321 } 1322 1323 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1324 { 1325 if (bio_list_empty(&wc->flush_list)) 1326 wake_up_process(wc->flush_thread); 1327 bio_list_add(&wc->flush_list, bio); 1328 } 1329 1330 enum wc_map_op { 1331 WC_MAP_SUBMIT, 1332 WC_MAP_REMAP, 1333 WC_MAP_REMAP_ORIGIN, 1334 WC_MAP_RETURN, 1335 WC_MAP_ERROR, 1336 }; 1337 1338 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, 1339 struct wc_entry *e) 1340 { 1341 if (e) { 1342 sector_t next_boundary = 1343 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1344 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) 1345 dm_accept_partial_bio(bio, next_boundary); 1346 } 1347 } 1348 1349 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) 1350 { 1351 enum wc_map_op map_op; 1352 struct wc_entry *e; 1353 1354 read_next_block: 1355 wc->stats.reads++; 1356 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1357 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1358 wc->stats.read_hits++; 1359 if (WC_MODE_PMEM(wc)) { 1360 bio_copy_block(wc, bio, memory_data(wc, e)); 1361 if (bio->bi_iter.bi_size) 1362 goto read_next_block; 1363 map_op = WC_MAP_SUBMIT; 1364 } else { 1365 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1366 bio_set_dev(bio, wc->ssd_dev->bdev); 1367 bio->bi_iter.bi_sector = cache_sector(wc, e); 1368 if (!writecache_entry_is_committed(wc, e)) 1369 writecache_wait_for_ios(wc, WRITE); 1370 map_op = WC_MAP_REMAP; 1371 } 1372 } else { 1373 writecache_map_remap_origin(wc, bio, e); 1374 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1375 map_op = WC_MAP_REMAP_ORIGIN; 1376 } 1377 1378 return map_op; 1379 } 1380 1381 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, 1382 struct wc_entry *e, bool search_used) 1383 { 1384 unsigned int bio_size = wc->block_size; 1385 sector_t start_cache_sec = cache_sector(wc, e); 1386 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1387 1388 while (bio_size < bio->bi_iter.bi_size) { 1389 if (!search_used) { 1390 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1391 1392 if (!f) 1393 break; 1394 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1395 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1396 writecache_insert_entry(wc, f); 1397 wc->uncommitted_blocks++; 1398 } else { 1399 struct wc_entry *f; 1400 struct rb_node *next = rb_next(&e->rb_node); 1401 1402 if (!next) 1403 break; 1404 f = container_of(next, struct wc_entry, rb_node); 1405 if (f != e + 1) 1406 break; 1407 if (read_original_sector(wc, f) != 1408 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1409 break; 1410 if (unlikely(f->write_in_progress)) 1411 break; 1412 if (writecache_entry_is_committed(wc, f)) 1413 wc->overwrote_committed = true; 1414 e = f; 1415 } 1416 bio_size += wc->block_size; 1417 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1418 } 1419 1420 bio_set_dev(bio, wc->ssd_dev->bdev); 1421 bio->bi_iter.bi_sector = start_cache_sec; 1422 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1423 1424 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1425 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1426 1427 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1428 wc->uncommitted_blocks = 0; 1429 queue_work(wc->writeback_wq, &wc->flush_work); 1430 } else { 1431 writecache_schedule_autocommit(wc); 1432 } 1433 } 1434 1435 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) 1436 { 1437 struct wc_entry *e; 1438 1439 do { 1440 bool found_entry = false; 1441 bool search_used = false; 1442 1443 if (writecache_has_error(wc)) { 1444 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1445 return WC_MAP_ERROR; 1446 } 1447 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1448 if (e) { 1449 if (!writecache_entry_is_committed(wc, e)) { 1450 wc->stats.write_hits_uncommitted++; 1451 search_used = true; 1452 goto bio_copy; 1453 } 1454 wc->stats.write_hits_committed++; 1455 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1456 wc->overwrote_committed = true; 1457 search_used = true; 1458 goto bio_copy; 1459 } 1460 found_entry = true; 1461 } else { 1462 if (unlikely(wc->cleaner) || 1463 (wc->metadata_only && !(bio->bi_opf & REQ_META))) 1464 goto direct_write; 1465 } 1466 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1467 if (unlikely(!e)) { 1468 if (!WC_MODE_PMEM(wc) && !found_entry) { 1469 direct_write: 1470 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1471 writecache_map_remap_origin(wc, bio, e); 1472 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; 1473 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1474 return WC_MAP_REMAP_ORIGIN; 1475 } 1476 wc->stats.writes_blocked_on_freelist++; 1477 writecache_wait_on_freelist(wc); 1478 continue; 1479 } 1480 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1481 writecache_insert_entry(wc, e); 1482 wc->uncommitted_blocks++; 1483 wc->stats.writes_allocate++; 1484 bio_copy: 1485 if (WC_MODE_PMEM(wc)) { 1486 bio_copy_block(wc, bio, memory_data(wc, e)); 1487 wc->stats.writes++; 1488 } else { 1489 writecache_bio_copy_ssd(wc, bio, e, search_used); 1490 return WC_MAP_REMAP; 1491 } 1492 } while (bio->bi_iter.bi_size); 1493 1494 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) 1495 writecache_flush(wc); 1496 else 1497 writecache_schedule_autocommit(wc); 1498 1499 return WC_MAP_SUBMIT; 1500 } 1501 1502 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) 1503 { 1504 if (writecache_has_error(wc)) 1505 return WC_MAP_ERROR; 1506 1507 if (WC_MODE_PMEM(wc)) { 1508 wc->stats.flushes++; 1509 writecache_flush(wc); 1510 if (writecache_has_error(wc)) 1511 return WC_MAP_ERROR; 1512 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) 1513 return WC_MAP_REMAP_ORIGIN; 1514 return WC_MAP_SUBMIT; 1515 } 1516 /* SSD: */ 1517 if (dm_bio_get_target_bio_nr(bio)) 1518 return WC_MAP_REMAP_ORIGIN; 1519 wc->stats.flushes++; 1520 writecache_offload_bio(wc, bio); 1521 return WC_MAP_RETURN; 1522 } 1523 1524 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) 1525 { 1526 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; 1527 1528 if (writecache_has_error(wc)) 1529 return WC_MAP_ERROR; 1530 1531 if (WC_MODE_PMEM(wc)) { 1532 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1533 return WC_MAP_REMAP_ORIGIN; 1534 } 1535 /* SSD: */ 1536 writecache_offload_bio(wc, bio); 1537 return WC_MAP_RETURN; 1538 } 1539 1540 static int writecache_map(struct dm_target *ti, struct bio *bio) 1541 { 1542 struct dm_writecache *wc = ti->private; 1543 enum wc_map_op map_op; 1544 1545 bio->bi_private = NULL; 1546 1547 wc_lock(wc); 1548 1549 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1550 map_op = writecache_map_flush(wc, bio); 1551 goto done; 1552 } 1553 1554 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1555 1556 if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1557 (wc->block_size / 512 - 1)) != 0)) { 1558 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1559 (unsigned long long)bio->bi_iter.bi_sector, 1560 bio->bi_iter.bi_size, wc->block_size); 1561 map_op = WC_MAP_ERROR; 1562 goto done; 1563 } 1564 1565 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1566 map_op = writecache_map_discard(wc, bio); 1567 goto done; 1568 } 1569 1570 if (bio_data_dir(bio) == READ) 1571 map_op = writecache_map_read(wc, bio); 1572 else 1573 map_op = writecache_map_write(wc, bio); 1574 done: 1575 switch (map_op) { 1576 case WC_MAP_REMAP_ORIGIN: 1577 if (likely(wc->pause != 0)) { 1578 if (bio_op(bio) == REQ_OP_WRITE) { 1579 dm_iot_io_begin(&wc->iot, 1); 1580 bio->bi_private = (void *)2; 1581 } 1582 } 1583 bio_set_dev(bio, wc->dev->bdev); 1584 wc_unlock(wc); 1585 return DM_MAPIO_REMAPPED; 1586 1587 case WC_MAP_REMAP: 1588 /* make sure that writecache_end_io decrements bio_in_progress: */ 1589 bio->bi_private = (void *)1; 1590 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1591 wc_unlock(wc); 1592 return DM_MAPIO_REMAPPED; 1593 1594 case WC_MAP_SUBMIT: 1595 wc_unlock(wc); 1596 bio_endio(bio); 1597 return DM_MAPIO_SUBMITTED; 1598 1599 case WC_MAP_RETURN: 1600 wc_unlock(wc); 1601 return DM_MAPIO_SUBMITTED; 1602 1603 case WC_MAP_ERROR: 1604 wc_unlock(wc); 1605 bio_io_error(bio); 1606 return DM_MAPIO_SUBMITTED; 1607 1608 default: 1609 BUG(); 1610 wc_unlock(wc); 1611 return DM_MAPIO_KILL; 1612 } 1613 } 1614 1615 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1616 { 1617 struct dm_writecache *wc = ti->private; 1618 1619 if (bio->bi_private == (void *)1) { 1620 int dir = bio_data_dir(bio); 1621 1622 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1623 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1624 wake_up(&wc->bio_in_progress_wait[dir]); 1625 } else if (bio->bi_private == (void *)2) { 1626 dm_iot_io_end(&wc->iot, 1); 1627 } 1628 return 0; 1629 } 1630 1631 static int writecache_iterate_devices(struct dm_target *ti, 1632 iterate_devices_callout_fn fn, void *data) 1633 { 1634 struct dm_writecache *wc = ti->private; 1635 1636 return fn(ti, wc->dev, 0, ti->len, data); 1637 } 1638 1639 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1640 { 1641 struct dm_writecache *wc = ti->private; 1642 1643 if (limits->logical_block_size < wc->block_size) 1644 limits->logical_block_size = wc->block_size; 1645 1646 if (limits->physical_block_size < wc->block_size) 1647 limits->physical_block_size = wc->block_size; 1648 1649 if (limits->io_min < wc->block_size) 1650 limits->io_min = wc->block_size; 1651 } 1652 1653 1654 static void writecache_writeback_endio(struct bio *bio) 1655 { 1656 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1657 struct dm_writecache *wc = wb->wc; 1658 unsigned long flags; 1659 1660 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1661 if (unlikely(list_empty(&wc->endio_list))) 1662 wake_up_process(wc->endio_thread); 1663 list_add_tail(&wb->endio_entry, &wc->endio_list); 1664 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1665 } 1666 1667 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1668 { 1669 struct copy_struct *c = ptr; 1670 struct dm_writecache *wc = c->wc; 1671 1672 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1673 1674 raw_spin_lock_irq(&wc->endio_list_lock); 1675 if (unlikely(list_empty(&wc->endio_list))) 1676 wake_up_process(wc->endio_thread); 1677 list_add_tail(&c->endio_entry, &wc->endio_list); 1678 raw_spin_unlock_irq(&wc->endio_list_lock); 1679 } 1680 1681 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1682 { 1683 unsigned int i; 1684 struct writeback_struct *wb; 1685 struct wc_entry *e; 1686 unsigned long n_walked = 0; 1687 1688 do { 1689 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1690 list_del(&wb->endio_entry); 1691 1692 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1693 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1694 "write error %d", wb->bio.bi_status); 1695 i = 0; 1696 do { 1697 e = wb->wc_list[i]; 1698 BUG_ON(!e->write_in_progress); 1699 e->write_in_progress = false; 1700 INIT_LIST_HEAD(&e->lru); 1701 if (!writecache_has_error(wc)) 1702 writecache_free_entry(wc, e); 1703 BUG_ON(!wc->writeback_size); 1704 wc->writeback_size--; 1705 n_walked++; 1706 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1707 writecache_commit_flushed(wc, false); 1708 wc_unlock(wc); 1709 wc_lock(wc); 1710 n_walked = 0; 1711 } 1712 } while (++i < wb->wc_list_n); 1713 1714 if (wb->wc_list != wb->wc_list_inline) 1715 kfree(wb->wc_list); 1716 bio_put(&wb->bio); 1717 } while (!list_empty(list)); 1718 } 1719 1720 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1721 { 1722 struct copy_struct *c; 1723 struct wc_entry *e; 1724 1725 do { 1726 c = list_entry(list->next, struct copy_struct, endio_entry); 1727 list_del(&c->endio_entry); 1728 1729 if (unlikely(c->error)) 1730 writecache_error(wc, c->error, "copy error"); 1731 1732 e = c->e; 1733 do { 1734 BUG_ON(!e->write_in_progress); 1735 e->write_in_progress = false; 1736 INIT_LIST_HEAD(&e->lru); 1737 if (!writecache_has_error(wc)) 1738 writecache_free_entry(wc, e); 1739 1740 BUG_ON(!wc->writeback_size); 1741 wc->writeback_size--; 1742 e++; 1743 } while (--c->n_entries); 1744 mempool_free(c, &wc->copy_pool); 1745 } while (!list_empty(list)); 1746 } 1747 1748 static int writecache_endio_thread(void *data) 1749 { 1750 struct dm_writecache *wc = data; 1751 1752 while (1) { 1753 struct list_head list; 1754 1755 raw_spin_lock_irq(&wc->endio_list_lock); 1756 if (!list_empty(&wc->endio_list)) 1757 goto pop_from_list; 1758 set_current_state(TASK_INTERRUPTIBLE); 1759 raw_spin_unlock_irq(&wc->endio_list_lock); 1760 1761 if (unlikely(kthread_should_stop())) { 1762 set_current_state(TASK_RUNNING); 1763 break; 1764 } 1765 1766 schedule(); 1767 1768 continue; 1769 1770 pop_from_list: 1771 list = wc->endio_list; 1772 list.next->prev = list.prev->next = &list; 1773 INIT_LIST_HEAD(&wc->endio_list); 1774 raw_spin_unlock_irq(&wc->endio_list_lock); 1775 1776 if (!WC_MODE_FUA(wc)) 1777 writecache_disk_flush(wc, wc->dev); 1778 1779 wc_lock(wc); 1780 1781 if (WC_MODE_PMEM(wc)) { 1782 __writecache_endio_pmem(wc, &list); 1783 } else { 1784 __writecache_endio_ssd(wc, &list); 1785 writecache_wait_for_ios(wc, READ); 1786 } 1787 1788 writecache_commit_flushed(wc, false); 1789 1790 wc_unlock(wc); 1791 } 1792 1793 return 0; 1794 } 1795 1796 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) 1797 { 1798 struct dm_writecache *wc = wb->wc; 1799 unsigned int block_size = wc->block_size; 1800 void *address = memory_data(wc, e); 1801 1802 persistent_memory_flush_cache(address, block_size); 1803 1804 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) 1805 return true; 1806 1807 return bio_add_page(&wb->bio, persistent_memory_page(address), 1808 block_size, persistent_memory_page_offset(address)) != 0; 1809 } 1810 1811 struct writeback_list { 1812 struct list_head list; 1813 size_t size; 1814 }; 1815 1816 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1817 { 1818 if (unlikely(wc->max_writeback_jobs)) { 1819 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1820 wc_lock(wc); 1821 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1822 writecache_wait_on_freelist(wc); 1823 wc_unlock(wc); 1824 } 1825 } 1826 cond_resched(); 1827 } 1828 1829 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1830 { 1831 struct wc_entry *e, *f; 1832 struct bio *bio; 1833 struct writeback_struct *wb; 1834 unsigned int max_pages; 1835 1836 while (wbl->size) { 1837 wbl->size--; 1838 e = container_of(wbl->list.prev, struct wc_entry, lru); 1839 list_del(&e->lru); 1840 1841 max_pages = e->wc_list_contiguous; 1842 1843 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, 1844 GFP_NOIO, &wc->bio_set); 1845 wb = container_of(bio, struct writeback_struct, bio); 1846 wb->wc = wc; 1847 bio->bi_end_io = writecache_writeback_endio; 1848 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1849 1850 if (unlikely(max_pages > WB_LIST_INLINE)) 1851 wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1852 GFP_NOIO | __GFP_NORETRY | 1853 __GFP_NOMEMALLOC | __GFP_NOWARN); 1854 1855 if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) { 1856 wb->wc_list = wb->wc_list_inline; 1857 max_pages = WB_LIST_INLINE; 1858 } 1859 1860 BUG_ON(!wc_add_block(wb, e)); 1861 1862 wb->wc_list[0] = e; 1863 wb->wc_list_n = 1; 1864 1865 while (wbl->size && wb->wc_list_n < max_pages) { 1866 f = container_of(wbl->list.prev, struct wc_entry, lru); 1867 if (read_original_sector(wc, f) != 1868 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1869 break; 1870 if (!wc_add_block(wb, f)) 1871 break; 1872 wbl->size--; 1873 list_del(&f->lru); 1874 wb->wc_list[wb->wc_list_n++] = f; 1875 e = f; 1876 } 1877 if (WC_MODE_FUA(wc)) 1878 bio->bi_opf |= REQ_FUA; 1879 if (writecache_has_error(wc)) { 1880 bio->bi_status = BLK_STS_IOERR; 1881 bio_endio(bio); 1882 } else if (unlikely(!bio_sectors(bio))) { 1883 bio->bi_status = BLK_STS_OK; 1884 bio_endio(bio); 1885 } else { 1886 submit_bio(bio); 1887 } 1888 1889 __writeback_throttle(wc, wbl); 1890 } 1891 } 1892 1893 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1894 { 1895 struct wc_entry *e, *f; 1896 struct dm_io_region from, to; 1897 struct copy_struct *c; 1898 1899 while (wbl->size) { 1900 unsigned int n_sectors; 1901 1902 wbl->size--; 1903 e = container_of(wbl->list.prev, struct wc_entry, lru); 1904 list_del(&e->lru); 1905 1906 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1907 1908 from.bdev = wc->ssd_dev->bdev; 1909 from.sector = cache_sector(wc, e); 1910 from.count = n_sectors; 1911 to.bdev = wc->dev->bdev; 1912 to.sector = read_original_sector(wc, e); 1913 to.count = n_sectors; 1914 1915 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1916 c->wc = wc; 1917 c->e = e; 1918 c->n_entries = e->wc_list_contiguous; 1919 1920 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1921 wbl->size--; 1922 f = container_of(wbl->list.prev, struct wc_entry, lru); 1923 BUG_ON(f != e + 1); 1924 list_del(&f->lru); 1925 e = f; 1926 } 1927 1928 if (unlikely(to.sector + to.count > wc->data_device_sectors)) { 1929 if (to.sector >= wc->data_device_sectors) { 1930 writecache_copy_endio(0, 0, c); 1931 continue; 1932 } 1933 from.count = to.count = wc->data_device_sectors - to.sector; 1934 } 1935 1936 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1937 1938 __writeback_throttle(wc, wbl); 1939 } 1940 } 1941 1942 static void writecache_writeback(struct work_struct *work) 1943 { 1944 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1945 struct blk_plug plug; 1946 struct wc_entry *f, *g, *e = NULL; 1947 struct rb_node *node, *next_node; 1948 struct list_head skipped; 1949 struct writeback_list wbl; 1950 unsigned long n_walked; 1951 1952 if (!WC_MODE_PMEM(wc)) { 1953 /* Wait for any active kcopyd work on behalf of ssd writeback */ 1954 dm_kcopyd_client_flush(wc->dm_kcopyd); 1955 } 1956 1957 if (likely(wc->pause != 0)) { 1958 while (1) { 1959 unsigned long idle; 1960 1961 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || 1962 unlikely(dm_suspended(wc->ti))) 1963 break; 1964 idle = dm_iot_idle_time(&wc->iot); 1965 if (idle >= wc->pause) 1966 break; 1967 idle = wc->pause - idle; 1968 if (idle > HZ) 1969 idle = HZ; 1970 schedule_timeout_idle(idle); 1971 } 1972 } 1973 1974 wc_lock(wc); 1975 restart: 1976 if (writecache_has_error(wc)) { 1977 wc_unlock(wc); 1978 return; 1979 } 1980 1981 if (unlikely(wc->writeback_all)) { 1982 if (writecache_wait_for_writeback(wc)) 1983 goto restart; 1984 } 1985 1986 if (wc->overwrote_committed) 1987 writecache_wait_for_ios(wc, WRITE); 1988 1989 n_walked = 0; 1990 INIT_LIST_HEAD(&skipped); 1991 INIT_LIST_HEAD(&wbl.list); 1992 wbl.size = 0; 1993 while (!list_empty(&wc->lru) && 1994 (wc->writeback_all || 1995 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1996 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1997 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1998 1999 n_walked++; 2000 if (unlikely(n_walked > WRITEBACK_LATENCY) && 2001 likely(!wc->writeback_all)) { 2002 if (likely(!dm_suspended(wc->ti))) 2003 queue_work(wc->writeback_wq, &wc->writeback_work); 2004 break; 2005 } 2006 2007 if (unlikely(wc->writeback_all)) { 2008 if (unlikely(!e)) { 2009 writecache_flush(wc); 2010 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 2011 } else 2012 e = g; 2013 } else 2014 e = container_of(wc->lru.prev, struct wc_entry, lru); 2015 BUG_ON(e->write_in_progress); 2016 if (unlikely(!writecache_entry_is_committed(wc, e))) 2017 writecache_flush(wc); 2018 2019 node = rb_prev(&e->rb_node); 2020 if (node) { 2021 f = container_of(node, struct wc_entry, rb_node); 2022 if (unlikely(read_original_sector(wc, f) == 2023 read_original_sector(wc, e))) { 2024 BUG_ON(!f->write_in_progress); 2025 list_move(&e->lru, &skipped); 2026 cond_resched(); 2027 continue; 2028 } 2029 } 2030 wc->writeback_size++; 2031 list_move(&e->lru, &wbl.list); 2032 wbl.size++; 2033 e->write_in_progress = true; 2034 e->wc_list_contiguous = 1; 2035 2036 f = e; 2037 2038 while (1) { 2039 next_node = rb_next(&f->rb_node); 2040 if (unlikely(!next_node)) 2041 break; 2042 g = container_of(next_node, struct wc_entry, rb_node); 2043 if (unlikely(read_original_sector(wc, g) == 2044 read_original_sector(wc, f))) { 2045 f = g; 2046 continue; 2047 } 2048 if (read_original_sector(wc, g) != 2049 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 2050 break; 2051 if (unlikely(g->write_in_progress)) 2052 break; 2053 if (unlikely(!writecache_entry_is_committed(wc, g))) 2054 break; 2055 2056 if (!WC_MODE_PMEM(wc)) { 2057 if (g != f + 1) 2058 break; 2059 } 2060 2061 n_walked++; 2062 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 2063 // break; 2064 2065 wc->writeback_size++; 2066 list_move(&g->lru, &wbl.list); 2067 wbl.size++; 2068 g->write_in_progress = true; 2069 g->wc_list_contiguous = BIO_MAX_VECS; 2070 f = g; 2071 e->wc_list_contiguous++; 2072 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { 2073 if (unlikely(wc->writeback_all)) { 2074 next_node = rb_next(&f->rb_node); 2075 if (likely(next_node)) 2076 g = container_of(next_node, struct wc_entry, rb_node); 2077 } 2078 break; 2079 } 2080 } 2081 cond_resched(); 2082 } 2083 2084 if (!list_empty(&skipped)) { 2085 list_splice_tail(&skipped, &wc->lru); 2086 /* 2087 * If we didn't do any progress, we must wait until some 2088 * writeback finishes to avoid burning CPU in a loop 2089 */ 2090 if (unlikely(!wbl.size)) 2091 writecache_wait_for_writeback(wc); 2092 } 2093 2094 wc_unlock(wc); 2095 2096 blk_start_plug(&plug); 2097 2098 if (WC_MODE_PMEM(wc)) 2099 __writecache_writeback_pmem(wc, &wbl); 2100 else 2101 __writecache_writeback_ssd(wc, &wbl); 2102 2103 blk_finish_plug(&plug); 2104 2105 if (unlikely(wc->writeback_all)) { 2106 wc_lock(wc); 2107 while (writecache_wait_for_writeback(wc)) 2108 ; 2109 wc_unlock(wc); 2110 } 2111 } 2112 2113 static int calculate_memory_size(uint64_t device_size, unsigned int block_size, 2114 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 2115 { 2116 uint64_t n_blocks, offset; 2117 struct wc_entry e; 2118 2119 n_blocks = device_size; 2120 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 2121 2122 while (1) { 2123 if (!n_blocks) 2124 return -ENOSPC; 2125 /* Verify the following entries[n_blocks] won't overflow */ 2126 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 2127 sizeof(struct wc_memory_entry))) 2128 return -EFBIG; 2129 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 2130 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 2131 if (offset + n_blocks * block_size <= device_size) 2132 break; 2133 n_blocks--; 2134 } 2135 2136 /* check if the bit field overflows */ 2137 e.index = n_blocks; 2138 if (e.index != n_blocks) 2139 return -EFBIG; 2140 2141 if (n_blocks_p) 2142 *n_blocks_p = n_blocks; 2143 if (n_metadata_blocks_p) 2144 *n_metadata_blocks_p = offset >> __ffs(block_size); 2145 return 0; 2146 } 2147 2148 static int init_memory(struct dm_writecache *wc) 2149 { 2150 size_t b; 2151 int r; 2152 2153 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 2154 if (r) 2155 return r; 2156 2157 r = writecache_alloc_entries(wc); 2158 if (r) 2159 return r; 2160 2161 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 2162 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 2163 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 2164 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 2165 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 2166 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 2167 2168 for (b = 0; b < wc->n_blocks; b++) { 2169 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 2170 cond_resched(); 2171 } 2172 2173 writecache_flush_all_metadata(wc); 2174 writecache_commit_flushed(wc, false); 2175 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 2176 writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic)); 2177 writecache_commit_flushed(wc, false); 2178 2179 return 0; 2180 } 2181 2182 static void writecache_dtr(struct dm_target *ti) 2183 { 2184 struct dm_writecache *wc = ti->private; 2185 2186 if (!wc) 2187 return; 2188 2189 if (wc->endio_thread) 2190 kthread_stop(wc->endio_thread); 2191 2192 if (wc->flush_thread) 2193 kthread_stop(wc->flush_thread); 2194 2195 bioset_exit(&wc->bio_set); 2196 2197 mempool_exit(&wc->copy_pool); 2198 2199 if (wc->writeback_wq) 2200 destroy_workqueue(wc->writeback_wq); 2201 2202 if (wc->dev) 2203 dm_put_device(ti, wc->dev); 2204 2205 if (wc->ssd_dev) 2206 dm_put_device(ti, wc->ssd_dev); 2207 2208 vfree(wc->entries); 2209 2210 if (wc->memory_map) { 2211 if (WC_MODE_PMEM(wc)) 2212 persistent_memory_release(wc); 2213 else 2214 vfree(wc->memory_map); 2215 } 2216 2217 if (wc->dm_kcopyd) 2218 dm_kcopyd_client_destroy(wc->dm_kcopyd); 2219 2220 if (wc->dm_io) 2221 dm_io_client_destroy(wc->dm_io); 2222 2223 vfree(wc->dirty_bitmap); 2224 2225 kfree(wc); 2226 } 2227 2228 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2229 { 2230 struct dm_writecache *wc; 2231 struct dm_arg_set as; 2232 const char *string; 2233 unsigned int opt_params; 2234 size_t offset, data_size; 2235 int i, r; 2236 char dummy; 2237 int high_wm_percent = HIGH_WATERMARK; 2238 int low_wm_percent = LOW_WATERMARK; 2239 uint64_t x; 2240 struct wc_memory_superblock s; 2241 2242 static struct dm_arg _args[] = { 2243 {0, 18, "Invalid number of feature args"}, 2244 }; 2245 2246 as.argc = argc; 2247 as.argv = argv; 2248 2249 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 2250 if (!wc) { 2251 ti->error = "Cannot allocate writecache structure"; 2252 r = -ENOMEM; 2253 goto bad; 2254 } 2255 ti->private = wc; 2256 wc->ti = ti; 2257 2258 mutex_init(&wc->lock); 2259 wc->max_age = MAX_AGE_UNSPECIFIED; 2260 writecache_poison_lists(wc); 2261 init_waitqueue_head(&wc->freelist_wait); 2262 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2263 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2264 2265 for (i = 0; i < 2; i++) { 2266 atomic_set(&wc->bio_in_progress[i], 0); 2267 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2268 } 2269 2270 wc->dm_io = dm_io_client_create(); 2271 if (IS_ERR(wc->dm_io)) { 2272 r = PTR_ERR(wc->dm_io); 2273 ti->error = "Unable to allocate dm-io client"; 2274 wc->dm_io = NULL; 2275 goto bad; 2276 } 2277 2278 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); 2279 if (!wc->writeback_wq) { 2280 r = -ENOMEM; 2281 ti->error = "Could not allocate writeback workqueue"; 2282 goto bad; 2283 } 2284 INIT_WORK(&wc->writeback_work, writecache_writeback); 2285 INIT_WORK(&wc->flush_work, writecache_flush_work); 2286 2287 dm_iot_init(&wc->iot); 2288 2289 raw_spin_lock_init(&wc->endio_list_lock); 2290 INIT_LIST_HEAD(&wc->endio_list); 2291 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); 2292 if (IS_ERR(wc->endio_thread)) { 2293 r = PTR_ERR(wc->endio_thread); 2294 wc->endio_thread = NULL; 2295 ti->error = "Couldn't spawn endio thread"; 2296 goto bad; 2297 } 2298 2299 /* 2300 * Parse the mode (pmem or ssd) 2301 */ 2302 string = dm_shift_arg(&as); 2303 if (!string) 2304 goto bad_arguments; 2305 2306 if (!strcasecmp(string, "s")) { 2307 wc->pmem_mode = false; 2308 } else if (!strcasecmp(string, "p")) { 2309 #ifdef DM_WRITECACHE_HAS_PMEM 2310 wc->pmem_mode = true; 2311 wc->writeback_fua = true; 2312 #else 2313 /* 2314 * If the architecture doesn't support persistent memory or 2315 * the kernel doesn't support any DAX drivers, this driver can 2316 * only be used in SSD-only mode. 2317 */ 2318 r = -EOPNOTSUPP; 2319 ti->error = "Persistent memory or DAX not supported on this system"; 2320 goto bad; 2321 #endif 2322 } else { 2323 goto bad_arguments; 2324 } 2325 2326 if (WC_MODE_PMEM(wc)) { 2327 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2328 offsetof(struct writeback_struct, bio), 2329 BIOSET_NEED_BVECS); 2330 if (r) { 2331 ti->error = "Could not allocate bio set"; 2332 goto bad; 2333 } 2334 } else { 2335 wc->pause = PAUSE_WRITEBACK; 2336 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2337 if (r) { 2338 ti->error = "Could not allocate mempool"; 2339 goto bad; 2340 } 2341 } 2342 2343 /* 2344 * Parse the origin data device 2345 */ 2346 string = dm_shift_arg(&as); 2347 if (!string) 2348 goto bad_arguments; 2349 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2350 if (r) { 2351 ti->error = "Origin data device lookup failed"; 2352 goto bad; 2353 } 2354 2355 /* 2356 * Parse cache data device (be it pmem or ssd) 2357 */ 2358 string = dm_shift_arg(&as); 2359 if (!string) 2360 goto bad_arguments; 2361 2362 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2363 if (r) { 2364 ti->error = "Cache data device lookup failed"; 2365 goto bad; 2366 } 2367 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); 2368 2369 /* 2370 * Parse the cache block size 2371 */ 2372 string = dm_shift_arg(&as); 2373 if (!string) 2374 goto bad_arguments; 2375 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2376 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2377 (wc->block_size & (wc->block_size - 1))) { 2378 r = -EINVAL; 2379 ti->error = "Invalid block size"; 2380 goto bad; 2381 } 2382 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2383 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2384 r = -EINVAL; 2385 ti->error = "Block size is smaller than device logical block size"; 2386 goto bad; 2387 } 2388 wc->block_size_bits = __ffs(wc->block_size); 2389 2390 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2391 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2392 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2393 2394 /* 2395 * Parse optional arguments 2396 */ 2397 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2398 if (r) 2399 goto bad; 2400 2401 while (opt_params) { 2402 string = dm_shift_arg(&as), opt_params--; 2403 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2404 unsigned long long start_sector; 2405 2406 string = dm_shift_arg(&as), opt_params--; 2407 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2408 goto invalid_optional; 2409 wc->start_sector = start_sector; 2410 wc->start_sector_set = true; 2411 if (wc->start_sector != start_sector || 2412 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2413 goto invalid_optional; 2414 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2415 string = dm_shift_arg(&as), opt_params--; 2416 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2417 goto invalid_optional; 2418 if (high_wm_percent < 0 || high_wm_percent > 100) 2419 goto invalid_optional; 2420 wc->high_wm_percent_value = high_wm_percent; 2421 wc->high_wm_percent_set = true; 2422 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2423 string = dm_shift_arg(&as), opt_params--; 2424 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2425 goto invalid_optional; 2426 if (low_wm_percent < 0 || low_wm_percent > 100) 2427 goto invalid_optional; 2428 wc->low_wm_percent_value = low_wm_percent; 2429 wc->low_wm_percent_set = true; 2430 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2431 string = dm_shift_arg(&as), opt_params--; 2432 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2433 goto invalid_optional; 2434 wc->max_writeback_jobs_set = true; 2435 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2436 string = dm_shift_arg(&as), opt_params--; 2437 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2438 goto invalid_optional; 2439 wc->autocommit_blocks_set = true; 2440 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2441 unsigned int autocommit_msecs; 2442 2443 string = dm_shift_arg(&as), opt_params--; 2444 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2445 goto invalid_optional; 2446 if (autocommit_msecs > 3600000) 2447 goto invalid_optional; 2448 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2449 wc->autocommit_time_value = autocommit_msecs; 2450 wc->autocommit_time_set = true; 2451 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2452 unsigned int max_age_msecs; 2453 2454 string = dm_shift_arg(&as), opt_params--; 2455 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2456 goto invalid_optional; 2457 if (max_age_msecs > 86400000) 2458 goto invalid_optional; 2459 wc->max_age = msecs_to_jiffies(max_age_msecs); 2460 wc->max_age_set = true; 2461 wc->max_age_value = max_age_msecs; 2462 } else if (!strcasecmp(string, "cleaner")) { 2463 wc->cleaner_set = true; 2464 wc->cleaner = true; 2465 } else if (!strcasecmp(string, "fua")) { 2466 if (WC_MODE_PMEM(wc)) { 2467 wc->writeback_fua = true; 2468 wc->writeback_fua_set = true; 2469 } else 2470 goto invalid_optional; 2471 } else if (!strcasecmp(string, "nofua")) { 2472 if (WC_MODE_PMEM(wc)) { 2473 wc->writeback_fua = false; 2474 wc->writeback_fua_set = true; 2475 } else 2476 goto invalid_optional; 2477 } else if (!strcasecmp(string, "metadata_only")) { 2478 wc->metadata_only = true; 2479 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { 2480 unsigned int pause_msecs; 2481 2482 if (WC_MODE_PMEM(wc)) 2483 goto invalid_optional; 2484 string = dm_shift_arg(&as), opt_params--; 2485 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) 2486 goto invalid_optional; 2487 if (pause_msecs > 60000) 2488 goto invalid_optional; 2489 wc->pause = msecs_to_jiffies(pause_msecs); 2490 wc->pause_set = true; 2491 wc->pause_value = pause_msecs; 2492 } else { 2493 invalid_optional: 2494 r = -EINVAL; 2495 ti->error = "Invalid optional argument"; 2496 goto bad; 2497 } 2498 } 2499 2500 if (high_wm_percent < low_wm_percent) { 2501 r = -EINVAL; 2502 ti->error = "High watermark must be greater than or equal to low watermark"; 2503 goto bad; 2504 } 2505 2506 if (WC_MODE_PMEM(wc)) { 2507 if (!dax_synchronous(wc->ssd_dev->dax_dev)) { 2508 r = -EOPNOTSUPP; 2509 ti->error = "Asynchronous persistent memory not supported as pmem cache"; 2510 goto bad; 2511 } 2512 2513 r = persistent_memory_claim(wc); 2514 if (r) { 2515 ti->error = "Unable to map persistent memory for cache"; 2516 goto bad; 2517 } 2518 } else { 2519 size_t n_blocks, n_metadata_blocks; 2520 uint64_t n_bitmap_bits; 2521 2522 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2523 2524 bio_list_init(&wc->flush_list); 2525 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); 2526 if (IS_ERR(wc->flush_thread)) { 2527 r = PTR_ERR(wc->flush_thread); 2528 wc->flush_thread = NULL; 2529 ti->error = "Couldn't spawn flush thread"; 2530 goto bad; 2531 } 2532 2533 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2534 &n_blocks, &n_metadata_blocks); 2535 if (r) { 2536 ti->error = "Invalid device size"; 2537 goto bad; 2538 } 2539 2540 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2541 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2542 /* this is limitation of test_bit functions */ 2543 if (n_bitmap_bits > 1U << 31) { 2544 r = -EFBIG; 2545 ti->error = "Invalid device size"; 2546 goto bad; 2547 } 2548 2549 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2550 if (!wc->memory_map) { 2551 r = -ENOMEM; 2552 ti->error = "Unable to allocate memory for metadata"; 2553 goto bad; 2554 } 2555 2556 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2557 if (IS_ERR(wc->dm_kcopyd)) { 2558 r = PTR_ERR(wc->dm_kcopyd); 2559 ti->error = "Unable to allocate dm-kcopyd client"; 2560 wc->dm_kcopyd = NULL; 2561 goto bad; 2562 } 2563 2564 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2565 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2566 BITS_PER_LONG * sizeof(unsigned long); 2567 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2568 if (!wc->dirty_bitmap) { 2569 r = -ENOMEM; 2570 ti->error = "Unable to allocate dirty bitmap"; 2571 goto bad; 2572 } 2573 2574 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2575 if (r) { 2576 ti->error = "Unable to read first block of metadata"; 2577 goto bad; 2578 } 2579 } 2580 2581 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2582 if (r) { 2583 ti->error = "Hardware memory error when reading superblock"; 2584 goto bad; 2585 } 2586 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2587 r = init_memory(wc); 2588 if (r) { 2589 ti->error = "Unable to initialize device"; 2590 goto bad; 2591 } 2592 r = copy_mc_to_kernel(&s, sb(wc), 2593 sizeof(struct wc_memory_superblock)); 2594 if (r) { 2595 ti->error = "Hardware memory error when reading superblock"; 2596 goto bad; 2597 } 2598 } 2599 2600 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2601 ti->error = "Invalid magic in the superblock"; 2602 r = -EINVAL; 2603 goto bad; 2604 } 2605 2606 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2607 ti->error = "Invalid version in the superblock"; 2608 r = -EINVAL; 2609 goto bad; 2610 } 2611 2612 if (le32_to_cpu(s.block_size) != wc->block_size) { 2613 ti->error = "Block size does not match superblock"; 2614 r = -EINVAL; 2615 goto bad; 2616 } 2617 2618 wc->n_blocks = le64_to_cpu(s.n_blocks); 2619 2620 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2621 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2622 overflow: 2623 ti->error = "Overflow in size calculation"; 2624 r = -EINVAL; 2625 goto bad; 2626 } 2627 offset += sizeof(struct wc_memory_superblock); 2628 if (offset < sizeof(struct wc_memory_superblock)) 2629 goto overflow; 2630 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2631 data_size = wc->n_blocks * (size_t)wc->block_size; 2632 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2633 (offset + data_size < offset)) 2634 goto overflow; 2635 if (offset + data_size > wc->memory_map_size) { 2636 ti->error = "Memory area is too small"; 2637 r = -EINVAL; 2638 goto bad; 2639 } 2640 2641 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2642 wc->block_start = (char *)sb(wc) + offset; 2643 2644 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2645 x += 50; 2646 do_div(x, 100); 2647 wc->freelist_high_watermark = x; 2648 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2649 x += 50; 2650 do_div(x, 100); 2651 wc->freelist_low_watermark = x; 2652 2653 if (wc->cleaner) 2654 activate_cleaner(wc); 2655 2656 r = writecache_alloc_entries(wc); 2657 if (r) { 2658 ti->error = "Cannot allocate memory"; 2659 goto bad; 2660 } 2661 2662 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; 2663 ti->flush_supported = true; 2664 ti->num_discard_bios = 1; 2665 2666 if (WC_MODE_PMEM(wc)) 2667 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2668 2669 return 0; 2670 2671 bad_arguments: 2672 r = -EINVAL; 2673 ti->error = "Bad arguments"; 2674 bad: 2675 writecache_dtr(ti); 2676 return r; 2677 } 2678 2679 static void writecache_status(struct dm_target *ti, status_type_t type, 2680 unsigned int status_flags, char *result, unsigned int maxlen) 2681 { 2682 struct dm_writecache *wc = ti->private; 2683 unsigned int extra_args; 2684 unsigned int sz = 0; 2685 2686 switch (type) { 2687 case STATUSTYPE_INFO: 2688 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", 2689 writecache_has_error(wc), 2690 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2691 (unsigned long long)wc->writeback_size, 2692 wc->stats.reads, 2693 wc->stats.read_hits, 2694 wc->stats.writes, 2695 wc->stats.write_hits_uncommitted, 2696 wc->stats.write_hits_committed, 2697 wc->stats.writes_around, 2698 wc->stats.writes_allocate, 2699 wc->stats.writes_blocked_on_freelist, 2700 wc->stats.flushes, 2701 wc->stats.discards); 2702 break; 2703 case STATUSTYPE_TABLE: 2704 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2705 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2706 extra_args = 0; 2707 if (wc->start_sector_set) 2708 extra_args += 2; 2709 if (wc->high_wm_percent_set) 2710 extra_args += 2; 2711 if (wc->low_wm_percent_set) 2712 extra_args += 2; 2713 if (wc->max_writeback_jobs_set) 2714 extra_args += 2; 2715 if (wc->autocommit_blocks_set) 2716 extra_args += 2; 2717 if (wc->autocommit_time_set) 2718 extra_args += 2; 2719 if (wc->max_age_set) 2720 extra_args += 2; 2721 if (wc->cleaner_set) 2722 extra_args++; 2723 if (wc->writeback_fua_set) 2724 extra_args++; 2725 if (wc->metadata_only) 2726 extra_args++; 2727 if (wc->pause_set) 2728 extra_args += 2; 2729 2730 DMEMIT("%u", extra_args); 2731 if (wc->start_sector_set) 2732 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2733 if (wc->high_wm_percent_set) 2734 DMEMIT(" high_watermark %u", wc->high_wm_percent_value); 2735 if (wc->low_wm_percent_set) 2736 DMEMIT(" low_watermark %u", wc->low_wm_percent_value); 2737 if (wc->max_writeback_jobs_set) 2738 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2739 if (wc->autocommit_blocks_set) 2740 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2741 if (wc->autocommit_time_set) 2742 DMEMIT(" autocommit_time %u", wc->autocommit_time_value); 2743 if (wc->max_age_set) 2744 DMEMIT(" max_age %u", wc->max_age_value); 2745 if (wc->cleaner_set) 2746 DMEMIT(" cleaner"); 2747 if (wc->writeback_fua_set) 2748 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2749 if (wc->metadata_only) 2750 DMEMIT(" metadata_only"); 2751 if (wc->pause_set) 2752 DMEMIT(" pause_writeback %u", wc->pause_value); 2753 break; 2754 case STATUSTYPE_IMA: 2755 *result = '\0'; 2756 break; 2757 } 2758 } 2759 2760 static struct target_type writecache_target = { 2761 .name = "writecache", 2762 .version = {1, 6, 0}, 2763 .module = THIS_MODULE, 2764 .ctr = writecache_ctr, 2765 .dtr = writecache_dtr, 2766 .status = writecache_status, 2767 .postsuspend = writecache_suspend, 2768 .resume = writecache_resume, 2769 .message = writecache_message, 2770 .map = writecache_map, 2771 .end_io = writecache_end_io, 2772 .iterate_devices = writecache_iterate_devices, 2773 .io_hints = writecache_io_hints, 2774 }; 2775 module_dm(writecache); 2776 2777 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2778 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>"); 2779 MODULE_LICENSE("GPL"); 2780