1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/libnvdimm.h> 17 #include <linux/delay.h> 18 #include "dm-io-tracker.h" 19 20 #define DM_MSG_PREFIX "writecache" 21 22 #define HIGH_WATERMARK 50 23 #define LOW_WATERMARK 45 24 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) 25 #define ENDIO_LATENCY 16 26 #define WRITEBACK_LATENCY 64 27 #define AUTOCOMMIT_BLOCKS_SSD 65536 28 #define AUTOCOMMIT_BLOCKS_PMEM 64 29 #define AUTOCOMMIT_MSEC 1000 30 #define MAX_AGE_DIV 16 31 #define MAX_AGE_UNSPECIFIED -1UL 32 #define PAUSE_WRITEBACK (HZ * 3) 33 34 #define BITMAP_GRANULARITY 65536 35 #if BITMAP_GRANULARITY < PAGE_SIZE 36 #undef BITMAP_GRANULARITY 37 #define BITMAP_GRANULARITY PAGE_SIZE 38 #endif 39 40 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 41 #define DM_WRITECACHE_HAS_PMEM 42 #endif 43 44 #ifdef DM_WRITECACHE_HAS_PMEM 45 #define pmem_assign(dest, src) \ 46 do { \ 47 typeof(dest) uniq = (src); \ 48 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 49 } while (0) 50 #else 51 #define pmem_assign(dest, src) ((dest) = (src)) 52 #endif 53 54 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) 55 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 56 #endif 57 58 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 59 #define MEMORY_SUPERBLOCK_VERSION 1 60 61 struct wc_memory_entry { 62 __le64 original_sector; 63 __le64 seq_count; 64 }; 65 66 struct wc_memory_superblock { 67 union { 68 struct { 69 __le32 magic; 70 __le32 version; 71 __le32 block_size; 72 __le32 pad; 73 __le64 n_blocks; 74 __le64 seq_count; 75 }; 76 __le64 padding[8]; 77 }; 78 struct wc_memory_entry entries[]; 79 }; 80 81 struct wc_entry { 82 struct rb_node rb_node; 83 struct list_head lru; 84 unsigned short wc_list_contiguous; 85 #if BITS_PER_LONG == 64 86 bool write_in_progress : 1; 87 unsigned long index : 47; 88 #else 89 bool write_in_progress; 90 unsigned long index; 91 #endif 92 unsigned long age; 93 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 94 uint64_t original_sector; 95 uint64_t seq_count; 96 #endif 97 }; 98 99 #ifdef DM_WRITECACHE_HAS_PMEM 100 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 101 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 102 #else 103 #define WC_MODE_PMEM(wc) false 104 #define WC_MODE_FUA(wc) false 105 #endif 106 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 107 108 struct dm_writecache { 109 struct mutex lock; 110 struct list_head lru; 111 union { 112 struct list_head freelist; 113 struct { 114 struct rb_root freetree; 115 struct wc_entry *current_free; 116 }; 117 }; 118 struct rb_root tree; 119 120 size_t freelist_size; 121 size_t writeback_size; 122 size_t freelist_high_watermark; 123 size_t freelist_low_watermark; 124 unsigned long max_age; 125 unsigned long pause; 126 127 unsigned int uncommitted_blocks; 128 unsigned int autocommit_blocks; 129 unsigned int max_writeback_jobs; 130 131 int error; 132 133 unsigned long autocommit_jiffies; 134 struct timer_list autocommit_timer; 135 struct wait_queue_head freelist_wait; 136 137 struct timer_list max_age_timer; 138 139 atomic_t bio_in_progress[2]; 140 struct wait_queue_head bio_in_progress_wait[2]; 141 142 struct dm_target *ti; 143 struct dm_dev *dev; 144 struct dm_dev *ssd_dev; 145 sector_t start_sector; 146 void *memory_map; 147 uint64_t memory_map_size; 148 size_t metadata_sectors; 149 size_t n_blocks; 150 uint64_t seq_count; 151 sector_t data_device_sectors; 152 void *block_start; 153 struct wc_entry *entries; 154 unsigned int block_size; 155 unsigned char block_size_bits; 156 157 bool pmem_mode:1; 158 bool writeback_fua:1; 159 160 bool overwrote_committed:1; 161 bool memory_vmapped:1; 162 163 bool start_sector_set:1; 164 bool high_wm_percent_set:1; 165 bool low_wm_percent_set:1; 166 bool max_writeback_jobs_set:1; 167 bool autocommit_blocks_set:1; 168 bool autocommit_time_set:1; 169 bool max_age_set:1; 170 bool writeback_fua_set:1; 171 bool flush_on_suspend:1; 172 bool cleaner:1; 173 bool cleaner_set:1; 174 bool metadata_only:1; 175 bool pause_set:1; 176 177 unsigned int high_wm_percent_value; 178 unsigned int low_wm_percent_value; 179 unsigned int autocommit_time_value; 180 unsigned int max_age_value; 181 unsigned int pause_value; 182 183 unsigned int writeback_all; 184 struct workqueue_struct *writeback_wq; 185 struct work_struct writeback_work; 186 struct work_struct flush_work; 187 188 struct dm_io_tracker iot; 189 190 struct dm_io_client *dm_io; 191 192 raw_spinlock_t endio_list_lock; 193 struct list_head endio_list; 194 struct task_struct *endio_thread; 195 196 struct task_struct *flush_thread; 197 struct bio_list flush_list; 198 199 struct dm_kcopyd_client *dm_kcopyd; 200 unsigned long *dirty_bitmap; 201 unsigned int dirty_bitmap_size; 202 203 struct bio_set bio_set; 204 mempool_t copy_pool; 205 206 struct { 207 unsigned long long reads; 208 unsigned long long read_hits; 209 unsigned long long writes; 210 unsigned long long write_hits_uncommitted; 211 unsigned long long write_hits_committed; 212 unsigned long long writes_around; 213 unsigned long long writes_allocate; 214 unsigned long long writes_blocked_on_freelist; 215 unsigned long long flushes; 216 unsigned long long discards; 217 } stats; 218 }; 219 220 #define WB_LIST_INLINE 16 221 222 struct writeback_struct { 223 struct list_head endio_entry; 224 struct dm_writecache *wc; 225 struct wc_entry **wc_list; 226 unsigned int wc_list_n; 227 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 228 struct bio bio; 229 }; 230 231 struct copy_struct { 232 struct list_head endio_entry; 233 struct dm_writecache *wc; 234 struct wc_entry *e; 235 unsigned int n_entries; 236 int error; 237 }; 238 239 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 240 "A percentage of time allocated for data copying"); 241 242 static void wc_lock(struct dm_writecache *wc) 243 { 244 mutex_lock(&wc->lock); 245 } 246 247 static void wc_unlock(struct dm_writecache *wc) 248 { 249 mutex_unlock(&wc->lock); 250 } 251 252 #ifdef DM_WRITECACHE_HAS_PMEM 253 static int persistent_memory_claim(struct dm_writecache *wc) 254 { 255 int r; 256 loff_t s; 257 long p, da; 258 unsigned long pfn; 259 int id; 260 struct page **pages; 261 sector_t offset; 262 263 wc->memory_vmapped = false; 264 265 s = wc->memory_map_size; 266 p = s >> PAGE_SHIFT; 267 if (!p) { 268 r = -EINVAL; 269 goto err1; 270 } 271 if (p != s >> PAGE_SHIFT) { 272 r = -EOVERFLOW; 273 goto err1; 274 } 275 276 offset = get_start_sect(wc->ssd_dev->bdev); 277 if (offset & (PAGE_SIZE / 512 - 1)) { 278 r = -EINVAL; 279 goto err1; 280 } 281 offset >>= PAGE_SHIFT - 9; 282 283 id = dax_read_lock(); 284 285 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 286 &wc->memory_map, &pfn); 287 if (da < 0) { 288 wc->memory_map = NULL; 289 r = da; 290 goto err2; 291 } 292 if (!pfn_valid(pfn)) { 293 wc->memory_map = NULL; 294 r = -EOPNOTSUPP; 295 goto err2; 296 } 297 if (da != p) { 298 long i; 299 300 wc->memory_map = NULL; 301 pages = vmalloc_array(p, sizeof(struct page *)); 302 if (!pages) { 303 r = -ENOMEM; 304 goto err2; 305 } 306 i = 0; 307 do { 308 long daa; 309 310 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 311 p - i, DAX_ACCESS, NULL, &pfn); 312 if (daa <= 0) { 313 r = daa ? daa : -EINVAL; 314 goto err3; 315 } 316 if (!pfn_valid(pfn)) { 317 r = -EOPNOTSUPP; 318 goto err3; 319 } 320 while (daa-- && i < p) { 321 pages[i++] = pfn_to_page(pfn); 322 pfn++; 323 if (!(i & 15)) 324 cond_resched(); 325 } 326 } while (i < p); 327 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 328 if (!wc->memory_map) { 329 r = -ENOMEM; 330 goto err3; 331 } 332 vfree(pages); 333 wc->memory_vmapped = true; 334 } 335 336 dax_read_unlock(id); 337 338 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 339 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 340 341 return 0; 342 err3: 343 vfree(pages); 344 err2: 345 dax_read_unlock(id); 346 err1: 347 return r; 348 } 349 #else 350 static int persistent_memory_claim(struct dm_writecache *wc) 351 { 352 return -EOPNOTSUPP; 353 } 354 #endif 355 356 static void persistent_memory_release(struct dm_writecache *wc) 357 { 358 if (wc->memory_vmapped) 359 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 360 } 361 362 static struct page *persistent_memory_page(void *addr) 363 { 364 if (is_vmalloc_addr(addr)) 365 return vmalloc_to_page(addr); 366 else 367 return virt_to_page(addr); 368 } 369 370 static unsigned int persistent_memory_page_offset(void *addr) 371 { 372 return (unsigned long)addr & (PAGE_SIZE - 1); 373 } 374 375 static void persistent_memory_flush_cache(void *ptr, size_t size) 376 { 377 if (is_vmalloc_addr(ptr)) 378 flush_kernel_vmap_range(ptr, size); 379 } 380 381 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 382 { 383 if (is_vmalloc_addr(ptr)) 384 invalidate_kernel_vmap_range(ptr, size); 385 } 386 387 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 388 { 389 return wc->memory_map; 390 } 391 392 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 393 { 394 return &sb(wc)->entries[e->index]; 395 } 396 397 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 398 { 399 return (char *)wc->block_start + (e->index << wc->block_size_bits); 400 } 401 402 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 403 { 404 return wc->start_sector + wc->metadata_sectors + 405 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 406 } 407 408 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 409 { 410 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 411 return e->original_sector; 412 #else 413 return le64_to_cpu(memory_entry(wc, e)->original_sector); 414 #endif 415 } 416 417 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 418 { 419 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 420 return e->seq_count; 421 #else 422 return le64_to_cpu(memory_entry(wc, e)->seq_count); 423 #endif 424 } 425 426 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 427 { 428 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 429 e->seq_count = -1; 430 #endif 431 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 432 } 433 434 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 435 uint64_t original_sector, uint64_t seq_count) 436 { 437 struct wc_memory_entry me; 438 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 439 e->original_sector = original_sector; 440 e->seq_count = seq_count; 441 #endif 442 me.original_sector = cpu_to_le64(original_sector); 443 me.seq_count = cpu_to_le64(seq_count); 444 pmem_assign(*memory_entry(wc, e), me); 445 } 446 447 #define writecache_error(wc, err, msg, arg...) \ 448 do { \ 449 if (!cmpxchg(&(wc)->error, 0, err)) \ 450 DMERR(msg, ##arg); \ 451 wake_up(&(wc)->freelist_wait); \ 452 } while (0) 453 454 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 455 456 static void writecache_flush_all_metadata(struct dm_writecache *wc) 457 { 458 if (!WC_MODE_PMEM(wc)) 459 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 460 } 461 462 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 463 { 464 if (!WC_MODE_PMEM(wc)) 465 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 466 wc->dirty_bitmap); 467 } 468 469 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 470 471 struct io_notify { 472 struct dm_writecache *wc; 473 struct completion c; 474 atomic_t count; 475 }; 476 477 static void writecache_notify_io(unsigned long error, void *context) 478 { 479 struct io_notify *endio = context; 480 481 if (unlikely(error != 0)) 482 writecache_error(endio->wc, -EIO, "error writing metadata"); 483 BUG_ON(atomic_read(&endio->count) <= 0); 484 if (atomic_dec_and_test(&endio->count)) 485 complete(&endio->c); 486 } 487 488 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 489 { 490 wait_event(wc->bio_in_progress_wait[direction], 491 !atomic_read(&wc->bio_in_progress[direction])); 492 } 493 494 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 495 { 496 struct dm_io_region region; 497 struct dm_io_request req; 498 struct io_notify endio = { 499 wc, 500 COMPLETION_INITIALIZER_ONSTACK(endio.c), 501 ATOMIC_INIT(1), 502 }; 503 unsigned int bitmap_bits = wc->dirty_bitmap_size * 8; 504 unsigned int i = 0; 505 506 while (1) { 507 unsigned int j; 508 509 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 510 if (unlikely(i == bitmap_bits)) 511 break; 512 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 513 514 region.bdev = wc->ssd_dev->bdev; 515 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 516 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 517 518 if (unlikely(region.sector >= wc->metadata_sectors)) 519 break; 520 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 521 region.count = wc->metadata_sectors - region.sector; 522 523 region.sector += wc->start_sector; 524 atomic_inc(&endio.count); 525 req.bi_opf = REQ_OP_WRITE | REQ_SYNC; 526 req.mem.type = DM_IO_VMA; 527 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 528 req.client = wc->dm_io; 529 req.notify.fn = writecache_notify_io; 530 req.notify.context = &endio; 531 532 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 533 (void) dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 534 i = j; 535 } 536 537 writecache_notify_io(0, &endio); 538 wait_for_completion_io(&endio.c); 539 540 if (wait_for_ios) 541 writecache_wait_for_ios(wc, WRITE); 542 543 writecache_disk_flush(wc, wc->ssd_dev); 544 545 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 546 } 547 548 static void ssd_commit_superblock(struct dm_writecache *wc) 549 { 550 int r; 551 struct dm_io_region region; 552 struct dm_io_request req; 553 554 region.bdev = wc->ssd_dev->bdev; 555 region.sector = 0; 556 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; 557 558 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 559 region.count = wc->metadata_sectors - region.sector; 560 561 region.sector += wc->start_sector; 562 563 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; 564 req.mem.type = DM_IO_VMA; 565 req.mem.ptr.vma = (char *)wc->memory_map; 566 req.client = wc->dm_io; 567 req.notify.fn = NULL; 568 req.notify.context = NULL; 569 570 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 571 if (unlikely(r)) 572 writecache_error(wc, r, "error writing superblock"); 573 } 574 575 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 576 { 577 if (WC_MODE_PMEM(wc)) 578 pmem_wmb(); 579 else 580 ssd_commit_flushed(wc, wait_for_ios); 581 } 582 583 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 584 { 585 int r; 586 struct dm_io_region region; 587 struct dm_io_request req; 588 589 region.bdev = dev->bdev; 590 region.sector = 0; 591 region.count = 0; 592 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 593 req.mem.type = DM_IO_KMEM; 594 req.mem.ptr.addr = NULL; 595 req.client = wc->dm_io; 596 req.notify.fn = NULL; 597 598 r = dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 599 if (unlikely(r)) 600 writecache_error(wc, r, "error flushing metadata: %d", r); 601 } 602 603 #define WFE_RETURN_FOLLOWING 1 604 #define WFE_LOWEST_SEQ 2 605 606 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 607 uint64_t block, int flags) 608 { 609 struct wc_entry *e; 610 struct rb_node *node = wc->tree.rb_node; 611 612 if (unlikely(!node)) 613 return NULL; 614 615 while (1) { 616 e = container_of(node, struct wc_entry, rb_node); 617 if (read_original_sector(wc, e) == block) 618 break; 619 620 node = (read_original_sector(wc, e) >= block ? 621 e->rb_node.rb_left : e->rb_node.rb_right); 622 if (unlikely(!node)) { 623 if (!(flags & WFE_RETURN_FOLLOWING)) 624 return NULL; 625 if (read_original_sector(wc, e) >= block) 626 return e; 627 628 node = rb_next(&e->rb_node); 629 if (unlikely(!node)) 630 return NULL; 631 632 e = container_of(node, struct wc_entry, rb_node); 633 return e; 634 } 635 } 636 637 while (1) { 638 struct wc_entry *e2; 639 640 if (flags & WFE_LOWEST_SEQ) 641 node = rb_prev(&e->rb_node); 642 else 643 node = rb_next(&e->rb_node); 644 if (unlikely(!node)) 645 return e; 646 e2 = container_of(node, struct wc_entry, rb_node); 647 if (read_original_sector(wc, e2) != block) 648 return e; 649 e = e2; 650 } 651 } 652 653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 654 { 655 struct wc_entry *e; 656 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 657 658 while (*node) { 659 e = container_of(*node, struct wc_entry, rb_node); 660 parent = &e->rb_node; 661 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 662 node = &parent->rb_left; 663 else 664 node = &parent->rb_right; 665 } 666 rb_link_node(&ins->rb_node, parent, node); 667 rb_insert_color(&ins->rb_node, &wc->tree); 668 list_add(&ins->lru, &wc->lru); 669 ins->age = jiffies; 670 } 671 672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 673 { 674 list_del(&e->lru); 675 rb_erase(&e->rb_node, &wc->tree); 676 } 677 678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 679 { 680 if (WC_MODE_SORT_FREELIST(wc)) { 681 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 682 683 if (unlikely(!*node)) 684 wc->current_free = e; 685 while (*node) { 686 parent = *node; 687 if (&e->rb_node < *node) 688 node = &parent->rb_left; 689 else 690 node = &parent->rb_right; 691 } 692 rb_link_node(&e->rb_node, parent, node); 693 rb_insert_color(&e->rb_node, &wc->freetree); 694 } else { 695 list_add_tail(&e->lru, &wc->freelist); 696 } 697 wc->freelist_size++; 698 } 699 700 static inline void writecache_verify_watermark(struct dm_writecache *wc) 701 { 702 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 703 queue_work(wc->writeback_wq, &wc->writeback_work); 704 } 705 706 static void writecache_max_age_timer(struct timer_list *t) 707 { 708 struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer); 709 710 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 711 queue_work(wc->writeback_wq, &wc->writeback_work); 712 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 713 } 714 } 715 716 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 717 { 718 struct wc_entry *e; 719 720 if (WC_MODE_SORT_FREELIST(wc)) { 721 struct rb_node *next; 722 723 if (unlikely(!wc->current_free)) 724 return NULL; 725 e = wc->current_free; 726 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 727 return NULL; 728 next = rb_next(&e->rb_node); 729 rb_erase(&e->rb_node, &wc->freetree); 730 if (unlikely(!next)) 731 next = rb_first(&wc->freetree); 732 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 733 } else { 734 if (unlikely(list_empty(&wc->freelist))) 735 return NULL; 736 e = container_of(wc->freelist.next, struct wc_entry, lru); 737 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 738 return NULL; 739 list_del(&e->lru); 740 } 741 wc->freelist_size--; 742 743 writecache_verify_watermark(wc); 744 745 return e; 746 } 747 748 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 749 { 750 writecache_unlink(wc, e); 751 writecache_add_to_freelist(wc, e); 752 clear_seq_count(wc, e); 753 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 754 if (unlikely(waitqueue_active(&wc->freelist_wait))) 755 wake_up(&wc->freelist_wait); 756 } 757 758 static void writecache_wait_on_freelist(struct dm_writecache *wc) 759 { 760 DEFINE_WAIT(wait); 761 762 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 763 wc_unlock(wc); 764 io_schedule(); 765 finish_wait(&wc->freelist_wait, &wait); 766 wc_lock(wc); 767 } 768 769 static void writecache_poison_lists(struct dm_writecache *wc) 770 { 771 /* 772 * Catch incorrect access to these values while the device is suspended. 773 */ 774 memset(&wc->tree, -1, sizeof(wc->tree)); 775 wc->lru.next = LIST_POISON1; 776 wc->lru.prev = LIST_POISON2; 777 wc->freelist.next = LIST_POISON1; 778 wc->freelist.prev = LIST_POISON2; 779 } 780 781 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 782 { 783 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 784 if (WC_MODE_PMEM(wc)) 785 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 786 } 787 788 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 789 { 790 return read_seq_count(wc, e) < wc->seq_count; 791 } 792 793 static void writecache_flush(struct dm_writecache *wc) 794 { 795 struct wc_entry *e, *e2; 796 bool need_flush_after_free; 797 798 wc->uncommitted_blocks = 0; 799 timer_delete(&wc->autocommit_timer); 800 801 if (list_empty(&wc->lru)) 802 return; 803 804 e = container_of(wc->lru.next, struct wc_entry, lru); 805 if (writecache_entry_is_committed(wc, e)) { 806 if (wc->overwrote_committed) { 807 writecache_wait_for_ios(wc, WRITE); 808 writecache_disk_flush(wc, wc->ssd_dev); 809 wc->overwrote_committed = false; 810 } 811 return; 812 } 813 while (1) { 814 writecache_flush_entry(wc, e); 815 if (unlikely(e->lru.next == &wc->lru)) 816 break; 817 e2 = container_of(e->lru.next, struct wc_entry, lru); 818 if (writecache_entry_is_committed(wc, e2)) 819 break; 820 e = e2; 821 cond_resched(); 822 } 823 writecache_commit_flushed(wc, true); 824 825 wc->seq_count++; 826 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 827 if (WC_MODE_PMEM(wc)) 828 writecache_commit_flushed(wc, false); 829 else 830 ssd_commit_superblock(wc); 831 832 wc->overwrote_committed = false; 833 834 need_flush_after_free = false; 835 while (1) { 836 /* Free another committed entry with lower seq-count */ 837 struct rb_node *rb_node = rb_prev(&e->rb_node); 838 839 if (rb_node) { 840 e2 = container_of(rb_node, struct wc_entry, rb_node); 841 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 842 likely(!e2->write_in_progress)) { 843 writecache_free_entry(wc, e2); 844 need_flush_after_free = true; 845 } 846 } 847 if (unlikely(e->lru.prev == &wc->lru)) 848 break; 849 e = container_of(e->lru.prev, struct wc_entry, lru); 850 cond_resched(); 851 } 852 853 if (need_flush_after_free) 854 writecache_commit_flushed(wc, false); 855 } 856 857 static void writecache_flush_work(struct work_struct *work) 858 { 859 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 860 861 wc_lock(wc); 862 writecache_flush(wc); 863 wc_unlock(wc); 864 } 865 866 static void writecache_autocommit_timer(struct timer_list *t) 867 { 868 struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer); 869 870 if (!writecache_has_error(wc)) 871 queue_work(wc->writeback_wq, &wc->flush_work); 872 } 873 874 static void writecache_schedule_autocommit(struct dm_writecache *wc) 875 { 876 if (!timer_pending(&wc->autocommit_timer)) 877 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 878 } 879 880 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 881 { 882 struct wc_entry *e; 883 bool discarded_something = false; 884 885 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 886 if (unlikely(!e)) 887 return; 888 889 while (read_original_sector(wc, e) < end) { 890 struct rb_node *node = rb_next(&e->rb_node); 891 892 if (likely(!e->write_in_progress)) { 893 if (!discarded_something) { 894 if (!WC_MODE_PMEM(wc)) { 895 writecache_wait_for_ios(wc, READ); 896 writecache_wait_for_ios(wc, WRITE); 897 } 898 discarded_something = true; 899 } 900 if (!writecache_entry_is_committed(wc, e)) 901 wc->uncommitted_blocks--; 902 writecache_free_entry(wc, e); 903 } 904 905 if (unlikely(!node)) 906 break; 907 908 e = container_of(node, struct wc_entry, rb_node); 909 } 910 911 if (discarded_something) 912 writecache_commit_flushed(wc, false); 913 } 914 915 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 916 { 917 if (wc->writeback_size) { 918 writecache_wait_on_freelist(wc); 919 return true; 920 } 921 return false; 922 } 923 924 static void writecache_suspend(struct dm_target *ti) 925 { 926 struct dm_writecache *wc = ti->private; 927 bool flush_on_suspend; 928 929 timer_delete_sync(&wc->autocommit_timer); 930 timer_delete_sync(&wc->max_age_timer); 931 932 wc_lock(wc); 933 writecache_flush(wc); 934 flush_on_suspend = wc->flush_on_suspend; 935 if (flush_on_suspend) { 936 wc->flush_on_suspend = false; 937 wc->writeback_all++; 938 queue_work(wc->writeback_wq, &wc->writeback_work); 939 } 940 wc_unlock(wc); 941 942 drain_workqueue(wc->writeback_wq); 943 944 wc_lock(wc); 945 if (flush_on_suspend) 946 wc->writeback_all--; 947 while (writecache_wait_for_writeback(wc)) 948 ; 949 950 if (WC_MODE_PMEM(wc)) 951 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 952 953 writecache_poison_lists(wc); 954 955 wc_unlock(wc); 956 } 957 958 static int writecache_alloc_entries(struct dm_writecache *wc) 959 { 960 size_t b; 961 962 if (wc->entries) 963 return 0; 964 wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry)); 965 if (!wc->entries) 966 return -ENOMEM; 967 for (b = 0; b < wc->n_blocks; b++) { 968 struct wc_entry *e = &wc->entries[b]; 969 970 e->index = b; 971 e->write_in_progress = false; 972 cond_resched(); 973 } 974 975 return 0; 976 } 977 978 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 979 { 980 struct dm_io_region region; 981 struct dm_io_request req; 982 983 region.bdev = wc->ssd_dev->bdev; 984 region.sector = wc->start_sector; 985 region.count = n_sectors; 986 req.bi_opf = REQ_OP_READ | REQ_SYNC; 987 req.mem.type = DM_IO_VMA; 988 req.mem.ptr.vma = (char *)wc->memory_map; 989 req.client = wc->dm_io; 990 req.notify.fn = NULL; 991 992 return dm_io(&req, 1, ®ion, NULL, IOPRIO_DEFAULT); 993 } 994 995 static void writecache_resume(struct dm_target *ti) 996 { 997 struct dm_writecache *wc = ti->private; 998 size_t b; 999 bool need_flush = false; 1000 __le64 sb_seq_count; 1001 int r; 1002 1003 wc_lock(wc); 1004 1005 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); 1006 1007 if (WC_MODE_PMEM(wc)) { 1008 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 1009 } else { 1010 r = writecache_read_metadata(wc, wc->metadata_sectors); 1011 if (r) { 1012 size_t sb_entries_offset; 1013 1014 writecache_error(wc, r, "unable to read metadata: %d", r); 1015 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 1016 memset((char *)wc->memory_map + sb_entries_offset, -1, 1017 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 1018 } 1019 } 1020 1021 wc->tree = RB_ROOT; 1022 INIT_LIST_HEAD(&wc->lru); 1023 if (WC_MODE_SORT_FREELIST(wc)) { 1024 wc->freetree = RB_ROOT; 1025 wc->current_free = NULL; 1026 } else { 1027 INIT_LIST_HEAD(&wc->freelist); 1028 } 1029 wc->freelist_size = 0; 1030 1031 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, 1032 sizeof(uint64_t)); 1033 if (r) { 1034 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 1035 sb_seq_count = cpu_to_le64(0); 1036 } 1037 wc->seq_count = le64_to_cpu(sb_seq_count); 1038 1039 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 1040 for (b = 0; b < wc->n_blocks; b++) { 1041 struct wc_entry *e = &wc->entries[b]; 1042 struct wc_memory_entry wme; 1043 1044 if (writecache_has_error(wc)) { 1045 e->original_sector = -1; 1046 e->seq_count = -1; 1047 continue; 1048 } 1049 r = copy_mc_to_kernel(&wme, memory_entry(wc, e), 1050 sizeof(struct wc_memory_entry)); 1051 if (r) { 1052 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1053 (unsigned long)b, r); 1054 e->original_sector = -1; 1055 e->seq_count = -1; 1056 } else { 1057 e->original_sector = le64_to_cpu(wme.original_sector); 1058 e->seq_count = le64_to_cpu(wme.seq_count); 1059 } 1060 cond_resched(); 1061 } 1062 #endif 1063 for (b = 0; b < wc->n_blocks; b++) { 1064 struct wc_entry *e = &wc->entries[b]; 1065 1066 if (!writecache_entry_is_committed(wc, e)) { 1067 if (read_seq_count(wc, e) != -1) { 1068 erase_this: 1069 clear_seq_count(wc, e); 1070 need_flush = true; 1071 } 1072 writecache_add_to_freelist(wc, e); 1073 } else { 1074 struct wc_entry *old; 1075 1076 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1077 if (!old) { 1078 writecache_insert_entry(wc, e); 1079 } else { 1080 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1081 writecache_error(wc, -EINVAL, 1082 "two identical entries, position %llu, sector %llu, sequence %llu", 1083 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1084 (unsigned long long)read_seq_count(wc, e)); 1085 } 1086 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1087 goto erase_this; 1088 } else { 1089 writecache_free_entry(wc, old); 1090 writecache_insert_entry(wc, e); 1091 need_flush = true; 1092 } 1093 } 1094 } 1095 cond_resched(); 1096 } 1097 1098 if (need_flush) { 1099 writecache_flush_all_metadata(wc); 1100 writecache_commit_flushed(wc, false); 1101 } 1102 1103 writecache_verify_watermark(wc); 1104 1105 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1106 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1107 1108 wc_unlock(wc); 1109 } 1110 1111 static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1112 { 1113 if (argc != 1) 1114 return -EINVAL; 1115 1116 wc_lock(wc); 1117 if (dm_suspended(wc->ti)) { 1118 wc_unlock(wc); 1119 return -EBUSY; 1120 } 1121 if (writecache_has_error(wc)) { 1122 wc_unlock(wc); 1123 return -EIO; 1124 } 1125 1126 writecache_flush(wc); 1127 wc->writeback_all++; 1128 queue_work(wc->writeback_wq, &wc->writeback_work); 1129 wc_unlock(wc); 1130 1131 flush_workqueue(wc->writeback_wq); 1132 1133 wc_lock(wc); 1134 wc->writeback_all--; 1135 if (writecache_has_error(wc)) { 1136 wc_unlock(wc); 1137 return -EIO; 1138 } 1139 wc_unlock(wc); 1140 1141 return 0; 1142 } 1143 1144 static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1145 { 1146 if (argc != 1) 1147 return -EINVAL; 1148 1149 wc_lock(wc); 1150 wc->flush_on_suspend = true; 1151 wc_unlock(wc); 1152 1153 return 0; 1154 } 1155 1156 static void activate_cleaner(struct dm_writecache *wc) 1157 { 1158 wc->flush_on_suspend = true; 1159 wc->cleaner = true; 1160 wc->freelist_high_watermark = wc->n_blocks; 1161 wc->freelist_low_watermark = wc->n_blocks; 1162 } 1163 1164 static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1165 { 1166 if (argc != 1) 1167 return -EINVAL; 1168 1169 wc_lock(wc); 1170 activate_cleaner(wc); 1171 if (!dm_suspended(wc->ti)) 1172 writecache_verify_watermark(wc); 1173 wc_unlock(wc); 1174 1175 return 0; 1176 } 1177 1178 static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc) 1179 { 1180 if (argc != 1) 1181 return -EINVAL; 1182 1183 wc_lock(wc); 1184 memset(&wc->stats, 0, sizeof(wc->stats)); 1185 wc_unlock(wc); 1186 1187 return 0; 1188 } 1189 1190 static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv, 1191 char *result, unsigned int maxlen) 1192 { 1193 int r = -EINVAL; 1194 struct dm_writecache *wc = ti->private; 1195 1196 if (!strcasecmp(argv[0], "flush")) 1197 r = process_flush_mesg(argc, argv, wc); 1198 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1199 r = process_flush_on_suspend_mesg(argc, argv, wc); 1200 else if (!strcasecmp(argv[0], "cleaner")) 1201 r = process_cleaner_mesg(argc, argv, wc); 1202 else if (!strcasecmp(argv[0], "clear_stats")) 1203 r = process_clear_stats_mesg(argc, argv, wc); 1204 else 1205 DMERR("unrecognised message received: %s", argv[0]); 1206 1207 return r; 1208 } 1209 1210 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) 1211 { 1212 /* 1213 * clflushopt performs better with block size 1024, 2048, 4096 1214 * non-temporal stores perform better with block size 512 1215 * 1216 * block size 512 1024 2048 4096 1217 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s 1218 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s 1219 * 1220 * We see that movnti performs better for 512-byte blocks, and 1221 * clflushopt performs better for 1024-byte and larger blocks. So, we 1222 * prefer clflushopt for sizes >= 768. 1223 * 1224 * NOTE: this happens to be the case now (with dm-writecache's single 1225 * threaded model) but re-evaluate this once memcpy_flushcache() is 1226 * enabled to use movdir64b which might invalidate this performance 1227 * advantage seen with cache-allocating-writes plus flushing. 1228 */ 1229 #ifdef CONFIG_X86 1230 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && 1231 likely(boot_cpu_data.x86_clflush_size == 64) && 1232 likely(size >= 768)) { 1233 do { 1234 memcpy((void *)dest, (void *)source, 64); 1235 clflushopt((void *)dest); 1236 dest += 64; 1237 source += 64; 1238 size -= 64; 1239 } while (size >= 64); 1240 return; 1241 } 1242 #endif 1243 memcpy_flushcache(dest, source, size); 1244 } 1245 1246 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1247 { 1248 void *buf; 1249 unsigned int size; 1250 int rw = bio_data_dir(bio); 1251 unsigned int remaining_size = wc->block_size; 1252 1253 do { 1254 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1255 1256 buf = bvec_kmap_local(&bv); 1257 size = bv.bv_len; 1258 if (unlikely(size > remaining_size)) 1259 size = remaining_size; 1260 1261 if (rw == READ) { 1262 int r; 1263 1264 r = copy_mc_to_kernel(buf, data, size); 1265 flush_dcache_page(bio_page(bio)); 1266 if (unlikely(r)) { 1267 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1268 bio->bi_status = BLK_STS_IOERR; 1269 } 1270 } else { 1271 flush_dcache_page(bio_page(bio)); 1272 memcpy_flushcache_optimized(data, buf, size); 1273 } 1274 1275 kunmap_local(buf); 1276 1277 data = (char *)data + size; 1278 remaining_size -= size; 1279 bio_advance(bio, size); 1280 } while (unlikely(remaining_size)); 1281 } 1282 1283 static int writecache_flush_thread(void *data) 1284 { 1285 struct dm_writecache *wc = data; 1286 1287 while (1) { 1288 struct bio *bio; 1289 1290 wc_lock(wc); 1291 bio = bio_list_pop(&wc->flush_list); 1292 if (!bio) { 1293 set_current_state(TASK_INTERRUPTIBLE); 1294 wc_unlock(wc); 1295 1296 if (unlikely(kthread_should_stop())) { 1297 set_current_state(TASK_RUNNING); 1298 break; 1299 } 1300 1301 schedule(); 1302 continue; 1303 } 1304 1305 if (bio_op(bio) == REQ_OP_DISCARD) { 1306 writecache_discard(wc, bio->bi_iter.bi_sector, 1307 bio_end_sector(bio)); 1308 wc_unlock(wc); 1309 bio_set_dev(bio, wc->dev->bdev); 1310 submit_bio_noacct(bio); 1311 } else { 1312 writecache_flush(wc); 1313 wc_unlock(wc); 1314 if (writecache_has_error(wc)) 1315 bio->bi_status = BLK_STS_IOERR; 1316 bio_endio(bio); 1317 } 1318 } 1319 1320 return 0; 1321 } 1322 1323 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1324 { 1325 if (bio_list_empty(&wc->flush_list)) 1326 wake_up_process(wc->flush_thread); 1327 bio_list_add(&wc->flush_list, bio); 1328 } 1329 1330 enum wc_map_op { 1331 WC_MAP_SUBMIT, 1332 WC_MAP_REMAP, 1333 WC_MAP_REMAP_ORIGIN, 1334 WC_MAP_RETURN, 1335 WC_MAP_ERROR, 1336 }; 1337 1338 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, 1339 struct wc_entry *e) 1340 { 1341 if (e) { 1342 sector_t next_boundary = 1343 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1344 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) 1345 dm_accept_partial_bio(bio, next_boundary); 1346 } 1347 } 1348 1349 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) 1350 { 1351 enum wc_map_op map_op; 1352 struct wc_entry *e; 1353 1354 read_next_block: 1355 wc->stats.reads++; 1356 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1357 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1358 wc->stats.read_hits++; 1359 if (WC_MODE_PMEM(wc)) { 1360 bio_copy_block(wc, bio, memory_data(wc, e)); 1361 if (bio->bi_iter.bi_size) 1362 goto read_next_block; 1363 map_op = WC_MAP_SUBMIT; 1364 } else { 1365 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1366 bio_set_dev(bio, wc->ssd_dev->bdev); 1367 bio->bi_iter.bi_sector = cache_sector(wc, e); 1368 if (!writecache_entry_is_committed(wc, e)) 1369 writecache_wait_for_ios(wc, WRITE); 1370 map_op = WC_MAP_REMAP; 1371 } 1372 } else { 1373 writecache_map_remap_origin(wc, bio, e); 1374 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1375 map_op = WC_MAP_REMAP_ORIGIN; 1376 } 1377 1378 return map_op; 1379 } 1380 1381 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, 1382 struct wc_entry *e, bool search_used) 1383 { 1384 unsigned int bio_size = wc->block_size; 1385 sector_t start_cache_sec = cache_sector(wc, e); 1386 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1387 1388 while (bio_size < bio->bi_iter.bi_size) { 1389 if (!search_used) { 1390 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1391 1392 if (!f) 1393 break; 1394 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1395 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1396 writecache_insert_entry(wc, f); 1397 wc->uncommitted_blocks++; 1398 } else { 1399 struct wc_entry *f; 1400 struct rb_node *next = rb_next(&e->rb_node); 1401 1402 if (!next) 1403 break; 1404 f = container_of(next, struct wc_entry, rb_node); 1405 if (f != e + 1) 1406 break; 1407 if (read_original_sector(wc, f) != 1408 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1409 break; 1410 if (unlikely(f->write_in_progress)) 1411 break; 1412 if (writecache_entry_is_committed(wc, f)) 1413 wc->overwrote_committed = true; 1414 e = f; 1415 } 1416 bio_size += wc->block_size; 1417 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1418 } 1419 1420 bio_set_dev(bio, wc->ssd_dev->bdev); 1421 bio->bi_iter.bi_sector = start_cache_sec; 1422 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1423 1424 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1425 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1426 1427 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1428 wc->uncommitted_blocks = 0; 1429 queue_work(wc->writeback_wq, &wc->flush_work); 1430 } else { 1431 writecache_schedule_autocommit(wc); 1432 } 1433 } 1434 1435 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) 1436 { 1437 struct wc_entry *e; 1438 1439 do { 1440 bool found_entry = false; 1441 bool search_used = false; 1442 1443 if (writecache_has_error(wc)) { 1444 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1445 return WC_MAP_ERROR; 1446 } 1447 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1448 if (e) { 1449 if (!writecache_entry_is_committed(wc, e)) { 1450 wc->stats.write_hits_uncommitted++; 1451 search_used = true; 1452 goto bio_copy; 1453 } 1454 wc->stats.write_hits_committed++; 1455 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1456 wc->overwrote_committed = true; 1457 search_used = true; 1458 goto bio_copy; 1459 } 1460 found_entry = true; 1461 } else { 1462 if (unlikely(wc->cleaner) || 1463 (wc->metadata_only && !(bio->bi_opf & REQ_META))) 1464 goto direct_write; 1465 } 1466 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1467 if (unlikely(!e)) { 1468 if (!WC_MODE_PMEM(wc) && !found_entry) { 1469 direct_write: 1470 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1471 writecache_map_remap_origin(wc, bio, e); 1472 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; 1473 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1474 return WC_MAP_REMAP_ORIGIN; 1475 } 1476 wc->stats.writes_blocked_on_freelist++; 1477 writecache_wait_on_freelist(wc); 1478 continue; 1479 } 1480 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1481 writecache_insert_entry(wc, e); 1482 wc->uncommitted_blocks++; 1483 wc->stats.writes_allocate++; 1484 bio_copy: 1485 if (WC_MODE_PMEM(wc)) { 1486 bio_copy_block(wc, bio, memory_data(wc, e)); 1487 wc->stats.writes++; 1488 } else { 1489 writecache_bio_copy_ssd(wc, bio, e, search_used); 1490 return WC_MAP_REMAP; 1491 } 1492 } while (bio->bi_iter.bi_size); 1493 1494 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) 1495 writecache_flush(wc); 1496 else 1497 writecache_schedule_autocommit(wc); 1498 1499 return WC_MAP_SUBMIT; 1500 } 1501 1502 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) 1503 { 1504 if (writecache_has_error(wc)) 1505 return WC_MAP_ERROR; 1506 1507 if (WC_MODE_PMEM(wc)) { 1508 wc->stats.flushes++; 1509 writecache_flush(wc); 1510 if (writecache_has_error(wc)) 1511 return WC_MAP_ERROR; 1512 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) 1513 return WC_MAP_REMAP_ORIGIN; 1514 return WC_MAP_SUBMIT; 1515 } 1516 /* SSD: */ 1517 if (dm_bio_get_target_bio_nr(bio)) 1518 return WC_MAP_REMAP_ORIGIN; 1519 wc->stats.flushes++; 1520 writecache_offload_bio(wc, bio); 1521 return WC_MAP_RETURN; 1522 } 1523 1524 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) 1525 { 1526 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; 1527 1528 if (writecache_has_error(wc)) 1529 return WC_MAP_ERROR; 1530 1531 if (WC_MODE_PMEM(wc)) { 1532 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1533 return WC_MAP_REMAP_ORIGIN; 1534 } 1535 /* SSD: */ 1536 writecache_offload_bio(wc, bio); 1537 return WC_MAP_RETURN; 1538 } 1539 1540 static int writecache_map(struct dm_target *ti, struct bio *bio) 1541 { 1542 struct dm_writecache *wc = ti->private; 1543 enum wc_map_op map_op; 1544 1545 bio->bi_private = NULL; 1546 1547 wc_lock(wc); 1548 1549 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1550 map_op = writecache_map_flush(wc, bio); 1551 goto done; 1552 } 1553 1554 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1555 1556 if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1557 (wc->block_size / 512 - 1)) != 0)) { 1558 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1559 (unsigned long long)bio->bi_iter.bi_sector, 1560 bio->bi_iter.bi_size, wc->block_size); 1561 map_op = WC_MAP_ERROR; 1562 goto done; 1563 } 1564 1565 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1566 map_op = writecache_map_discard(wc, bio); 1567 goto done; 1568 } 1569 1570 if (bio_data_dir(bio) == READ) 1571 map_op = writecache_map_read(wc, bio); 1572 else 1573 map_op = writecache_map_write(wc, bio); 1574 done: 1575 switch (map_op) { 1576 case WC_MAP_REMAP_ORIGIN: 1577 if (likely(wc->pause != 0)) { 1578 if (bio_op(bio) == REQ_OP_WRITE) { 1579 dm_iot_io_begin(&wc->iot, 1); 1580 bio->bi_private = (void *)2; 1581 } 1582 } 1583 bio_set_dev(bio, wc->dev->bdev); 1584 wc_unlock(wc); 1585 return DM_MAPIO_REMAPPED; 1586 1587 case WC_MAP_REMAP: 1588 /* make sure that writecache_end_io decrements bio_in_progress: */ 1589 bio->bi_private = (void *)1; 1590 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1591 wc_unlock(wc); 1592 return DM_MAPIO_REMAPPED; 1593 1594 case WC_MAP_SUBMIT: 1595 wc_unlock(wc); 1596 bio_endio(bio); 1597 return DM_MAPIO_SUBMITTED; 1598 1599 case WC_MAP_RETURN: 1600 wc_unlock(wc); 1601 return DM_MAPIO_SUBMITTED; 1602 1603 case WC_MAP_ERROR: 1604 wc_unlock(wc); 1605 bio_io_error(bio); 1606 return DM_MAPIO_SUBMITTED; 1607 1608 default: 1609 BUG(); 1610 wc_unlock(wc); 1611 return DM_MAPIO_KILL; 1612 } 1613 } 1614 1615 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1616 { 1617 struct dm_writecache *wc = ti->private; 1618 1619 if (bio->bi_private == (void *)1) { 1620 int dir = bio_data_dir(bio); 1621 1622 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1623 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1624 wake_up(&wc->bio_in_progress_wait[dir]); 1625 } else if (bio->bi_private == (void *)2) { 1626 dm_iot_io_end(&wc->iot, 1); 1627 } 1628 return 0; 1629 } 1630 1631 static int writecache_iterate_devices(struct dm_target *ti, 1632 iterate_devices_callout_fn fn, void *data) 1633 { 1634 struct dm_writecache *wc = ti->private; 1635 1636 return fn(ti, wc->dev, 0, ti->len, data); 1637 } 1638 1639 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1640 { 1641 struct dm_writecache *wc = ti->private; 1642 1643 dm_stack_bs_limits(limits, wc->block_size); 1644 } 1645 1646 static void writecache_writeback_endio(struct bio *bio) 1647 { 1648 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1649 struct dm_writecache *wc = wb->wc; 1650 unsigned long flags; 1651 1652 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1653 if (unlikely(list_empty(&wc->endio_list))) 1654 wake_up_process(wc->endio_thread); 1655 list_add_tail(&wb->endio_entry, &wc->endio_list); 1656 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1657 } 1658 1659 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1660 { 1661 struct copy_struct *c = ptr; 1662 struct dm_writecache *wc = c->wc; 1663 1664 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1665 1666 raw_spin_lock_irq(&wc->endio_list_lock); 1667 if (unlikely(list_empty(&wc->endio_list))) 1668 wake_up_process(wc->endio_thread); 1669 list_add_tail(&c->endio_entry, &wc->endio_list); 1670 raw_spin_unlock_irq(&wc->endio_list_lock); 1671 } 1672 1673 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1674 { 1675 unsigned int i; 1676 struct writeback_struct *wb; 1677 struct wc_entry *e; 1678 unsigned long n_walked = 0; 1679 1680 do { 1681 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1682 list_del(&wb->endio_entry); 1683 1684 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1685 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1686 "write error %d", wb->bio.bi_status); 1687 i = 0; 1688 do { 1689 e = wb->wc_list[i]; 1690 BUG_ON(!e->write_in_progress); 1691 e->write_in_progress = false; 1692 INIT_LIST_HEAD(&e->lru); 1693 if (!writecache_has_error(wc)) 1694 writecache_free_entry(wc, e); 1695 BUG_ON(!wc->writeback_size); 1696 wc->writeback_size--; 1697 n_walked++; 1698 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1699 writecache_commit_flushed(wc, false); 1700 wc_unlock(wc); 1701 wc_lock(wc); 1702 n_walked = 0; 1703 } 1704 } while (++i < wb->wc_list_n); 1705 1706 if (wb->wc_list != wb->wc_list_inline) 1707 kfree(wb->wc_list); 1708 bio_put(&wb->bio); 1709 } while (!list_empty(list)); 1710 } 1711 1712 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1713 { 1714 struct copy_struct *c; 1715 struct wc_entry *e; 1716 1717 do { 1718 c = list_entry(list->next, struct copy_struct, endio_entry); 1719 list_del(&c->endio_entry); 1720 1721 if (unlikely(c->error)) 1722 writecache_error(wc, c->error, "copy error"); 1723 1724 e = c->e; 1725 do { 1726 BUG_ON(!e->write_in_progress); 1727 e->write_in_progress = false; 1728 INIT_LIST_HEAD(&e->lru); 1729 if (!writecache_has_error(wc)) 1730 writecache_free_entry(wc, e); 1731 1732 BUG_ON(!wc->writeback_size); 1733 wc->writeback_size--; 1734 e++; 1735 } while (--c->n_entries); 1736 mempool_free(c, &wc->copy_pool); 1737 } while (!list_empty(list)); 1738 } 1739 1740 static int writecache_endio_thread(void *data) 1741 { 1742 struct dm_writecache *wc = data; 1743 1744 while (1) { 1745 struct list_head list; 1746 1747 raw_spin_lock_irq(&wc->endio_list_lock); 1748 if (!list_empty(&wc->endio_list)) 1749 goto pop_from_list; 1750 set_current_state(TASK_INTERRUPTIBLE); 1751 raw_spin_unlock_irq(&wc->endio_list_lock); 1752 1753 if (unlikely(kthread_should_stop())) { 1754 set_current_state(TASK_RUNNING); 1755 break; 1756 } 1757 1758 schedule(); 1759 1760 continue; 1761 1762 pop_from_list: 1763 list = wc->endio_list; 1764 list.next->prev = list.prev->next = &list; 1765 INIT_LIST_HEAD(&wc->endio_list); 1766 raw_spin_unlock_irq(&wc->endio_list_lock); 1767 1768 if (!WC_MODE_FUA(wc)) 1769 writecache_disk_flush(wc, wc->dev); 1770 1771 wc_lock(wc); 1772 1773 if (WC_MODE_PMEM(wc)) { 1774 __writecache_endio_pmem(wc, &list); 1775 } else { 1776 __writecache_endio_ssd(wc, &list); 1777 writecache_wait_for_ios(wc, READ); 1778 } 1779 1780 writecache_commit_flushed(wc, false); 1781 1782 wc_unlock(wc); 1783 } 1784 1785 return 0; 1786 } 1787 1788 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) 1789 { 1790 struct dm_writecache *wc = wb->wc; 1791 unsigned int block_size = wc->block_size; 1792 void *address = memory_data(wc, e); 1793 1794 persistent_memory_flush_cache(address, block_size); 1795 1796 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) 1797 return true; 1798 1799 return bio_add_page(&wb->bio, persistent_memory_page(address), 1800 block_size, persistent_memory_page_offset(address)) != 0; 1801 } 1802 1803 struct writeback_list { 1804 struct list_head list; 1805 size_t size; 1806 }; 1807 1808 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1809 { 1810 if (unlikely(wc->max_writeback_jobs)) { 1811 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1812 wc_lock(wc); 1813 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1814 writecache_wait_on_freelist(wc); 1815 wc_unlock(wc); 1816 } 1817 } 1818 cond_resched(); 1819 } 1820 1821 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1822 { 1823 struct wc_entry *e, *f; 1824 struct bio *bio; 1825 struct writeback_struct *wb; 1826 unsigned int max_pages; 1827 1828 while (wbl->size) { 1829 wbl->size--; 1830 e = container_of(wbl->list.prev, struct wc_entry, lru); 1831 list_del(&e->lru); 1832 1833 max_pages = e->wc_list_contiguous; 1834 1835 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, 1836 GFP_NOIO, &wc->bio_set); 1837 wb = container_of(bio, struct writeback_struct, bio); 1838 wb->wc = wc; 1839 bio->bi_end_io = writecache_writeback_endio; 1840 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1841 1842 if (unlikely(max_pages > WB_LIST_INLINE)) 1843 wb->wc_list = kmalloc_objs(struct wc_entry *, max_pages, 1844 GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 1845 1846 if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) { 1847 wb->wc_list = wb->wc_list_inline; 1848 max_pages = WB_LIST_INLINE; 1849 } 1850 1851 BUG_ON(!wc_add_block(wb, e)); 1852 1853 wb->wc_list[0] = e; 1854 wb->wc_list_n = 1; 1855 1856 while (wbl->size && wb->wc_list_n < max_pages) { 1857 f = container_of(wbl->list.prev, struct wc_entry, lru); 1858 if (read_original_sector(wc, f) != 1859 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1860 break; 1861 if (!wc_add_block(wb, f)) 1862 break; 1863 wbl->size--; 1864 list_del(&f->lru); 1865 wb->wc_list[wb->wc_list_n++] = f; 1866 e = f; 1867 } 1868 if (WC_MODE_FUA(wc)) 1869 bio->bi_opf |= REQ_FUA; 1870 if (writecache_has_error(wc)) { 1871 bio->bi_status = BLK_STS_IOERR; 1872 bio_endio(bio); 1873 } else if (unlikely(!bio_sectors(bio))) { 1874 bio->bi_status = BLK_STS_OK; 1875 bio_endio(bio); 1876 } else { 1877 submit_bio(bio); 1878 } 1879 1880 __writeback_throttle(wc, wbl); 1881 } 1882 } 1883 1884 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1885 { 1886 struct wc_entry *e, *f; 1887 struct dm_io_region from, to; 1888 struct copy_struct *c; 1889 1890 while (wbl->size) { 1891 unsigned int n_sectors; 1892 1893 wbl->size--; 1894 e = container_of(wbl->list.prev, struct wc_entry, lru); 1895 list_del(&e->lru); 1896 1897 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1898 1899 from.bdev = wc->ssd_dev->bdev; 1900 from.sector = cache_sector(wc, e); 1901 from.count = n_sectors; 1902 to.bdev = wc->dev->bdev; 1903 to.sector = read_original_sector(wc, e); 1904 to.count = n_sectors; 1905 1906 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1907 c->wc = wc; 1908 c->e = e; 1909 c->n_entries = e->wc_list_contiguous; 1910 1911 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1912 wbl->size--; 1913 f = container_of(wbl->list.prev, struct wc_entry, lru); 1914 BUG_ON(f != e + 1); 1915 list_del(&f->lru); 1916 e = f; 1917 } 1918 1919 if (unlikely(to.sector + to.count > wc->data_device_sectors)) { 1920 if (to.sector >= wc->data_device_sectors) { 1921 writecache_copy_endio(0, 0, c); 1922 continue; 1923 } 1924 from.count = to.count = wc->data_device_sectors - to.sector; 1925 } 1926 1927 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1928 1929 __writeback_throttle(wc, wbl); 1930 } 1931 } 1932 1933 static void writecache_writeback(struct work_struct *work) 1934 { 1935 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1936 struct blk_plug plug; 1937 struct wc_entry *f, *g, *e = NULL; 1938 struct rb_node *node, *next_node; 1939 struct list_head skipped; 1940 struct writeback_list wbl; 1941 unsigned long n_walked; 1942 1943 if (!WC_MODE_PMEM(wc)) { 1944 /* Wait for any active kcopyd work on behalf of ssd writeback */ 1945 dm_kcopyd_client_flush(wc->dm_kcopyd); 1946 } 1947 1948 if (likely(wc->pause != 0)) { 1949 while (1) { 1950 unsigned long idle; 1951 1952 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || 1953 unlikely(dm_suspended(wc->ti))) 1954 break; 1955 idle = dm_iot_idle_time(&wc->iot); 1956 if (idle >= wc->pause) 1957 break; 1958 idle = wc->pause - idle; 1959 if (idle > HZ) 1960 idle = HZ; 1961 schedule_timeout_idle(idle); 1962 } 1963 } 1964 1965 wc_lock(wc); 1966 restart: 1967 if (writecache_has_error(wc)) { 1968 wc_unlock(wc); 1969 return; 1970 } 1971 1972 if (unlikely(wc->writeback_all)) { 1973 if (writecache_wait_for_writeback(wc)) 1974 goto restart; 1975 } 1976 1977 if (wc->overwrote_committed) 1978 writecache_wait_for_ios(wc, WRITE); 1979 1980 n_walked = 0; 1981 INIT_LIST_HEAD(&skipped); 1982 INIT_LIST_HEAD(&wbl.list); 1983 wbl.size = 0; 1984 while (!list_empty(&wc->lru) && 1985 (wc->writeback_all || 1986 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1987 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1988 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1989 1990 n_walked++; 1991 if (unlikely(n_walked > WRITEBACK_LATENCY) && 1992 likely(!wc->writeback_all)) { 1993 if (likely(!dm_suspended(wc->ti))) 1994 queue_work(wc->writeback_wq, &wc->writeback_work); 1995 break; 1996 } 1997 1998 if (unlikely(wc->writeback_all)) { 1999 if (unlikely(!e)) { 2000 writecache_flush(wc); 2001 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 2002 } else 2003 e = g; 2004 } else 2005 e = container_of(wc->lru.prev, struct wc_entry, lru); 2006 BUG_ON(e->write_in_progress); 2007 if (unlikely(!writecache_entry_is_committed(wc, e))) 2008 writecache_flush(wc); 2009 2010 node = rb_prev(&e->rb_node); 2011 if (node) { 2012 f = container_of(node, struct wc_entry, rb_node); 2013 if (unlikely(read_original_sector(wc, f) == 2014 read_original_sector(wc, e))) { 2015 BUG_ON(!f->write_in_progress); 2016 list_move(&e->lru, &skipped); 2017 cond_resched(); 2018 continue; 2019 } 2020 } 2021 wc->writeback_size++; 2022 list_move(&e->lru, &wbl.list); 2023 wbl.size++; 2024 e->write_in_progress = true; 2025 e->wc_list_contiguous = 1; 2026 2027 f = e; 2028 2029 while (1) { 2030 next_node = rb_next(&f->rb_node); 2031 if (unlikely(!next_node)) 2032 break; 2033 g = container_of(next_node, struct wc_entry, rb_node); 2034 if (unlikely(read_original_sector(wc, g) == 2035 read_original_sector(wc, f))) { 2036 f = g; 2037 continue; 2038 } 2039 if (read_original_sector(wc, g) != 2040 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 2041 break; 2042 if (unlikely(g->write_in_progress)) 2043 break; 2044 if (unlikely(!writecache_entry_is_committed(wc, g))) 2045 break; 2046 2047 if (!WC_MODE_PMEM(wc)) { 2048 if (g != f + 1) 2049 break; 2050 } 2051 2052 n_walked++; 2053 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 2054 // break; 2055 2056 wc->writeback_size++; 2057 list_move(&g->lru, &wbl.list); 2058 wbl.size++; 2059 g->write_in_progress = true; 2060 g->wc_list_contiguous = BIO_MAX_VECS; 2061 f = g; 2062 e->wc_list_contiguous++; 2063 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { 2064 if (unlikely(wc->writeback_all)) { 2065 next_node = rb_next(&f->rb_node); 2066 if (likely(next_node)) 2067 g = container_of(next_node, struct wc_entry, rb_node); 2068 } 2069 break; 2070 } 2071 } 2072 cond_resched(); 2073 } 2074 2075 if (!list_empty(&skipped)) { 2076 list_splice_tail(&skipped, &wc->lru); 2077 /* 2078 * If we didn't do any progress, we must wait until some 2079 * writeback finishes to avoid burning CPU in a loop 2080 */ 2081 if (unlikely(!wbl.size)) 2082 writecache_wait_for_writeback(wc); 2083 } 2084 2085 wc_unlock(wc); 2086 2087 blk_start_plug(&plug); 2088 2089 if (WC_MODE_PMEM(wc)) 2090 __writecache_writeback_pmem(wc, &wbl); 2091 else 2092 __writecache_writeback_ssd(wc, &wbl); 2093 2094 blk_finish_plug(&plug); 2095 2096 if (unlikely(wc->writeback_all)) { 2097 wc_lock(wc); 2098 while (writecache_wait_for_writeback(wc)) 2099 ; 2100 wc_unlock(wc); 2101 } 2102 } 2103 2104 static int calculate_memory_size(uint64_t device_size, unsigned int block_size, 2105 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 2106 { 2107 uint64_t n_blocks, offset; 2108 struct wc_entry e; 2109 2110 n_blocks = device_size; 2111 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 2112 2113 while (1) { 2114 if (!n_blocks) 2115 return -ENOSPC; 2116 /* Verify the following entries[n_blocks] won't overflow */ 2117 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 2118 sizeof(struct wc_memory_entry))) 2119 return -EFBIG; 2120 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 2121 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 2122 if (offset + n_blocks * block_size <= device_size) 2123 break; 2124 n_blocks--; 2125 } 2126 2127 /* check if the bit field overflows */ 2128 e.index = n_blocks; 2129 if (e.index != n_blocks) 2130 return -EFBIG; 2131 2132 if (n_blocks_p) 2133 *n_blocks_p = n_blocks; 2134 if (n_metadata_blocks_p) 2135 *n_metadata_blocks_p = offset >> __ffs(block_size); 2136 return 0; 2137 } 2138 2139 static int init_memory(struct dm_writecache *wc) 2140 { 2141 size_t b; 2142 int r; 2143 2144 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 2145 if (r) 2146 return r; 2147 2148 r = writecache_alloc_entries(wc); 2149 if (r) 2150 return r; 2151 2152 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 2153 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 2154 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 2155 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 2156 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 2157 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 2158 2159 for (b = 0; b < wc->n_blocks; b++) { 2160 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 2161 cond_resched(); 2162 } 2163 2164 writecache_flush_all_metadata(wc); 2165 writecache_commit_flushed(wc, false); 2166 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 2167 writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic)); 2168 writecache_commit_flushed(wc, false); 2169 2170 return 0; 2171 } 2172 2173 static void writecache_dtr(struct dm_target *ti) 2174 { 2175 struct dm_writecache *wc = ti->private; 2176 2177 if (!wc) 2178 return; 2179 2180 if (wc->endio_thread) 2181 kthread_stop(wc->endio_thread); 2182 2183 if (wc->flush_thread) 2184 kthread_stop(wc->flush_thread); 2185 2186 bioset_exit(&wc->bio_set); 2187 2188 mempool_exit(&wc->copy_pool); 2189 2190 if (wc->writeback_wq) 2191 destroy_workqueue(wc->writeback_wq); 2192 2193 if (wc->dev) 2194 dm_put_device(ti, wc->dev); 2195 2196 if (wc->ssd_dev) 2197 dm_put_device(ti, wc->ssd_dev); 2198 2199 vfree(wc->entries); 2200 2201 if (wc->memory_map) { 2202 if (WC_MODE_PMEM(wc)) 2203 persistent_memory_release(wc); 2204 else 2205 vfree(wc->memory_map); 2206 } 2207 2208 if (wc->dm_kcopyd) 2209 dm_kcopyd_client_destroy(wc->dm_kcopyd); 2210 2211 if (wc->dm_io) 2212 dm_io_client_destroy(wc->dm_io); 2213 2214 vfree(wc->dirty_bitmap); 2215 2216 kfree(wc); 2217 } 2218 2219 static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2220 { 2221 struct dm_writecache *wc; 2222 struct dm_arg_set as; 2223 const char *string; 2224 unsigned int opt_params; 2225 size_t offset, data_size; 2226 int i, r; 2227 char dummy; 2228 int high_wm_percent = HIGH_WATERMARK; 2229 int low_wm_percent = LOW_WATERMARK; 2230 uint64_t x; 2231 struct wc_memory_superblock s; 2232 2233 static struct dm_arg _args[] = { 2234 {0, 18, "Invalid number of feature args"}, 2235 }; 2236 2237 as.argc = argc; 2238 as.argv = argv; 2239 2240 wc = kzalloc_obj(struct dm_writecache); 2241 if (!wc) { 2242 ti->error = "Cannot allocate writecache structure"; 2243 r = -ENOMEM; 2244 goto bad; 2245 } 2246 ti->private = wc; 2247 wc->ti = ti; 2248 2249 mutex_init(&wc->lock); 2250 wc->max_age = MAX_AGE_UNSPECIFIED; 2251 writecache_poison_lists(wc); 2252 init_waitqueue_head(&wc->freelist_wait); 2253 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2254 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2255 2256 for (i = 0; i < 2; i++) { 2257 atomic_set(&wc->bio_in_progress[i], 0); 2258 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2259 } 2260 2261 wc->dm_io = dm_io_client_create(); 2262 if (IS_ERR(wc->dm_io)) { 2263 r = PTR_ERR(wc->dm_io); 2264 ti->error = "Unable to allocate dm-io client"; 2265 wc->dm_io = NULL; 2266 goto bad; 2267 } 2268 2269 wc->writeback_wq = alloc_workqueue("writecache-writeback", 2270 WQ_MEM_RECLAIM | WQ_PERCPU, 1); 2271 if (!wc->writeback_wq) { 2272 r = -ENOMEM; 2273 ti->error = "Could not allocate writeback workqueue"; 2274 goto bad; 2275 } 2276 INIT_WORK(&wc->writeback_work, writecache_writeback); 2277 INIT_WORK(&wc->flush_work, writecache_flush_work); 2278 2279 dm_iot_init(&wc->iot); 2280 2281 raw_spin_lock_init(&wc->endio_list_lock); 2282 INIT_LIST_HEAD(&wc->endio_list); 2283 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); 2284 if (IS_ERR(wc->endio_thread)) { 2285 r = PTR_ERR(wc->endio_thread); 2286 wc->endio_thread = NULL; 2287 ti->error = "Couldn't spawn endio thread"; 2288 goto bad; 2289 } 2290 2291 /* 2292 * Parse the mode (pmem or ssd) 2293 */ 2294 string = dm_shift_arg(&as); 2295 if (!string) 2296 goto bad_arguments; 2297 2298 if (!strcasecmp(string, "s")) { 2299 wc->pmem_mode = false; 2300 } else if (!strcasecmp(string, "p")) { 2301 #ifdef DM_WRITECACHE_HAS_PMEM 2302 wc->pmem_mode = true; 2303 wc->writeback_fua = true; 2304 #else 2305 /* 2306 * If the architecture doesn't support persistent memory or 2307 * the kernel doesn't support any DAX drivers, this driver can 2308 * only be used in SSD-only mode. 2309 */ 2310 r = -EOPNOTSUPP; 2311 ti->error = "Persistent memory or DAX not supported on this system"; 2312 goto bad; 2313 #endif 2314 } else { 2315 goto bad_arguments; 2316 } 2317 2318 if (WC_MODE_PMEM(wc)) { 2319 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2320 offsetof(struct writeback_struct, bio), 2321 BIOSET_NEED_BVECS); 2322 if (r) { 2323 ti->error = "Could not allocate bio set"; 2324 goto bad; 2325 } 2326 } else { 2327 wc->pause = PAUSE_WRITEBACK; 2328 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2329 if (r) { 2330 ti->error = "Could not allocate mempool"; 2331 goto bad; 2332 } 2333 } 2334 2335 /* 2336 * Parse the origin data device 2337 */ 2338 string = dm_shift_arg(&as); 2339 if (!string) 2340 goto bad_arguments; 2341 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2342 if (r) { 2343 ti->error = "Origin data device lookup failed"; 2344 goto bad; 2345 } 2346 2347 /* 2348 * Parse cache data device (be it pmem or ssd) 2349 */ 2350 string = dm_shift_arg(&as); 2351 if (!string) 2352 goto bad_arguments; 2353 2354 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2355 if (r) { 2356 ti->error = "Cache data device lookup failed"; 2357 goto bad; 2358 } 2359 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); 2360 2361 /* 2362 * Parse the cache block size 2363 */ 2364 string = dm_shift_arg(&as); 2365 if (!string) 2366 goto bad_arguments; 2367 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2368 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2369 (wc->block_size & (wc->block_size - 1))) { 2370 r = -EINVAL; 2371 ti->error = "Invalid block size"; 2372 goto bad; 2373 } 2374 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2375 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2376 r = -EINVAL; 2377 ti->error = "Block size is smaller than device logical block size"; 2378 goto bad; 2379 } 2380 wc->block_size_bits = __ffs(wc->block_size); 2381 2382 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2383 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2384 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2385 2386 /* 2387 * Parse optional arguments 2388 */ 2389 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2390 if (r) 2391 goto bad; 2392 2393 while (opt_params) { 2394 string = dm_shift_arg(&as), opt_params--; 2395 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2396 unsigned long long start_sector; 2397 2398 string = dm_shift_arg(&as), opt_params--; 2399 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2400 goto invalid_optional; 2401 wc->start_sector = start_sector; 2402 wc->start_sector_set = true; 2403 if (wc->start_sector != start_sector || 2404 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2405 goto invalid_optional; 2406 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2407 string = dm_shift_arg(&as), opt_params--; 2408 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2409 goto invalid_optional; 2410 if (high_wm_percent < 0 || high_wm_percent > 100) 2411 goto invalid_optional; 2412 wc->high_wm_percent_value = high_wm_percent; 2413 wc->high_wm_percent_set = true; 2414 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2415 string = dm_shift_arg(&as), opt_params--; 2416 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2417 goto invalid_optional; 2418 if (low_wm_percent < 0 || low_wm_percent > 100) 2419 goto invalid_optional; 2420 wc->low_wm_percent_value = low_wm_percent; 2421 wc->low_wm_percent_set = true; 2422 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2423 string = dm_shift_arg(&as), opt_params--; 2424 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2425 goto invalid_optional; 2426 wc->max_writeback_jobs_set = true; 2427 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2428 string = dm_shift_arg(&as), opt_params--; 2429 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2430 goto invalid_optional; 2431 wc->autocommit_blocks_set = true; 2432 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2433 unsigned int autocommit_msecs; 2434 2435 string = dm_shift_arg(&as), opt_params--; 2436 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2437 goto invalid_optional; 2438 if (autocommit_msecs > 3600000) 2439 goto invalid_optional; 2440 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2441 wc->autocommit_time_value = autocommit_msecs; 2442 wc->autocommit_time_set = true; 2443 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2444 unsigned int max_age_msecs; 2445 2446 string = dm_shift_arg(&as), opt_params--; 2447 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2448 goto invalid_optional; 2449 if (max_age_msecs > 86400000) 2450 goto invalid_optional; 2451 wc->max_age = msecs_to_jiffies(max_age_msecs); 2452 wc->max_age_set = true; 2453 wc->max_age_value = max_age_msecs; 2454 } else if (!strcasecmp(string, "cleaner")) { 2455 wc->cleaner_set = true; 2456 wc->cleaner = true; 2457 } else if (!strcasecmp(string, "fua")) { 2458 if (WC_MODE_PMEM(wc)) { 2459 wc->writeback_fua = true; 2460 wc->writeback_fua_set = true; 2461 } else 2462 goto invalid_optional; 2463 } else if (!strcasecmp(string, "nofua")) { 2464 if (WC_MODE_PMEM(wc)) { 2465 wc->writeback_fua = false; 2466 wc->writeback_fua_set = true; 2467 } else 2468 goto invalid_optional; 2469 } else if (!strcasecmp(string, "metadata_only")) { 2470 wc->metadata_only = true; 2471 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { 2472 unsigned int pause_msecs; 2473 2474 if (WC_MODE_PMEM(wc)) 2475 goto invalid_optional; 2476 string = dm_shift_arg(&as), opt_params--; 2477 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) 2478 goto invalid_optional; 2479 if (pause_msecs > 60000) 2480 goto invalid_optional; 2481 wc->pause = msecs_to_jiffies(pause_msecs); 2482 wc->pause_set = true; 2483 wc->pause_value = pause_msecs; 2484 } else { 2485 invalid_optional: 2486 r = -EINVAL; 2487 ti->error = "Invalid optional argument"; 2488 goto bad; 2489 } 2490 } 2491 2492 if (high_wm_percent < low_wm_percent) { 2493 r = -EINVAL; 2494 ti->error = "High watermark must be greater than or equal to low watermark"; 2495 goto bad; 2496 } 2497 2498 if (WC_MODE_PMEM(wc)) { 2499 if (!dax_synchronous(wc->ssd_dev->dax_dev)) { 2500 r = -EOPNOTSUPP; 2501 ti->error = "Asynchronous persistent memory not supported as pmem cache"; 2502 goto bad; 2503 } 2504 2505 r = persistent_memory_claim(wc); 2506 if (r) { 2507 ti->error = "Unable to map persistent memory for cache"; 2508 goto bad; 2509 } 2510 } else { 2511 size_t n_blocks, n_metadata_blocks; 2512 uint64_t n_bitmap_bits; 2513 2514 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2515 2516 bio_list_init(&wc->flush_list); 2517 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); 2518 if (IS_ERR(wc->flush_thread)) { 2519 r = PTR_ERR(wc->flush_thread); 2520 wc->flush_thread = NULL; 2521 ti->error = "Couldn't spawn flush thread"; 2522 goto bad; 2523 } 2524 2525 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2526 &n_blocks, &n_metadata_blocks); 2527 if (r) { 2528 ti->error = "Invalid device size"; 2529 goto bad; 2530 } 2531 2532 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2533 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2534 /* this is limitation of test_bit functions */ 2535 if (n_bitmap_bits > 1U << 31) { 2536 r = -EFBIG; 2537 ti->error = "Invalid device size"; 2538 goto bad; 2539 } 2540 2541 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2542 if (!wc->memory_map) { 2543 r = -ENOMEM; 2544 ti->error = "Unable to allocate memory for metadata"; 2545 goto bad; 2546 } 2547 2548 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2549 if (IS_ERR(wc->dm_kcopyd)) { 2550 r = PTR_ERR(wc->dm_kcopyd); 2551 ti->error = "Unable to allocate dm-kcopyd client"; 2552 wc->dm_kcopyd = NULL; 2553 goto bad; 2554 } 2555 2556 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2557 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2558 BITS_PER_LONG * sizeof(unsigned long); 2559 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2560 if (!wc->dirty_bitmap) { 2561 r = -ENOMEM; 2562 ti->error = "Unable to allocate dirty bitmap"; 2563 goto bad; 2564 } 2565 2566 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2567 if (r) { 2568 ti->error = "Unable to read first block of metadata"; 2569 goto bad; 2570 } 2571 } 2572 2573 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2574 if (r) { 2575 ti->error = "Hardware memory error when reading superblock"; 2576 goto bad; 2577 } 2578 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2579 r = init_memory(wc); 2580 if (r) { 2581 ti->error = "Unable to initialize device"; 2582 goto bad; 2583 } 2584 r = copy_mc_to_kernel(&s, sb(wc), 2585 sizeof(struct wc_memory_superblock)); 2586 if (r) { 2587 ti->error = "Hardware memory error when reading superblock"; 2588 goto bad; 2589 } 2590 } 2591 2592 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2593 ti->error = "Invalid magic in the superblock"; 2594 r = -EINVAL; 2595 goto bad; 2596 } 2597 2598 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2599 ti->error = "Invalid version in the superblock"; 2600 r = -EINVAL; 2601 goto bad; 2602 } 2603 2604 if (le32_to_cpu(s.block_size) != wc->block_size) { 2605 ti->error = "Block size does not match superblock"; 2606 r = -EINVAL; 2607 goto bad; 2608 } 2609 2610 wc->n_blocks = le64_to_cpu(s.n_blocks); 2611 2612 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2613 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2614 overflow: 2615 ti->error = "Overflow in size calculation"; 2616 r = -EINVAL; 2617 goto bad; 2618 } 2619 offset += sizeof(struct wc_memory_superblock); 2620 if (offset < sizeof(struct wc_memory_superblock)) 2621 goto overflow; 2622 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2623 data_size = wc->n_blocks * (size_t)wc->block_size; 2624 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2625 (offset + data_size < offset)) 2626 goto overflow; 2627 if (offset + data_size > wc->memory_map_size) { 2628 ti->error = "Memory area is too small"; 2629 r = -EINVAL; 2630 goto bad; 2631 } 2632 2633 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2634 wc->block_start = (char *)sb(wc) + offset; 2635 2636 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2637 x += 50; 2638 do_div(x, 100); 2639 wc->freelist_high_watermark = x; 2640 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2641 x += 50; 2642 do_div(x, 100); 2643 wc->freelist_low_watermark = x; 2644 2645 if (wc->cleaner) 2646 activate_cleaner(wc); 2647 2648 r = writecache_alloc_entries(wc); 2649 if (r) { 2650 ti->error = "Cannot allocate memory"; 2651 goto bad; 2652 } 2653 2654 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; 2655 ti->flush_supported = true; 2656 ti->num_discard_bios = 1; 2657 2658 if (WC_MODE_PMEM(wc)) 2659 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2660 2661 return 0; 2662 2663 bad_arguments: 2664 r = -EINVAL; 2665 ti->error = "Bad arguments"; 2666 bad: 2667 writecache_dtr(ti); 2668 return r; 2669 } 2670 2671 static void writecache_status(struct dm_target *ti, status_type_t type, 2672 unsigned int status_flags, char *result, unsigned int maxlen) 2673 { 2674 struct dm_writecache *wc = ti->private; 2675 unsigned int extra_args; 2676 unsigned int sz = 0; 2677 2678 switch (type) { 2679 case STATUSTYPE_INFO: 2680 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", 2681 writecache_has_error(wc), 2682 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2683 (unsigned long long)wc->writeback_size, 2684 wc->stats.reads, 2685 wc->stats.read_hits, 2686 wc->stats.writes, 2687 wc->stats.write_hits_uncommitted, 2688 wc->stats.write_hits_committed, 2689 wc->stats.writes_around, 2690 wc->stats.writes_allocate, 2691 wc->stats.writes_blocked_on_freelist, 2692 wc->stats.flushes, 2693 wc->stats.discards); 2694 break; 2695 case STATUSTYPE_TABLE: 2696 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2697 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2698 extra_args = 0; 2699 if (wc->start_sector_set) 2700 extra_args += 2; 2701 if (wc->high_wm_percent_set) 2702 extra_args += 2; 2703 if (wc->low_wm_percent_set) 2704 extra_args += 2; 2705 if (wc->max_writeback_jobs_set) 2706 extra_args += 2; 2707 if (wc->autocommit_blocks_set) 2708 extra_args += 2; 2709 if (wc->autocommit_time_set) 2710 extra_args += 2; 2711 if (wc->max_age_set) 2712 extra_args += 2; 2713 if (wc->cleaner_set) 2714 extra_args++; 2715 if (wc->writeback_fua_set) 2716 extra_args++; 2717 if (wc->metadata_only) 2718 extra_args++; 2719 if (wc->pause_set) 2720 extra_args += 2; 2721 2722 DMEMIT("%u", extra_args); 2723 if (wc->start_sector_set) 2724 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2725 if (wc->high_wm_percent_set) 2726 DMEMIT(" high_watermark %u", wc->high_wm_percent_value); 2727 if (wc->low_wm_percent_set) 2728 DMEMIT(" low_watermark %u", wc->low_wm_percent_value); 2729 if (wc->max_writeback_jobs_set) 2730 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2731 if (wc->autocommit_blocks_set) 2732 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2733 if (wc->autocommit_time_set) 2734 DMEMIT(" autocommit_time %u", wc->autocommit_time_value); 2735 if (wc->max_age_set) 2736 DMEMIT(" max_age %u", wc->max_age_value); 2737 if (wc->cleaner_set) 2738 DMEMIT(" cleaner"); 2739 if (wc->writeback_fua_set) 2740 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2741 if (wc->metadata_only) 2742 DMEMIT(" metadata_only"); 2743 if (wc->pause_set) 2744 DMEMIT(" pause_writeback %u", wc->pause_value); 2745 break; 2746 case STATUSTYPE_IMA: 2747 *result = '\0'; 2748 break; 2749 } 2750 } 2751 2752 static struct target_type writecache_target = { 2753 .name = "writecache", 2754 .version = {1, 6, 0}, 2755 .module = THIS_MODULE, 2756 .ctr = writecache_ctr, 2757 .dtr = writecache_dtr, 2758 .status = writecache_status, 2759 .postsuspend = writecache_suspend, 2760 .resume = writecache_resume, 2761 .message = writecache_message, 2762 .map = writecache_map, 2763 .end_io = writecache_end_io, 2764 .iterate_devices = writecache_iterate_devices, 2765 .io_hints = writecache_io_hints, 2766 }; 2767 module_dm(writecache); 2768 2769 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2770 MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>"); 2771 MODULE_LICENSE("GPL"); 2772