1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/blkdev.h> 4 #include <linux/module.h> 5 #include <linux/errno.h> 6 #include <linux/slab.h> 7 #include <linux/init.h> 8 #include <linux/timer.h> 9 #include <linux/sched.h> 10 #include <linux/list.h> 11 #include <linux/file.h> 12 #include <linux/seq_file.h> 13 #include <trace/events/block.h> 14 15 #include "md.h" 16 #include "md-bitmap.h" 17 18 /* 19 * #### Background 20 * 21 * Redundant data is used to enhance data fault tolerance, and the storage 22 * methods for redundant data vary depending on the RAID levels. And it's 23 * important to maintain the consistency of redundant data. 24 * 25 * Bitmap is used to record which data blocks have been synchronized and which 26 * ones need to be resynchronized or recovered. Each bit in the bitmap 27 * represents a segment of data in the array. When a bit is set, it indicates 28 * that the multiple redundant copies of that data segment may not be 29 * consistent. Data synchronization can be performed based on the bitmap after 30 * power failure or readding a disk. If there is no bitmap, a full disk 31 * synchronization is required. 32 * 33 * #### Key Features 34 * 35 * - IO fastpath is lockless, if user issues lots of write IO to the same 36 * bitmap bit in a short time, only the first write has additional overhead 37 * to update bitmap bit, no additional overhead for the following writes; 38 * - support only resync or recover written data, means in the case creating 39 * new array or replacing with a new disk, there is no need to do a full disk 40 * resync/recovery; 41 * 42 * #### Key Concept 43 * 44 * ##### State Machine 45 * 46 * Each bit is one byte, contain 6 different states, see llbitmap_state. And 47 * there are total 8 different actions, see llbitmap_action, can change state: 48 * 49 * llbitmap state machine: transitions between states 50 * 51 * | | Startwrite | Startsync | Endsync | Abortsync| 52 * | --------- | ---------- | --------- | ------- | ------- | 53 * | Unwritten | Dirty | x | x | x | 54 * | Clean | Dirty | x | x | x | 55 * | Dirty | x | x | x | x | 56 * | NeedSync | x | Syncing | x | x | 57 * | Syncing | x | Syncing | Dirty | NeedSync | 58 * 59 * | | Reload | Daemon | Discard | Stale | 60 * | --------- | -------- | ------ | --------- | --------- | 61 * | Unwritten | x | x | x | x | 62 * | Clean | x | x | Unwritten | NeedSync | 63 * | Dirty | NeedSync | Clean | Unwritten | NeedSync | 64 * | NeedSync | x | x | Unwritten | x | 65 * | Syncing | NeedSync | x | Unwritten | NeedSync | 66 * 67 * Typical scenarios: 68 * 69 * 1) Create new array 70 * All bits will be set to Unwritten by default, if --assume-clean is set, 71 * all bits will be set to Clean instead. 72 * 73 * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and 74 * rely on xor data 75 * 76 * 2.1) write new data to raid1/raid10: 77 * Unwritten --StartWrite--> Dirty 78 * 79 * 2.2) write new data to raid456: 80 * Unwritten --StartWrite--> NeedSync 81 * 82 * Because the initial recover for raid456 is skipped, the xor data is not built 83 * yet, the bit must be set to NeedSync first and after lazy initial recover is 84 * finished, the bit will finally set to Dirty(see 5.1 and 5.4); 85 * 86 * 2.3) cover write 87 * Clean --StartWrite--> Dirty 88 * 89 * 3) daemon, if the array is not degraded: 90 * Dirty --Daemon--> Clean 91 * 92 * 4) discard 93 * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten 94 * 95 * 5) resync and recover 96 * 97 * 5.1) common process 98 * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean 99 * 100 * 5.2) resync after power failure 101 * Dirty --Reload--> NeedSync 102 * 103 * 5.3) recover while replacing with a new disk 104 * By default, the old bitmap framework will recover all data, and llbitmap 105 * implements this by a new helper, see llbitmap_skip_sync_blocks: 106 * 107 * skip recover for bits other than dirty or clean; 108 * 109 * 5.4) lazy initial recover for raid5: 110 * By default, the old bitmap framework will only allow new recover when there 111 * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added 112 * to perform raid456 lazy recover for set bits(from 2.2). 113 * 114 * 6. special handling for degraded array: 115 * 116 * - Dirty bits will never be cleared, daemon will just do nothing, so that if 117 * a disk is readded, Clean bits can be skipped with recovery; 118 * - Dirty bits will convert to Syncing from start write, to do data recovery 119 * for new added disks; 120 * - New write will convert bits to NeedSync directly; 121 * 122 * ##### Bitmap IO 123 * 124 * ##### Chunksize 125 * 126 * The default bitmap size is 128k, incluing 1k bitmap super block, and 127 * the default size of segment of data in the array each bit(chunksize) is 64k, 128 * and chunksize will adjust to twice the old size each time if the total number 129 * bits is not less than 127k.(see llbitmap_init) 130 * 131 * ##### READ 132 * 133 * While creating bitmap, all pages will be allocated and read for llbitmap, 134 * there won't be read afterwards 135 * 136 * ##### WRITE 137 * 138 * WRITE IO is divided into logical_block_size of the array, the dirty state 139 * of each block is tracked independently, for example: 140 * 141 * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; 142 * 143 * | page0 | page1 | ... | page 31 | 144 * | | 145 * | \-----------------------\ 146 * | | 147 * | block0 | block1 | ... | block 8| 148 * | | 149 * | \-----------------\ 150 * | | 151 * | bit0 | bit1 | ... | bit511 | 152 * 153 * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding 154 * subpage will be marked dirty, such block must write first before the IO is 155 * issued. This behaviour will affect IO performance, to reduce the impact, if 156 * multiple bits are changed in the same block in a short time, all bits in this 157 * block will be changed to Dirty/NeedSync, so that there won't be any overhead 158 * until daemon clears dirty bits. 159 * 160 * ##### Dirty Bits synchronization 161 * 162 * IO fast path will set bits to dirty, and those dirty bits will be cleared 163 * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between 164 * IO path and daemon; 165 * 166 * IO path: 167 * 1) try to grab a reference, if succeed, set expire time after 5s and return; 168 * 2) if failed to grab a reference, wait for daemon to finish clearing dirty 169 * bits; 170 * 171 * Daemon (Daemon will be woken up every daemon_sleep seconds): 172 * For each page: 173 * 1) check if page expired, if not skip this page; for expired page: 174 * 2) suspend the page and wait for inflight write IO to be done; 175 * 3) change dirty page to clean; 176 * 4) resume the page; 177 */ 178 179 #define BITMAP_DATA_OFFSET 1024 180 181 /* 64k is the max IO size of sync IO for raid1/raid10 */ 182 #define MIN_CHUNK_SIZE (64 * 2) 183 184 /* By default, daemon will be woken up every 30s */ 185 #define DEFAULT_DAEMON_SLEEP 30 186 187 /* 188 * Dirtied bits that have not been accessed for more than 5s will be cleared 189 * by daemon. 190 */ 191 #define DEFAULT_BARRIER_IDLE 5 192 193 enum llbitmap_state { 194 /* No valid data, init state after assemble the array */ 195 BitUnwritten = 0, 196 /* data is consistent */ 197 BitClean, 198 /* data will be consistent after IO is done, set directly for writes */ 199 BitDirty, 200 /* 201 * data need to be resynchronized: 202 * 1) set directly for writes if array is degraded, prevent full disk 203 * synchronization after readding a disk; 204 * 2) reassemble the array after power failure, and dirty bits are 205 * found after reloading the bitmap; 206 * 3) set for first write for raid5, to build initial xor data lazily 207 */ 208 BitNeedSync, 209 /* data is synchronizing */ 210 BitSyncing, 211 /* 212 * Proactive sync requested for unwritten region (raid456 only). 213 * Triggered via sysfs when user wants to pre-build XOR parity 214 * for regions that have never been written. 215 */ 216 BitNeedSyncUnwritten, 217 /* Proactive sync in progress for unwritten region */ 218 BitSyncingUnwritten, 219 /* 220 * XOR parity has been pre-built for a region that has never had 221 * user data written. When user writes to this region, it transitions 222 * to BitDirty. 223 */ 224 BitCleanUnwritten, 225 BitStateCount, 226 BitNone = 0xff, 227 }; 228 229 enum llbitmap_action { 230 /* User write new data, this is the only action from IO fast path */ 231 BitmapActionStartwrite = 0, 232 /* Start recovery */ 233 BitmapActionStartsync, 234 /* Finish recovery */ 235 BitmapActionEndsync, 236 /* Failed recovery */ 237 BitmapActionAbortsync, 238 /* Reassemble the array */ 239 BitmapActionReload, 240 /* Daemon thread is trying to clear dirty bits */ 241 BitmapActionDaemon, 242 /* Data is deleted */ 243 BitmapActionDiscard, 244 /* 245 * Bitmap is stale, mark all bits in addition to BitUnwritten to 246 * BitNeedSync. 247 */ 248 BitmapActionStale, 249 /* 250 * Proactive sync trigger for raid456 - builds XOR parity for 251 * Unwritten regions without requiring user data write first. 252 */ 253 BitmapActionProactiveSync, 254 BitmapActionClearUnwritten, 255 BitmapActionCount, 256 /* Init state is BitUnwritten */ 257 BitmapActionInit, 258 }; 259 260 enum llbitmap_page_state { 261 LLPageFlush = 0, 262 LLPageDirty, 263 }; 264 265 struct llbitmap_page_ctl { 266 char *state; 267 struct page *page; 268 unsigned long expire; 269 unsigned long flags; 270 wait_queue_head_t wait; 271 struct percpu_ref active; 272 /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ 273 unsigned long dirty[]; 274 }; 275 276 struct llbitmap { 277 struct mddev *mddev; 278 struct llbitmap_page_ctl **pctl; 279 280 unsigned int nr_pages; 281 unsigned int io_size; 282 unsigned int blocks_per_page; 283 284 /* shift of one chunk */ 285 unsigned long chunkshift; 286 /* size of one chunk in sector */ 287 unsigned long chunksize; 288 /* total number of chunks */ 289 unsigned long chunks; 290 unsigned long last_end_sync; 291 /* 292 * time in seconds that dirty bits will be cleared if the page is not 293 * accessed. 294 */ 295 unsigned long barrier_idle; 296 /* fires on first BitDirty state */ 297 struct timer_list pending_timer; 298 struct work_struct daemon_work; 299 300 unsigned long flags; 301 __u64 events_cleared; 302 303 /* for slow disks */ 304 atomic_t behind_writes; 305 wait_queue_head_t behind_wait; 306 }; 307 308 struct llbitmap_unplug_work { 309 struct work_struct work; 310 struct llbitmap *llbitmap; 311 struct completion *done; 312 }; 313 314 static struct workqueue_struct *md_llbitmap_io_wq; 315 static struct workqueue_struct *md_llbitmap_unplug_wq; 316 317 static char state_machine[BitStateCount][BitmapActionCount] = { 318 [BitUnwritten] = { 319 [BitmapActionStartwrite] = BitDirty, 320 [BitmapActionStartsync] = BitNone, 321 [BitmapActionEndsync] = BitNone, 322 [BitmapActionAbortsync] = BitNone, 323 [BitmapActionReload] = BitNone, 324 [BitmapActionDaemon] = BitNone, 325 [BitmapActionDiscard] = BitNone, 326 [BitmapActionStale] = BitNone, 327 [BitmapActionProactiveSync] = BitNeedSyncUnwritten, 328 [BitmapActionClearUnwritten] = BitNone, 329 }, 330 [BitClean] = { 331 [BitmapActionStartwrite] = BitDirty, 332 [BitmapActionStartsync] = BitNone, 333 [BitmapActionEndsync] = BitNone, 334 [BitmapActionAbortsync] = BitNone, 335 [BitmapActionReload] = BitNone, 336 [BitmapActionDaemon] = BitNone, 337 [BitmapActionDiscard] = BitUnwritten, 338 [BitmapActionStale] = BitNeedSync, 339 [BitmapActionProactiveSync] = BitNone, 340 [BitmapActionClearUnwritten] = BitNone, 341 }, 342 [BitDirty] = { 343 [BitmapActionStartwrite] = BitNone, 344 [BitmapActionStartsync] = BitNone, 345 [BitmapActionEndsync] = BitNone, 346 [BitmapActionAbortsync] = BitNone, 347 [BitmapActionReload] = BitNeedSync, 348 [BitmapActionDaemon] = BitClean, 349 [BitmapActionDiscard] = BitUnwritten, 350 [BitmapActionStale] = BitNeedSync, 351 [BitmapActionProactiveSync] = BitNone, 352 [BitmapActionClearUnwritten] = BitNone, 353 }, 354 [BitNeedSync] = { 355 [BitmapActionStartwrite] = BitNone, 356 [BitmapActionStartsync] = BitSyncing, 357 [BitmapActionEndsync] = BitNone, 358 [BitmapActionAbortsync] = BitNone, 359 [BitmapActionReload] = BitNone, 360 [BitmapActionDaemon] = BitNone, 361 [BitmapActionDiscard] = BitUnwritten, 362 [BitmapActionStale] = BitNone, 363 [BitmapActionProactiveSync] = BitNone, 364 [BitmapActionClearUnwritten] = BitNone, 365 }, 366 [BitSyncing] = { 367 [BitmapActionStartwrite] = BitNone, 368 [BitmapActionStartsync] = BitSyncing, 369 [BitmapActionEndsync] = BitDirty, 370 [BitmapActionAbortsync] = BitNeedSync, 371 [BitmapActionReload] = BitNeedSync, 372 [BitmapActionDaemon] = BitNone, 373 [BitmapActionDiscard] = BitUnwritten, 374 [BitmapActionStale] = BitNeedSync, 375 [BitmapActionProactiveSync] = BitNone, 376 [BitmapActionClearUnwritten] = BitNone, 377 }, 378 [BitNeedSyncUnwritten] = { 379 [BitmapActionStartwrite] = BitNeedSync, 380 [BitmapActionStartsync] = BitSyncingUnwritten, 381 [BitmapActionEndsync] = BitNone, 382 [BitmapActionAbortsync] = BitUnwritten, 383 [BitmapActionReload] = BitUnwritten, 384 [BitmapActionDaemon] = BitNone, 385 [BitmapActionDiscard] = BitUnwritten, 386 [BitmapActionStale] = BitUnwritten, 387 [BitmapActionProactiveSync] = BitNone, 388 [BitmapActionClearUnwritten] = BitUnwritten, 389 }, 390 [BitSyncingUnwritten] = { 391 [BitmapActionStartwrite] = BitSyncing, 392 [BitmapActionStartsync] = BitSyncingUnwritten, 393 [BitmapActionEndsync] = BitCleanUnwritten, 394 [BitmapActionAbortsync] = BitUnwritten, 395 [BitmapActionReload] = BitUnwritten, 396 [BitmapActionDaemon] = BitNone, 397 [BitmapActionDiscard] = BitUnwritten, 398 [BitmapActionStale] = BitUnwritten, 399 [BitmapActionProactiveSync] = BitNone, 400 [BitmapActionClearUnwritten] = BitUnwritten, 401 }, 402 [BitCleanUnwritten] = { 403 [BitmapActionStartwrite] = BitDirty, 404 [BitmapActionStartsync] = BitNone, 405 [BitmapActionEndsync] = BitNone, 406 [BitmapActionAbortsync] = BitNone, 407 [BitmapActionReload] = BitNone, 408 [BitmapActionDaemon] = BitNone, 409 [BitmapActionDiscard] = BitUnwritten, 410 [BitmapActionStale] = BitUnwritten, 411 [BitmapActionProactiveSync] = BitNone, 412 [BitmapActionClearUnwritten] = BitUnwritten, 413 }, 414 }; 415 416 static void __llbitmap_flush(struct mddev *mddev); 417 418 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) 419 { 420 unsigned int idx; 421 unsigned int offset; 422 423 pos += BITMAP_DATA_OFFSET; 424 idx = pos >> PAGE_SHIFT; 425 offset = offset_in_page(pos); 426 427 return llbitmap->pctl[idx]->state[offset]; 428 } 429 430 /* set all the bits in the subpage as dirty */ 431 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, 432 struct llbitmap_page_ctl *pctl, 433 unsigned int block) 434 { 435 bool level_456 = raid_is_456(llbitmap->mddev); 436 unsigned int io_size = llbitmap->io_size; 437 int pos; 438 439 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { 440 switch (pctl->state[pos]) { 441 case BitUnwritten: 442 pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; 443 break; 444 case BitClean: 445 case BitCleanUnwritten: 446 pctl->state[pos] = BitDirty; 447 break; 448 } 449 } 450 } 451 452 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, 453 int offset, bool infect) 454 { 455 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 456 unsigned int io_size = llbitmap->io_size; 457 int block = offset / io_size; 458 int pos; 459 460 if (!test_bit(LLPageDirty, &pctl->flags)) 461 set_bit(LLPageDirty, &pctl->flags); 462 463 /* 464 * For degraded array, dirty bits will never be cleared, and we must 465 * resync all the dirty bits, hence skip infect new dirty bits to 466 * prevent resync unnecessary data. 467 */ 468 if (llbitmap->mddev->degraded || !infect) { 469 set_bit(block, pctl->dirty); 470 return; 471 } 472 473 /* 474 * The subpage usually contains a total of 512 bits. If any single bit 475 * within the subpage is marked as dirty, the entire sector will be 476 * written. To avoid impacting write performance, when multiple bits 477 * within the same sector are modified within llbitmap->barrier_idle, 478 * all bits in the sector will be collectively marked as dirty at once. 479 */ 480 if (test_and_set_bit(block, pctl->dirty)) { 481 llbitmap_infect_dirty_bits(llbitmap, pctl, block); 482 return; 483 } 484 485 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { 486 if (pos == offset) 487 continue; 488 if (pctl->state[pos] == BitDirty || 489 pctl->state[pos] == BitNeedSync) { 490 llbitmap_infect_dirty_bits(llbitmap, pctl, block); 491 return; 492 } 493 } 494 } 495 496 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, 497 loff_t pos) 498 { 499 unsigned int idx; 500 unsigned int bit; 501 502 pos += BITMAP_DATA_OFFSET; 503 idx = pos >> PAGE_SHIFT; 504 bit = offset_in_page(pos); 505 506 llbitmap->pctl[idx]->state[bit] = state; 507 if (state == BitDirty || state == BitNeedSync) 508 llbitmap_set_page_dirty(llbitmap, idx, bit, true); 509 else if (state == BitNeedSyncUnwritten) 510 llbitmap_set_page_dirty(llbitmap, idx, bit, false); 511 } 512 513 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) 514 { 515 struct mddev *mddev = llbitmap->mddev; 516 struct page *page = NULL; 517 struct md_rdev *rdev; 518 519 if (llbitmap->pctl && llbitmap->pctl[idx]) 520 page = llbitmap->pctl[idx]->page; 521 if (page) 522 return page; 523 524 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 525 if (!page) 526 return ERR_PTR(-ENOMEM); 527 528 rdev_for_each(rdev, mddev) { 529 sector_t sector; 530 531 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) || 532 !test_bit(In_sync, &rdev->flags)) 533 continue; 534 535 sector = mddev->bitmap_info.offset + 536 (idx << PAGE_SECTORS_SHIFT); 537 538 if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, 539 true)) 540 return page; 541 542 md_error(mddev, rdev); 543 } 544 545 __free_page(page); 546 return ERR_PTR(-EIO); 547 } 548 549 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) 550 { 551 struct page *page = llbitmap->pctl[idx]->page; 552 struct mddev *mddev = llbitmap->mddev; 553 struct md_rdev *rdev; 554 int block; 555 556 for (block = 0; block < llbitmap->blocks_per_page; block++) { 557 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 558 559 if (!test_and_clear_bit(block, pctl->dirty)) 560 continue; 561 562 rdev_for_each(rdev, mddev) { 563 sector_t sector; 564 sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; 565 566 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 567 continue; 568 569 sector = mddev->bitmap_info.offset + rdev->sb_start + 570 (idx << PAGE_SECTORS_SHIFT) + 571 block * bit_sector; 572 md_write_metadata(mddev, rdev, sector, 573 llbitmap->io_size, page, 574 block * llbitmap->io_size); 575 } 576 } 577 } 578 579 static void active_release(struct percpu_ref *ref) 580 { 581 struct llbitmap_page_ctl *pctl = 582 container_of(ref, struct llbitmap_page_ctl, active); 583 584 wake_up(&pctl->wait); 585 } 586 587 static void llbitmap_free_pages(struct llbitmap *llbitmap) 588 { 589 int i; 590 591 if (!llbitmap->pctl) 592 return; 593 594 for (i = 0; i < llbitmap->nr_pages; i++) { 595 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 596 597 if (!pctl || !pctl->page) 598 break; 599 600 __free_page(pctl->page); 601 percpu_ref_exit(&pctl->active); 602 } 603 604 kfree(llbitmap->pctl[0]); 605 kfree(llbitmap->pctl); 606 llbitmap->pctl = NULL; 607 } 608 609 static int llbitmap_cache_pages(struct llbitmap *llbitmap) 610 { 611 struct llbitmap_page_ctl *pctl; 612 unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + 613 BITMAP_DATA_OFFSET, PAGE_SIZE); 614 unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( 615 llbitmap->blocks_per_page)); 616 int i; 617 618 llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), 619 GFP_KERNEL | __GFP_ZERO); 620 if (!llbitmap->pctl) 621 return -ENOMEM; 622 623 size = round_up(size, cache_line_size()); 624 pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); 625 if (!pctl) { 626 kfree(llbitmap->pctl); 627 return -ENOMEM; 628 } 629 630 llbitmap->nr_pages = nr_pages; 631 632 for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { 633 struct page *page = llbitmap_read_page(llbitmap, i); 634 635 llbitmap->pctl[i] = pctl; 636 637 if (IS_ERR(page)) { 638 llbitmap_free_pages(llbitmap); 639 return PTR_ERR(page); 640 } 641 642 if (percpu_ref_init(&pctl->active, active_release, 643 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 644 __free_page(page); 645 llbitmap_free_pages(llbitmap); 646 return -ENOMEM; 647 } 648 649 pctl->page = page; 650 pctl->state = page_address(page); 651 init_waitqueue_head(&pctl->wait); 652 } 653 654 return 0; 655 } 656 657 /* 658 * Check if all underlying disks support write_zeroes with unmap. 659 */ 660 static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap) 661 { 662 struct mddev *mddev = llbitmap->mddev; 663 struct md_rdev *rdev; 664 665 rdev_for_each(rdev, mddev) { 666 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 667 continue; 668 669 if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0) 670 return false; 671 } 672 673 return true; 674 } 675 676 /* 677 * Issue write_zeroes to all underlying disks to zero their data regions. 678 * This ensures parity consistency for RAID-456 (0 XOR 0 = 0). 679 * Returns true if all disks were successfully zeroed. 680 */ 681 static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap) 682 { 683 struct mddev *mddev = llbitmap->mddev; 684 struct md_rdev *rdev; 685 sector_t dev_sectors = mddev->dev_sectors; 686 int ret; 687 688 rdev_for_each(rdev, mddev) { 689 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 690 continue; 691 692 ret = blkdev_issue_zeroout(rdev->bdev, 693 rdev->data_offset, 694 dev_sectors, 695 GFP_KERNEL, 0); 696 if (ret) { 697 pr_warn("md/llbitmap: failed to zero disk %pg: %d\n", 698 rdev->bdev, ret); 699 return false; 700 } 701 } 702 703 return true; 704 } 705 706 static void llbitmap_init_state(struct llbitmap *llbitmap) 707 { 708 struct mddev *mddev = llbitmap->mddev; 709 enum llbitmap_state state = BitUnwritten; 710 unsigned long i; 711 712 if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) { 713 state = BitClean; 714 } else if (raid_is_456(mddev) && 715 llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) { 716 /* 717 * All disks support write_zeroes with unmap. Zero all disks 718 * to ensure parity consistency, then set BitCleanUnwritten 719 * to skip initial sync. 720 */ 721 if (llbitmap_zero_all_disks(llbitmap)) 722 state = BitCleanUnwritten; 723 } 724 725 for (i = 0; i < llbitmap->chunks; i++) 726 llbitmap_write(llbitmap, state, i); 727 } 728 729 /* The return value is only used from resync, where @start == @end. */ 730 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, 731 unsigned long start, 732 unsigned long end, 733 enum llbitmap_action action) 734 { 735 struct mddev *mddev = llbitmap->mddev; 736 enum llbitmap_state state = BitNone; 737 bool level_456 = raid_is_456(llbitmap->mddev); 738 bool need_resync = false; 739 bool need_recovery = false; 740 741 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) 742 return BitNone; 743 744 if (action == BitmapActionInit) { 745 llbitmap_init_state(llbitmap); 746 return BitNone; 747 } 748 749 while (start <= end) { 750 enum llbitmap_state c = llbitmap_read(llbitmap, start); 751 752 if (c < 0 || c >= BitStateCount) { 753 pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", 754 __func__, start, c, action); 755 state = BitNeedSync; 756 goto write_bitmap; 757 } 758 759 if (c == BitNeedSync || c == BitNeedSyncUnwritten) 760 need_resync = !mddev->degraded; 761 762 state = state_machine[c][action]; 763 write_bitmap: 764 if (unlikely(mddev->degraded)) { 765 /* For degraded array, mark new data as need sync. */ 766 if (state == BitDirty && 767 action == BitmapActionStartwrite) 768 state = BitNeedSync; 769 /* 770 * For degraded array, resync dirty data as well, noted 771 * if array is still degraded after resync is done, all 772 * new data will still be dirty until array is clean. 773 */ 774 else if (c == BitDirty && 775 action == BitmapActionStartsync) 776 state = BitSyncing; 777 } else if (c == BitUnwritten && state == BitDirty && 778 action == BitmapActionStartwrite && level_456) { 779 /* Delay raid456 initial recovery to first write. */ 780 state = BitNeedSync; 781 } 782 783 if (state == BitNone) { 784 start++; 785 continue; 786 } 787 788 llbitmap_write(llbitmap, state, start); 789 if (state == BitNeedSync || state == BitNeedSyncUnwritten) 790 need_resync = !mddev->degraded; 791 else if (state == BitDirty && 792 !timer_pending(&llbitmap->pending_timer)) 793 mod_timer(&llbitmap->pending_timer, 794 jiffies + mddev->bitmap_info.daemon_sleep * HZ); 795 796 start++; 797 } 798 799 if (need_resync && level_456) 800 need_recovery = true; 801 802 if (need_recovery) { 803 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 804 set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 805 md_wakeup_thread(mddev->thread); 806 } else if (need_resync) { 807 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 808 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 809 md_wakeup_thread(mddev->thread); 810 } 811 812 return state; 813 } 814 815 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) 816 { 817 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 818 819 retry: 820 if (likely(percpu_ref_tryget_live(&pctl->active))) { 821 WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); 822 return; 823 } 824 825 wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); 826 goto retry; 827 } 828 829 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) 830 { 831 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 832 833 percpu_ref_put(&pctl->active); 834 } 835 836 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) 837 { 838 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 839 840 percpu_ref_kill(&pctl->active); 841 842 if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), 843 llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) { 844 percpu_ref_resurrect(&pctl->active); 845 return -ETIMEDOUT; 846 } 847 848 return 0; 849 } 850 851 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) 852 { 853 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; 854 855 pctl->expire = LONG_MAX; 856 percpu_ref_resurrect(&pctl->active); 857 wake_up(&pctl->wait); 858 } 859 860 static int llbitmap_check_support(struct mddev *mddev) 861 { 862 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 863 pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", 864 mdname(mddev)); 865 return -EBUSY; 866 } 867 868 if (mddev->bitmap_info.space == 0) { 869 if (mddev->bitmap_info.default_space == 0) { 870 pr_notice("md/llbitmap: %s: no space for bitmap\n", 871 mdname(mddev)); 872 return -ENOSPC; 873 } 874 } 875 876 if (!mddev->persistent) { 877 pr_notice("md/llbitmap: %s: array must be persistent\n", 878 mdname(mddev)); 879 return -EOPNOTSUPP; 880 } 881 882 if (mddev->bitmap_info.file) { 883 pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", 884 mdname(mddev)); 885 return -EOPNOTSUPP; 886 } 887 888 if (mddev->bitmap_info.external) { 889 pr_notice("md/llbitmap: %s: doesn't support external metadata\n", 890 mdname(mddev)); 891 return -EOPNOTSUPP; 892 } 893 894 if (mddev_is_dm(mddev)) { 895 pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", 896 mdname(mddev)); 897 return -EOPNOTSUPP; 898 } 899 900 return 0; 901 } 902 903 static int llbitmap_init(struct llbitmap *llbitmap) 904 { 905 struct mddev *mddev = llbitmap->mddev; 906 sector_t blocks = mddev->resync_max_sectors; 907 unsigned long chunksize = MIN_CHUNK_SIZE; 908 unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); 909 unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; 910 int ret; 911 912 while (chunks > space) { 913 chunksize = chunksize << 1; 914 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); 915 } 916 917 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; 918 llbitmap->chunkshift = ffz(~chunksize); 919 llbitmap->chunksize = chunksize; 920 llbitmap->chunks = chunks; 921 mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; 922 923 ret = llbitmap_cache_pages(llbitmap); 924 if (ret) 925 return ret; 926 927 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 928 BitmapActionInit); 929 /* flush initial llbitmap to disk */ 930 __llbitmap_flush(mddev); 931 932 return 0; 933 } 934 935 static int llbitmap_read_sb(struct llbitmap *llbitmap) 936 { 937 struct mddev *mddev = llbitmap->mddev; 938 unsigned long daemon_sleep; 939 unsigned long chunksize; 940 unsigned long events; 941 struct page *sb_page; 942 bitmap_super_t *sb; 943 int ret = -EINVAL; 944 945 if (!mddev->bitmap_info.offset) { 946 pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); 947 return -EINVAL; 948 } 949 950 sb_page = llbitmap_read_page(llbitmap, 0); 951 if (IS_ERR(sb_page)) { 952 pr_err("md/llbitmap: %s: read super block failed", 953 mdname(mddev)); 954 return -EIO; 955 } 956 957 sb = kmap_local_page(sb_page); 958 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { 959 pr_err("md/llbitmap: %s: invalid super block magic number", 960 mdname(mddev)); 961 goto out_put_page; 962 } 963 964 if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { 965 pr_err("md/llbitmap: %s: invalid super block version", 966 mdname(mddev)); 967 goto out_put_page; 968 } 969 970 if (memcmp(sb->uuid, mddev->uuid, 16)) { 971 pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", 972 mdname(mddev)); 973 goto out_put_page; 974 } 975 976 if (mddev->bitmap_info.space == 0) { 977 int room = le32_to_cpu(sb->sectors_reserved); 978 979 if (room) 980 mddev->bitmap_info.space = room; 981 else 982 mddev->bitmap_info.space = mddev->bitmap_info.default_space; 983 } 984 llbitmap->flags = le32_to_cpu(sb->state); 985 if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { 986 ret = llbitmap_init(llbitmap); 987 goto out_put_page; 988 } 989 990 chunksize = le32_to_cpu(sb->chunksize); 991 if (!is_power_of_2(chunksize)) { 992 pr_err("md/llbitmap: %s: chunksize not a power of 2", 993 mdname(mddev)); 994 goto out_put_page; 995 } 996 997 if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, 998 mddev->bitmap_info.space << SECTOR_SHIFT)) { 999 pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", 1000 mdname(mddev), chunksize, mddev->resync_max_sectors, 1001 mddev->bitmap_info.space); 1002 goto out_put_page; 1003 } 1004 1005 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 1006 if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { 1007 pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", 1008 mdname(mddev), daemon_sleep); 1009 goto out_put_page; 1010 } 1011 1012 events = le64_to_cpu(sb->events); 1013 if (events < mddev->events) { 1014 pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", 1015 mdname(mddev), events, mddev->events); 1016 set_bit(BITMAP_STALE, &llbitmap->flags); 1017 } 1018 1019 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); 1020 mddev->bitmap_info.chunksize = chunksize; 1021 mddev->bitmap_info.daemon_sleep = daemon_sleep; 1022 1023 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; 1024 llbitmap->chunksize = chunksize; 1025 llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); 1026 llbitmap->chunkshift = ffz(~chunksize); 1027 ret = llbitmap_cache_pages(llbitmap); 1028 1029 out_put_page: 1030 __free_page(sb_page); 1031 kunmap_local(sb); 1032 return ret; 1033 } 1034 1035 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) 1036 { 1037 struct llbitmap *llbitmap = 1038 container_of(pending_timer, struct llbitmap, pending_timer); 1039 1040 if (work_busy(&llbitmap->daemon_work)) { 1041 pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", 1042 mdname(llbitmap->mddev), 1043 llbitmap->mddev->bitmap_info.daemon_sleep); 1044 set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); 1045 return; 1046 } 1047 1048 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); 1049 } 1050 1051 static void md_llbitmap_daemon_fn(struct work_struct *work) 1052 { 1053 struct llbitmap *llbitmap = 1054 container_of(work, struct llbitmap, daemon_work); 1055 unsigned long start; 1056 unsigned long end; 1057 bool restart; 1058 int idx; 1059 1060 if (llbitmap->mddev->degraded) 1061 return; 1062 retry: 1063 start = 0; 1064 end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; 1065 restart = false; 1066 1067 for (idx = 0; idx < llbitmap->nr_pages; idx++) { 1068 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; 1069 1070 if (idx > 0) { 1071 start = end + 1; 1072 end = min(end + PAGE_SIZE, llbitmap->chunks - 1); 1073 } 1074 1075 if (!test_bit(LLPageFlush, &pctl->flags) && 1076 time_before(jiffies, pctl->expire)) { 1077 restart = true; 1078 continue; 1079 } 1080 1081 if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { 1082 pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", 1083 mdname(llbitmap->mddev), __func__, idx); 1084 continue; 1085 } 1086 1087 llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); 1088 llbitmap_resume(llbitmap, idx); 1089 } 1090 1091 /* 1092 * If the daemon took a long time to finish, retry to prevent missing 1093 * clearing dirty bits. 1094 */ 1095 if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) 1096 goto retry; 1097 1098 /* If some page is dirty but not expired, setup timer again */ 1099 if (restart) 1100 mod_timer(&llbitmap->pending_timer, 1101 jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); 1102 } 1103 1104 static int llbitmap_create(struct mddev *mddev) 1105 { 1106 struct llbitmap *llbitmap; 1107 int ret; 1108 1109 ret = llbitmap_check_support(mddev); 1110 if (ret) 1111 return ret; 1112 1113 llbitmap = kzalloc_obj(*llbitmap); 1114 if (!llbitmap) 1115 return -ENOMEM; 1116 1117 llbitmap->mddev = mddev; 1118 llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); 1119 llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; 1120 1121 timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); 1122 INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); 1123 atomic_set(&llbitmap->behind_writes, 0); 1124 init_waitqueue_head(&llbitmap->behind_wait); 1125 1126 mutex_lock(&mddev->bitmap_info.mutex); 1127 mddev->bitmap = llbitmap; 1128 ret = llbitmap_read_sb(llbitmap); 1129 mutex_unlock(&mddev->bitmap_info.mutex); 1130 if (ret) { 1131 kfree(llbitmap); 1132 mddev->bitmap = NULL; 1133 } 1134 1135 return ret; 1136 } 1137 1138 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) 1139 { 1140 struct llbitmap *llbitmap = mddev->bitmap; 1141 unsigned long chunks; 1142 1143 if (chunksize == 0) 1144 chunksize = llbitmap->chunksize; 1145 1146 /* If there is enough space, leave the chunksize unchanged. */ 1147 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); 1148 while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { 1149 chunksize = chunksize << 1; 1150 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); 1151 } 1152 1153 llbitmap->chunkshift = ffz(~chunksize); 1154 llbitmap->chunksize = chunksize; 1155 llbitmap->chunks = chunks; 1156 1157 return 0; 1158 } 1159 1160 static int llbitmap_load(struct mddev *mddev) 1161 { 1162 enum llbitmap_action action = BitmapActionReload; 1163 struct llbitmap *llbitmap = mddev->bitmap; 1164 1165 if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) 1166 action = BitmapActionStale; 1167 1168 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); 1169 return 0; 1170 } 1171 1172 static void llbitmap_destroy(struct mddev *mddev) 1173 { 1174 struct llbitmap *llbitmap = mddev->bitmap; 1175 1176 if (!llbitmap) 1177 return; 1178 1179 mutex_lock(&mddev->bitmap_info.mutex); 1180 1181 timer_delete_sync(&llbitmap->pending_timer); 1182 flush_workqueue(md_llbitmap_io_wq); 1183 flush_workqueue(md_llbitmap_unplug_wq); 1184 1185 mddev->bitmap = NULL; 1186 llbitmap_free_pages(llbitmap); 1187 kfree(llbitmap); 1188 mutex_unlock(&mddev->bitmap_info.mutex); 1189 } 1190 1191 static void llbitmap_start_write(struct mddev *mddev, sector_t offset, 1192 unsigned long sectors) 1193 { 1194 struct llbitmap *llbitmap = mddev->bitmap; 1195 unsigned long start = offset >> llbitmap->chunkshift; 1196 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1197 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1198 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1199 1200 while (page_start <= page_end) { 1201 llbitmap_raise_barrier(llbitmap, page_start); 1202 page_start++; 1203 } 1204 1205 llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); 1206 } 1207 1208 static void llbitmap_end_write(struct mddev *mddev, sector_t offset, 1209 unsigned long sectors) 1210 { 1211 struct llbitmap *llbitmap = mddev->bitmap; 1212 unsigned long start = offset >> llbitmap->chunkshift; 1213 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1214 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1215 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1216 1217 while (page_start <= page_end) { 1218 llbitmap_release_barrier(llbitmap, page_start); 1219 page_start++; 1220 } 1221 } 1222 1223 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, 1224 unsigned long sectors) 1225 { 1226 struct llbitmap *llbitmap = mddev->bitmap; 1227 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); 1228 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1229 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1230 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1231 1232 while (page_start <= page_end) { 1233 llbitmap_raise_barrier(llbitmap, page_start); 1234 page_start++; 1235 } 1236 1237 llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); 1238 } 1239 1240 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, 1241 unsigned long sectors) 1242 { 1243 struct llbitmap *llbitmap = mddev->bitmap; 1244 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); 1245 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; 1246 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1247 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; 1248 1249 while (page_start <= page_end) { 1250 llbitmap_release_barrier(llbitmap, page_start); 1251 page_start++; 1252 } 1253 } 1254 1255 static void llbitmap_unplug_fn(struct work_struct *work) 1256 { 1257 struct llbitmap_unplug_work *unplug_work = 1258 container_of(work, struct llbitmap_unplug_work, work); 1259 struct llbitmap *llbitmap = unplug_work->llbitmap; 1260 struct blk_plug plug; 1261 int i; 1262 1263 blk_start_plug(&plug); 1264 1265 for (i = 0; i < llbitmap->nr_pages; i++) { 1266 if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || 1267 !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) 1268 continue; 1269 1270 llbitmap_write_page(llbitmap, i); 1271 } 1272 1273 blk_finish_plug(&plug); 1274 md_super_wait(llbitmap->mddev); 1275 complete(unplug_work->done); 1276 } 1277 1278 static bool llbitmap_dirty(struct llbitmap *llbitmap) 1279 { 1280 int i; 1281 1282 for (i = 0; i < llbitmap->nr_pages; i++) 1283 if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) 1284 return true; 1285 1286 return false; 1287 } 1288 1289 static void llbitmap_unplug(struct mddev *mddev, bool sync) 1290 { 1291 DECLARE_COMPLETION_ONSTACK(done); 1292 struct llbitmap *llbitmap = mddev->bitmap; 1293 struct llbitmap_unplug_work unplug_work = { 1294 .llbitmap = llbitmap, 1295 .done = &done, 1296 }; 1297 1298 if (!llbitmap_dirty(llbitmap)) 1299 return; 1300 1301 /* 1302 * Issue new bitmap IO under submit_bio() context will deadlock: 1303 * - the bio will wait for bitmap bio to be done, before it can be 1304 * issued; 1305 * - bitmap bio will be added to current->bio_list and wait for this 1306 * bio to be issued; 1307 */ 1308 INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); 1309 queue_work(md_llbitmap_unplug_wq, &unplug_work.work); 1310 wait_for_completion(&done); 1311 destroy_work_on_stack(&unplug_work.work); 1312 } 1313 1314 /* 1315 * Force to write all bitmap pages to disk, called when stopping the array, or 1316 * every daemon_sleep seconds when sync_thread is running. 1317 */ 1318 static void __llbitmap_flush(struct mddev *mddev) 1319 { 1320 struct llbitmap *llbitmap = mddev->bitmap; 1321 struct blk_plug plug; 1322 int i; 1323 1324 blk_start_plug(&plug); 1325 for (i = 0; i < llbitmap->nr_pages; i++) { 1326 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1327 1328 /* mark all blocks as dirty */ 1329 set_bit(LLPageDirty, &pctl->flags); 1330 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); 1331 llbitmap_write_page(llbitmap, i); 1332 } 1333 blk_finish_plug(&plug); 1334 md_super_wait(llbitmap->mddev); 1335 } 1336 1337 static void llbitmap_flush(struct mddev *mddev) 1338 { 1339 struct llbitmap *llbitmap = mddev->bitmap; 1340 int i; 1341 1342 for (i = 0; i < llbitmap->nr_pages; i++) 1343 set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); 1344 1345 timer_delete_sync(&llbitmap->pending_timer); 1346 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); 1347 flush_work(&llbitmap->daemon_work); 1348 1349 __llbitmap_flush(mddev); 1350 } 1351 1352 /* This is used for raid5 lazy initial recovery */ 1353 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) 1354 { 1355 struct llbitmap *llbitmap = mddev->bitmap; 1356 unsigned long p = offset >> llbitmap->chunkshift; 1357 enum llbitmap_state c = llbitmap_read(llbitmap, p); 1358 1359 return c == BitClean || c == BitDirty || c == BitCleanUnwritten; 1360 } 1361 1362 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) 1363 { 1364 struct llbitmap *llbitmap = mddev->bitmap; 1365 unsigned long p = offset >> llbitmap->chunkshift; 1366 int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1367 enum llbitmap_state c = llbitmap_read(llbitmap, p); 1368 1369 /* always skip unwritten blocks */ 1370 if (c == BitUnwritten) 1371 return blocks; 1372 1373 /* Skip CleanUnwritten - no user data, will be reset after recovery */ 1374 if (c == BitCleanUnwritten) 1375 return blocks; 1376 1377 /* For degraded array, don't skip */ 1378 if (mddev->degraded) 1379 return 0; 1380 1381 /* For resync also skip clean/dirty blocks */ 1382 if ((c == BitClean || c == BitDirty) && 1383 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 1384 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1385 return blocks; 1386 1387 return 0; 1388 } 1389 1390 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, 1391 sector_t *blocks, bool degraded) 1392 { 1393 struct llbitmap *llbitmap = mddev->bitmap; 1394 unsigned long p = offset >> llbitmap->chunkshift; 1395 enum llbitmap_state state; 1396 1397 /* 1398 * Before recovery starts, convert CleanUnwritten to Unwritten. 1399 * This ensures the new disk won't have stale parity data. 1400 */ 1401 if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 1402 !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) 1403 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 1404 BitmapActionClearUnwritten); 1405 1406 1407 /* 1408 * Handle one bit at a time, this is much simpler. And it doesn't matter 1409 * if md_do_sync() loop more times. 1410 */ 1411 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1412 state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync); 1413 return state == BitSyncing || state == BitSyncingUnwritten; 1414 } 1415 1416 /* Something is wrong, sync_thread stop at @offset */ 1417 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, 1418 sector_t *blocks) 1419 { 1420 struct llbitmap *llbitmap = mddev->bitmap; 1421 unsigned long p = offset >> llbitmap->chunkshift; 1422 1423 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); 1424 llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, 1425 BitmapActionAbortsync); 1426 } 1427 1428 /* A full sync_thread is finished */ 1429 static void llbitmap_close_sync(struct mddev *mddev) 1430 { 1431 struct llbitmap *llbitmap = mddev->bitmap; 1432 int i; 1433 1434 for (i = 0; i < llbitmap->nr_pages; i++) { 1435 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1436 1437 /* let daemon_fn clear dirty bits immediately */ 1438 WRITE_ONCE(pctl->expire, jiffies); 1439 } 1440 1441 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 1442 BitmapActionEndsync); 1443 } 1444 1445 /* 1446 * sync_thread have reached @sector, update metadata every daemon_sleep seconds, 1447 * just in case sync_thread have to restart after power failure. 1448 */ 1449 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, 1450 bool force) 1451 { 1452 struct llbitmap *llbitmap = mddev->bitmap; 1453 1454 if (sector == 0) { 1455 llbitmap->last_end_sync = jiffies; 1456 return; 1457 } 1458 1459 if (time_before(jiffies, llbitmap->last_end_sync + 1460 HZ * mddev->bitmap_info.daemon_sleep)) 1461 return; 1462 1463 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 1464 1465 mddev->curr_resync_completed = sector; 1466 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 1467 llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, 1468 BitmapActionEndsync); 1469 __llbitmap_flush(mddev); 1470 1471 llbitmap->last_end_sync = jiffies; 1472 sysfs_notify_dirent_safe(mddev->sysfs_completed); 1473 } 1474 1475 static bool llbitmap_enabled(void *data, bool flush) 1476 { 1477 struct llbitmap *llbitmap = data; 1478 1479 return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); 1480 } 1481 1482 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, 1483 unsigned long e) 1484 { 1485 llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); 1486 } 1487 1488 static void llbitmap_write_sb(struct llbitmap *llbitmap) 1489 { 1490 int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); 1491 1492 bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); 1493 llbitmap_write_page(llbitmap, 0); 1494 md_super_wait(llbitmap->mddev); 1495 } 1496 1497 static void llbitmap_update_sb(void *data) 1498 { 1499 struct llbitmap *llbitmap = data; 1500 struct mddev *mddev = llbitmap->mddev; 1501 struct page *sb_page; 1502 bitmap_super_t *sb; 1503 1504 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) 1505 return; 1506 1507 sb_page = llbitmap_read_page(llbitmap, 0); 1508 if (IS_ERR(sb_page)) { 1509 pr_err("%s: %s: read super block failed", __func__, 1510 mdname(mddev)); 1511 set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); 1512 return; 1513 } 1514 1515 if (mddev->events < llbitmap->events_cleared) 1516 llbitmap->events_cleared = mddev->events; 1517 1518 sb = kmap_local_page(sb_page); 1519 sb->events = cpu_to_le64(mddev->events); 1520 sb->state = cpu_to_le32(llbitmap->flags); 1521 sb->chunksize = cpu_to_le32(llbitmap->chunksize); 1522 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); 1523 sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); 1524 sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); 1525 sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); 1526 1527 kunmap_local(sb); 1528 llbitmap_write_sb(llbitmap); 1529 } 1530 1531 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) 1532 { 1533 struct llbitmap *llbitmap = data; 1534 1535 memset(stats, 0, sizeof(*stats)); 1536 1537 stats->missing_pages = 0; 1538 stats->pages = llbitmap->nr_pages; 1539 stats->file_pages = llbitmap->nr_pages; 1540 1541 stats->behind_writes = atomic_read(&llbitmap->behind_writes); 1542 stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); 1543 stats->events_cleared = llbitmap->events_cleared; 1544 1545 return 0; 1546 } 1547 1548 /* just flag all pages as needing to be written */ 1549 static void llbitmap_write_all(struct mddev *mddev) 1550 { 1551 int i; 1552 struct llbitmap *llbitmap = mddev->bitmap; 1553 1554 for (i = 0; i < llbitmap->nr_pages; i++) { 1555 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; 1556 1557 set_bit(LLPageDirty, &pctl->flags); 1558 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); 1559 } 1560 } 1561 1562 static void llbitmap_start_behind_write(struct mddev *mddev) 1563 { 1564 struct llbitmap *llbitmap = mddev->bitmap; 1565 1566 atomic_inc(&llbitmap->behind_writes); 1567 } 1568 1569 static void llbitmap_end_behind_write(struct mddev *mddev) 1570 { 1571 struct llbitmap *llbitmap = mddev->bitmap; 1572 1573 if (atomic_dec_and_test(&llbitmap->behind_writes)) 1574 wake_up(&llbitmap->behind_wait); 1575 } 1576 1577 static void llbitmap_wait_behind_writes(struct mddev *mddev) 1578 { 1579 struct llbitmap *llbitmap = mddev->bitmap; 1580 1581 if (!llbitmap) 1582 return; 1583 1584 wait_event(llbitmap->behind_wait, 1585 atomic_read(&llbitmap->behind_writes) == 0); 1586 1587 } 1588 1589 static ssize_t bits_show(struct mddev *mddev, char *page) 1590 { 1591 struct llbitmap *llbitmap; 1592 int bits[BitStateCount] = {0}; 1593 loff_t start = 0; 1594 1595 mutex_lock(&mddev->bitmap_info.mutex); 1596 llbitmap = mddev->bitmap; 1597 if (!llbitmap || !llbitmap->pctl) { 1598 mutex_unlock(&mddev->bitmap_info.mutex); 1599 return sprintf(page, "no bitmap\n"); 1600 } 1601 1602 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { 1603 mutex_unlock(&mddev->bitmap_info.mutex); 1604 return sprintf(page, "bitmap io error\n"); 1605 } 1606 1607 while (start < llbitmap->chunks) { 1608 enum llbitmap_state c = llbitmap_read(llbitmap, start); 1609 1610 if (c < 0 || c >= BitStateCount) 1611 pr_err("%s: invalid bit %llu state %d\n", 1612 __func__, start, c); 1613 else 1614 bits[c]++; 1615 start++; 1616 } 1617 1618 mutex_unlock(&mddev->bitmap_info.mutex); 1619 return sprintf(page, 1620 "unwritten %d\nclean %d\ndirty %d\n" 1621 "need sync %d\nsyncing %d\n" 1622 "need sync unwritten %d\nsyncing unwritten %d\n" 1623 "clean unwritten %d\n", 1624 bits[BitUnwritten], bits[BitClean], bits[BitDirty], 1625 bits[BitNeedSync], bits[BitSyncing], 1626 bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten], 1627 bits[BitCleanUnwritten]); 1628 } 1629 1630 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); 1631 1632 static ssize_t metadata_show(struct mddev *mddev, char *page) 1633 { 1634 struct llbitmap *llbitmap; 1635 ssize_t ret; 1636 1637 mutex_lock(&mddev->bitmap_info.mutex); 1638 llbitmap = mddev->bitmap; 1639 if (!llbitmap) { 1640 mutex_unlock(&mddev->bitmap_info.mutex); 1641 return sprintf(page, "no bitmap\n"); 1642 } 1643 1644 ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", 1645 llbitmap->chunksize, llbitmap->chunkshift, 1646 llbitmap->chunks, mddev->bitmap_info.offset, 1647 llbitmap->mddev->bitmap_info.daemon_sleep); 1648 mutex_unlock(&mddev->bitmap_info.mutex); 1649 1650 return ret; 1651 } 1652 1653 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); 1654 1655 static ssize_t 1656 daemon_sleep_show(struct mddev *mddev, char *page) 1657 { 1658 return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); 1659 } 1660 1661 static ssize_t 1662 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) 1663 { 1664 unsigned long timeout; 1665 int rv = kstrtoul(buf, 10, &timeout); 1666 1667 if (rv) 1668 return rv; 1669 1670 mddev->bitmap_info.daemon_sleep = timeout; 1671 return len; 1672 } 1673 1674 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); 1675 1676 static ssize_t 1677 barrier_idle_show(struct mddev *mddev, char *page) 1678 { 1679 struct llbitmap *llbitmap = mddev->bitmap; 1680 1681 return sprintf(page, "%lu\n", llbitmap->barrier_idle); 1682 } 1683 1684 static ssize_t 1685 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) 1686 { 1687 struct llbitmap *llbitmap = mddev->bitmap; 1688 unsigned long timeout; 1689 int rv = kstrtoul(buf, 10, &timeout); 1690 1691 if (rv) 1692 return rv; 1693 1694 llbitmap->barrier_idle = timeout; 1695 return len; 1696 } 1697 1698 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); 1699 1700 static ssize_t 1701 proactive_sync_store(struct mddev *mddev, const char *buf, size_t len) 1702 { 1703 struct llbitmap *llbitmap; 1704 1705 /* Only for RAID-456 */ 1706 if (!raid_is_456(mddev)) 1707 return -EINVAL; 1708 1709 mutex_lock(&mddev->bitmap_info.mutex); 1710 llbitmap = mddev->bitmap; 1711 if (!llbitmap || !llbitmap->pctl) { 1712 mutex_unlock(&mddev->bitmap_info.mutex); 1713 return -ENODEV; 1714 } 1715 1716 /* Trigger proactive sync on all Unwritten regions */ 1717 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, 1718 BitmapActionProactiveSync); 1719 1720 mutex_unlock(&mddev->bitmap_info.mutex); 1721 return len; 1722 } 1723 1724 static struct md_sysfs_entry llbitmap_proactive_sync = 1725 __ATTR(proactive_sync, 0200, NULL, proactive_sync_store); 1726 1727 static struct attribute *md_llbitmap_attrs[] = { 1728 &llbitmap_bits.attr, 1729 &llbitmap_metadata.attr, 1730 &llbitmap_daemon_sleep.attr, 1731 &llbitmap_barrier_idle.attr, 1732 &llbitmap_proactive_sync.attr, 1733 NULL 1734 }; 1735 1736 static struct attribute_group md_llbitmap_group = { 1737 .name = "llbitmap", 1738 .attrs = md_llbitmap_attrs, 1739 }; 1740 1741 static struct bitmap_operations llbitmap_ops = { 1742 .head = { 1743 .type = MD_BITMAP, 1744 .id = ID_LLBITMAP, 1745 .name = "llbitmap", 1746 }, 1747 1748 .enabled = llbitmap_enabled, 1749 .create = llbitmap_create, 1750 .resize = llbitmap_resize, 1751 .load = llbitmap_load, 1752 .destroy = llbitmap_destroy, 1753 1754 .start_write = llbitmap_start_write, 1755 .end_write = llbitmap_end_write, 1756 .start_discard = llbitmap_start_discard, 1757 .end_discard = llbitmap_end_discard, 1758 .unplug = llbitmap_unplug, 1759 .flush = llbitmap_flush, 1760 1761 .start_behind_write = llbitmap_start_behind_write, 1762 .end_behind_write = llbitmap_end_behind_write, 1763 .wait_behind_writes = llbitmap_wait_behind_writes, 1764 1765 .blocks_synced = llbitmap_blocks_synced, 1766 .skip_sync_blocks = llbitmap_skip_sync_blocks, 1767 .start_sync = llbitmap_start_sync, 1768 .end_sync = llbitmap_end_sync, 1769 .close_sync = llbitmap_close_sync, 1770 .cond_end_sync = llbitmap_cond_end_sync, 1771 1772 .update_sb = llbitmap_update_sb, 1773 .get_stats = llbitmap_get_stats, 1774 .dirty_bits = llbitmap_dirty_bits, 1775 .write_all = llbitmap_write_all, 1776 1777 .group = &md_llbitmap_group, 1778 }; 1779 1780 int md_llbitmap_init(void) 1781 { 1782 md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", 1783 WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1784 if (!md_llbitmap_io_wq) 1785 return -ENOMEM; 1786 1787 md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", 1788 WQ_MEM_RECLAIM | WQ_UNBOUND, 0); 1789 if (!md_llbitmap_unplug_wq) { 1790 destroy_workqueue(md_llbitmap_io_wq); 1791 md_llbitmap_io_wq = NULL; 1792 return -ENOMEM; 1793 } 1794 1795 return register_md_submodule(&llbitmap_ops.head); 1796 } 1797 1798 void md_llbitmap_exit(void) 1799 { 1800 destroy_workqueue(md_llbitmap_io_wq); 1801 md_llbitmap_io_wq = NULL; 1802 destroy_workqueue(md_llbitmap_unplug_wq); 1803 md_llbitmap_unplug_wq = NULL; 1804 unregister_md_submodule(&llbitmap_ops.head); 1805 } 1806