1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/blkdev.h>
4 #include <linux/module.h>
5 #include <linux/errno.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/timer.h>
9 #include <linux/sched.h>
10 #include <linux/list.h>
11 #include <linux/file.h>
12 #include <linux/seq_file.h>
13 #include <trace/events/block.h>
14
15 #include "md.h"
16 #include "md-bitmap.h"
17
18 /*
19 * #### Background
20 *
21 * Redundant data is used to enhance data fault tolerance, and the storage
22 * methods for redundant data vary depending on the RAID levels. And it's
23 * important to maintain the consistency of redundant data.
24 *
25 * Bitmap is used to record which data blocks have been synchronized and which
26 * ones need to be resynchronized or recovered. Each bit in the bitmap
27 * represents a segment of data in the array. When a bit is set, it indicates
28 * that the multiple redundant copies of that data segment may not be
29 * consistent. Data synchronization can be performed based on the bitmap after
30 * power failure or readding a disk. If there is no bitmap, a full disk
31 * synchronization is required.
32 *
33 * #### Key Features
34 *
35 * - IO fastpath is lockless, if user issues lots of write IO to the same
36 * bitmap bit in a short time, only the first write has additional overhead
37 * to update bitmap bit, no additional overhead for the following writes;
38 * - support only resync or recover written data, means in the case creating
39 * new array or replacing with a new disk, there is no need to do a full disk
40 * resync/recovery;
41 *
42 * #### Key Concept
43 *
44 * ##### State Machine
45 *
46 * Each bit is one byte, contain 6 different states, see llbitmap_state. And
47 * there are total 8 different actions, see llbitmap_action, can change state:
48 *
49 * llbitmap state machine: transitions between states
50 *
51 * | | Startwrite | Startsync | Endsync | Abortsync|
52 * | --------- | ---------- | --------- | ------- | ------- |
53 * | Unwritten | Dirty | x | x | x |
54 * | Clean | Dirty | x | x | x |
55 * | Dirty | x | x | x | x |
56 * | NeedSync | x | Syncing | x | x |
57 * | Syncing | x | Syncing | Dirty | NeedSync |
58 *
59 * | | Reload | Daemon | Discard | Stale |
60 * | --------- | -------- | ------ | --------- | --------- |
61 * | Unwritten | x | x | x | x |
62 * | Clean | x | x | Unwritten | NeedSync |
63 * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
64 * | NeedSync | x | x | Unwritten | x |
65 * | Syncing | NeedSync | x | Unwritten | NeedSync |
66 *
67 * Typical scenarios:
68 *
69 * 1) Create new array
70 * All bits will be set to Unwritten by default, if --assume-clean is set,
71 * all bits will be set to Clean instead.
72 *
73 * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
74 * rely on xor data
75 *
76 * 2.1) write new data to raid1/raid10:
77 * Unwritten --StartWrite--> Dirty
78 *
79 * 2.2) write new data to raid456:
80 * Unwritten --StartWrite--> NeedSync
81 *
82 * Because the initial recover for raid456 is skipped, the xor data is not built
83 * yet, the bit must be set to NeedSync first and after lazy initial recover is
84 * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
85 *
86 * 2.3) cover write
87 * Clean --StartWrite--> Dirty
88 *
89 * 3) daemon, if the array is not degraded:
90 * Dirty --Daemon--> Clean
91 *
92 * 4) discard
93 * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
94 *
95 * 5) resync and recover
96 *
97 * 5.1) common process
98 * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
99 *
100 * 5.2) resync after power failure
101 * Dirty --Reload--> NeedSync
102 *
103 * 5.3) recover while replacing with a new disk
104 * By default, the old bitmap framework will recover all data, and llbitmap
105 * implements this by a new helper, see llbitmap_skip_sync_blocks:
106 *
107 * skip recover for bits other than dirty or clean;
108 *
109 * 5.4) lazy initial recover for raid5:
110 * By default, the old bitmap framework will only allow new recover when there
111 * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
112 * to perform raid456 lazy recover for set bits(from 2.2).
113 *
114 * 6. special handling for degraded array:
115 *
116 * - Dirty bits will never be cleared, daemon will just do nothing, so that if
117 * a disk is readded, Clean bits can be skipped with recovery;
118 * - Dirty bits will convert to Syncing from start write, to do data recovery
119 * for new added disks;
120 * - New write will convert bits to NeedSync directly;
121 *
122 * ##### Bitmap IO
123 *
124 * ##### Chunksize
125 *
126 * The default bitmap size is 128k, incluing 1k bitmap super block, and
127 * the default size of segment of data in the array each bit(chunksize) is 64k,
128 * and chunksize will adjust to twice the old size each time if the total number
129 * bits is not less than 127k.(see llbitmap_init)
130 *
131 * ##### READ
132 *
133 * While creating bitmap, all pages will be allocated and read for llbitmap,
134 * there won't be read afterwards
135 *
136 * ##### WRITE
137 *
138 * WRITE IO is divided into logical_block_size of the array, the dirty state
139 * of each block is tracked independently, for example:
140 *
141 * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
142 *
143 * | page0 | page1 | ... | page 31 |
144 * | |
145 * | \-----------------------\
146 * | |
147 * | block0 | block1 | ... | block 8|
148 * | |
149 * | \-----------------\
150 * | |
151 * | bit0 | bit1 | ... | bit511 |
152 *
153 * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
154 * subpage will be marked dirty, such block must write first before the IO is
155 * issued. This behaviour will affect IO performance, to reduce the impact, if
156 * multiple bits are changed in the same block in a short time, all bits in this
157 * block will be changed to Dirty/NeedSync, so that there won't be any overhead
158 * until daemon clears dirty bits.
159 *
160 * ##### Dirty Bits synchronization
161 *
162 * IO fast path will set bits to dirty, and those dirty bits will be cleared
163 * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
164 * IO path and daemon;
165 *
166 * IO path:
167 * 1) try to grab a reference, if succeed, set expire time after 5s and return;
168 * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
169 * bits;
170 *
171 * Daemon (Daemon will be woken up every daemon_sleep seconds):
172 * For each page:
173 * 1) check if page expired, if not skip this page; for expired page:
174 * 2) suspend the page and wait for inflight write IO to be done;
175 * 3) change dirty page to clean;
176 * 4) resume the page;
177 */
178
179 #define BITMAP_DATA_OFFSET 1024
180
181 /* 64k is the max IO size of sync IO for raid1/raid10 */
182 #define MIN_CHUNK_SIZE (64 * 2)
183
184 /* By default, daemon will be woken up every 30s */
185 #define DEFAULT_DAEMON_SLEEP 30
186
187 /*
188 * Dirtied bits that have not been accessed for more than 5s will be cleared
189 * by daemon.
190 */
191 #define DEFAULT_BARRIER_IDLE 5
192
193 enum llbitmap_state {
194 /* No valid data, init state after assemble the array */
195 BitUnwritten = 0,
196 /* data is consistent */
197 BitClean,
198 /* data will be consistent after IO is done, set directly for writes */
199 BitDirty,
200 /*
201 * data need to be resynchronized:
202 * 1) set directly for writes if array is degraded, prevent full disk
203 * synchronization after readding a disk;
204 * 2) reassemble the array after power failure, and dirty bits are
205 * found after reloading the bitmap;
206 * 3) set for first write for raid5, to build initial xor data lazily
207 */
208 BitNeedSync,
209 /* data is synchronizing */
210 BitSyncing,
211 BitStateCount,
212 BitNone = 0xff,
213 };
214
215 enum llbitmap_action {
216 /* User write new data, this is the only action from IO fast path */
217 BitmapActionStartwrite = 0,
218 /* Start recovery */
219 BitmapActionStartsync,
220 /* Finish recovery */
221 BitmapActionEndsync,
222 /* Failed recovery */
223 BitmapActionAbortsync,
224 /* Reassemble the array */
225 BitmapActionReload,
226 /* Daemon thread is trying to clear dirty bits */
227 BitmapActionDaemon,
228 /* Data is deleted */
229 BitmapActionDiscard,
230 /*
231 * Bitmap is stale, mark all bits in addition to BitUnwritten to
232 * BitNeedSync.
233 */
234 BitmapActionStale,
235 BitmapActionCount,
236 /* Init state is BitUnwritten */
237 BitmapActionInit,
238 };
239
240 enum llbitmap_page_state {
241 LLPageFlush = 0,
242 LLPageDirty,
243 };
244
245 struct llbitmap_page_ctl {
246 char *state;
247 struct page *page;
248 unsigned long expire;
249 unsigned long flags;
250 wait_queue_head_t wait;
251 struct percpu_ref active;
252 /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
253 unsigned long dirty[];
254 };
255
256 struct llbitmap {
257 struct mddev *mddev;
258 struct llbitmap_page_ctl **pctl;
259
260 unsigned int nr_pages;
261 unsigned int io_size;
262 unsigned int blocks_per_page;
263
264 /* shift of one chunk */
265 unsigned long chunkshift;
266 /* size of one chunk in sector */
267 unsigned long chunksize;
268 /* total number of chunks */
269 unsigned long chunks;
270 unsigned long last_end_sync;
271 /*
272 * time in seconds that dirty bits will be cleared if the page is not
273 * accessed.
274 */
275 unsigned long barrier_idle;
276 /* fires on first BitDirty state */
277 struct timer_list pending_timer;
278 struct work_struct daemon_work;
279
280 unsigned long flags;
281 __u64 events_cleared;
282
283 /* for slow disks */
284 atomic_t behind_writes;
285 wait_queue_head_t behind_wait;
286 };
287
288 struct llbitmap_unplug_work {
289 struct work_struct work;
290 struct llbitmap *llbitmap;
291 struct completion *done;
292 };
293
294 static struct workqueue_struct *md_llbitmap_io_wq;
295 static struct workqueue_struct *md_llbitmap_unplug_wq;
296
297 static char state_machine[BitStateCount][BitmapActionCount] = {
298 [BitUnwritten] = {
299 [BitmapActionStartwrite] = BitDirty,
300 [BitmapActionStartsync] = BitNone,
301 [BitmapActionEndsync] = BitNone,
302 [BitmapActionAbortsync] = BitNone,
303 [BitmapActionReload] = BitNone,
304 [BitmapActionDaemon] = BitNone,
305 [BitmapActionDiscard] = BitNone,
306 [BitmapActionStale] = BitNone,
307 },
308 [BitClean] = {
309 [BitmapActionStartwrite] = BitDirty,
310 [BitmapActionStartsync] = BitNone,
311 [BitmapActionEndsync] = BitNone,
312 [BitmapActionAbortsync] = BitNone,
313 [BitmapActionReload] = BitNone,
314 [BitmapActionDaemon] = BitNone,
315 [BitmapActionDiscard] = BitUnwritten,
316 [BitmapActionStale] = BitNeedSync,
317 },
318 [BitDirty] = {
319 [BitmapActionStartwrite] = BitNone,
320 [BitmapActionStartsync] = BitNone,
321 [BitmapActionEndsync] = BitNone,
322 [BitmapActionAbortsync] = BitNone,
323 [BitmapActionReload] = BitNeedSync,
324 [BitmapActionDaemon] = BitClean,
325 [BitmapActionDiscard] = BitUnwritten,
326 [BitmapActionStale] = BitNeedSync,
327 },
328 [BitNeedSync] = {
329 [BitmapActionStartwrite] = BitNone,
330 [BitmapActionStartsync] = BitSyncing,
331 [BitmapActionEndsync] = BitNone,
332 [BitmapActionAbortsync] = BitNone,
333 [BitmapActionReload] = BitNone,
334 [BitmapActionDaemon] = BitNone,
335 [BitmapActionDiscard] = BitUnwritten,
336 [BitmapActionStale] = BitNone,
337 },
338 [BitSyncing] = {
339 [BitmapActionStartwrite] = BitNone,
340 [BitmapActionStartsync] = BitSyncing,
341 [BitmapActionEndsync] = BitDirty,
342 [BitmapActionAbortsync] = BitNeedSync,
343 [BitmapActionReload] = BitNeedSync,
344 [BitmapActionDaemon] = BitNone,
345 [BitmapActionDiscard] = BitUnwritten,
346 [BitmapActionStale] = BitNeedSync,
347 },
348 };
349
350 static void __llbitmap_flush(struct mddev *mddev);
351
llbitmap_read(struct llbitmap * llbitmap,loff_t pos)352 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
353 {
354 unsigned int idx;
355 unsigned int offset;
356
357 pos += BITMAP_DATA_OFFSET;
358 idx = pos >> PAGE_SHIFT;
359 offset = offset_in_page(pos);
360
361 return llbitmap->pctl[idx]->state[offset];
362 }
363
364 /* set all the bits in the subpage as dirty */
llbitmap_infect_dirty_bits(struct llbitmap * llbitmap,struct llbitmap_page_ctl * pctl,unsigned int block)365 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
366 struct llbitmap_page_ctl *pctl,
367 unsigned int block)
368 {
369 bool level_456 = raid_is_456(llbitmap->mddev);
370 unsigned int io_size = llbitmap->io_size;
371 int pos;
372
373 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
374 switch (pctl->state[pos]) {
375 case BitUnwritten:
376 pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
377 break;
378 case BitClean:
379 pctl->state[pos] = BitDirty;
380 break;
381 }
382 }
383 }
384
llbitmap_set_page_dirty(struct llbitmap * llbitmap,int idx,int offset)385 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
386 int offset)
387 {
388 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
389 unsigned int io_size = llbitmap->io_size;
390 int block = offset / io_size;
391 int pos;
392
393 if (!test_bit(LLPageDirty, &pctl->flags))
394 set_bit(LLPageDirty, &pctl->flags);
395
396 /*
397 * For degraded array, dirty bits will never be cleared, and we must
398 * resync all the dirty bits, hence skip infect new dirty bits to
399 * prevent resync unnecessary data.
400 */
401 if (llbitmap->mddev->degraded) {
402 set_bit(block, pctl->dirty);
403 return;
404 }
405
406 /*
407 * The subpage usually contains a total of 512 bits. If any single bit
408 * within the subpage is marked as dirty, the entire sector will be
409 * written. To avoid impacting write performance, when multiple bits
410 * within the same sector are modified within llbitmap->barrier_idle,
411 * all bits in the sector will be collectively marked as dirty at once.
412 */
413 if (test_and_set_bit(block, pctl->dirty)) {
414 llbitmap_infect_dirty_bits(llbitmap, pctl, block);
415 return;
416 }
417
418 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
419 if (pos == offset)
420 continue;
421 if (pctl->state[pos] == BitDirty ||
422 pctl->state[pos] == BitNeedSync) {
423 llbitmap_infect_dirty_bits(llbitmap, pctl, block);
424 return;
425 }
426 }
427 }
428
llbitmap_write(struct llbitmap * llbitmap,enum llbitmap_state state,loff_t pos)429 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
430 loff_t pos)
431 {
432 unsigned int idx;
433 unsigned int bit;
434
435 pos += BITMAP_DATA_OFFSET;
436 idx = pos >> PAGE_SHIFT;
437 bit = offset_in_page(pos);
438
439 llbitmap->pctl[idx]->state[bit] = state;
440 if (state == BitDirty || state == BitNeedSync)
441 llbitmap_set_page_dirty(llbitmap, idx, bit);
442 }
443
llbitmap_read_page(struct llbitmap * llbitmap,int idx)444 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
445 {
446 struct mddev *mddev = llbitmap->mddev;
447 struct page *page = NULL;
448 struct md_rdev *rdev;
449
450 if (llbitmap->pctl && llbitmap->pctl[idx])
451 page = llbitmap->pctl[idx]->page;
452 if (page)
453 return page;
454
455 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
456 if (!page)
457 return ERR_PTR(-ENOMEM);
458
459 rdev_for_each(rdev, mddev) {
460 sector_t sector;
461
462 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
463 continue;
464
465 sector = mddev->bitmap_info.offset +
466 (idx << PAGE_SECTORS_SHIFT);
467
468 if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
469 true))
470 return page;
471
472 md_error(mddev, rdev);
473 }
474
475 __free_page(page);
476 return ERR_PTR(-EIO);
477 }
478
llbitmap_write_page(struct llbitmap * llbitmap,int idx)479 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
480 {
481 struct page *page = llbitmap->pctl[idx]->page;
482 struct mddev *mddev = llbitmap->mddev;
483 struct md_rdev *rdev;
484 int block;
485
486 for (block = 0; block < llbitmap->blocks_per_page; block++) {
487 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
488
489 if (!test_and_clear_bit(block, pctl->dirty))
490 continue;
491
492 rdev_for_each(rdev, mddev) {
493 sector_t sector;
494 sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
495
496 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
497 continue;
498
499 sector = mddev->bitmap_info.offset + rdev->sb_start +
500 (idx << PAGE_SECTORS_SHIFT) +
501 block * bit_sector;
502 md_write_metadata(mddev, rdev, sector,
503 llbitmap->io_size, page,
504 block * llbitmap->io_size);
505 }
506 }
507 }
508
active_release(struct percpu_ref * ref)509 static void active_release(struct percpu_ref *ref)
510 {
511 struct llbitmap_page_ctl *pctl =
512 container_of(ref, struct llbitmap_page_ctl, active);
513
514 wake_up(&pctl->wait);
515 }
516
llbitmap_free_pages(struct llbitmap * llbitmap)517 static void llbitmap_free_pages(struct llbitmap *llbitmap)
518 {
519 int i;
520
521 if (!llbitmap->pctl)
522 return;
523
524 for (i = 0; i < llbitmap->nr_pages; i++) {
525 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
526
527 if (!pctl || !pctl->page)
528 break;
529
530 __free_page(pctl->page);
531 percpu_ref_exit(&pctl->active);
532 }
533
534 kfree(llbitmap->pctl[0]);
535 kfree(llbitmap->pctl);
536 llbitmap->pctl = NULL;
537 }
538
llbitmap_cache_pages(struct llbitmap * llbitmap)539 static int llbitmap_cache_pages(struct llbitmap *llbitmap)
540 {
541 struct llbitmap_page_ctl *pctl;
542 unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
543 BITMAP_DATA_OFFSET, PAGE_SIZE);
544 unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
545 llbitmap->blocks_per_page));
546 int i;
547
548 llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
549 GFP_KERNEL | __GFP_ZERO);
550 if (!llbitmap->pctl)
551 return -ENOMEM;
552
553 size = round_up(size, cache_line_size());
554 pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
555 if (!pctl) {
556 kfree(llbitmap->pctl);
557 return -ENOMEM;
558 }
559
560 llbitmap->nr_pages = nr_pages;
561
562 for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
563 struct page *page = llbitmap_read_page(llbitmap, i);
564
565 llbitmap->pctl[i] = pctl;
566
567 if (IS_ERR(page)) {
568 llbitmap_free_pages(llbitmap);
569 return PTR_ERR(page);
570 }
571
572 if (percpu_ref_init(&pctl->active, active_release,
573 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
574 __free_page(page);
575 llbitmap_free_pages(llbitmap);
576 return -ENOMEM;
577 }
578
579 pctl->page = page;
580 pctl->state = page_address(page);
581 init_waitqueue_head(&pctl->wait);
582 }
583
584 return 0;
585 }
586
llbitmap_init_state(struct llbitmap * llbitmap)587 static void llbitmap_init_state(struct llbitmap *llbitmap)
588 {
589 enum llbitmap_state state = BitUnwritten;
590 unsigned long i;
591
592 if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
593 state = BitClean;
594
595 for (i = 0; i < llbitmap->chunks; i++)
596 llbitmap_write(llbitmap, state, i);
597 }
598
599 /* The return value is only used from resync, where @start == @end. */
llbitmap_state_machine(struct llbitmap * llbitmap,unsigned long start,unsigned long end,enum llbitmap_action action)600 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
601 unsigned long start,
602 unsigned long end,
603 enum llbitmap_action action)
604 {
605 struct mddev *mddev = llbitmap->mddev;
606 enum llbitmap_state state = BitNone;
607 bool level_456 = raid_is_456(llbitmap->mddev);
608 bool need_resync = false;
609 bool need_recovery = false;
610
611 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
612 return BitNone;
613
614 if (action == BitmapActionInit) {
615 llbitmap_init_state(llbitmap);
616 return BitNone;
617 }
618
619 while (start <= end) {
620 enum llbitmap_state c = llbitmap_read(llbitmap, start);
621
622 if (c < 0 || c >= BitStateCount) {
623 pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
624 __func__, start, c, action);
625 state = BitNeedSync;
626 goto write_bitmap;
627 }
628
629 if (c == BitNeedSync)
630 need_resync = !mddev->degraded;
631
632 state = state_machine[c][action];
633
634 write_bitmap:
635 if (unlikely(mddev->degraded)) {
636 /* For degraded array, mark new data as need sync. */
637 if (state == BitDirty &&
638 action == BitmapActionStartwrite)
639 state = BitNeedSync;
640 /*
641 * For degraded array, resync dirty data as well, noted
642 * if array is still degraded after resync is done, all
643 * new data will still be dirty until array is clean.
644 */
645 else if (c == BitDirty &&
646 action == BitmapActionStartsync)
647 state = BitSyncing;
648 } else if (c == BitUnwritten && state == BitDirty &&
649 action == BitmapActionStartwrite && level_456) {
650 /* Delay raid456 initial recovery to first write. */
651 state = BitNeedSync;
652 }
653
654 if (state == BitNone) {
655 start++;
656 continue;
657 }
658
659 llbitmap_write(llbitmap, state, start);
660
661 if (state == BitNeedSync)
662 need_resync = !mddev->degraded;
663 else if (state == BitDirty &&
664 !timer_pending(&llbitmap->pending_timer))
665 mod_timer(&llbitmap->pending_timer,
666 jiffies + mddev->bitmap_info.daemon_sleep * HZ);
667
668 start++;
669 }
670
671 if (need_resync && level_456)
672 need_recovery = true;
673
674 if (need_recovery) {
675 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
676 set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
677 md_wakeup_thread(mddev->thread);
678 } else if (need_resync) {
679 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
680 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
681 md_wakeup_thread(mddev->thread);
682 }
683
684 return state;
685 }
686
llbitmap_raise_barrier(struct llbitmap * llbitmap,int page_idx)687 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
688 {
689 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
690
691 retry:
692 if (likely(percpu_ref_tryget_live(&pctl->active))) {
693 WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
694 return;
695 }
696
697 wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
698 goto retry;
699 }
700
llbitmap_release_barrier(struct llbitmap * llbitmap,int page_idx)701 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
702 {
703 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
704
705 percpu_ref_put(&pctl->active);
706 }
707
llbitmap_suspend_timeout(struct llbitmap * llbitmap,int page_idx)708 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
709 {
710 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
711
712 percpu_ref_kill(&pctl->active);
713
714 if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
715 llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
716 percpu_ref_resurrect(&pctl->active);
717 return -ETIMEDOUT;
718 }
719
720 return 0;
721 }
722
llbitmap_resume(struct llbitmap * llbitmap,int page_idx)723 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
724 {
725 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
726
727 pctl->expire = LONG_MAX;
728 percpu_ref_resurrect(&pctl->active);
729 wake_up(&pctl->wait);
730 }
731
llbitmap_check_support(struct mddev * mddev)732 static int llbitmap_check_support(struct mddev *mddev)
733 {
734 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
735 pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
736 mdname(mddev));
737 return -EBUSY;
738 }
739
740 if (mddev->bitmap_info.space == 0) {
741 if (mddev->bitmap_info.default_space == 0) {
742 pr_notice("md/llbitmap: %s: no space for bitmap\n",
743 mdname(mddev));
744 return -ENOSPC;
745 }
746 }
747
748 if (!mddev->persistent) {
749 pr_notice("md/llbitmap: %s: array must be persistent\n",
750 mdname(mddev));
751 return -EOPNOTSUPP;
752 }
753
754 if (mddev->bitmap_info.file) {
755 pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
756 mdname(mddev));
757 return -EOPNOTSUPP;
758 }
759
760 if (mddev->bitmap_info.external) {
761 pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
762 mdname(mddev));
763 return -EOPNOTSUPP;
764 }
765
766 if (mddev_is_dm(mddev)) {
767 pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
768 mdname(mddev));
769 return -EOPNOTSUPP;
770 }
771
772 return 0;
773 }
774
llbitmap_init(struct llbitmap * llbitmap)775 static int llbitmap_init(struct llbitmap *llbitmap)
776 {
777 struct mddev *mddev = llbitmap->mddev;
778 sector_t blocks = mddev->resync_max_sectors;
779 unsigned long chunksize = MIN_CHUNK_SIZE;
780 unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
781 unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
782 int ret;
783
784 while (chunks > space) {
785 chunksize = chunksize << 1;
786 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
787 }
788
789 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
790 llbitmap->chunkshift = ffz(~chunksize);
791 llbitmap->chunksize = chunksize;
792 llbitmap->chunks = chunks;
793 mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
794
795 ret = llbitmap_cache_pages(llbitmap);
796 if (ret)
797 return ret;
798
799 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
800 BitmapActionInit);
801 /* flush initial llbitmap to disk */
802 __llbitmap_flush(mddev);
803
804 return 0;
805 }
806
llbitmap_read_sb(struct llbitmap * llbitmap)807 static int llbitmap_read_sb(struct llbitmap *llbitmap)
808 {
809 struct mddev *mddev = llbitmap->mddev;
810 unsigned long daemon_sleep;
811 unsigned long chunksize;
812 unsigned long events;
813 struct page *sb_page;
814 bitmap_super_t *sb;
815 int ret = -EINVAL;
816
817 if (!mddev->bitmap_info.offset) {
818 pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
819 return -EINVAL;
820 }
821
822 sb_page = llbitmap_read_page(llbitmap, 0);
823 if (IS_ERR(sb_page)) {
824 pr_err("md/llbitmap: %s: read super block failed",
825 mdname(mddev));
826 return -EIO;
827 }
828
829 sb = kmap_local_page(sb_page);
830 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
831 pr_err("md/llbitmap: %s: invalid super block magic number",
832 mdname(mddev));
833 goto out_put_page;
834 }
835
836 if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
837 pr_err("md/llbitmap: %s: invalid super block version",
838 mdname(mddev));
839 goto out_put_page;
840 }
841
842 if (memcmp(sb->uuid, mddev->uuid, 16)) {
843 pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
844 mdname(mddev));
845 goto out_put_page;
846 }
847
848 if (mddev->bitmap_info.space == 0) {
849 int room = le32_to_cpu(sb->sectors_reserved);
850
851 if (room)
852 mddev->bitmap_info.space = room;
853 else
854 mddev->bitmap_info.space = mddev->bitmap_info.default_space;
855 }
856 llbitmap->flags = le32_to_cpu(sb->state);
857 if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
858 ret = llbitmap_init(llbitmap);
859 goto out_put_page;
860 }
861
862 chunksize = le32_to_cpu(sb->chunksize);
863 if (!is_power_of_2(chunksize)) {
864 pr_err("md/llbitmap: %s: chunksize not a power of 2",
865 mdname(mddev));
866 goto out_put_page;
867 }
868
869 if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
870 mddev->bitmap_info.space << SECTOR_SHIFT)) {
871 pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
872 mdname(mddev), chunksize, mddev->resync_max_sectors,
873 mddev->bitmap_info.space);
874 goto out_put_page;
875 }
876
877 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
878 if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
879 pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
880 mdname(mddev), daemon_sleep);
881 goto out_put_page;
882 }
883
884 events = le64_to_cpu(sb->events);
885 if (events < mddev->events) {
886 pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
887 mdname(mddev), events, mddev->events);
888 set_bit(BITMAP_STALE, &llbitmap->flags);
889 }
890
891 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
892 mddev->bitmap_info.chunksize = chunksize;
893 mddev->bitmap_info.daemon_sleep = daemon_sleep;
894
895 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
896 llbitmap->chunksize = chunksize;
897 llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
898 llbitmap->chunkshift = ffz(~chunksize);
899 ret = llbitmap_cache_pages(llbitmap);
900
901 out_put_page:
902 __free_page(sb_page);
903 kunmap_local(sb);
904 return ret;
905 }
906
llbitmap_pending_timer_fn(struct timer_list * pending_timer)907 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
908 {
909 struct llbitmap *llbitmap =
910 container_of(pending_timer, struct llbitmap, pending_timer);
911
912 if (work_busy(&llbitmap->daemon_work)) {
913 pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
914 mdname(llbitmap->mddev),
915 llbitmap->mddev->bitmap_info.daemon_sleep);
916 set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
917 return;
918 }
919
920 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
921 }
922
md_llbitmap_daemon_fn(struct work_struct * work)923 static void md_llbitmap_daemon_fn(struct work_struct *work)
924 {
925 struct llbitmap *llbitmap =
926 container_of(work, struct llbitmap, daemon_work);
927 unsigned long start;
928 unsigned long end;
929 bool restart;
930 int idx;
931
932 if (llbitmap->mddev->degraded)
933 return;
934 retry:
935 start = 0;
936 end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
937 restart = false;
938
939 for (idx = 0; idx < llbitmap->nr_pages; idx++) {
940 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
941
942 if (idx > 0) {
943 start = end + 1;
944 end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
945 }
946
947 if (!test_bit(LLPageFlush, &pctl->flags) &&
948 time_before(jiffies, pctl->expire)) {
949 restart = true;
950 continue;
951 }
952
953 if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
954 pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
955 mdname(llbitmap->mddev), __func__, idx);
956 continue;
957 }
958
959 llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
960 llbitmap_resume(llbitmap, idx);
961 }
962
963 /*
964 * If the daemon took a long time to finish, retry to prevent missing
965 * clearing dirty bits.
966 */
967 if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
968 goto retry;
969
970 /* If some page is dirty but not expired, setup timer again */
971 if (restart)
972 mod_timer(&llbitmap->pending_timer,
973 jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
974 }
975
llbitmap_create(struct mddev * mddev)976 static int llbitmap_create(struct mddev *mddev)
977 {
978 struct llbitmap *llbitmap;
979 int ret;
980
981 ret = llbitmap_check_support(mddev);
982 if (ret)
983 return ret;
984
985 llbitmap = kzalloc_obj(*llbitmap, GFP_KERNEL);
986 if (!llbitmap)
987 return -ENOMEM;
988
989 llbitmap->mddev = mddev;
990 llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
991 llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
992
993 timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
994 INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
995 atomic_set(&llbitmap->behind_writes, 0);
996 init_waitqueue_head(&llbitmap->behind_wait);
997
998 mutex_lock(&mddev->bitmap_info.mutex);
999 mddev->bitmap = llbitmap;
1000 ret = llbitmap_read_sb(llbitmap);
1001 mutex_unlock(&mddev->bitmap_info.mutex);
1002 if (ret) {
1003 kfree(llbitmap);
1004 mddev->bitmap = NULL;
1005 }
1006
1007 return ret;
1008 }
1009
llbitmap_resize(struct mddev * mddev,sector_t blocks,int chunksize)1010 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
1011 {
1012 struct llbitmap *llbitmap = mddev->bitmap;
1013 unsigned long chunks;
1014
1015 if (chunksize == 0)
1016 chunksize = llbitmap->chunksize;
1017
1018 /* If there is enough space, leave the chunksize unchanged. */
1019 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1020 while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
1021 chunksize = chunksize << 1;
1022 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1023 }
1024
1025 llbitmap->chunkshift = ffz(~chunksize);
1026 llbitmap->chunksize = chunksize;
1027 llbitmap->chunks = chunks;
1028
1029 return 0;
1030 }
1031
llbitmap_load(struct mddev * mddev)1032 static int llbitmap_load(struct mddev *mddev)
1033 {
1034 enum llbitmap_action action = BitmapActionReload;
1035 struct llbitmap *llbitmap = mddev->bitmap;
1036
1037 if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
1038 action = BitmapActionStale;
1039
1040 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
1041 return 0;
1042 }
1043
llbitmap_destroy(struct mddev * mddev)1044 static void llbitmap_destroy(struct mddev *mddev)
1045 {
1046 struct llbitmap *llbitmap = mddev->bitmap;
1047
1048 if (!llbitmap)
1049 return;
1050
1051 mutex_lock(&mddev->bitmap_info.mutex);
1052
1053 timer_delete_sync(&llbitmap->pending_timer);
1054 flush_workqueue(md_llbitmap_io_wq);
1055 flush_workqueue(md_llbitmap_unplug_wq);
1056
1057 mddev->bitmap = NULL;
1058 llbitmap_free_pages(llbitmap);
1059 kfree(llbitmap);
1060 mutex_unlock(&mddev->bitmap_info.mutex);
1061 }
1062
llbitmap_start_write(struct mddev * mddev,sector_t offset,unsigned long sectors)1063 static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
1064 unsigned long sectors)
1065 {
1066 struct llbitmap *llbitmap = mddev->bitmap;
1067 unsigned long start = offset >> llbitmap->chunkshift;
1068 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1069 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1070 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1071
1072 llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
1073
1074 while (page_start <= page_end) {
1075 llbitmap_raise_barrier(llbitmap, page_start);
1076 page_start++;
1077 }
1078 }
1079
llbitmap_end_write(struct mddev * mddev,sector_t offset,unsigned long sectors)1080 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
1081 unsigned long sectors)
1082 {
1083 struct llbitmap *llbitmap = mddev->bitmap;
1084 unsigned long start = offset >> llbitmap->chunkshift;
1085 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1086 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1087 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1088
1089 while (page_start <= page_end) {
1090 llbitmap_release_barrier(llbitmap, page_start);
1091 page_start++;
1092 }
1093 }
1094
llbitmap_start_discard(struct mddev * mddev,sector_t offset,unsigned long sectors)1095 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
1096 unsigned long sectors)
1097 {
1098 struct llbitmap *llbitmap = mddev->bitmap;
1099 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1100 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1101 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1102 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1103
1104 llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
1105
1106 while (page_start <= page_end) {
1107 llbitmap_raise_barrier(llbitmap, page_start);
1108 page_start++;
1109 }
1110 }
1111
llbitmap_end_discard(struct mddev * mddev,sector_t offset,unsigned long sectors)1112 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
1113 unsigned long sectors)
1114 {
1115 struct llbitmap *llbitmap = mddev->bitmap;
1116 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1117 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1118 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1119 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1120
1121 while (page_start <= page_end) {
1122 llbitmap_release_barrier(llbitmap, page_start);
1123 page_start++;
1124 }
1125 }
1126
llbitmap_unplug_fn(struct work_struct * work)1127 static void llbitmap_unplug_fn(struct work_struct *work)
1128 {
1129 struct llbitmap_unplug_work *unplug_work =
1130 container_of(work, struct llbitmap_unplug_work, work);
1131 struct llbitmap *llbitmap = unplug_work->llbitmap;
1132 struct blk_plug plug;
1133 int i;
1134
1135 blk_start_plug(&plug);
1136
1137 for (i = 0; i < llbitmap->nr_pages; i++) {
1138 if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
1139 !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1140 continue;
1141
1142 llbitmap_write_page(llbitmap, i);
1143 }
1144
1145 blk_finish_plug(&plug);
1146 md_super_wait(llbitmap->mddev);
1147 complete(unplug_work->done);
1148 }
1149
llbitmap_dirty(struct llbitmap * llbitmap)1150 static bool llbitmap_dirty(struct llbitmap *llbitmap)
1151 {
1152 int i;
1153
1154 for (i = 0; i < llbitmap->nr_pages; i++)
1155 if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1156 return true;
1157
1158 return false;
1159 }
1160
llbitmap_unplug(struct mddev * mddev,bool sync)1161 static void llbitmap_unplug(struct mddev *mddev, bool sync)
1162 {
1163 DECLARE_COMPLETION_ONSTACK(done);
1164 struct llbitmap *llbitmap = mddev->bitmap;
1165 struct llbitmap_unplug_work unplug_work = {
1166 .llbitmap = llbitmap,
1167 .done = &done,
1168 };
1169
1170 if (!llbitmap_dirty(llbitmap))
1171 return;
1172
1173 /*
1174 * Issue new bitmap IO under submit_bio() context will deadlock:
1175 * - the bio will wait for bitmap bio to be done, before it can be
1176 * issued;
1177 * - bitmap bio will be added to current->bio_list and wait for this
1178 * bio to be issued;
1179 */
1180 INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
1181 queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
1182 wait_for_completion(&done);
1183 destroy_work_on_stack(&unplug_work.work);
1184 }
1185
1186 /*
1187 * Force to write all bitmap pages to disk, called when stopping the array, or
1188 * every daemon_sleep seconds when sync_thread is running.
1189 */
__llbitmap_flush(struct mddev * mddev)1190 static void __llbitmap_flush(struct mddev *mddev)
1191 {
1192 struct llbitmap *llbitmap = mddev->bitmap;
1193 struct blk_plug plug;
1194 int i;
1195
1196 blk_start_plug(&plug);
1197 for (i = 0; i < llbitmap->nr_pages; i++) {
1198 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1199
1200 /* mark all blocks as dirty */
1201 set_bit(LLPageDirty, &pctl->flags);
1202 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1203 llbitmap_write_page(llbitmap, i);
1204 }
1205 blk_finish_plug(&plug);
1206 md_super_wait(llbitmap->mddev);
1207 }
1208
llbitmap_flush(struct mddev * mddev)1209 static void llbitmap_flush(struct mddev *mddev)
1210 {
1211 struct llbitmap *llbitmap = mddev->bitmap;
1212 int i;
1213
1214 for (i = 0; i < llbitmap->nr_pages; i++)
1215 set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
1216
1217 timer_delete_sync(&llbitmap->pending_timer);
1218 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1219 flush_work(&llbitmap->daemon_work);
1220
1221 __llbitmap_flush(mddev);
1222 }
1223
1224 /* This is used for raid5 lazy initial recovery */
llbitmap_blocks_synced(struct mddev * mddev,sector_t offset)1225 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
1226 {
1227 struct llbitmap *llbitmap = mddev->bitmap;
1228 unsigned long p = offset >> llbitmap->chunkshift;
1229 enum llbitmap_state c = llbitmap_read(llbitmap, p);
1230
1231 return c == BitClean || c == BitDirty;
1232 }
1233
llbitmap_skip_sync_blocks(struct mddev * mddev,sector_t offset)1234 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
1235 {
1236 struct llbitmap *llbitmap = mddev->bitmap;
1237 unsigned long p = offset >> llbitmap->chunkshift;
1238 int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1239 enum llbitmap_state c = llbitmap_read(llbitmap, p);
1240
1241 /* always skip unwritten blocks */
1242 if (c == BitUnwritten)
1243 return blocks;
1244
1245 /* For degraded array, don't skip */
1246 if (mddev->degraded)
1247 return 0;
1248
1249 /* For resync also skip clean/dirty blocks */
1250 if ((c == BitClean || c == BitDirty) &&
1251 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
1252 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1253 return blocks;
1254
1255 return 0;
1256 }
1257
llbitmap_start_sync(struct mddev * mddev,sector_t offset,sector_t * blocks,bool degraded)1258 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
1259 sector_t *blocks, bool degraded)
1260 {
1261 struct llbitmap *llbitmap = mddev->bitmap;
1262 unsigned long p = offset >> llbitmap->chunkshift;
1263
1264 /*
1265 * Handle one bit at a time, this is much simpler. And it doesn't matter
1266 * if md_do_sync() loop more times.
1267 */
1268 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1269 return llbitmap_state_machine(llbitmap, p, p,
1270 BitmapActionStartsync) == BitSyncing;
1271 }
1272
1273 /* Something is wrong, sync_thread stop at @offset */
llbitmap_end_sync(struct mddev * mddev,sector_t offset,sector_t * blocks)1274 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
1275 sector_t *blocks)
1276 {
1277 struct llbitmap *llbitmap = mddev->bitmap;
1278 unsigned long p = offset >> llbitmap->chunkshift;
1279
1280 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1281 llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
1282 BitmapActionAbortsync);
1283 }
1284
1285 /* A full sync_thread is finished */
llbitmap_close_sync(struct mddev * mddev)1286 static void llbitmap_close_sync(struct mddev *mddev)
1287 {
1288 struct llbitmap *llbitmap = mddev->bitmap;
1289 int i;
1290
1291 for (i = 0; i < llbitmap->nr_pages; i++) {
1292 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1293
1294 /* let daemon_fn clear dirty bits immediately */
1295 WRITE_ONCE(pctl->expire, jiffies);
1296 }
1297
1298 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1299 BitmapActionEndsync);
1300 }
1301
1302 /*
1303 * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
1304 * just in case sync_thread have to restart after power failure.
1305 */
llbitmap_cond_end_sync(struct mddev * mddev,sector_t sector,bool force)1306 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1307 bool force)
1308 {
1309 struct llbitmap *llbitmap = mddev->bitmap;
1310
1311 if (sector == 0) {
1312 llbitmap->last_end_sync = jiffies;
1313 return;
1314 }
1315
1316 if (time_before(jiffies, llbitmap->last_end_sync +
1317 HZ * mddev->bitmap_info.daemon_sleep))
1318 return;
1319
1320 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
1321
1322 mddev->curr_resync_completed = sector;
1323 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1324 llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
1325 BitmapActionEndsync);
1326 __llbitmap_flush(mddev);
1327
1328 llbitmap->last_end_sync = jiffies;
1329 sysfs_notify_dirent_safe(mddev->sysfs_completed);
1330 }
1331
llbitmap_enabled(void * data,bool flush)1332 static bool llbitmap_enabled(void *data, bool flush)
1333 {
1334 struct llbitmap *llbitmap = data;
1335
1336 return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1337 }
1338
llbitmap_dirty_bits(struct mddev * mddev,unsigned long s,unsigned long e)1339 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1340 unsigned long e)
1341 {
1342 llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
1343 }
1344
llbitmap_write_sb(struct llbitmap * llbitmap)1345 static void llbitmap_write_sb(struct llbitmap *llbitmap)
1346 {
1347 int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
1348
1349 bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
1350 llbitmap_write_page(llbitmap, 0);
1351 md_super_wait(llbitmap->mddev);
1352 }
1353
llbitmap_update_sb(void * data)1354 static void llbitmap_update_sb(void *data)
1355 {
1356 struct llbitmap *llbitmap = data;
1357 struct mddev *mddev = llbitmap->mddev;
1358 struct page *sb_page;
1359 bitmap_super_t *sb;
1360
1361 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
1362 return;
1363
1364 sb_page = llbitmap_read_page(llbitmap, 0);
1365 if (IS_ERR(sb_page)) {
1366 pr_err("%s: %s: read super block failed", __func__,
1367 mdname(mddev));
1368 set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1369 return;
1370 }
1371
1372 if (mddev->events < llbitmap->events_cleared)
1373 llbitmap->events_cleared = mddev->events;
1374
1375 sb = kmap_local_page(sb_page);
1376 sb->events = cpu_to_le64(mddev->events);
1377 sb->state = cpu_to_le32(llbitmap->flags);
1378 sb->chunksize = cpu_to_le32(llbitmap->chunksize);
1379 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1380 sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
1381 sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
1382 sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
1383
1384 kunmap_local(sb);
1385 llbitmap_write_sb(llbitmap);
1386 }
1387
llbitmap_get_stats(void * data,struct md_bitmap_stats * stats)1388 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
1389 {
1390 struct llbitmap *llbitmap = data;
1391
1392 memset(stats, 0, sizeof(*stats));
1393
1394 stats->missing_pages = 0;
1395 stats->pages = llbitmap->nr_pages;
1396 stats->file_pages = llbitmap->nr_pages;
1397
1398 stats->behind_writes = atomic_read(&llbitmap->behind_writes);
1399 stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
1400 stats->events_cleared = llbitmap->events_cleared;
1401
1402 return 0;
1403 }
1404
1405 /* just flag all pages as needing to be written */
llbitmap_write_all(struct mddev * mddev)1406 static void llbitmap_write_all(struct mddev *mddev)
1407 {
1408 int i;
1409 struct llbitmap *llbitmap = mddev->bitmap;
1410
1411 for (i = 0; i < llbitmap->nr_pages; i++) {
1412 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1413
1414 set_bit(LLPageDirty, &pctl->flags);
1415 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1416 }
1417 }
1418
llbitmap_start_behind_write(struct mddev * mddev)1419 static void llbitmap_start_behind_write(struct mddev *mddev)
1420 {
1421 struct llbitmap *llbitmap = mddev->bitmap;
1422
1423 atomic_inc(&llbitmap->behind_writes);
1424 }
1425
llbitmap_end_behind_write(struct mddev * mddev)1426 static void llbitmap_end_behind_write(struct mddev *mddev)
1427 {
1428 struct llbitmap *llbitmap = mddev->bitmap;
1429
1430 if (atomic_dec_and_test(&llbitmap->behind_writes))
1431 wake_up(&llbitmap->behind_wait);
1432 }
1433
llbitmap_wait_behind_writes(struct mddev * mddev)1434 static void llbitmap_wait_behind_writes(struct mddev *mddev)
1435 {
1436 struct llbitmap *llbitmap = mddev->bitmap;
1437
1438 if (!llbitmap)
1439 return;
1440
1441 wait_event(llbitmap->behind_wait,
1442 atomic_read(&llbitmap->behind_writes) == 0);
1443
1444 }
1445
bits_show(struct mddev * mddev,char * page)1446 static ssize_t bits_show(struct mddev *mddev, char *page)
1447 {
1448 struct llbitmap *llbitmap;
1449 int bits[BitStateCount] = {0};
1450 loff_t start = 0;
1451
1452 mutex_lock(&mddev->bitmap_info.mutex);
1453 llbitmap = mddev->bitmap;
1454 if (!llbitmap || !llbitmap->pctl) {
1455 mutex_unlock(&mddev->bitmap_info.mutex);
1456 return sprintf(page, "no bitmap\n");
1457 }
1458
1459 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
1460 mutex_unlock(&mddev->bitmap_info.mutex);
1461 return sprintf(page, "bitmap io error\n");
1462 }
1463
1464 while (start < llbitmap->chunks) {
1465 enum llbitmap_state c = llbitmap_read(llbitmap, start);
1466
1467 if (c < 0 || c >= BitStateCount)
1468 pr_err("%s: invalid bit %llu state %d\n",
1469 __func__, start, c);
1470 else
1471 bits[c]++;
1472 start++;
1473 }
1474
1475 mutex_unlock(&mddev->bitmap_info.mutex);
1476 return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
1477 bits[BitUnwritten], bits[BitClean], bits[BitDirty],
1478 bits[BitNeedSync], bits[BitSyncing]);
1479 }
1480
1481 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
1482
metadata_show(struct mddev * mddev,char * page)1483 static ssize_t metadata_show(struct mddev *mddev, char *page)
1484 {
1485 struct llbitmap *llbitmap;
1486 ssize_t ret;
1487
1488 mutex_lock(&mddev->bitmap_info.mutex);
1489 llbitmap = mddev->bitmap;
1490 if (!llbitmap) {
1491 mutex_unlock(&mddev->bitmap_info.mutex);
1492 return sprintf(page, "no bitmap\n");
1493 }
1494
1495 ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
1496 llbitmap->chunksize, llbitmap->chunkshift,
1497 llbitmap->chunks, mddev->bitmap_info.offset,
1498 llbitmap->mddev->bitmap_info.daemon_sleep);
1499 mutex_unlock(&mddev->bitmap_info.mutex);
1500
1501 return ret;
1502 }
1503
1504 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
1505
1506 static ssize_t
daemon_sleep_show(struct mddev * mddev,char * page)1507 daemon_sleep_show(struct mddev *mddev, char *page)
1508 {
1509 return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
1510 }
1511
1512 static ssize_t
daemon_sleep_store(struct mddev * mddev,const char * buf,size_t len)1513 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
1514 {
1515 unsigned long timeout;
1516 int rv = kstrtoul(buf, 10, &timeout);
1517
1518 if (rv)
1519 return rv;
1520
1521 mddev->bitmap_info.daemon_sleep = timeout;
1522 return len;
1523 }
1524
1525 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
1526
1527 static ssize_t
barrier_idle_show(struct mddev * mddev,char * page)1528 barrier_idle_show(struct mddev *mddev, char *page)
1529 {
1530 struct llbitmap *llbitmap = mddev->bitmap;
1531
1532 return sprintf(page, "%lu\n", llbitmap->barrier_idle);
1533 }
1534
1535 static ssize_t
barrier_idle_store(struct mddev * mddev,const char * buf,size_t len)1536 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
1537 {
1538 struct llbitmap *llbitmap = mddev->bitmap;
1539 unsigned long timeout;
1540 int rv = kstrtoul(buf, 10, &timeout);
1541
1542 if (rv)
1543 return rv;
1544
1545 llbitmap->barrier_idle = timeout;
1546 return len;
1547 }
1548
1549 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
1550
1551 static struct attribute *md_llbitmap_attrs[] = {
1552 &llbitmap_bits.attr,
1553 &llbitmap_metadata.attr,
1554 &llbitmap_daemon_sleep.attr,
1555 &llbitmap_barrier_idle.attr,
1556 NULL
1557 };
1558
1559 static struct attribute_group md_llbitmap_group = {
1560 .name = "llbitmap",
1561 .attrs = md_llbitmap_attrs,
1562 };
1563
1564 static struct bitmap_operations llbitmap_ops = {
1565 .head = {
1566 .type = MD_BITMAP,
1567 .id = ID_LLBITMAP,
1568 .name = "llbitmap",
1569 },
1570
1571 .enabled = llbitmap_enabled,
1572 .create = llbitmap_create,
1573 .resize = llbitmap_resize,
1574 .load = llbitmap_load,
1575 .destroy = llbitmap_destroy,
1576
1577 .start_write = llbitmap_start_write,
1578 .end_write = llbitmap_end_write,
1579 .start_discard = llbitmap_start_discard,
1580 .end_discard = llbitmap_end_discard,
1581 .unplug = llbitmap_unplug,
1582 .flush = llbitmap_flush,
1583
1584 .start_behind_write = llbitmap_start_behind_write,
1585 .end_behind_write = llbitmap_end_behind_write,
1586 .wait_behind_writes = llbitmap_wait_behind_writes,
1587
1588 .blocks_synced = llbitmap_blocks_synced,
1589 .skip_sync_blocks = llbitmap_skip_sync_blocks,
1590 .start_sync = llbitmap_start_sync,
1591 .end_sync = llbitmap_end_sync,
1592 .close_sync = llbitmap_close_sync,
1593 .cond_end_sync = llbitmap_cond_end_sync,
1594
1595 .update_sb = llbitmap_update_sb,
1596 .get_stats = llbitmap_get_stats,
1597 .dirty_bits = llbitmap_dirty_bits,
1598 .write_all = llbitmap_write_all,
1599
1600 .group = &md_llbitmap_group,
1601 };
1602
md_llbitmap_init(void)1603 int md_llbitmap_init(void)
1604 {
1605 md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
1606 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1607 if (!md_llbitmap_io_wq)
1608 return -ENOMEM;
1609
1610 md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
1611 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1612 if (!md_llbitmap_unplug_wq) {
1613 destroy_workqueue(md_llbitmap_io_wq);
1614 md_llbitmap_io_wq = NULL;
1615 return -ENOMEM;
1616 }
1617
1618 return register_md_submodule(&llbitmap_ops.head);
1619 }
1620
md_llbitmap_exit(void)1621 void md_llbitmap_exit(void)
1622 {
1623 destroy_workqueue(md_llbitmap_io_wq);
1624 md_llbitmap_io_wq = NULL;
1625 destroy_workqueue(md_llbitmap_unplug_wq);
1626 md_llbitmap_unplug_wq = NULL;
1627 unregister_md_submodule(&llbitmap_ops.head);
1628 }
1629