xref: /linux/drivers/md/md-llbitmap.c (revision f990ad67f0febc51274adb604d5bdeab0d06d024)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/blkdev.h>
4 #include <linux/module.h>
5 #include <linux/errno.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/timer.h>
9 #include <linux/sched.h>
10 #include <linux/list.h>
11 #include <linux/file.h>
12 #include <linux/seq_file.h>
13 #include <trace/events/block.h>
14 
15 #include "md.h"
16 #include "md-bitmap.h"
17 
18 /*
19  * #### Background
20  *
21  * Redundant data is used to enhance data fault tolerance, and the storage
22  * methods for redundant data vary depending on the RAID levels. And it's
23  * important to maintain the consistency of redundant data.
24  *
25  * Bitmap is used to record which data blocks have been synchronized and which
26  * ones need to be resynchronized or recovered. Each bit in the bitmap
27  * represents a segment of data in the array. When a bit is set, it indicates
28  * that the multiple redundant copies of that data segment may not be
29  * consistent. Data synchronization can be performed based on the bitmap after
30  * power failure or readding a disk. If there is no bitmap, a full disk
31  * synchronization is required.
32  *
33  * #### Key Features
34  *
35  *  - IO fastpath is lockless, if user issues lots of write IO to the same
36  *  bitmap bit in a short time, only the first write has additional overhead
37  *  to update bitmap bit, no additional overhead for the following writes;
38  *  - support only resync or recover written data, means in the case creating
39  *  new array or replacing with a new disk, there is no need to do a full disk
40  *  resync/recovery;
41  *
42  * #### Key Concept
43  *
44  * ##### State Machine
45  *
46  * Each bit is one byte, contain 6 different states, see llbitmap_state. And
47  * there are total 8 different actions, see llbitmap_action, can change state:
48  *
49  * llbitmap state machine: transitions between states
50  *
51  * |           | Startwrite | Startsync | Endsync | Abortsync|
52  * | --------- | ---------- | --------- | ------- | -------  |
53  * | Unwritten | Dirty      | x         | x       | x        |
54  * | Clean     | Dirty      | x         | x       | x        |
55  * | Dirty     | x          | x         | x       | x        |
56  * | NeedSync  | x          | Syncing   | x       | x        |
57  * | Syncing   | x          | Syncing   | Dirty   | NeedSync |
58  *
59  * |           | Reload   | Daemon | Discard   | Stale     |
60  * | --------- | -------- | ------ | --------- | --------- |
61  * | Unwritten | x        | x      | x         | x         |
62  * | Clean     | x        | x      | Unwritten | NeedSync  |
63  * | Dirty     | NeedSync | Clean  | Unwritten | NeedSync  |
64  * | NeedSync  | x        | x      | Unwritten | x         |
65  * | Syncing   | NeedSync | x      | Unwritten | NeedSync  |
66  *
67  * Typical scenarios:
68  *
69  * 1) Create new array
70  * All bits will be set to Unwritten by default, if --assume-clean is set,
71  * all bits will be set to Clean instead.
72  *
73  * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
74  * rely on xor data
75  *
76  * 2.1) write new data to raid1/raid10:
77  * Unwritten --StartWrite--> Dirty
78  *
79  * 2.2) write new data to raid456:
80  * Unwritten --StartWrite--> NeedSync
81  *
82  * Because the initial recover for raid456 is skipped, the xor data is not built
83  * yet, the bit must be set to NeedSync first and after lazy initial recover is
84  * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
85  *
86  * 2.3) cover write
87  * Clean --StartWrite--> Dirty
88  *
89  * 3) daemon, if the array is not degraded:
90  * Dirty --Daemon--> Clean
91  *
92  * 4) discard
93  * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
94  *
95  * 5) resync and recover
96  *
97  * 5.1) common process
98  * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
99  *
100  * 5.2) resync after power failure
101  * Dirty --Reload--> NeedSync
102  *
103  * 5.3) recover while replacing with a new disk
104  * By default, the old bitmap framework will recover all data, and llbitmap
105  * implements this by a new helper, see llbitmap_skip_sync_blocks:
106  *
107  * skip recover for bits other than dirty or clean;
108  *
109  * 5.4) lazy initial recover for raid5:
110  * By default, the old bitmap framework will only allow new recover when there
111  * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
112  * to perform raid456 lazy recover for set bits(from 2.2).
113  *
114  * 6. special handling for degraded array:
115  *
116  * - Dirty bits will never be cleared, daemon will just do nothing, so that if
117  *   a disk is readded, Clean bits can be skipped with recovery;
118  * - Dirty bits will convert to Syncing from start write, to do data recovery
119  *   for new added disks;
120  * - New write will convert bits to NeedSync directly;
121  *
122  * ##### Bitmap IO
123  *
124  * ##### Chunksize
125  *
126  * The default bitmap size is 128k, incluing 1k bitmap super block, and
127  * the default size of segment of data in the array each bit(chunksize) is 64k,
128  * and chunksize will adjust to twice the old size each time if the total number
129  * bits is not less than 127k.(see llbitmap_init)
130  *
131  * ##### READ
132  *
133  * While creating bitmap, all pages will be allocated and read for llbitmap,
134  * there won't be read afterwards
135  *
136  * ##### WRITE
137  *
138  * WRITE IO is divided into logical_block_size of the array, the dirty state
139  * of each block is tracked independently, for example:
140  *
141  * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
142  *
143  * | page0 | page1 | ... | page 31 |
144  * |       |
145  * |        \-----------------------\
146  * |                                |
147  * | block0 | block1 | ... | block 8|
148  * |        |
149  * |         \-----------------\
150  * |                            |
151  * | bit0 | bit1 | ... | bit511 |
152  *
153  * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
154  * subpage will be marked dirty, such block must write first before the IO is
155  * issued. This behaviour will affect IO performance, to reduce the impact, if
156  * multiple bits are changed in the same block in a short time, all bits in this
157  * block will be changed to Dirty/NeedSync, so that there won't be any overhead
158  * until daemon clears dirty bits.
159  *
160  * ##### Dirty Bits synchronization
161  *
162  * IO fast path will set bits to dirty, and those dirty bits will be cleared
163  * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
164  * IO path and daemon;
165  *
166  * IO path:
167  *  1) try to grab a reference, if succeed, set expire time after 5s and return;
168  *  2) if failed to grab a reference, wait for daemon to finish clearing dirty
169  *  bits;
170  *
171  * Daemon (Daemon will be woken up every daemon_sleep seconds):
172  * For each page:
173  *  1) check if page expired, if not skip this page; for expired page:
174  *  2) suspend the page and wait for inflight write IO to be done;
175  *  3) change dirty page to clean;
176  *  4) resume the page;
177  */
178 
179 #define BITMAP_DATA_OFFSET 1024
180 
181 /* 64k is the max IO size of sync IO for raid1/raid10 */
182 #define MIN_CHUNK_SIZE (64 * 2)
183 
184 /* By default, daemon will be woken up every 30s */
185 #define DEFAULT_DAEMON_SLEEP 30
186 
187 /*
188  * Dirtied bits that have not been accessed for more than 5s will be cleared
189  * by daemon.
190  */
191 #define DEFAULT_BARRIER_IDLE 5
192 
193 enum llbitmap_state {
194 	/* No valid data, init state after assemble the array */
195 	BitUnwritten = 0,
196 	/* data is consistent */
197 	BitClean,
198 	/* data will be consistent after IO is done, set directly for writes */
199 	BitDirty,
200 	/*
201 	 * data need to be resynchronized:
202 	 * 1) set directly for writes if array is degraded, prevent full disk
203 	 * synchronization after readding a disk;
204 	 * 2) reassemble the array after power failure, and dirty bits are
205 	 * found after reloading the bitmap;
206 	 * 3) set for first write for raid5, to build initial xor data lazily
207 	 */
208 	BitNeedSync,
209 	/* data is synchronizing */
210 	BitSyncing,
211 	BitStateCount,
212 	BitNone = 0xff,
213 };
214 
215 enum llbitmap_action {
216 	/* User write new data, this is the only action from IO fast path */
217 	BitmapActionStartwrite = 0,
218 	/* Start recovery */
219 	BitmapActionStartsync,
220 	/* Finish recovery */
221 	BitmapActionEndsync,
222 	/* Failed recovery */
223 	BitmapActionAbortsync,
224 	/* Reassemble the array */
225 	BitmapActionReload,
226 	/* Daemon thread is trying to clear dirty bits */
227 	BitmapActionDaemon,
228 	/* Data is deleted */
229 	BitmapActionDiscard,
230 	/*
231 	 * Bitmap is stale, mark all bits in addition to BitUnwritten to
232 	 * BitNeedSync.
233 	 */
234 	BitmapActionStale,
235 	BitmapActionCount,
236 	/* Init state is BitUnwritten */
237 	BitmapActionInit,
238 };
239 
240 enum llbitmap_page_state {
241 	LLPageFlush = 0,
242 	LLPageDirty,
243 };
244 
245 struct llbitmap_page_ctl {
246 	char *state;
247 	struct page *page;
248 	unsigned long expire;
249 	unsigned long flags;
250 	wait_queue_head_t wait;
251 	struct percpu_ref active;
252 	/* Per block size dirty state, maximum 64k page / 1 sector = 128 */
253 	unsigned long dirty[];
254 };
255 
256 struct llbitmap {
257 	struct mddev *mddev;
258 	struct llbitmap_page_ctl **pctl;
259 
260 	unsigned int nr_pages;
261 	unsigned int io_size;
262 	unsigned int blocks_per_page;
263 
264 	/* shift of one chunk */
265 	unsigned long chunkshift;
266 	/* size of one chunk in sector */
267 	unsigned long chunksize;
268 	/* total number of chunks */
269 	unsigned long chunks;
270 	unsigned long last_end_sync;
271 	/*
272 	 * time in seconds that dirty bits will be cleared if the page is not
273 	 * accessed.
274 	 */
275 	unsigned long barrier_idle;
276 	/* fires on first BitDirty state */
277 	struct timer_list pending_timer;
278 	struct work_struct daemon_work;
279 
280 	unsigned long flags;
281 	__u64	events_cleared;
282 
283 	/* for slow disks */
284 	atomic_t behind_writes;
285 	wait_queue_head_t behind_wait;
286 };
287 
288 struct llbitmap_unplug_work {
289 	struct work_struct work;
290 	struct llbitmap *llbitmap;
291 	struct completion *done;
292 };
293 
294 static struct workqueue_struct *md_llbitmap_io_wq;
295 static struct workqueue_struct *md_llbitmap_unplug_wq;
296 
297 static char state_machine[BitStateCount][BitmapActionCount] = {
298 	[BitUnwritten] = {
299 		[BitmapActionStartwrite]	= BitDirty,
300 		[BitmapActionStartsync]		= BitNone,
301 		[BitmapActionEndsync]		= BitNone,
302 		[BitmapActionAbortsync]		= BitNone,
303 		[BitmapActionReload]		= BitNone,
304 		[BitmapActionDaemon]		= BitNone,
305 		[BitmapActionDiscard]		= BitNone,
306 		[BitmapActionStale]		= BitNone,
307 	},
308 	[BitClean] = {
309 		[BitmapActionStartwrite]	= BitDirty,
310 		[BitmapActionStartsync]		= BitNone,
311 		[BitmapActionEndsync]		= BitNone,
312 		[BitmapActionAbortsync]		= BitNone,
313 		[BitmapActionReload]		= BitNone,
314 		[BitmapActionDaemon]		= BitNone,
315 		[BitmapActionDiscard]		= BitUnwritten,
316 		[BitmapActionStale]		= BitNeedSync,
317 	},
318 	[BitDirty] = {
319 		[BitmapActionStartwrite]	= BitNone,
320 		[BitmapActionStartsync]		= BitNone,
321 		[BitmapActionEndsync]		= BitNone,
322 		[BitmapActionAbortsync]		= BitNone,
323 		[BitmapActionReload]		= BitNeedSync,
324 		[BitmapActionDaemon]		= BitClean,
325 		[BitmapActionDiscard]		= BitUnwritten,
326 		[BitmapActionStale]		= BitNeedSync,
327 	},
328 	[BitNeedSync] = {
329 		[BitmapActionStartwrite]	= BitNone,
330 		[BitmapActionStartsync]		= BitSyncing,
331 		[BitmapActionEndsync]		= BitNone,
332 		[BitmapActionAbortsync]		= BitNone,
333 		[BitmapActionReload]		= BitNone,
334 		[BitmapActionDaemon]		= BitNone,
335 		[BitmapActionDiscard]		= BitUnwritten,
336 		[BitmapActionStale]		= BitNone,
337 	},
338 	[BitSyncing] = {
339 		[BitmapActionStartwrite]	= BitNone,
340 		[BitmapActionStartsync]		= BitSyncing,
341 		[BitmapActionEndsync]		= BitDirty,
342 		[BitmapActionAbortsync]		= BitNeedSync,
343 		[BitmapActionReload]		= BitNeedSync,
344 		[BitmapActionDaemon]		= BitNone,
345 		[BitmapActionDiscard]		= BitUnwritten,
346 		[BitmapActionStale]		= BitNeedSync,
347 	},
348 };
349 
350 static void __llbitmap_flush(struct mddev *mddev);
351 
352 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
353 {
354 	unsigned int idx;
355 	unsigned int offset;
356 
357 	pos += BITMAP_DATA_OFFSET;
358 	idx = pos >> PAGE_SHIFT;
359 	offset = offset_in_page(pos);
360 
361 	return llbitmap->pctl[idx]->state[offset];
362 }
363 
364 /* set all the bits in the subpage as dirty */
365 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
366 				       struct llbitmap_page_ctl *pctl,
367 				       unsigned int block)
368 {
369 	bool level_456 = raid_is_456(llbitmap->mddev);
370 	unsigned int io_size = llbitmap->io_size;
371 	int pos;
372 
373 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
374 		switch (pctl->state[pos]) {
375 		case BitUnwritten:
376 			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
377 			break;
378 		case BitClean:
379 			pctl->state[pos] = BitDirty;
380 			break;
381 		}
382 	}
383 }
384 
385 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
386 				    int offset)
387 {
388 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
389 	unsigned int io_size = llbitmap->io_size;
390 	int block = offset / io_size;
391 	int pos;
392 
393 	if (!test_bit(LLPageDirty, &pctl->flags))
394 		set_bit(LLPageDirty, &pctl->flags);
395 
396 	/*
397 	 * For degraded array, dirty bits will never be cleared, and we must
398 	 * resync all the dirty bits, hence skip infect new dirty bits to
399 	 * prevent resync unnecessary data.
400 	 */
401 	if (llbitmap->mddev->degraded) {
402 		set_bit(block, pctl->dirty);
403 		return;
404 	}
405 
406 	/*
407 	 * The subpage usually contains a total of 512 bits. If any single bit
408 	 * within the subpage is marked as dirty, the entire sector will be
409 	 * written. To avoid impacting write performance, when multiple bits
410 	 * within the same sector are modified within llbitmap->barrier_idle,
411 	 * all bits in the sector will be collectively marked as dirty at once.
412 	 */
413 	if (test_and_set_bit(block, pctl->dirty)) {
414 		llbitmap_infect_dirty_bits(llbitmap, pctl, block);
415 		return;
416 	}
417 
418 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
419 		if (pos == offset)
420 			continue;
421 		if (pctl->state[pos] == BitDirty ||
422 		    pctl->state[pos] == BitNeedSync) {
423 			llbitmap_infect_dirty_bits(llbitmap, pctl, block);
424 			return;
425 		}
426 	}
427 }
428 
429 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
430 			   loff_t pos)
431 {
432 	unsigned int idx;
433 	unsigned int bit;
434 
435 	pos += BITMAP_DATA_OFFSET;
436 	idx = pos >> PAGE_SHIFT;
437 	bit = offset_in_page(pos);
438 
439 	llbitmap->pctl[idx]->state[bit] = state;
440 	if (state == BitDirty || state == BitNeedSync)
441 		llbitmap_set_page_dirty(llbitmap, idx, bit);
442 }
443 
444 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
445 {
446 	struct mddev *mddev = llbitmap->mddev;
447 	struct page *page = NULL;
448 	struct md_rdev *rdev;
449 
450 	if (llbitmap->pctl && llbitmap->pctl[idx])
451 		page = llbitmap->pctl[idx]->page;
452 	if (page)
453 		return page;
454 
455 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
456 	if (!page)
457 		return ERR_PTR(-ENOMEM);
458 
459 	rdev_for_each(rdev, mddev) {
460 		sector_t sector;
461 
462 		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) ||
463 		    !test_bit(In_sync, &rdev->flags))
464 			continue;
465 
466 		sector = mddev->bitmap_info.offset +
467 			 (idx << PAGE_SECTORS_SHIFT);
468 
469 		if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
470 				 true))
471 			return page;
472 
473 		md_error(mddev, rdev);
474 	}
475 
476 	__free_page(page);
477 	return ERR_PTR(-EIO);
478 }
479 
480 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
481 {
482 	struct page *page = llbitmap->pctl[idx]->page;
483 	struct mddev *mddev = llbitmap->mddev;
484 	struct md_rdev *rdev;
485 	int block;
486 
487 	for (block = 0; block < llbitmap->blocks_per_page; block++) {
488 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
489 
490 		if (!test_and_clear_bit(block, pctl->dirty))
491 			continue;
492 
493 		rdev_for_each(rdev, mddev) {
494 			sector_t sector;
495 			sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
496 
497 			if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
498 				continue;
499 
500 			sector = mddev->bitmap_info.offset + rdev->sb_start +
501 				 (idx << PAGE_SECTORS_SHIFT) +
502 				 block * bit_sector;
503 			md_write_metadata(mddev, rdev, sector,
504 					  llbitmap->io_size, page,
505 					  block * llbitmap->io_size);
506 		}
507 	}
508 }
509 
510 static void active_release(struct percpu_ref *ref)
511 {
512 	struct llbitmap_page_ctl *pctl =
513 		container_of(ref, struct llbitmap_page_ctl, active);
514 
515 	wake_up(&pctl->wait);
516 }
517 
518 static void llbitmap_free_pages(struct llbitmap *llbitmap)
519 {
520 	int i;
521 
522 	if (!llbitmap->pctl)
523 		return;
524 
525 	for (i = 0; i < llbitmap->nr_pages; i++) {
526 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
527 
528 		if (!pctl || !pctl->page)
529 			break;
530 
531 		__free_page(pctl->page);
532 		percpu_ref_exit(&pctl->active);
533 	}
534 
535 	kfree(llbitmap->pctl[0]);
536 	kfree(llbitmap->pctl);
537 	llbitmap->pctl = NULL;
538 }
539 
540 static int llbitmap_cache_pages(struct llbitmap *llbitmap)
541 {
542 	struct llbitmap_page_ctl *pctl;
543 	unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
544 					     BITMAP_DATA_OFFSET, PAGE_SIZE);
545 	unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
546 						llbitmap->blocks_per_page));
547 	int i;
548 
549 	llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
550 				       GFP_KERNEL | __GFP_ZERO);
551 	if (!llbitmap->pctl)
552 		return -ENOMEM;
553 
554 	size = round_up(size, cache_line_size());
555 	pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
556 	if (!pctl) {
557 		kfree(llbitmap->pctl);
558 		return -ENOMEM;
559 	}
560 
561 	llbitmap->nr_pages = nr_pages;
562 
563 	for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
564 		struct page *page = llbitmap_read_page(llbitmap, i);
565 
566 		llbitmap->pctl[i] = pctl;
567 
568 		if (IS_ERR(page)) {
569 			llbitmap_free_pages(llbitmap);
570 			return PTR_ERR(page);
571 		}
572 
573 		if (percpu_ref_init(&pctl->active, active_release,
574 				    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
575 			__free_page(page);
576 			llbitmap_free_pages(llbitmap);
577 			return -ENOMEM;
578 		}
579 
580 		pctl->page = page;
581 		pctl->state = page_address(page);
582 		init_waitqueue_head(&pctl->wait);
583 	}
584 
585 	return 0;
586 }
587 
588 static void llbitmap_init_state(struct llbitmap *llbitmap)
589 {
590 	enum llbitmap_state state = BitUnwritten;
591 	unsigned long i;
592 
593 	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
594 		state = BitClean;
595 
596 	for (i = 0; i < llbitmap->chunks; i++)
597 		llbitmap_write(llbitmap, state, i);
598 }
599 
600 /* The return value is only used from resync, where @start == @end. */
601 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
602 						  unsigned long start,
603 						  unsigned long end,
604 						  enum llbitmap_action action)
605 {
606 	struct mddev *mddev = llbitmap->mddev;
607 	enum llbitmap_state state = BitNone;
608 	bool level_456 = raid_is_456(llbitmap->mddev);
609 	bool need_resync = false;
610 	bool need_recovery = false;
611 
612 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
613 		return BitNone;
614 
615 	if (action == BitmapActionInit) {
616 		llbitmap_init_state(llbitmap);
617 		return BitNone;
618 	}
619 
620 	while (start <= end) {
621 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
622 
623 		if (c < 0 || c >= BitStateCount) {
624 			pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
625 			       __func__, start, c, action);
626 			state = BitNeedSync;
627 			goto write_bitmap;
628 		}
629 
630 		if (c == BitNeedSync)
631 			need_resync = !mddev->degraded;
632 
633 		state = state_machine[c][action];
634 
635 write_bitmap:
636 		if (unlikely(mddev->degraded)) {
637 			/* For degraded array, mark new data as need sync. */
638 			if (state == BitDirty &&
639 			    action == BitmapActionStartwrite)
640 				state = BitNeedSync;
641 			/*
642 			 * For degraded array, resync dirty data as well, noted
643 			 * if array is still degraded after resync is done, all
644 			 * new data will still be dirty until array is clean.
645 			 */
646 			else if (c == BitDirty &&
647 				action == BitmapActionStartsync)
648 				state = BitSyncing;
649 		} else if (c == BitUnwritten && state == BitDirty &&
650 			   action == BitmapActionStartwrite && level_456) {
651 			/* Delay raid456 initial recovery to first write. */
652 			state = BitNeedSync;
653 		}
654 
655 		if (state == BitNone) {
656 			start++;
657 			continue;
658 		}
659 
660 		llbitmap_write(llbitmap, state, start);
661 
662 		if (state == BitNeedSync)
663 			need_resync = !mddev->degraded;
664 		else if (state == BitDirty &&
665 			 !timer_pending(&llbitmap->pending_timer))
666 			mod_timer(&llbitmap->pending_timer,
667 				  jiffies + mddev->bitmap_info.daemon_sleep * HZ);
668 
669 		start++;
670 	}
671 
672 	if (need_resync && level_456)
673 		need_recovery = true;
674 
675 	if (need_recovery) {
676 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
677 		set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
678 		md_wakeup_thread(mddev->thread);
679 	} else if (need_resync) {
680 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
681 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
682 		md_wakeup_thread(mddev->thread);
683 	}
684 
685 	return state;
686 }
687 
688 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
689 {
690 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
691 
692 retry:
693 	if (likely(percpu_ref_tryget_live(&pctl->active))) {
694 		WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
695 		return;
696 	}
697 
698 	wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
699 	goto retry;
700 }
701 
702 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
703 {
704 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
705 
706 	percpu_ref_put(&pctl->active);
707 }
708 
709 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
710 {
711 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
712 
713 	percpu_ref_kill(&pctl->active);
714 
715 	if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
716 			llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
717 		percpu_ref_resurrect(&pctl->active);
718 		return -ETIMEDOUT;
719 	}
720 
721 	return 0;
722 }
723 
724 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
725 {
726 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
727 
728 	pctl->expire = LONG_MAX;
729 	percpu_ref_resurrect(&pctl->active);
730 	wake_up(&pctl->wait);
731 }
732 
733 static int llbitmap_check_support(struct mddev *mddev)
734 {
735 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
736 		pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
737 			  mdname(mddev));
738 		return -EBUSY;
739 	}
740 
741 	if (mddev->bitmap_info.space == 0) {
742 		if (mddev->bitmap_info.default_space == 0) {
743 			pr_notice("md/llbitmap: %s: no space for bitmap\n",
744 				  mdname(mddev));
745 			return -ENOSPC;
746 		}
747 	}
748 
749 	if (!mddev->persistent) {
750 		pr_notice("md/llbitmap: %s: array must be persistent\n",
751 			  mdname(mddev));
752 		return -EOPNOTSUPP;
753 	}
754 
755 	if (mddev->bitmap_info.file) {
756 		pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
757 			  mdname(mddev));
758 		return -EOPNOTSUPP;
759 	}
760 
761 	if (mddev->bitmap_info.external) {
762 		pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
763 			  mdname(mddev));
764 		return -EOPNOTSUPP;
765 	}
766 
767 	if (mddev_is_dm(mddev)) {
768 		pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
769 			  mdname(mddev));
770 		return -EOPNOTSUPP;
771 	}
772 
773 	return 0;
774 }
775 
776 static int llbitmap_init(struct llbitmap *llbitmap)
777 {
778 	struct mddev *mddev = llbitmap->mddev;
779 	sector_t blocks = mddev->resync_max_sectors;
780 	unsigned long chunksize = MIN_CHUNK_SIZE;
781 	unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
782 	unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
783 	int ret;
784 
785 	while (chunks > space) {
786 		chunksize = chunksize << 1;
787 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
788 	}
789 
790 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
791 	llbitmap->chunkshift = ffz(~chunksize);
792 	llbitmap->chunksize = chunksize;
793 	llbitmap->chunks = chunks;
794 	mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
795 
796 	ret = llbitmap_cache_pages(llbitmap);
797 	if (ret)
798 		return ret;
799 
800 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
801 			       BitmapActionInit);
802 	/* flush initial llbitmap to disk */
803 	__llbitmap_flush(mddev);
804 
805 	return 0;
806 }
807 
808 static int llbitmap_read_sb(struct llbitmap *llbitmap)
809 {
810 	struct mddev *mddev = llbitmap->mddev;
811 	unsigned long daemon_sleep;
812 	unsigned long chunksize;
813 	unsigned long events;
814 	struct page *sb_page;
815 	bitmap_super_t *sb;
816 	int ret = -EINVAL;
817 
818 	if (!mddev->bitmap_info.offset) {
819 		pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
820 		return -EINVAL;
821 	}
822 
823 	sb_page = llbitmap_read_page(llbitmap, 0);
824 	if (IS_ERR(sb_page)) {
825 		pr_err("md/llbitmap: %s: read super block failed",
826 		       mdname(mddev));
827 		return -EIO;
828 	}
829 
830 	sb = kmap_local_page(sb_page);
831 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
832 		pr_err("md/llbitmap: %s: invalid super block magic number",
833 		       mdname(mddev));
834 		goto out_put_page;
835 	}
836 
837 	if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
838 		pr_err("md/llbitmap: %s: invalid super block version",
839 		       mdname(mddev));
840 		goto out_put_page;
841 	}
842 
843 	if (memcmp(sb->uuid, mddev->uuid, 16)) {
844 		pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
845 		       mdname(mddev));
846 		goto out_put_page;
847 	}
848 
849 	if (mddev->bitmap_info.space == 0) {
850 		int room = le32_to_cpu(sb->sectors_reserved);
851 
852 		if (room)
853 			mddev->bitmap_info.space = room;
854 		else
855 			mddev->bitmap_info.space = mddev->bitmap_info.default_space;
856 	}
857 	llbitmap->flags = le32_to_cpu(sb->state);
858 	if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
859 		ret = llbitmap_init(llbitmap);
860 		goto out_put_page;
861 	}
862 
863 	chunksize = le32_to_cpu(sb->chunksize);
864 	if (!is_power_of_2(chunksize)) {
865 		pr_err("md/llbitmap: %s: chunksize not a power of 2",
866 		       mdname(mddev));
867 		goto out_put_page;
868 	}
869 
870 	if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
871 					      mddev->bitmap_info.space << SECTOR_SHIFT)) {
872 		pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
873 		       mdname(mddev), chunksize, mddev->resync_max_sectors,
874 		       mddev->bitmap_info.space);
875 		goto out_put_page;
876 	}
877 
878 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
879 	if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
880 		pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
881 		       mdname(mddev), daemon_sleep);
882 		goto out_put_page;
883 	}
884 
885 	events = le64_to_cpu(sb->events);
886 	if (events < mddev->events) {
887 		pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
888 			mdname(mddev), events, mddev->events);
889 		set_bit(BITMAP_STALE, &llbitmap->flags);
890 	}
891 
892 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
893 	mddev->bitmap_info.chunksize = chunksize;
894 	mddev->bitmap_info.daemon_sleep = daemon_sleep;
895 
896 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
897 	llbitmap->chunksize = chunksize;
898 	llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
899 	llbitmap->chunkshift = ffz(~chunksize);
900 	ret = llbitmap_cache_pages(llbitmap);
901 
902 out_put_page:
903 	__free_page(sb_page);
904 	kunmap_local(sb);
905 	return ret;
906 }
907 
908 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
909 {
910 	struct llbitmap *llbitmap =
911 		container_of(pending_timer, struct llbitmap, pending_timer);
912 
913 	if (work_busy(&llbitmap->daemon_work)) {
914 		pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
915 			mdname(llbitmap->mddev),
916 			llbitmap->mddev->bitmap_info.daemon_sleep);
917 		set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
918 		return;
919 	}
920 
921 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
922 }
923 
924 static void md_llbitmap_daemon_fn(struct work_struct *work)
925 {
926 	struct llbitmap *llbitmap =
927 		container_of(work, struct llbitmap, daemon_work);
928 	unsigned long start;
929 	unsigned long end;
930 	bool restart;
931 	int idx;
932 
933 	if (llbitmap->mddev->degraded)
934 		return;
935 retry:
936 	start = 0;
937 	end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
938 	restart = false;
939 
940 	for (idx = 0; idx < llbitmap->nr_pages; idx++) {
941 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
942 
943 		if (idx > 0) {
944 			start = end + 1;
945 			end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
946 		}
947 
948 		if (!test_bit(LLPageFlush, &pctl->flags) &&
949 		    time_before(jiffies, pctl->expire)) {
950 			restart = true;
951 			continue;
952 		}
953 
954 		if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
955 			pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
956 				mdname(llbitmap->mddev), __func__, idx);
957 			continue;
958 		}
959 
960 		llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
961 		llbitmap_resume(llbitmap, idx);
962 	}
963 
964 	/*
965 	 * If the daemon took a long time to finish, retry to prevent missing
966 	 * clearing dirty bits.
967 	 */
968 	if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
969 		goto retry;
970 
971 	/* If some page is dirty but not expired, setup timer again */
972 	if (restart)
973 		mod_timer(&llbitmap->pending_timer,
974 			  jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
975 }
976 
977 static int llbitmap_create(struct mddev *mddev)
978 {
979 	struct llbitmap *llbitmap;
980 	int ret;
981 
982 	ret = llbitmap_check_support(mddev);
983 	if (ret)
984 		return ret;
985 
986 	llbitmap = kzalloc_obj(*llbitmap);
987 	if (!llbitmap)
988 		return -ENOMEM;
989 
990 	llbitmap->mddev = mddev;
991 	llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
992 	llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
993 
994 	timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
995 	INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
996 	atomic_set(&llbitmap->behind_writes, 0);
997 	init_waitqueue_head(&llbitmap->behind_wait);
998 
999 	mutex_lock(&mddev->bitmap_info.mutex);
1000 	mddev->bitmap = llbitmap;
1001 	ret = llbitmap_read_sb(llbitmap);
1002 	mutex_unlock(&mddev->bitmap_info.mutex);
1003 	if (ret) {
1004 		kfree(llbitmap);
1005 		mddev->bitmap = NULL;
1006 	}
1007 
1008 	return ret;
1009 }
1010 
1011 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
1012 {
1013 	struct llbitmap *llbitmap = mddev->bitmap;
1014 	unsigned long chunks;
1015 
1016 	if (chunksize == 0)
1017 		chunksize = llbitmap->chunksize;
1018 
1019 	/* If there is enough space, leave the chunksize unchanged. */
1020 	chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1021 	while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
1022 		chunksize = chunksize << 1;
1023 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1024 	}
1025 
1026 	llbitmap->chunkshift = ffz(~chunksize);
1027 	llbitmap->chunksize = chunksize;
1028 	llbitmap->chunks = chunks;
1029 
1030 	return 0;
1031 }
1032 
1033 static int llbitmap_load(struct mddev *mddev)
1034 {
1035 	enum llbitmap_action action = BitmapActionReload;
1036 	struct llbitmap *llbitmap = mddev->bitmap;
1037 
1038 	if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
1039 		action = BitmapActionStale;
1040 
1041 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
1042 	return 0;
1043 }
1044 
1045 static void llbitmap_destroy(struct mddev *mddev)
1046 {
1047 	struct llbitmap *llbitmap = mddev->bitmap;
1048 
1049 	if (!llbitmap)
1050 		return;
1051 
1052 	mutex_lock(&mddev->bitmap_info.mutex);
1053 
1054 	timer_delete_sync(&llbitmap->pending_timer);
1055 	flush_workqueue(md_llbitmap_io_wq);
1056 	flush_workqueue(md_llbitmap_unplug_wq);
1057 
1058 	mddev->bitmap = NULL;
1059 	llbitmap_free_pages(llbitmap);
1060 	kfree(llbitmap);
1061 	mutex_unlock(&mddev->bitmap_info.mutex);
1062 }
1063 
1064 static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
1065 				 unsigned long sectors)
1066 {
1067 	struct llbitmap *llbitmap = mddev->bitmap;
1068 	unsigned long start = offset >> llbitmap->chunkshift;
1069 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1070 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1071 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1072 
1073 	while (page_start <= page_end) {
1074 		llbitmap_raise_barrier(llbitmap, page_start);
1075 		page_start++;
1076 	}
1077 
1078 	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
1079 }
1080 
1081 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
1082 			       unsigned long sectors)
1083 {
1084 	struct llbitmap *llbitmap = mddev->bitmap;
1085 	unsigned long start = offset >> llbitmap->chunkshift;
1086 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1087 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1088 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1089 
1090 	while (page_start <= page_end) {
1091 		llbitmap_release_barrier(llbitmap, page_start);
1092 		page_start++;
1093 	}
1094 }
1095 
1096 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
1097 				   unsigned long sectors)
1098 {
1099 	struct llbitmap *llbitmap = mddev->bitmap;
1100 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1101 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1102 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1103 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1104 
1105 	while (page_start <= page_end) {
1106 		llbitmap_raise_barrier(llbitmap, page_start);
1107 		page_start++;
1108 	}
1109 
1110 	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
1111 }
1112 
1113 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
1114 				 unsigned long sectors)
1115 {
1116 	struct llbitmap *llbitmap = mddev->bitmap;
1117 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1118 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1119 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1120 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1121 
1122 	while (page_start <= page_end) {
1123 		llbitmap_release_barrier(llbitmap, page_start);
1124 		page_start++;
1125 	}
1126 }
1127 
1128 static void llbitmap_unplug_fn(struct work_struct *work)
1129 {
1130 	struct llbitmap_unplug_work *unplug_work =
1131 		container_of(work, struct llbitmap_unplug_work, work);
1132 	struct llbitmap *llbitmap = unplug_work->llbitmap;
1133 	struct blk_plug plug;
1134 	int i;
1135 
1136 	blk_start_plug(&plug);
1137 
1138 	for (i = 0; i < llbitmap->nr_pages; i++) {
1139 		if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
1140 		    !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1141 			continue;
1142 
1143 		llbitmap_write_page(llbitmap, i);
1144 	}
1145 
1146 	blk_finish_plug(&plug);
1147 	md_super_wait(llbitmap->mddev);
1148 	complete(unplug_work->done);
1149 }
1150 
1151 static bool llbitmap_dirty(struct llbitmap *llbitmap)
1152 {
1153 	int i;
1154 
1155 	for (i = 0; i < llbitmap->nr_pages; i++)
1156 		if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1157 			return true;
1158 
1159 	return false;
1160 }
1161 
1162 static void llbitmap_unplug(struct mddev *mddev, bool sync)
1163 {
1164 	DECLARE_COMPLETION_ONSTACK(done);
1165 	struct llbitmap *llbitmap = mddev->bitmap;
1166 	struct llbitmap_unplug_work unplug_work = {
1167 		.llbitmap = llbitmap,
1168 		.done = &done,
1169 	};
1170 
1171 	if (!llbitmap_dirty(llbitmap))
1172 		return;
1173 
1174 	/*
1175 	 * Issue new bitmap IO under submit_bio() context will deadlock:
1176 	 *  - the bio will wait for bitmap bio to be done, before it can be
1177 	 *  issued;
1178 	 *  - bitmap bio will be added to current->bio_list and wait for this
1179 	 *  bio to be issued;
1180 	 */
1181 	INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
1182 	queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
1183 	wait_for_completion(&done);
1184 	destroy_work_on_stack(&unplug_work.work);
1185 }
1186 
1187 /*
1188  * Force to write all bitmap pages to disk, called when stopping the array, or
1189  * every daemon_sleep seconds when sync_thread is running.
1190  */
1191 static void __llbitmap_flush(struct mddev *mddev)
1192 {
1193 	struct llbitmap *llbitmap = mddev->bitmap;
1194 	struct blk_plug plug;
1195 	int i;
1196 
1197 	blk_start_plug(&plug);
1198 	for (i = 0; i < llbitmap->nr_pages; i++) {
1199 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1200 
1201 		/* mark all blocks as dirty */
1202 		set_bit(LLPageDirty, &pctl->flags);
1203 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1204 		llbitmap_write_page(llbitmap, i);
1205 	}
1206 	blk_finish_plug(&plug);
1207 	md_super_wait(llbitmap->mddev);
1208 }
1209 
1210 static void llbitmap_flush(struct mddev *mddev)
1211 {
1212 	struct llbitmap *llbitmap = mddev->bitmap;
1213 	int i;
1214 
1215 	for (i = 0; i < llbitmap->nr_pages; i++)
1216 		set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
1217 
1218 	timer_delete_sync(&llbitmap->pending_timer);
1219 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1220 	flush_work(&llbitmap->daemon_work);
1221 
1222 	__llbitmap_flush(mddev);
1223 }
1224 
1225 /* This is used for raid5 lazy initial recovery */
1226 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
1227 {
1228 	struct llbitmap *llbitmap = mddev->bitmap;
1229 	unsigned long p = offset >> llbitmap->chunkshift;
1230 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1231 
1232 	return c == BitClean || c == BitDirty;
1233 }
1234 
1235 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
1236 {
1237 	struct llbitmap *llbitmap = mddev->bitmap;
1238 	unsigned long p = offset >> llbitmap->chunkshift;
1239 	int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1240 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1241 
1242 	/* always skip unwritten blocks */
1243 	if (c == BitUnwritten)
1244 		return blocks;
1245 
1246 	/* For degraded array, don't skip */
1247 	if (mddev->degraded)
1248 		return 0;
1249 
1250 	/* For resync also skip clean/dirty blocks */
1251 	if ((c == BitClean || c == BitDirty) &&
1252 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
1253 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1254 		return blocks;
1255 
1256 	return 0;
1257 }
1258 
1259 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
1260 				sector_t *blocks, bool degraded)
1261 {
1262 	struct llbitmap *llbitmap = mddev->bitmap;
1263 	unsigned long p = offset >> llbitmap->chunkshift;
1264 
1265 	/*
1266 	 * Handle one bit at a time, this is much simpler. And it doesn't matter
1267 	 * if md_do_sync() loop more times.
1268 	 */
1269 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1270 	return llbitmap_state_machine(llbitmap, p, p,
1271 				      BitmapActionStartsync) == BitSyncing;
1272 }
1273 
1274 /* Something is wrong, sync_thread stop at @offset */
1275 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
1276 			      sector_t *blocks)
1277 {
1278 	struct llbitmap *llbitmap = mddev->bitmap;
1279 	unsigned long p = offset >> llbitmap->chunkshift;
1280 
1281 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1282 	llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
1283 			       BitmapActionAbortsync);
1284 }
1285 
1286 /* A full sync_thread is finished */
1287 static void llbitmap_close_sync(struct mddev *mddev)
1288 {
1289 	struct llbitmap *llbitmap = mddev->bitmap;
1290 	int i;
1291 
1292 	for (i = 0; i < llbitmap->nr_pages; i++) {
1293 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1294 
1295 		/* let daemon_fn clear dirty bits immediately */
1296 		WRITE_ONCE(pctl->expire, jiffies);
1297 	}
1298 
1299 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1300 			       BitmapActionEndsync);
1301 }
1302 
1303 /*
1304  * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
1305  * just in case sync_thread have to restart after power failure.
1306  */
1307 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1308 				   bool force)
1309 {
1310 	struct llbitmap *llbitmap = mddev->bitmap;
1311 
1312 	if (sector == 0) {
1313 		llbitmap->last_end_sync = jiffies;
1314 		return;
1315 	}
1316 
1317 	if (time_before(jiffies, llbitmap->last_end_sync +
1318 				 HZ * mddev->bitmap_info.daemon_sleep))
1319 		return;
1320 
1321 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
1322 
1323 	mddev->curr_resync_completed = sector;
1324 	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1325 	llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
1326 			       BitmapActionEndsync);
1327 	__llbitmap_flush(mddev);
1328 
1329 	llbitmap->last_end_sync = jiffies;
1330 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
1331 }
1332 
1333 static bool llbitmap_enabled(void *data, bool flush)
1334 {
1335 	struct llbitmap *llbitmap = data;
1336 
1337 	return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1338 }
1339 
1340 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1341 				unsigned long e)
1342 {
1343 	llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
1344 }
1345 
1346 static void llbitmap_write_sb(struct llbitmap *llbitmap)
1347 {
1348 	int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
1349 
1350 	bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
1351 	llbitmap_write_page(llbitmap, 0);
1352 	md_super_wait(llbitmap->mddev);
1353 }
1354 
1355 static void llbitmap_update_sb(void *data)
1356 {
1357 	struct llbitmap *llbitmap = data;
1358 	struct mddev *mddev = llbitmap->mddev;
1359 	struct page *sb_page;
1360 	bitmap_super_t *sb;
1361 
1362 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
1363 		return;
1364 
1365 	sb_page = llbitmap_read_page(llbitmap, 0);
1366 	if (IS_ERR(sb_page)) {
1367 		pr_err("%s: %s: read super block failed", __func__,
1368 		       mdname(mddev));
1369 		set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1370 		return;
1371 	}
1372 
1373 	if (mddev->events < llbitmap->events_cleared)
1374 		llbitmap->events_cleared = mddev->events;
1375 
1376 	sb = kmap_local_page(sb_page);
1377 	sb->events = cpu_to_le64(mddev->events);
1378 	sb->state = cpu_to_le32(llbitmap->flags);
1379 	sb->chunksize = cpu_to_le32(llbitmap->chunksize);
1380 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1381 	sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
1382 	sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
1383 	sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
1384 
1385 	kunmap_local(sb);
1386 	llbitmap_write_sb(llbitmap);
1387 }
1388 
1389 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
1390 {
1391 	struct llbitmap *llbitmap = data;
1392 
1393 	memset(stats, 0, sizeof(*stats));
1394 
1395 	stats->missing_pages = 0;
1396 	stats->pages = llbitmap->nr_pages;
1397 	stats->file_pages = llbitmap->nr_pages;
1398 
1399 	stats->behind_writes = atomic_read(&llbitmap->behind_writes);
1400 	stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
1401 	stats->events_cleared = llbitmap->events_cleared;
1402 
1403 	return 0;
1404 }
1405 
1406 /* just flag all pages as needing to be written */
1407 static void llbitmap_write_all(struct mddev *mddev)
1408 {
1409 	int i;
1410 	struct llbitmap *llbitmap = mddev->bitmap;
1411 
1412 	for (i = 0; i < llbitmap->nr_pages; i++) {
1413 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1414 
1415 		set_bit(LLPageDirty, &pctl->flags);
1416 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1417 	}
1418 }
1419 
1420 static void llbitmap_start_behind_write(struct mddev *mddev)
1421 {
1422 	struct llbitmap *llbitmap = mddev->bitmap;
1423 
1424 	atomic_inc(&llbitmap->behind_writes);
1425 }
1426 
1427 static void llbitmap_end_behind_write(struct mddev *mddev)
1428 {
1429 	struct llbitmap *llbitmap = mddev->bitmap;
1430 
1431 	if (atomic_dec_and_test(&llbitmap->behind_writes))
1432 		wake_up(&llbitmap->behind_wait);
1433 }
1434 
1435 static void llbitmap_wait_behind_writes(struct mddev *mddev)
1436 {
1437 	struct llbitmap *llbitmap = mddev->bitmap;
1438 
1439 	if (!llbitmap)
1440 		return;
1441 
1442 	wait_event(llbitmap->behind_wait,
1443 		   atomic_read(&llbitmap->behind_writes) == 0);
1444 
1445 }
1446 
1447 static ssize_t bits_show(struct mddev *mddev, char *page)
1448 {
1449 	struct llbitmap *llbitmap;
1450 	int bits[BitStateCount] = {0};
1451 	loff_t start = 0;
1452 
1453 	mutex_lock(&mddev->bitmap_info.mutex);
1454 	llbitmap = mddev->bitmap;
1455 	if (!llbitmap || !llbitmap->pctl) {
1456 		mutex_unlock(&mddev->bitmap_info.mutex);
1457 		return sprintf(page, "no bitmap\n");
1458 	}
1459 
1460 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
1461 		mutex_unlock(&mddev->bitmap_info.mutex);
1462 		return sprintf(page, "bitmap io error\n");
1463 	}
1464 
1465 	while (start < llbitmap->chunks) {
1466 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
1467 
1468 		if (c < 0 || c >= BitStateCount)
1469 			pr_err("%s: invalid bit %llu state %d\n",
1470 			       __func__, start, c);
1471 		else
1472 			bits[c]++;
1473 		start++;
1474 	}
1475 
1476 	mutex_unlock(&mddev->bitmap_info.mutex);
1477 	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
1478 		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
1479 		       bits[BitNeedSync], bits[BitSyncing]);
1480 }
1481 
1482 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
1483 
1484 static ssize_t metadata_show(struct mddev *mddev, char *page)
1485 {
1486 	struct llbitmap *llbitmap;
1487 	ssize_t ret;
1488 
1489 	mutex_lock(&mddev->bitmap_info.mutex);
1490 	llbitmap = mddev->bitmap;
1491 	if (!llbitmap) {
1492 		mutex_unlock(&mddev->bitmap_info.mutex);
1493 		return sprintf(page, "no bitmap\n");
1494 	}
1495 
1496 	ret =  sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
1497 		       llbitmap->chunksize, llbitmap->chunkshift,
1498 		       llbitmap->chunks, mddev->bitmap_info.offset,
1499 		       llbitmap->mddev->bitmap_info.daemon_sleep);
1500 	mutex_unlock(&mddev->bitmap_info.mutex);
1501 
1502 	return ret;
1503 }
1504 
1505 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
1506 
1507 static ssize_t
1508 daemon_sleep_show(struct mddev *mddev, char *page)
1509 {
1510 	return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
1511 }
1512 
1513 static ssize_t
1514 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
1515 {
1516 	unsigned long timeout;
1517 	int rv = kstrtoul(buf, 10, &timeout);
1518 
1519 	if (rv)
1520 		return rv;
1521 
1522 	mddev->bitmap_info.daemon_sleep = timeout;
1523 	return len;
1524 }
1525 
1526 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
1527 
1528 static ssize_t
1529 barrier_idle_show(struct mddev *mddev, char *page)
1530 {
1531 	struct llbitmap *llbitmap = mddev->bitmap;
1532 
1533 	return sprintf(page, "%lu\n", llbitmap->barrier_idle);
1534 }
1535 
1536 static ssize_t
1537 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
1538 {
1539 	struct llbitmap *llbitmap = mddev->bitmap;
1540 	unsigned long timeout;
1541 	int rv = kstrtoul(buf, 10, &timeout);
1542 
1543 	if (rv)
1544 		return rv;
1545 
1546 	llbitmap->barrier_idle = timeout;
1547 	return len;
1548 }
1549 
1550 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
1551 
1552 static struct attribute *md_llbitmap_attrs[] = {
1553 	&llbitmap_bits.attr,
1554 	&llbitmap_metadata.attr,
1555 	&llbitmap_daemon_sleep.attr,
1556 	&llbitmap_barrier_idle.attr,
1557 	NULL
1558 };
1559 
1560 static struct attribute_group md_llbitmap_group = {
1561 	.name = "llbitmap",
1562 	.attrs = md_llbitmap_attrs,
1563 };
1564 
1565 static struct bitmap_operations llbitmap_ops = {
1566 	.head = {
1567 		.type	= MD_BITMAP,
1568 		.id	= ID_LLBITMAP,
1569 		.name	= "llbitmap",
1570 	},
1571 
1572 	.enabled		= llbitmap_enabled,
1573 	.create			= llbitmap_create,
1574 	.resize			= llbitmap_resize,
1575 	.load			= llbitmap_load,
1576 	.destroy		= llbitmap_destroy,
1577 
1578 	.start_write		= llbitmap_start_write,
1579 	.end_write		= llbitmap_end_write,
1580 	.start_discard		= llbitmap_start_discard,
1581 	.end_discard		= llbitmap_end_discard,
1582 	.unplug			= llbitmap_unplug,
1583 	.flush			= llbitmap_flush,
1584 
1585 	.start_behind_write	= llbitmap_start_behind_write,
1586 	.end_behind_write	= llbitmap_end_behind_write,
1587 	.wait_behind_writes	= llbitmap_wait_behind_writes,
1588 
1589 	.blocks_synced		= llbitmap_blocks_synced,
1590 	.skip_sync_blocks	= llbitmap_skip_sync_blocks,
1591 	.start_sync		= llbitmap_start_sync,
1592 	.end_sync		= llbitmap_end_sync,
1593 	.close_sync		= llbitmap_close_sync,
1594 	.cond_end_sync		= llbitmap_cond_end_sync,
1595 
1596 	.update_sb		= llbitmap_update_sb,
1597 	.get_stats		= llbitmap_get_stats,
1598 	.dirty_bits		= llbitmap_dirty_bits,
1599 	.write_all		= llbitmap_write_all,
1600 
1601 	.group			= &md_llbitmap_group,
1602 };
1603 
1604 int md_llbitmap_init(void)
1605 {
1606 	md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
1607 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1608 	if (!md_llbitmap_io_wq)
1609 		return -ENOMEM;
1610 
1611 	md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
1612 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1613 	if (!md_llbitmap_unplug_wq) {
1614 		destroy_workqueue(md_llbitmap_io_wq);
1615 		md_llbitmap_io_wq = NULL;
1616 		return -ENOMEM;
1617 	}
1618 
1619 	return register_md_submodule(&llbitmap_ops.head);
1620 }
1621 
1622 void md_llbitmap_exit(void)
1623 {
1624 	destroy_workqueue(md_llbitmap_io_wq);
1625 	md_llbitmap_io_wq = NULL;
1626 	destroy_workqueue(md_llbitmap_unplug_wq);
1627 	md_llbitmap_unplug_wq = NULL;
1628 	unregister_md_submodule(&llbitmap_ops.head);
1629 }
1630