xref: /linux/drivers/md/md-llbitmap.c (revision 23b0f90ba871f096474e1c27c3d14f455189d2d9)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/blkdev.h>
4 #include <linux/module.h>
5 #include <linux/errno.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/timer.h>
9 #include <linux/sched.h>
10 #include <linux/list.h>
11 #include <linux/file.h>
12 #include <linux/seq_file.h>
13 #include <trace/events/block.h>
14 
15 #include "md.h"
16 #include "md-bitmap.h"
17 
18 /*
19  * #### Background
20  *
21  * Redundant data is used to enhance data fault tolerance, and the storage
22  * methods for redundant data vary depending on the RAID levels. And it's
23  * important to maintain the consistency of redundant data.
24  *
25  * Bitmap is used to record which data blocks have been synchronized and which
26  * ones need to be resynchronized or recovered. Each bit in the bitmap
27  * represents a segment of data in the array. When a bit is set, it indicates
28  * that the multiple redundant copies of that data segment may not be
29  * consistent. Data synchronization can be performed based on the bitmap after
30  * power failure or readding a disk. If there is no bitmap, a full disk
31  * synchronization is required.
32  *
33  * #### Key Features
34  *
35  *  - IO fastpath is lockless, if user issues lots of write IO to the same
36  *  bitmap bit in a short time, only the first write has additional overhead
37  *  to update bitmap bit, no additional overhead for the following writes;
38  *  - support only resync or recover written data, means in the case creating
39  *  new array or replacing with a new disk, there is no need to do a full disk
40  *  resync/recovery;
41  *
42  * #### Key Concept
43  *
44  * ##### State Machine
45  *
46  * Each bit is one byte, contain 6 different states, see llbitmap_state. And
47  * there are total 8 different actions, see llbitmap_action, can change state:
48  *
49  * llbitmap state machine: transitions between states
50  *
51  * |           | Startwrite | Startsync | Endsync | Abortsync|
52  * | --------- | ---------- | --------- | ------- | -------  |
53  * | Unwritten | Dirty      | x         | x       | x        |
54  * | Clean     | Dirty      | x         | x       | x        |
55  * | Dirty     | x          | x         | x       | x        |
56  * | NeedSync  | x          | Syncing   | x       | x        |
57  * | Syncing   | x          | Syncing   | Dirty   | NeedSync |
58  *
59  * |           | Reload   | Daemon | Discard   | Stale     |
60  * | --------- | -------- | ------ | --------- | --------- |
61  * | Unwritten | x        | x      | x         | x         |
62  * | Clean     | x        | x      | Unwritten | NeedSync  |
63  * | Dirty     | NeedSync | Clean  | Unwritten | NeedSync  |
64  * | NeedSync  | x        | x      | Unwritten | x         |
65  * | Syncing   | NeedSync | x      | Unwritten | NeedSync  |
66  *
67  * Typical scenarios:
68  *
69  * 1) Create new array
70  * All bits will be set to Unwritten by default, if --assume-clean is set,
71  * all bits will be set to Clean instead.
72  *
73  * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
74  * rely on xor data
75  *
76  * 2.1) write new data to raid1/raid10:
77  * Unwritten --StartWrite--> Dirty
78  *
79  * 2.2) write new data to raid456:
80  * Unwritten --StartWrite--> NeedSync
81  *
82  * Because the initial recover for raid456 is skipped, the xor data is not built
83  * yet, the bit must be set to NeedSync first and after lazy initial recover is
84  * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
85  *
86  * 2.3) cover write
87  * Clean --StartWrite--> Dirty
88  *
89  * 3) daemon, if the array is not degraded:
90  * Dirty --Daemon--> Clean
91  *
92  * 4) discard
93  * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
94  *
95  * 5) resync and recover
96  *
97  * 5.1) common process
98  * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
99  *
100  * 5.2) resync after power failure
101  * Dirty --Reload--> NeedSync
102  *
103  * 5.3) recover while replacing with a new disk
104  * By default, the old bitmap framework will recover all data, and llbitmap
105  * implements this by a new helper, see llbitmap_skip_sync_blocks:
106  *
107  * skip recover for bits other than dirty or clean;
108  *
109  * 5.4) lazy initial recover for raid5:
110  * By default, the old bitmap framework will only allow new recover when there
111  * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
112  * to perform raid456 lazy recover for set bits(from 2.2).
113  *
114  * 6. special handling for degraded array:
115  *
116  * - Dirty bits will never be cleared, daemon will just do nothing, so that if
117  *   a disk is readded, Clean bits can be skipped with recovery;
118  * - Dirty bits will convert to Syncing from start write, to do data recovery
119  *   for new added disks;
120  * - New write will convert bits to NeedSync directly;
121  *
122  * ##### Bitmap IO
123  *
124  * ##### Chunksize
125  *
126  * The default bitmap size is 128k, incluing 1k bitmap super block, and
127  * the default size of segment of data in the array each bit(chunksize) is 64k,
128  * and chunksize will adjust to twice the old size each time if the total number
129  * bits is not less than 127k.(see llbitmap_init)
130  *
131  * ##### READ
132  *
133  * While creating bitmap, all pages will be allocated and read for llbitmap,
134  * there won't be read afterwards
135  *
136  * ##### WRITE
137  *
138  * WRITE IO is divided into logical_block_size of the array, the dirty state
139  * of each block is tracked independently, for example:
140  *
141  * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
142  *
143  * | page0 | page1 | ... | page 31 |
144  * |       |
145  * |        \-----------------------\
146  * |                                |
147  * | block0 | block1 | ... | block 8|
148  * |        |
149  * |         \-----------------\
150  * |                            |
151  * | bit0 | bit1 | ... | bit511 |
152  *
153  * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
154  * subpage will be marked dirty, such block must write first before the IO is
155  * issued. This behaviour will affect IO performance, to reduce the impact, if
156  * multiple bits are changed in the same block in a short time, all bits in this
157  * block will be changed to Dirty/NeedSync, so that there won't be any overhead
158  * until daemon clears dirty bits.
159  *
160  * ##### Dirty Bits synchronization
161  *
162  * IO fast path will set bits to dirty, and those dirty bits will be cleared
163  * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
164  * IO path and daemon;
165  *
166  * IO path:
167  *  1) try to grab a reference, if succeed, set expire time after 5s and return;
168  *  2) if failed to grab a reference, wait for daemon to finish clearing dirty
169  *  bits;
170  *
171  * Daemon (Daemon will be woken up every daemon_sleep seconds):
172  * For each page:
173  *  1) check if page expired, if not skip this page; for expired page:
174  *  2) suspend the page and wait for inflight write IO to be done;
175  *  3) change dirty page to clean;
176  *  4) resume the page;
177  */
178 
179 #define BITMAP_DATA_OFFSET 1024
180 
181 /* 64k is the max IO size of sync IO for raid1/raid10 */
182 #define MIN_CHUNK_SIZE (64 * 2)
183 
184 /* By default, daemon will be woken up every 30s */
185 #define DEFAULT_DAEMON_SLEEP 30
186 
187 /*
188  * Dirtied bits that have not been accessed for more than 5s will be cleared
189  * by daemon.
190  */
191 #define DEFAULT_BARRIER_IDLE 5
192 
193 enum llbitmap_state {
194 	/* No valid data, init state after assemble the array */
195 	BitUnwritten = 0,
196 	/* data is consistent */
197 	BitClean,
198 	/* data will be consistent after IO is done, set directly for writes */
199 	BitDirty,
200 	/*
201 	 * data need to be resynchronized:
202 	 * 1) set directly for writes if array is degraded, prevent full disk
203 	 * synchronization after readding a disk;
204 	 * 2) reassemble the array after power failure, and dirty bits are
205 	 * found after reloading the bitmap;
206 	 * 3) set for first write for raid5, to build initial xor data lazily
207 	 */
208 	BitNeedSync,
209 	/* data is synchronizing */
210 	BitSyncing,
211 	BitStateCount,
212 	BitNone = 0xff,
213 };
214 
215 enum llbitmap_action {
216 	/* User write new data, this is the only action from IO fast path */
217 	BitmapActionStartwrite = 0,
218 	/* Start recovery */
219 	BitmapActionStartsync,
220 	/* Finish recovery */
221 	BitmapActionEndsync,
222 	/* Failed recovery */
223 	BitmapActionAbortsync,
224 	/* Reassemble the array */
225 	BitmapActionReload,
226 	/* Daemon thread is trying to clear dirty bits */
227 	BitmapActionDaemon,
228 	/* Data is deleted */
229 	BitmapActionDiscard,
230 	/*
231 	 * Bitmap is stale, mark all bits in addition to BitUnwritten to
232 	 * BitNeedSync.
233 	 */
234 	BitmapActionStale,
235 	BitmapActionCount,
236 	/* Init state is BitUnwritten */
237 	BitmapActionInit,
238 };
239 
240 enum llbitmap_page_state {
241 	LLPageFlush = 0,
242 	LLPageDirty,
243 };
244 
245 struct llbitmap_page_ctl {
246 	char *state;
247 	struct page *page;
248 	unsigned long expire;
249 	unsigned long flags;
250 	wait_queue_head_t wait;
251 	struct percpu_ref active;
252 	/* Per block size dirty state, maximum 64k page / 1 sector = 128 */
253 	unsigned long dirty[];
254 };
255 
256 struct llbitmap {
257 	struct mddev *mddev;
258 	struct llbitmap_page_ctl **pctl;
259 
260 	unsigned int nr_pages;
261 	unsigned int io_size;
262 	unsigned int blocks_per_page;
263 
264 	/* shift of one chunk */
265 	unsigned long chunkshift;
266 	/* size of one chunk in sector */
267 	unsigned long chunksize;
268 	/* total number of chunks */
269 	unsigned long chunks;
270 	unsigned long last_end_sync;
271 	/*
272 	 * time in seconds that dirty bits will be cleared if the page is not
273 	 * accessed.
274 	 */
275 	unsigned long barrier_idle;
276 	/* fires on first BitDirty state */
277 	struct timer_list pending_timer;
278 	struct work_struct daemon_work;
279 
280 	unsigned long flags;
281 	__u64	events_cleared;
282 
283 	/* for slow disks */
284 	atomic_t behind_writes;
285 	wait_queue_head_t behind_wait;
286 };
287 
288 struct llbitmap_unplug_work {
289 	struct work_struct work;
290 	struct llbitmap *llbitmap;
291 	struct completion *done;
292 };
293 
294 static struct workqueue_struct *md_llbitmap_io_wq;
295 static struct workqueue_struct *md_llbitmap_unplug_wq;
296 
297 static char state_machine[BitStateCount][BitmapActionCount] = {
298 	[BitUnwritten] = {
299 		[BitmapActionStartwrite]	= BitDirty,
300 		[BitmapActionStartsync]		= BitNone,
301 		[BitmapActionEndsync]		= BitNone,
302 		[BitmapActionAbortsync]		= BitNone,
303 		[BitmapActionReload]		= BitNone,
304 		[BitmapActionDaemon]		= BitNone,
305 		[BitmapActionDiscard]		= BitNone,
306 		[BitmapActionStale]		= BitNone,
307 	},
308 	[BitClean] = {
309 		[BitmapActionStartwrite]	= BitDirty,
310 		[BitmapActionStartsync]		= BitNone,
311 		[BitmapActionEndsync]		= BitNone,
312 		[BitmapActionAbortsync]		= BitNone,
313 		[BitmapActionReload]		= BitNone,
314 		[BitmapActionDaemon]		= BitNone,
315 		[BitmapActionDiscard]		= BitUnwritten,
316 		[BitmapActionStale]		= BitNeedSync,
317 	},
318 	[BitDirty] = {
319 		[BitmapActionStartwrite]	= BitNone,
320 		[BitmapActionStartsync]		= BitNone,
321 		[BitmapActionEndsync]		= BitNone,
322 		[BitmapActionAbortsync]		= BitNone,
323 		[BitmapActionReload]		= BitNeedSync,
324 		[BitmapActionDaemon]		= BitClean,
325 		[BitmapActionDiscard]		= BitUnwritten,
326 		[BitmapActionStale]		= BitNeedSync,
327 	},
328 	[BitNeedSync] = {
329 		[BitmapActionStartwrite]	= BitNone,
330 		[BitmapActionStartsync]		= BitSyncing,
331 		[BitmapActionEndsync]		= BitNone,
332 		[BitmapActionAbortsync]		= BitNone,
333 		[BitmapActionReload]		= BitNone,
334 		[BitmapActionDaemon]		= BitNone,
335 		[BitmapActionDiscard]		= BitUnwritten,
336 		[BitmapActionStale]		= BitNone,
337 	},
338 	[BitSyncing] = {
339 		[BitmapActionStartwrite]	= BitNone,
340 		[BitmapActionStartsync]		= BitSyncing,
341 		[BitmapActionEndsync]		= BitDirty,
342 		[BitmapActionAbortsync]		= BitNeedSync,
343 		[BitmapActionReload]		= BitNeedSync,
344 		[BitmapActionDaemon]		= BitNone,
345 		[BitmapActionDiscard]		= BitUnwritten,
346 		[BitmapActionStale]		= BitNeedSync,
347 	},
348 };
349 
350 static void __llbitmap_flush(struct mddev *mddev);
351 
352 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
353 {
354 	unsigned int idx;
355 	unsigned int offset;
356 
357 	pos += BITMAP_DATA_OFFSET;
358 	idx = pos >> PAGE_SHIFT;
359 	offset = offset_in_page(pos);
360 
361 	return llbitmap->pctl[idx]->state[offset];
362 }
363 
364 /* set all the bits in the subpage as dirty */
365 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
366 				       struct llbitmap_page_ctl *pctl,
367 				       unsigned int block)
368 {
369 	bool level_456 = raid_is_456(llbitmap->mddev);
370 	unsigned int io_size = llbitmap->io_size;
371 	int pos;
372 
373 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
374 		switch (pctl->state[pos]) {
375 		case BitUnwritten:
376 			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
377 			break;
378 		case BitClean:
379 			pctl->state[pos] = BitDirty;
380 			break;
381 		}
382 	}
383 }
384 
385 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
386 				    int offset)
387 {
388 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
389 	unsigned int io_size = llbitmap->io_size;
390 	int block = offset / io_size;
391 	int pos;
392 
393 	if (!test_bit(LLPageDirty, &pctl->flags))
394 		set_bit(LLPageDirty, &pctl->flags);
395 
396 	/*
397 	 * For degraded array, dirty bits will never be cleared, and we must
398 	 * resync all the dirty bits, hence skip infect new dirty bits to
399 	 * prevent resync unnecessary data.
400 	 */
401 	if (llbitmap->mddev->degraded) {
402 		set_bit(block, pctl->dirty);
403 		return;
404 	}
405 
406 	/*
407 	 * The subpage usually contains a total of 512 bits. If any single bit
408 	 * within the subpage is marked as dirty, the entire sector will be
409 	 * written. To avoid impacting write performance, when multiple bits
410 	 * within the same sector are modified within llbitmap->barrier_idle,
411 	 * all bits in the sector will be collectively marked as dirty at once.
412 	 */
413 	if (test_and_set_bit(block, pctl->dirty)) {
414 		llbitmap_infect_dirty_bits(llbitmap, pctl, block);
415 		return;
416 	}
417 
418 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
419 		if (pos == offset)
420 			continue;
421 		if (pctl->state[pos] == BitDirty ||
422 		    pctl->state[pos] == BitNeedSync) {
423 			llbitmap_infect_dirty_bits(llbitmap, pctl, block);
424 			return;
425 		}
426 	}
427 }
428 
429 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
430 			   loff_t pos)
431 {
432 	unsigned int idx;
433 	unsigned int bit;
434 
435 	pos += BITMAP_DATA_OFFSET;
436 	idx = pos >> PAGE_SHIFT;
437 	bit = offset_in_page(pos);
438 
439 	llbitmap->pctl[idx]->state[bit] = state;
440 	if (state == BitDirty || state == BitNeedSync)
441 		llbitmap_set_page_dirty(llbitmap, idx, bit);
442 }
443 
444 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
445 {
446 	struct mddev *mddev = llbitmap->mddev;
447 	struct page *page = NULL;
448 	struct md_rdev *rdev;
449 
450 	if (llbitmap->pctl && llbitmap->pctl[idx])
451 		page = llbitmap->pctl[idx]->page;
452 	if (page)
453 		return page;
454 
455 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
456 	if (!page)
457 		return ERR_PTR(-ENOMEM);
458 
459 	rdev_for_each(rdev, mddev) {
460 		sector_t sector;
461 
462 		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
463 			continue;
464 
465 		sector = mddev->bitmap_info.offset +
466 			 (idx << PAGE_SECTORS_SHIFT);
467 
468 		if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
469 				 true))
470 			return page;
471 
472 		md_error(mddev, rdev);
473 	}
474 
475 	__free_page(page);
476 	return ERR_PTR(-EIO);
477 }
478 
479 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
480 {
481 	struct page *page = llbitmap->pctl[idx]->page;
482 	struct mddev *mddev = llbitmap->mddev;
483 	struct md_rdev *rdev;
484 	int block;
485 
486 	for (block = 0; block < llbitmap->blocks_per_page; block++) {
487 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
488 
489 		if (!test_and_clear_bit(block, pctl->dirty))
490 			continue;
491 
492 		rdev_for_each(rdev, mddev) {
493 			sector_t sector;
494 			sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
495 
496 			if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
497 				continue;
498 
499 			sector = mddev->bitmap_info.offset + rdev->sb_start +
500 				 (idx << PAGE_SECTORS_SHIFT) +
501 				 block * bit_sector;
502 			md_write_metadata(mddev, rdev, sector,
503 					  llbitmap->io_size, page,
504 					  block * llbitmap->io_size);
505 		}
506 	}
507 }
508 
509 static void active_release(struct percpu_ref *ref)
510 {
511 	struct llbitmap_page_ctl *pctl =
512 		container_of(ref, struct llbitmap_page_ctl, active);
513 
514 	wake_up(&pctl->wait);
515 }
516 
517 static void llbitmap_free_pages(struct llbitmap *llbitmap)
518 {
519 	int i;
520 
521 	if (!llbitmap->pctl)
522 		return;
523 
524 	for (i = 0; i < llbitmap->nr_pages; i++) {
525 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
526 
527 		if (!pctl || !pctl->page)
528 			break;
529 
530 		__free_page(pctl->page);
531 		percpu_ref_exit(&pctl->active);
532 	}
533 
534 	kfree(llbitmap->pctl[0]);
535 	kfree(llbitmap->pctl);
536 	llbitmap->pctl = NULL;
537 }
538 
539 static int llbitmap_cache_pages(struct llbitmap *llbitmap)
540 {
541 	struct llbitmap_page_ctl *pctl;
542 	unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
543 					     BITMAP_DATA_OFFSET, PAGE_SIZE);
544 	unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
545 						llbitmap->blocks_per_page));
546 	int i;
547 
548 	llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
549 				       GFP_KERNEL | __GFP_ZERO);
550 	if (!llbitmap->pctl)
551 		return -ENOMEM;
552 
553 	size = round_up(size, cache_line_size());
554 	pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
555 	if (!pctl) {
556 		kfree(llbitmap->pctl);
557 		return -ENOMEM;
558 	}
559 
560 	llbitmap->nr_pages = nr_pages;
561 
562 	for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
563 		struct page *page = llbitmap_read_page(llbitmap, i);
564 
565 		llbitmap->pctl[i] = pctl;
566 
567 		if (IS_ERR(page)) {
568 			llbitmap_free_pages(llbitmap);
569 			return PTR_ERR(page);
570 		}
571 
572 		if (percpu_ref_init(&pctl->active, active_release,
573 				    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
574 			__free_page(page);
575 			llbitmap_free_pages(llbitmap);
576 			return -ENOMEM;
577 		}
578 
579 		pctl->page = page;
580 		pctl->state = page_address(page);
581 		init_waitqueue_head(&pctl->wait);
582 	}
583 
584 	return 0;
585 }
586 
587 static void llbitmap_init_state(struct llbitmap *llbitmap)
588 {
589 	enum llbitmap_state state = BitUnwritten;
590 	unsigned long i;
591 
592 	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
593 		state = BitClean;
594 
595 	for (i = 0; i < llbitmap->chunks; i++)
596 		llbitmap_write(llbitmap, state, i);
597 }
598 
599 /* The return value is only used from resync, where @start == @end. */
600 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
601 						  unsigned long start,
602 						  unsigned long end,
603 						  enum llbitmap_action action)
604 {
605 	struct mddev *mddev = llbitmap->mddev;
606 	enum llbitmap_state state = BitNone;
607 	bool level_456 = raid_is_456(llbitmap->mddev);
608 	bool need_resync = false;
609 	bool need_recovery = false;
610 
611 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
612 		return BitNone;
613 
614 	if (action == BitmapActionInit) {
615 		llbitmap_init_state(llbitmap);
616 		return BitNone;
617 	}
618 
619 	while (start <= end) {
620 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
621 
622 		if (c < 0 || c >= BitStateCount) {
623 			pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
624 			       __func__, start, c, action);
625 			state = BitNeedSync;
626 			goto write_bitmap;
627 		}
628 
629 		if (c == BitNeedSync)
630 			need_resync = !mddev->degraded;
631 
632 		state = state_machine[c][action];
633 
634 write_bitmap:
635 		if (unlikely(mddev->degraded)) {
636 			/* For degraded array, mark new data as need sync. */
637 			if (state == BitDirty &&
638 			    action == BitmapActionStartwrite)
639 				state = BitNeedSync;
640 			/*
641 			 * For degraded array, resync dirty data as well, noted
642 			 * if array is still degraded after resync is done, all
643 			 * new data will still be dirty until array is clean.
644 			 */
645 			else if (c == BitDirty &&
646 				action == BitmapActionStartsync)
647 				state = BitSyncing;
648 		} else if (c == BitUnwritten && state == BitDirty &&
649 			   action == BitmapActionStartwrite && level_456) {
650 			/* Delay raid456 initial recovery to first write. */
651 			state = BitNeedSync;
652 		}
653 
654 		if (state == BitNone) {
655 			start++;
656 			continue;
657 		}
658 
659 		llbitmap_write(llbitmap, state, start);
660 
661 		if (state == BitNeedSync)
662 			need_resync = !mddev->degraded;
663 		else if (state == BitDirty &&
664 			 !timer_pending(&llbitmap->pending_timer))
665 			mod_timer(&llbitmap->pending_timer,
666 				  jiffies + mddev->bitmap_info.daemon_sleep * HZ);
667 
668 		start++;
669 	}
670 
671 	if (need_resync && level_456)
672 		need_recovery = true;
673 
674 	if (need_recovery) {
675 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
676 		set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
677 		md_wakeup_thread(mddev->thread);
678 	} else if (need_resync) {
679 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
680 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
681 		md_wakeup_thread(mddev->thread);
682 	}
683 
684 	return state;
685 }
686 
687 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
688 {
689 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
690 
691 retry:
692 	if (likely(percpu_ref_tryget_live(&pctl->active))) {
693 		WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
694 		return;
695 	}
696 
697 	wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
698 	goto retry;
699 }
700 
701 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
702 {
703 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
704 
705 	percpu_ref_put(&pctl->active);
706 }
707 
708 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
709 {
710 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
711 
712 	percpu_ref_kill(&pctl->active);
713 
714 	if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
715 			llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
716 		percpu_ref_resurrect(&pctl->active);
717 		return -ETIMEDOUT;
718 	}
719 
720 	return 0;
721 }
722 
723 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
724 {
725 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
726 
727 	pctl->expire = LONG_MAX;
728 	percpu_ref_resurrect(&pctl->active);
729 	wake_up(&pctl->wait);
730 }
731 
732 static int llbitmap_check_support(struct mddev *mddev)
733 {
734 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
735 		pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
736 			  mdname(mddev));
737 		return -EBUSY;
738 	}
739 
740 	if (mddev->bitmap_info.space == 0) {
741 		if (mddev->bitmap_info.default_space == 0) {
742 			pr_notice("md/llbitmap: %s: no space for bitmap\n",
743 				  mdname(mddev));
744 			return -ENOSPC;
745 		}
746 	}
747 
748 	if (!mddev->persistent) {
749 		pr_notice("md/llbitmap: %s: array must be persistent\n",
750 			  mdname(mddev));
751 		return -EOPNOTSUPP;
752 	}
753 
754 	if (mddev->bitmap_info.file) {
755 		pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
756 			  mdname(mddev));
757 		return -EOPNOTSUPP;
758 	}
759 
760 	if (mddev->bitmap_info.external) {
761 		pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
762 			  mdname(mddev));
763 		return -EOPNOTSUPP;
764 	}
765 
766 	if (mddev_is_dm(mddev)) {
767 		pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
768 			  mdname(mddev));
769 		return -EOPNOTSUPP;
770 	}
771 
772 	return 0;
773 }
774 
775 static int llbitmap_init(struct llbitmap *llbitmap)
776 {
777 	struct mddev *mddev = llbitmap->mddev;
778 	sector_t blocks = mddev->resync_max_sectors;
779 	unsigned long chunksize = MIN_CHUNK_SIZE;
780 	unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
781 	unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
782 	int ret;
783 
784 	while (chunks > space) {
785 		chunksize = chunksize << 1;
786 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
787 	}
788 
789 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
790 	llbitmap->chunkshift = ffz(~chunksize);
791 	llbitmap->chunksize = chunksize;
792 	llbitmap->chunks = chunks;
793 	mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
794 
795 	ret = llbitmap_cache_pages(llbitmap);
796 	if (ret)
797 		return ret;
798 
799 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
800 			       BitmapActionInit);
801 	/* flush initial llbitmap to disk */
802 	__llbitmap_flush(mddev);
803 
804 	return 0;
805 }
806 
807 static int llbitmap_read_sb(struct llbitmap *llbitmap)
808 {
809 	struct mddev *mddev = llbitmap->mddev;
810 	unsigned long daemon_sleep;
811 	unsigned long chunksize;
812 	unsigned long events;
813 	struct page *sb_page;
814 	bitmap_super_t *sb;
815 	int ret = -EINVAL;
816 
817 	if (!mddev->bitmap_info.offset) {
818 		pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
819 		return -EINVAL;
820 	}
821 
822 	sb_page = llbitmap_read_page(llbitmap, 0);
823 	if (IS_ERR(sb_page)) {
824 		pr_err("md/llbitmap: %s: read super block failed",
825 		       mdname(mddev));
826 		return -EIO;
827 	}
828 
829 	sb = kmap_local_page(sb_page);
830 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
831 		pr_err("md/llbitmap: %s: invalid super block magic number",
832 		       mdname(mddev));
833 		goto out_put_page;
834 	}
835 
836 	if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
837 		pr_err("md/llbitmap: %s: invalid super block version",
838 		       mdname(mddev));
839 		goto out_put_page;
840 	}
841 
842 	if (memcmp(sb->uuid, mddev->uuid, 16)) {
843 		pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
844 		       mdname(mddev));
845 		goto out_put_page;
846 	}
847 
848 	if (mddev->bitmap_info.space == 0) {
849 		int room = le32_to_cpu(sb->sectors_reserved);
850 
851 		if (room)
852 			mddev->bitmap_info.space = room;
853 		else
854 			mddev->bitmap_info.space = mddev->bitmap_info.default_space;
855 	}
856 	llbitmap->flags = le32_to_cpu(sb->state);
857 	if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
858 		ret = llbitmap_init(llbitmap);
859 		goto out_put_page;
860 	}
861 
862 	chunksize = le32_to_cpu(sb->chunksize);
863 	if (!is_power_of_2(chunksize)) {
864 		pr_err("md/llbitmap: %s: chunksize not a power of 2",
865 		       mdname(mddev));
866 		goto out_put_page;
867 	}
868 
869 	if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
870 					      mddev->bitmap_info.space << SECTOR_SHIFT)) {
871 		pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
872 		       mdname(mddev), chunksize, mddev->resync_max_sectors,
873 		       mddev->bitmap_info.space);
874 		goto out_put_page;
875 	}
876 
877 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
878 	if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
879 		pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
880 		       mdname(mddev), daemon_sleep);
881 		goto out_put_page;
882 	}
883 
884 	events = le64_to_cpu(sb->events);
885 	if (events < mddev->events) {
886 		pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
887 			mdname(mddev), events, mddev->events);
888 		set_bit(BITMAP_STALE, &llbitmap->flags);
889 	}
890 
891 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
892 	mddev->bitmap_info.chunksize = chunksize;
893 	mddev->bitmap_info.daemon_sleep = daemon_sleep;
894 
895 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
896 	llbitmap->chunksize = chunksize;
897 	llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
898 	llbitmap->chunkshift = ffz(~chunksize);
899 	ret = llbitmap_cache_pages(llbitmap);
900 
901 out_put_page:
902 	__free_page(sb_page);
903 	kunmap_local(sb);
904 	return ret;
905 }
906 
907 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
908 {
909 	struct llbitmap *llbitmap =
910 		container_of(pending_timer, struct llbitmap, pending_timer);
911 
912 	if (work_busy(&llbitmap->daemon_work)) {
913 		pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
914 			mdname(llbitmap->mddev),
915 			llbitmap->mddev->bitmap_info.daemon_sleep);
916 		set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
917 		return;
918 	}
919 
920 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
921 }
922 
923 static void md_llbitmap_daemon_fn(struct work_struct *work)
924 {
925 	struct llbitmap *llbitmap =
926 		container_of(work, struct llbitmap, daemon_work);
927 	unsigned long start;
928 	unsigned long end;
929 	bool restart;
930 	int idx;
931 
932 	if (llbitmap->mddev->degraded)
933 		return;
934 retry:
935 	start = 0;
936 	end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
937 	restart = false;
938 
939 	for (idx = 0; idx < llbitmap->nr_pages; idx++) {
940 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
941 
942 		if (idx > 0) {
943 			start = end + 1;
944 			end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
945 		}
946 
947 		if (!test_bit(LLPageFlush, &pctl->flags) &&
948 		    time_before(jiffies, pctl->expire)) {
949 			restart = true;
950 			continue;
951 		}
952 
953 		if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
954 			pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
955 				mdname(llbitmap->mddev), __func__, idx);
956 			continue;
957 		}
958 
959 		llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
960 		llbitmap_resume(llbitmap, idx);
961 	}
962 
963 	/*
964 	 * If the daemon took a long time to finish, retry to prevent missing
965 	 * clearing dirty bits.
966 	 */
967 	if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
968 		goto retry;
969 
970 	/* If some page is dirty but not expired, setup timer again */
971 	if (restart)
972 		mod_timer(&llbitmap->pending_timer,
973 			  jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
974 }
975 
976 static int llbitmap_create(struct mddev *mddev)
977 {
978 	struct llbitmap *llbitmap;
979 	int ret;
980 
981 	ret = llbitmap_check_support(mddev);
982 	if (ret)
983 		return ret;
984 
985 	llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
986 	if (!llbitmap)
987 		return -ENOMEM;
988 
989 	llbitmap->mddev = mddev;
990 	llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
991 	llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
992 
993 	timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
994 	INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
995 	atomic_set(&llbitmap->behind_writes, 0);
996 	init_waitqueue_head(&llbitmap->behind_wait);
997 
998 	mutex_lock(&mddev->bitmap_info.mutex);
999 	mddev->bitmap = llbitmap;
1000 	ret = llbitmap_read_sb(llbitmap);
1001 	mutex_unlock(&mddev->bitmap_info.mutex);
1002 	if (ret) {
1003 		kfree(llbitmap);
1004 		mddev->bitmap = NULL;
1005 	}
1006 
1007 	return ret;
1008 }
1009 
1010 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
1011 {
1012 	struct llbitmap *llbitmap = mddev->bitmap;
1013 	unsigned long chunks;
1014 
1015 	if (chunksize == 0)
1016 		chunksize = llbitmap->chunksize;
1017 
1018 	/* If there is enough space, leave the chunksize unchanged. */
1019 	chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1020 	while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
1021 		chunksize = chunksize << 1;
1022 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1023 	}
1024 
1025 	llbitmap->chunkshift = ffz(~chunksize);
1026 	llbitmap->chunksize = chunksize;
1027 	llbitmap->chunks = chunks;
1028 
1029 	return 0;
1030 }
1031 
1032 static int llbitmap_load(struct mddev *mddev)
1033 {
1034 	enum llbitmap_action action = BitmapActionReload;
1035 	struct llbitmap *llbitmap = mddev->bitmap;
1036 
1037 	if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
1038 		action = BitmapActionStale;
1039 
1040 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
1041 	return 0;
1042 }
1043 
1044 static void llbitmap_destroy(struct mddev *mddev)
1045 {
1046 	struct llbitmap *llbitmap = mddev->bitmap;
1047 
1048 	if (!llbitmap)
1049 		return;
1050 
1051 	mutex_lock(&mddev->bitmap_info.mutex);
1052 
1053 	timer_delete_sync(&llbitmap->pending_timer);
1054 	flush_workqueue(md_llbitmap_io_wq);
1055 	flush_workqueue(md_llbitmap_unplug_wq);
1056 
1057 	mddev->bitmap = NULL;
1058 	llbitmap_free_pages(llbitmap);
1059 	kfree(llbitmap);
1060 	mutex_unlock(&mddev->bitmap_info.mutex);
1061 }
1062 
1063 static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
1064 				 unsigned long sectors)
1065 {
1066 	struct llbitmap *llbitmap = mddev->bitmap;
1067 	unsigned long start = offset >> llbitmap->chunkshift;
1068 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1069 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1070 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1071 
1072 	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
1073 
1074 	while (page_start <= page_end) {
1075 		llbitmap_raise_barrier(llbitmap, page_start);
1076 		page_start++;
1077 	}
1078 }
1079 
1080 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
1081 			       unsigned long sectors)
1082 {
1083 	struct llbitmap *llbitmap = mddev->bitmap;
1084 	unsigned long start = offset >> llbitmap->chunkshift;
1085 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1086 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1087 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1088 
1089 	while (page_start <= page_end) {
1090 		llbitmap_release_barrier(llbitmap, page_start);
1091 		page_start++;
1092 	}
1093 }
1094 
1095 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
1096 				   unsigned long sectors)
1097 {
1098 	struct llbitmap *llbitmap = mddev->bitmap;
1099 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1100 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1101 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1102 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1103 
1104 	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
1105 
1106 	while (page_start <= page_end) {
1107 		llbitmap_raise_barrier(llbitmap, page_start);
1108 		page_start++;
1109 	}
1110 }
1111 
1112 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
1113 				 unsigned long sectors)
1114 {
1115 	struct llbitmap *llbitmap = mddev->bitmap;
1116 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1117 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1118 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1119 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1120 
1121 	while (page_start <= page_end) {
1122 		llbitmap_release_barrier(llbitmap, page_start);
1123 		page_start++;
1124 	}
1125 }
1126 
1127 static void llbitmap_unplug_fn(struct work_struct *work)
1128 {
1129 	struct llbitmap_unplug_work *unplug_work =
1130 		container_of(work, struct llbitmap_unplug_work, work);
1131 	struct llbitmap *llbitmap = unplug_work->llbitmap;
1132 	struct blk_plug plug;
1133 	int i;
1134 
1135 	blk_start_plug(&plug);
1136 
1137 	for (i = 0; i < llbitmap->nr_pages; i++) {
1138 		if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
1139 		    !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1140 			continue;
1141 
1142 		llbitmap_write_page(llbitmap, i);
1143 	}
1144 
1145 	blk_finish_plug(&plug);
1146 	md_super_wait(llbitmap->mddev);
1147 	complete(unplug_work->done);
1148 }
1149 
1150 static bool llbitmap_dirty(struct llbitmap *llbitmap)
1151 {
1152 	int i;
1153 
1154 	for (i = 0; i < llbitmap->nr_pages; i++)
1155 		if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1156 			return true;
1157 
1158 	return false;
1159 }
1160 
1161 static void llbitmap_unplug(struct mddev *mddev, bool sync)
1162 {
1163 	DECLARE_COMPLETION_ONSTACK(done);
1164 	struct llbitmap *llbitmap = mddev->bitmap;
1165 	struct llbitmap_unplug_work unplug_work = {
1166 		.llbitmap = llbitmap,
1167 		.done = &done,
1168 	};
1169 
1170 	if (!llbitmap_dirty(llbitmap))
1171 		return;
1172 
1173 	/*
1174 	 * Issue new bitmap IO under submit_bio() context will deadlock:
1175 	 *  - the bio will wait for bitmap bio to be done, before it can be
1176 	 *  issued;
1177 	 *  - bitmap bio will be added to current->bio_list and wait for this
1178 	 *  bio to be issued;
1179 	 */
1180 	INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
1181 	queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
1182 	wait_for_completion(&done);
1183 	destroy_work_on_stack(&unplug_work.work);
1184 }
1185 
1186 /*
1187  * Force to write all bitmap pages to disk, called when stopping the array, or
1188  * every daemon_sleep seconds when sync_thread is running.
1189  */
1190 static void __llbitmap_flush(struct mddev *mddev)
1191 {
1192 	struct llbitmap *llbitmap = mddev->bitmap;
1193 	struct blk_plug plug;
1194 	int i;
1195 
1196 	blk_start_plug(&plug);
1197 	for (i = 0; i < llbitmap->nr_pages; i++) {
1198 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1199 
1200 		/* mark all blocks as dirty */
1201 		set_bit(LLPageDirty, &pctl->flags);
1202 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1203 		llbitmap_write_page(llbitmap, i);
1204 	}
1205 	blk_finish_plug(&plug);
1206 	md_super_wait(llbitmap->mddev);
1207 }
1208 
1209 static void llbitmap_flush(struct mddev *mddev)
1210 {
1211 	struct llbitmap *llbitmap = mddev->bitmap;
1212 	int i;
1213 
1214 	for (i = 0; i < llbitmap->nr_pages; i++)
1215 		set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
1216 
1217 	timer_delete_sync(&llbitmap->pending_timer);
1218 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1219 	flush_work(&llbitmap->daemon_work);
1220 
1221 	__llbitmap_flush(mddev);
1222 }
1223 
1224 /* This is used for raid5 lazy initial recovery */
1225 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
1226 {
1227 	struct llbitmap *llbitmap = mddev->bitmap;
1228 	unsigned long p = offset >> llbitmap->chunkshift;
1229 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1230 
1231 	return c == BitClean || c == BitDirty;
1232 }
1233 
1234 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
1235 {
1236 	struct llbitmap *llbitmap = mddev->bitmap;
1237 	unsigned long p = offset >> llbitmap->chunkshift;
1238 	int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1239 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1240 
1241 	/* always skip unwritten blocks */
1242 	if (c == BitUnwritten)
1243 		return blocks;
1244 
1245 	/* For degraded array, don't skip */
1246 	if (mddev->degraded)
1247 		return 0;
1248 
1249 	/* For resync also skip clean/dirty blocks */
1250 	if ((c == BitClean || c == BitDirty) &&
1251 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
1252 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1253 		return blocks;
1254 
1255 	return 0;
1256 }
1257 
1258 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
1259 				sector_t *blocks, bool degraded)
1260 {
1261 	struct llbitmap *llbitmap = mddev->bitmap;
1262 	unsigned long p = offset >> llbitmap->chunkshift;
1263 
1264 	/*
1265 	 * Handle one bit at a time, this is much simpler. And it doesn't matter
1266 	 * if md_do_sync() loop more times.
1267 	 */
1268 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1269 	return llbitmap_state_machine(llbitmap, p, p,
1270 				      BitmapActionStartsync) == BitSyncing;
1271 }
1272 
1273 /* Something is wrong, sync_thread stop at @offset */
1274 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
1275 			      sector_t *blocks)
1276 {
1277 	struct llbitmap *llbitmap = mddev->bitmap;
1278 	unsigned long p = offset >> llbitmap->chunkshift;
1279 
1280 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1281 	llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
1282 			       BitmapActionAbortsync);
1283 }
1284 
1285 /* A full sync_thread is finished */
1286 static void llbitmap_close_sync(struct mddev *mddev)
1287 {
1288 	struct llbitmap *llbitmap = mddev->bitmap;
1289 	int i;
1290 
1291 	for (i = 0; i < llbitmap->nr_pages; i++) {
1292 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1293 
1294 		/* let daemon_fn clear dirty bits immediately */
1295 		WRITE_ONCE(pctl->expire, jiffies);
1296 	}
1297 
1298 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1299 			       BitmapActionEndsync);
1300 }
1301 
1302 /*
1303  * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
1304  * just in case sync_thread have to restart after power failure.
1305  */
1306 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1307 				   bool force)
1308 {
1309 	struct llbitmap *llbitmap = mddev->bitmap;
1310 
1311 	if (sector == 0) {
1312 		llbitmap->last_end_sync = jiffies;
1313 		return;
1314 	}
1315 
1316 	if (time_before(jiffies, llbitmap->last_end_sync +
1317 				 HZ * mddev->bitmap_info.daemon_sleep))
1318 		return;
1319 
1320 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
1321 
1322 	mddev->curr_resync_completed = sector;
1323 	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1324 	llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
1325 			       BitmapActionEndsync);
1326 	__llbitmap_flush(mddev);
1327 
1328 	llbitmap->last_end_sync = jiffies;
1329 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
1330 }
1331 
1332 static bool llbitmap_enabled(void *data, bool flush)
1333 {
1334 	struct llbitmap *llbitmap = data;
1335 
1336 	return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1337 }
1338 
1339 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1340 				unsigned long e)
1341 {
1342 	llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
1343 }
1344 
1345 static void llbitmap_write_sb(struct llbitmap *llbitmap)
1346 {
1347 	int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
1348 
1349 	bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
1350 	llbitmap_write_page(llbitmap, 0);
1351 	md_super_wait(llbitmap->mddev);
1352 }
1353 
1354 static void llbitmap_update_sb(void *data)
1355 {
1356 	struct llbitmap *llbitmap = data;
1357 	struct mddev *mddev = llbitmap->mddev;
1358 	struct page *sb_page;
1359 	bitmap_super_t *sb;
1360 
1361 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
1362 		return;
1363 
1364 	sb_page = llbitmap_read_page(llbitmap, 0);
1365 	if (IS_ERR(sb_page)) {
1366 		pr_err("%s: %s: read super block failed", __func__,
1367 		       mdname(mddev));
1368 		set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1369 		return;
1370 	}
1371 
1372 	if (mddev->events < llbitmap->events_cleared)
1373 		llbitmap->events_cleared = mddev->events;
1374 
1375 	sb = kmap_local_page(sb_page);
1376 	sb->events = cpu_to_le64(mddev->events);
1377 	sb->state = cpu_to_le32(llbitmap->flags);
1378 	sb->chunksize = cpu_to_le32(llbitmap->chunksize);
1379 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1380 	sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
1381 	sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
1382 	sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
1383 
1384 	kunmap_local(sb);
1385 	llbitmap_write_sb(llbitmap);
1386 }
1387 
1388 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
1389 {
1390 	struct llbitmap *llbitmap = data;
1391 
1392 	memset(stats, 0, sizeof(*stats));
1393 
1394 	stats->missing_pages = 0;
1395 	stats->pages = llbitmap->nr_pages;
1396 	stats->file_pages = llbitmap->nr_pages;
1397 
1398 	stats->behind_writes = atomic_read(&llbitmap->behind_writes);
1399 	stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
1400 	stats->events_cleared = llbitmap->events_cleared;
1401 
1402 	return 0;
1403 }
1404 
1405 /* just flag all pages as needing to be written */
1406 static void llbitmap_write_all(struct mddev *mddev)
1407 {
1408 	int i;
1409 	struct llbitmap *llbitmap = mddev->bitmap;
1410 
1411 	for (i = 0; i < llbitmap->nr_pages; i++) {
1412 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1413 
1414 		set_bit(LLPageDirty, &pctl->flags);
1415 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1416 	}
1417 }
1418 
1419 static void llbitmap_start_behind_write(struct mddev *mddev)
1420 {
1421 	struct llbitmap *llbitmap = mddev->bitmap;
1422 
1423 	atomic_inc(&llbitmap->behind_writes);
1424 }
1425 
1426 static void llbitmap_end_behind_write(struct mddev *mddev)
1427 {
1428 	struct llbitmap *llbitmap = mddev->bitmap;
1429 
1430 	if (atomic_dec_and_test(&llbitmap->behind_writes))
1431 		wake_up(&llbitmap->behind_wait);
1432 }
1433 
1434 static void llbitmap_wait_behind_writes(struct mddev *mddev)
1435 {
1436 	struct llbitmap *llbitmap = mddev->bitmap;
1437 
1438 	if (!llbitmap)
1439 		return;
1440 
1441 	wait_event(llbitmap->behind_wait,
1442 		   atomic_read(&llbitmap->behind_writes) == 0);
1443 
1444 }
1445 
1446 static ssize_t bits_show(struct mddev *mddev, char *page)
1447 {
1448 	struct llbitmap *llbitmap;
1449 	int bits[BitStateCount] = {0};
1450 	loff_t start = 0;
1451 
1452 	mutex_lock(&mddev->bitmap_info.mutex);
1453 	llbitmap = mddev->bitmap;
1454 	if (!llbitmap || !llbitmap->pctl) {
1455 		mutex_unlock(&mddev->bitmap_info.mutex);
1456 		return sprintf(page, "no bitmap\n");
1457 	}
1458 
1459 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
1460 		mutex_unlock(&mddev->bitmap_info.mutex);
1461 		return sprintf(page, "bitmap io error\n");
1462 	}
1463 
1464 	while (start < llbitmap->chunks) {
1465 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
1466 
1467 		if (c < 0 || c >= BitStateCount)
1468 			pr_err("%s: invalid bit %llu state %d\n",
1469 			       __func__, start, c);
1470 		else
1471 			bits[c]++;
1472 		start++;
1473 	}
1474 
1475 	mutex_unlock(&mddev->bitmap_info.mutex);
1476 	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
1477 		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
1478 		       bits[BitNeedSync], bits[BitSyncing]);
1479 }
1480 
1481 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
1482 
1483 static ssize_t metadata_show(struct mddev *mddev, char *page)
1484 {
1485 	struct llbitmap *llbitmap;
1486 	ssize_t ret;
1487 
1488 	mutex_lock(&mddev->bitmap_info.mutex);
1489 	llbitmap = mddev->bitmap;
1490 	if (!llbitmap) {
1491 		mutex_unlock(&mddev->bitmap_info.mutex);
1492 		return sprintf(page, "no bitmap\n");
1493 	}
1494 
1495 	ret =  sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
1496 		       llbitmap->chunksize, llbitmap->chunkshift,
1497 		       llbitmap->chunks, mddev->bitmap_info.offset,
1498 		       llbitmap->mddev->bitmap_info.daemon_sleep);
1499 	mutex_unlock(&mddev->bitmap_info.mutex);
1500 
1501 	return ret;
1502 }
1503 
1504 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
1505 
1506 static ssize_t
1507 daemon_sleep_show(struct mddev *mddev, char *page)
1508 {
1509 	return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
1510 }
1511 
1512 static ssize_t
1513 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
1514 {
1515 	unsigned long timeout;
1516 	int rv = kstrtoul(buf, 10, &timeout);
1517 
1518 	if (rv)
1519 		return rv;
1520 
1521 	mddev->bitmap_info.daemon_sleep = timeout;
1522 	return len;
1523 }
1524 
1525 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
1526 
1527 static ssize_t
1528 barrier_idle_show(struct mddev *mddev, char *page)
1529 {
1530 	struct llbitmap *llbitmap = mddev->bitmap;
1531 
1532 	return sprintf(page, "%lu\n", llbitmap->barrier_idle);
1533 }
1534 
1535 static ssize_t
1536 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
1537 {
1538 	struct llbitmap *llbitmap = mddev->bitmap;
1539 	unsigned long timeout;
1540 	int rv = kstrtoul(buf, 10, &timeout);
1541 
1542 	if (rv)
1543 		return rv;
1544 
1545 	llbitmap->barrier_idle = timeout;
1546 	return len;
1547 }
1548 
1549 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
1550 
1551 static struct attribute *md_llbitmap_attrs[] = {
1552 	&llbitmap_bits.attr,
1553 	&llbitmap_metadata.attr,
1554 	&llbitmap_daemon_sleep.attr,
1555 	&llbitmap_barrier_idle.attr,
1556 	NULL
1557 };
1558 
1559 static struct attribute_group md_llbitmap_group = {
1560 	.name = "llbitmap",
1561 	.attrs = md_llbitmap_attrs,
1562 };
1563 
1564 static struct bitmap_operations llbitmap_ops = {
1565 	.head = {
1566 		.type	= MD_BITMAP,
1567 		.id	= ID_LLBITMAP,
1568 		.name	= "llbitmap",
1569 	},
1570 
1571 	.enabled		= llbitmap_enabled,
1572 	.create			= llbitmap_create,
1573 	.resize			= llbitmap_resize,
1574 	.load			= llbitmap_load,
1575 	.destroy		= llbitmap_destroy,
1576 
1577 	.start_write		= llbitmap_start_write,
1578 	.end_write		= llbitmap_end_write,
1579 	.start_discard		= llbitmap_start_discard,
1580 	.end_discard		= llbitmap_end_discard,
1581 	.unplug			= llbitmap_unplug,
1582 	.flush			= llbitmap_flush,
1583 
1584 	.start_behind_write	= llbitmap_start_behind_write,
1585 	.end_behind_write	= llbitmap_end_behind_write,
1586 	.wait_behind_writes	= llbitmap_wait_behind_writes,
1587 
1588 	.blocks_synced		= llbitmap_blocks_synced,
1589 	.skip_sync_blocks	= llbitmap_skip_sync_blocks,
1590 	.start_sync		= llbitmap_start_sync,
1591 	.end_sync		= llbitmap_end_sync,
1592 	.close_sync		= llbitmap_close_sync,
1593 	.cond_end_sync		= llbitmap_cond_end_sync,
1594 
1595 	.update_sb		= llbitmap_update_sb,
1596 	.get_stats		= llbitmap_get_stats,
1597 	.dirty_bits		= llbitmap_dirty_bits,
1598 	.write_all		= llbitmap_write_all,
1599 
1600 	.group			= &md_llbitmap_group,
1601 };
1602 
1603 int md_llbitmap_init(void)
1604 {
1605 	md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
1606 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1607 	if (!md_llbitmap_io_wq)
1608 		return -ENOMEM;
1609 
1610 	md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
1611 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1612 	if (!md_llbitmap_unplug_wq) {
1613 		destroy_workqueue(md_llbitmap_io_wq);
1614 		md_llbitmap_io_wq = NULL;
1615 		return -ENOMEM;
1616 	}
1617 
1618 	return register_md_submodule(&llbitmap_ops.head);
1619 }
1620 
1621 void md_llbitmap_exit(void)
1622 {
1623 	destroy_workqueue(md_llbitmap_io_wq);
1624 	md_llbitmap_io_wq = NULL;
1625 	destroy_workqueue(md_llbitmap_unplug_wq);
1626 	md_llbitmap_unplug_wq = NULL;
1627 	unregister_md_submodule(&llbitmap_ops.head);
1628 }
1629