xref: /linux/drivers/md/md-llbitmap.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/blkdev.h>
4 #include <linux/module.h>
5 #include <linux/errno.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/timer.h>
9 #include <linux/sched.h>
10 #include <linux/list.h>
11 #include <linux/file.h>
12 #include <linux/seq_file.h>
13 #include <trace/events/block.h>
14 
15 #include "md.h"
16 #include "md-bitmap.h"
17 
18 /*
19  * #### Background
20  *
21  * Redundant data is used to enhance data fault tolerance, and the storage
22  * methods for redundant data vary depending on the RAID levels. And it's
23  * important to maintain the consistency of redundant data.
24  *
25  * Bitmap is used to record which data blocks have been synchronized and which
26  * ones need to be resynchronized or recovered. Each bit in the bitmap
27  * represents a segment of data in the array. When a bit is set, it indicates
28  * that the multiple redundant copies of that data segment may not be
29  * consistent. Data synchronization can be performed based on the bitmap after
30  * power failure or readding a disk. If there is no bitmap, a full disk
31  * synchronization is required.
32  *
33  * #### Key Features
34  *
35  *  - IO fastpath is lockless, if user issues lots of write IO to the same
36  *  bitmap bit in a short time, only the first write has additional overhead
37  *  to update bitmap bit, no additional overhead for the following writes;
38  *  - support only resync or recover written data, means in the case creating
39  *  new array or replacing with a new disk, there is no need to do a full disk
40  *  resync/recovery;
41  *
42  * #### Key Concept
43  *
44  * ##### State Machine
45  *
46  * Each bit is one byte, contain 6 different states, see llbitmap_state. And
47  * there are total 8 different actions, see llbitmap_action, can change state:
48  *
49  * llbitmap state machine: transitions between states
50  *
51  * |           | Startwrite | Startsync | Endsync | Abortsync|
52  * | --------- | ---------- | --------- | ------- | -------  |
53  * | Unwritten | Dirty      | x         | x       | x        |
54  * | Clean     | Dirty      | x         | x       | x        |
55  * | Dirty     | x          | x         | x       | x        |
56  * | NeedSync  | x          | Syncing   | x       | x        |
57  * | Syncing   | x          | Syncing   | Dirty   | NeedSync |
58  *
59  * |           | Reload   | Daemon | Discard   | Stale     |
60  * | --------- | -------- | ------ | --------- | --------- |
61  * | Unwritten | x        | x      | x         | x         |
62  * | Clean     | x        | x      | Unwritten | NeedSync  |
63  * | Dirty     | NeedSync | Clean  | Unwritten | NeedSync  |
64  * | NeedSync  | x        | x      | Unwritten | x         |
65  * | Syncing   | NeedSync | x      | Unwritten | NeedSync  |
66  *
67  * Typical scenarios:
68  *
69  * 1) Create new array
70  * All bits will be set to Unwritten by default, if --assume-clean is set,
71  * all bits will be set to Clean instead.
72  *
73  * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
74  * rely on xor data
75  *
76  * 2.1) write new data to raid1/raid10:
77  * Unwritten --StartWrite--> Dirty
78  *
79  * 2.2) write new data to raid456:
80  * Unwritten --StartWrite--> NeedSync
81  *
82  * Because the initial recover for raid456 is skipped, the xor data is not built
83  * yet, the bit must be set to NeedSync first and after lazy initial recover is
84  * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
85  *
86  * 2.3) cover write
87  * Clean --StartWrite--> Dirty
88  *
89  * 3) daemon, if the array is not degraded:
90  * Dirty --Daemon--> Clean
91  *
92  * 4) discard
93  * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
94  *
95  * 5) resync and recover
96  *
97  * 5.1) common process
98  * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
99  *
100  * 5.2) resync after power failure
101  * Dirty --Reload--> NeedSync
102  *
103  * 5.3) recover while replacing with a new disk
104  * By default, the old bitmap framework will recover all data, and llbitmap
105  * implements this by a new helper, see llbitmap_skip_sync_blocks:
106  *
107  * skip recover for bits other than dirty or clean;
108  *
109  * 5.4) lazy initial recover for raid5:
110  * By default, the old bitmap framework will only allow new recover when there
111  * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
112  * to perform raid456 lazy recover for set bits(from 2.2).
113  *
114  * 6. special handling for degraded array:
115  *
116  * - Dirty bits will never be cleared, daemon will just do nothing, so that if
117  *   a disk is readded, Clean bits can be skipped with recovery;
118  * - Dirty bits will convert to Syncing from start write, to do data recovery
119  *   for new added disks;
120  * - New write will convert bits to NeedSync directly;
121  *
122  * ##### Bitmap IO
123  *
124  * ##### Chunksize
125  *
126  * The default bitmap size is 128k, incluing 1k bitmap super block, and
127  * the default size of segment of data in the array each bit(chunksize) is 64k,
128  * and chunksize will adjust to twice the old size each time if the total number
129  * bits is not less than 127k.(see llbitmap_init)
130  *
131  * ##### READ
132  *
133  * While creating bitmap, all pages will be allocated and read for llbitmap,
134  * there won't be read afterwards
135  *
136  * ##### WRITE
137  *
138  * WRITE IO is divided into logical_block_size of the array, the dirty state
139  * of each block is tracked independently, for example:
140  *
141  * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
142  *
143  * | page0 | page1 | ... | page 31 |
144  * |       |
145  * |        \-----------------------\
146  * |                                |
147  * | block0 | block1 | ... | block 8|
148  * |        |
149  * |         \-----------------\
150  * |                            |
151  * | bit0 | bit1 | ... | bit511 |
152  *
153  * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
154  * subpage will be marked dirty, such block must write first before the IO is
155  * issued. This behaviour will affect IO performance, to reduce the impact, if
156  * multiple bits are changed in the same block in a short time, all bits in this
157  * block will be changed to Dirty/NeedSync, so that there won't be any overhead
158  * until daemon clears dirty bits.
159  *
160  * ##### Dirty Bits synchronization
161  *
162  * IO fast path will set bits to dirty, and those dirty bits will be cleared
163  * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
164  * IO path and daemon;
165  *
166  * IO path:
167  *  1) try to grab a reference, if succeed, set expire time after 5s and return;
168  *  2) if failed to grab a reference, wait for daemon to finish clearing dirty
169  *  bits;
170  *
171  * Daemon (Daemon will be woken up every daemon_sleep seconds):
172  * For each page:
173  *  1) check if page expired, if not skip this page; for expired page:
174  *  2) suspend the page and wait for inflight write IO to be done;
175  *  3) change dirty page to clean;
176  *  4) resume the page;
177  */
178 
179 #define BITMAP_DATA_OFFSET 1024
180 
181 /* 64k is the max IO size of sync IO for raid1/raid10 */
182 #define MIN_CHUNK_SIZE (64 * 2)
183 
184 /* By default, daemon will be woken up every 30s */
185 #define DEFAULT_DAEMON_SLEEP 30
186 
187 /*
188  * Dirtied bits that have not been accessed for more than 5s will be cleared
189  * by daemon.
190  */
191 #define DEFAULT_BARRIER_IDLE 5
192 
193 enum llbitmap_state {
194 	/* No valid data, init state after assemble the array */
195 	BitUnwritten = 0,
196 	/* data is consistent */
197 	BitClean,
198 	/* data will be consistent after IO is done, set directly for writes */
199 	BitDirty,
200 	/*
201 	 * data need to be resynchronized:
202 	 * 1) set directly for writes if array is degraded, prevent full disk
203 	 * synchronization after readding a disk;
204 	 * 2) reassemble the array after power failure, and dirty bits are
205 	 * found after reloading the bitmap;
206 	 * 3) set for first write for raid5, to build initial xor data lazily
207 	 */
208 	BitNeedSync,
209 	/* data is synchronizing */
210 	BitSyncing,
211 	BitStateCount,
212 	BitNone = 0xff,
213 };
214 
215 enum llbitmap_action {
216 	/* User write new data, this is the only action from IO fast path */
217 	BitmapActionStartwrite = 0,
218 	/* Start recovery */
219 	BitmapActionStartsync,
220 	/* Finish recovery */
221 	BitmapActionEndsync,
222 	/* Failed recovery */
223 	BitmapActionAbortsync,
224 	/* Reassemble the array */
225 	BitmapActionReload,
226 	/* Daemon thread is trying to clear dirty bits */
227 	BitmapActionDaemon,
228 	/* Data is deleted */
229 	BitmapActionDiscard,
230 	/*
231 	 * Bitmap is stale, mark all bits in addition to BitUnwritten to
232 	 * BitNeedSync.
233 	 */
234 	BitmapActionStale,
235 	BitmapActionCount,
236 	/* Init state is BitUnwritten */
237 	BitmapActionInit,
238 };
239 
240 enum llbitmap_page_state {
241 	LLPageFlush = 0,
242 	LLPageDirty,
243 };
244 
245 struct llbitmap_page_ctl {
246 	char *state;
247 	struct page *page;
248 	unsigned long expire;
249 	unsigned long flags;
250 	wait_queue_head_t wait;
251 	struct percpu_ref active;
252 	/* Per block size dirty state, maximum 64k page / 1 sector = 128 */
253 	unsigned long dirty[];
254 };
255 
256 struct llbitmap {
257 	struct mddev *mddev;
258 	struct llbitmap_page_ctl **pctl;
259 
260 	unsigned int nr_pages;
261 	unsigned int io_size;
262 	unsigned int blocks_per_page;
263 
264 	/* shift of one chunk */
265 	unsigned long chunkshift;
266 	/* size of one chunk in sector */
267 	unsigned long chunksize;
268 	/* total number of chunks */
269 	unsigned long chunks;
270 	unsigned long last_end_sync;
271 	/*
272 	 * time in seconds that dirty bits will be cleared if the page is not
273 	 * accessed.
274 	 */
275 	unsigned long barrier_idle;
276 	/* fires on first BitDirty state */
277 	struct timer_list pending_timer;
278 	struct work_struct daemon_work;
279 
280 	unsigned long flags;
281 	__u64	events_cleared;
282 
283 	/* for slow disks */
284 	atomic_t behind_writes;
285 	wait_queue_head_t behind_wait;
286 };
287 
288 struct llbitmap_unplug_work {
289 	struct work_struct work;
290 	struct llbitmap *llbitmap;
291 	struct completion *done;
292 };
293 
294 static struct workqueue_struct *md_llbitmap_io_wq;
295 static struct workqueue_struct *md_llbitmap_unplug_wq;
296 
297 static char state_machine[BitStateCount][BitmapActionCount] = {
298 	[BitUnwritten] = {
299 		[BitmapActionStartwrite]	= BitDirty,
300 		[BitmapActionStartsync]		= BitNone,
301 		[BitmapActionEndsync]		= BitNone,
302 		[BitmapActionAbortsync]		= BitNone,
303 		[BitmapActionReload]		= BitNone,
304 		[BitmapActionDaemon]		= BitNone,
305 		[BitmapActionDiscard]		= BitNone,
306 		[BitmapActionStale]		= BitNone,
307 	},
308 	[BitClean] = {
309 		[BitmapActionStartwrite]	= BitDirty,
310 		[BitmapActionStartsync]		= BitNone,
311 		[BitmapActionEndsync]		= BitNone,
312 		[BitmapActionAbortsync]		= BitNone,
313 		[BitmapActionReload]		= BitNone,
314 		[BitmapActionDaemon]		= BitNone,
315 		[BitmapActionDiscard]		= BitUnwritten,
316 		[BitmapActionStale]		= BitNeedSync,
317 	},
318 	[BitDirty] = {
319 		[BitmapActionStartwrite]	= BitNone,
320 		[BitmapActionStartsync]		= BitNone,
321 		[BitmapActionEndsync]		= BitNone,
322 		[BitmapActionAbortsync]		= BitNone,
323 		[BitmapActionReload]		= BitNeedSync,
324 		[BitmapActionDaemon]		= BitClean,
325 		[BitmapActionDiscard]		= BitUnwritten,
326 		[BitmapActionStale]		= BitNeedSync,
327 	},
328 	[BitNeedSync] = {
329 		[BitmapActionStartwrite]	= BitNone,
330 		[BitmapActionStartsync]		= BitSyncing,
331 		[BitmapActionEndsync]		= BitNone,
332 		[BitmapActionAbortsync]		= BitNone,
333 		[BitmapActionReload]		= BitNone,
334 		[BitmapActionDaemon]		= BitNone,
335 		[BitmapActionDiscard]		= BitUnwritten,
336 		[BitmapActionStale]		= BitNone,
337 	},
338 	[BitSyncing] = {
339 		[BitmapActionStartwrite]	= BitNone,
340 		[BitmapActionStartsync]		= BitSyncing,
341 		[BitmapActionEndsync]		= BitDirty,
342 		[BitmapActionAbortsync]		= BitNeedSync,
343 		[BitmapActionReload]		= BitNeedSync,
344 		[BitmapActionDaemon]		= BitNone,
345 		[BitmapActionDiscard]		= BitUnwritten,
346 		[BitmapActionStale]		= BitNeedSync,
347 	},
348 };
349 
350 static void __llbitmap_flush(struct mddev *mddev);
351 
352 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
353 {
354 	unsigned int idx;
355 	unsigned int offset;
356 
357 	pos += BITMAP_DATA_OFFSET;
358 	idx = pos >> PAGE_SHIFT;
359 	offset = offset_in_page(pos);
360 
361 	return llbitmap->pctl[idx]->state[offset];
362 }
363 
364 /* set all the bits in the subpage as dirty */
365 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
366 				       struct llbitmap_page_ctl *pctl,
367 				       unsigned int block)
368 {
369 	bool level_456 = raid_is_456(llbitmap->mddev);
370 	unsigned int io_size = llbitmap->io_size;
371 	int pos;
372 
373 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
374 		switch (pctl->state[pos]) {
375 		case BitUnwritten:
376 			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
377 			break;
378 		case BitClean:
379 			pctl->state[pos] = BitDirty;
380 			break;
381 		};
382 	}
383 }
384 
385 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
386 				    int offset)
387 {
388 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
389 	unsigned int io_size = llbitmap->io_size;
390 	int block = offset / io_size;
391 	int pos;
392 
393 	if (!test_bit(LLPageDirty, &pctl->flags))
394 		set_bit(LLPageDirty, &pctl->flags);
395 
396 	/*
397 	 * For degraded array, dirty bits will never be cleared, and we must
398 	 * resync all the dirty bits, hence skip infect new dirty bits to
399 	 * prevent resync unnecessary data.
400 	 */
401 	if (llbitmap->mddev->degraded) {
402 		set_bit(block, pctl->dirty);
403 		return;
404 	}
405 
406 	/*
407 	 * The subpage usually contains a total of 512 bits. If any single bit
408 	 * within the subpage is marked as dirty, the entire sector will be
409 	 * written. To avoid impacting write performance, when multiple bits
410 	 * within the same sector are modified within llbitmap->barrier_idle,
411 	 * all bits in the sector will be collectively marked as dirty at once.
412 	 */
413 	if (test_and_set_bit(block, pctl->dirty)) {
414 		llbitmap_infect_dirty_bits(llbitmap, pctl, block);
415 		return;
416 	}
417 
418 	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
419 		if (pos == offset)
420 			continue;
421 		if (pctl->state[pos] == BitDirty ||
422 		    pctl->state[pos] == BitNeedSync) {
423 			llbitmap_infect_dirty_bits(llbitmap, pctl, block);
424 			return;
425 		}
426 	}
427 }
428 
429 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
430 			   loff_t pos)
431 {
432 	unsigned int idx;
433 	unsigned int bit;
434 
435 	pos += BITMAP_DATA_OFFSET;
436 	idx = pos >> PAGE_SHIFT;
437 	bit = offset_in_page(pos);
438 
439 	llbitmap->pctl[idx]->state[bit] = state;
440 	if (state == BitDirty || state == BitNeedSync)
441 		llbitmap_set_page_dirty(llbitmap, idx, bit);
442 }
443 
444 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
445 {
446 	struct mddev *mddev = llbitmap->mddev;
447 	struct page *page = NULL;
448 	struct md_rdev *rdev;
449 
450 	if (llbitmap->pctl && llbitmap->pctl[idx])
451 		page = llbitmap->pctl[idx]->page;
452 	if (page)
453 		return page;
454 
455 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
456 	if (!page)
457 		return ERR_PTR(-ENOMEM);
458 
459 	rdev_for_each(rdev, mddev) {
460 		sector_t sector;
461 
462 		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
463 			continue;
464 
465 		sector = mddev->bitmap_info.offset +
466 			 (idx << PAGE_SECTORS_SHIFT);
467 
468 		if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
469 				 true))
470 			return page;
471 
472 		md_error(mddev, rdev);
473 	}
474 
475 	__free_page(page);
476 	return ERR_PTR(-EIO);
477 }
478 
479 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
480 {
481 	struct page *page = llbitmap->pctl[idx]->page;
482 	struct mddev *mddev = llbitmap->mddev;
483 	struct md_rdev *rdev;
484 	int block;
485 
486 	for (block = 0; block < llbitmap->blocks_per_page; block++) {
487 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
488 
489 		if (!test_and_clear_bit(block, pctl->dirty))
490 			continue;
491 
492 		rdev_for_each(rdev, mddev) {
493 			sector_t sector;
494 			sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
495 
496 			if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
497 				continue;
498 
499 			sector = mddev->bitmap_info.offset + rdev->sb_start +
500 				 (idx << PAGE_SECTORS_SHIFT) +
501 				 block * bit_sector;
502 			md_write_metadata(mddev, rdev, sector,
503 					  llbitmap->io_size, page,
504 					  block * llbitmap->io_size);
505 		}
506 	}
507 }
508 
509 static void active_release(struct percpu_ref *ref)
510 {
511 	struct llbitmap_page_ctl *pctl =
512 		container_of(ref, struct llbitmap_page_ctl, active);
513 
514 	wake_up(&pctl->wait);
515 }
516 
517 static void llbitmap_free_pages(struct llbitmap *llbitmap)
518 {
519 	int i;
520 
521 	if (!llbitmap->pctl)
522 		return;
523 
524 	for (i = 0; i < llbitmap->nr_pages; i++) {
525 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
526 
527 		if (!pctl || !pctl->page)
528 			break;
529 
530 		__free_page(pctl->page);
531 		percpu_ref_exit(&pctl->active);
532 	}
533 
534 	kfree(llbitmap->pctl[0]);
535 	kfree(llbitmap->pctl);
536 	llbitmap->pctl = NULL;
537 }
538 
539 static int llbitmap_cache_pages(struct llbitmap *llbitmap)
540 {
541 	struct llbitmap_page_ctl *pctl;
542 	unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
543 					     BITMAP_DATA_OFFSET, PAGE_SIZE);
544 	unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
545 						llbitmap->blocks_per_page));
546 	int i;
547 
548 	llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
549 				       GFP_KERNEL | __GFP_ZERO);
550 	if (!llbitmap->pctl)
551 		return -ENOMEM;
552 
553 	size = round_up(size, cache_line_size());
554 	pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
555 	if (!pctl) {
556 		kfree(llbitmap->pctl);
557 		return -ENOMEM;
558 	}
559 
560 	llbitmap->nr_pages = nr_pages;
561 
562 	for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
563 		struct page *page = llbitmap_read_page(llbitmap, i);
564 
565 		llbitmap->pctl[i] = pctl;
566 
567 		if (IS_ERR(page)) {
568 			llbitmap_free_pages(llbitmap);
569 			return PTR_ERR(page);
570 		}
571 
572 		if (percpu_ref_init(&pctl->active, active_release,
573 				    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
574 			__free_page(page);
575 			llbitmap_free_pages(llbitmap);
576 			return -ENOMEM;
577 		}
578 
579 		pctl->page = page;
580 		pctl->state = page_address(page);
581 		init_waitqueue_head(&pctl->wait);
582 	}
583 
584 	return 0;
585 }
586 
587 static void llbitmap_init_state(struct llbitmap *llbitmap)
588 {
589 	enum llbitmap_state state = BitUnwritten;
590 	unsigned long i;
591 
592 	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
593 		state = BitClean;
594 
595 	for (i = 0; i < llbitmap->chunks; i++)
596 		llbitmap_write(llbitmap, state, i);
597 }
598 
599 /* The return value is only used from resync, where @start == @end. */
600 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
601 						  unsigned long start,
602 						  unsigned long end,
603 						  enum llbitmap_action action)
604 {
605 	struct mddev *mddev = llbitmap->mddev;
606 	enum llbitmap_state state = BitNone;
607 	bool level_456 = raid_is_456(llbitmap->mddev);
608 	bool need_resync = false;
609 	bool need_recovery = false;
610 
611 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
612 		return BitNone;
613 
614 	if (action == BitmapActionInit) {
615 		llbitmap_init_state(llbitmap);
616 		return BitNone;
617 	}
618 
619 	while (start <= end) {
620 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
621 
622 		if (c < 0 || c >= BitStateCount) {
623 			pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
624 			       __func__, start, c, action);
625 			state = BitNeedSync;
626 			goto write_bitmap;
627 		}
628 
629 		if (c == BitNeedSync)
630 			need_resync = !mddev->degraded;
631 
632 		state = state_machine[c][action];
633 
634 write_bitmap:
635 		if (unlikely(mddev->degraded)) {
636 			/* For degraded array, mark new data as need sync. */
637 			if (state == BitDirty &&
638 			    action == BitmapActionStartwrite)
639 				state = BitNeedSync;
640 			/*
641 			 * For degraded array, resync dirty data as well, noted
642 			 * if array is still degraded after resync is done, all
643 			 * new data will still be dirty until array is clean.
644 			 */
645 			else if (c == BitDirty &&
646 				action == BitmapActionStartsync)
647 				state = BitSyncing;
648 		} else if (c == BitUnwritten && state == BitDirty &&
649 			   action == BitmapActionStartwrite && level_456) {
650 			/* Delay raid456 initial recovery to first write. */
651 			state = BitNeedSync;
652 		}
653 
654 		if (state == BitNone) {
655 			start++;
656 			continue;
657 		}
658 
659 		llbitmap_write(llbitmap, state, start);
660 
661 		if (state == BitNeedSync)
662 			need_resync = !mddev->degraded;
663 		else if (state == BitDirty &&
664 			 !timer_pending(&llbitmap->pending_timer))
665 			mod_timer(&llbitmap->pending_timer,
666 				  jiffies + mddev->bitmap_info.daemon_sleep * HZ);
667 
668 		start++;
669 	}
670 
671 	if (need_resync && level_456)
672 		need_recovery = true;
673 
674 	if (need_recovery) {
675 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
676 		set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
677 		md_wakeup_thread(mddev->thread);
678 	} else if (need_resync) {
679 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
680 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
681 		md_wakeup_thread(mddev->thread);
682 	}
683 
684 	return state;
685 }
686 
687 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
688 {
689 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
690 
691 retry:
692 	if (likely(percpu_ref_tryget_live(&pctl->active))) {
693 		WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
694 		return;
695 	}
696 
697 	wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
698 	goto retry;
699 }
700 
701 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
702 {
703 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
704 
705 	percpu_ref_put(&pctl->active);
706 }
707 
708 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
709 {
710 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
711 
712 	percpu_ref_kill(&pctl->active);
713 
714 	if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
715 			llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
716 		return -ETIMEDOUT;
717 
718 	return 0;
719 }
720 
721 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
722 {
723 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
724 
725 	pctl->expire = LONG_MAX;
726 	percpu_ref_resurrect(&pctl->active);
727 	wake_up(&pctl->wait);
728 }
729 
730 static int llbitmap_check_support(struct mddev *mddev)
731 {
732 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
733 		pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
734 			  mdname(mddev));
735 		return -EBUSY;
736 	}
737 
738 	if (mddev->bitmap_info.space == 0) {
739 		if (mddev->bitmap_info.default_space == 0) {
740 			pr_notice("md/llbitmap: %s: no space for bitmap\n",
741 				  mdname(mddev));
742 			return -ENOSPC;
743 		}
744 	}
745 
746 	if (!mddev->persistent) {
747 		pr_notice("md/llbitmap: %s: array must be persistent\n",
748 			  mdname(mddev));
749 		return -EOPNOTSUPP;
750 	}
751 
752 	if (mddev->bitmap_info.file) {
753 		pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
754 			  mdname(mddev));
755 		return -EOPNOTSUPP;
756 	}
757 
758 	if (mddev->bitmap_info.external) {
759 		pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
760 			  mdname(mddev));
761 		return -EOPNOTSUPP;
762 	}
763 
764 	if (mddev_is_dm(mddev)) {
765 		pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
766 			  mdname(mddev));
767 		return -EOPNOTSUPP;
768 	}
769 
770 	return 0;
771 }
772 
773 static int llbitmap_init(struct llbitmap *llbitmap)
774 {
775 	struct mddev *mddev = llbitmap->mddev;
776 	sector_t blocks = mddev->resync_max_sectors;
777 	unsigned long chunksize = MIN_CHUNK_SIZE;
778 	unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
779 	unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
780 	int ret;
781 
782 	while (chunks > space) {
783 		chunksize = chunksize << 1;
784 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
785 	}
786 
787 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
788 	llbitmap->chunkshift = ffz(~chunksize);
789 	llbitmap->chunksize = chunksize;
790 	llbitmap->chunks = chunks;
791 	mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
792 
793 	ret = llbitmap_cache_pages(llbitmap);
794 	if (ret)
795 		return ret;
796 
797 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
798 			       BitmapActionInit);
799 	/* flush initial llbitmap to disk */
800 	__llbitmap_flush(mddev);
801 
802 	return 0;
803 }
804 
805 static int llbitmap_read_sb(struct llbitmap *llbitmap)
806 {
807 	struct mddev *mddev = llbitmap->mddev;
808 	unsigned long daemon_sleep;
809 	unsigned long chunksize;
810 	unsigned long events;
811 	struct page *sb_page;
812 	bitmap_super_t *sb;
813 	int ret = -EINVAL;
814 
815 	if (!mddev->bitmap_info.offset) {
816 		pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
817 		return -EINVAL;
818 	}
819 
820 	sb_page = llbitmap_read_page(llbitmap, 0);
821 	if (IS_ERR(sb_page)) {
822 		pr_err("md/llbitmap: %s: read super block failed",
823 		       mdname(mddev));
824 		return -EIO;
825 	}
826 
827 	sb = kmap_local_page(sb_page);
828 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
829 		pr_err("md/llbitmap: %s: invalid super block magic number",
830 		       mdname(mddev));
831 		goto out_put_page;
832 	}
833 
834 	if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
835 		pr_err("md/llbitmap: %s: invalid super block version",
836 		       mdname(mddev));
837 		goto out_put_page;
838 	}
839 
840 	if (memcmp(sb->uuid, mddev->uuid, 16)) {
841 		pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
842 		       mdname(mddev));
843 		goto out_put_page;
844 	}
845 
846 	if (mddev->bitmap_info.space == 0) {
847 		int room = le32_to_cpu(sb->sectors_reserved);
848 
849 		if (room)
850 			mddev->bitmap_info.space = room;
851 		else
852 			mddev->bitmap_info.space = mddev->bitmap_info.default_space;
853 	}
854 	llbitmap->flags = le32_to_cpu(sb->state);
855 	if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
856 		ret = llbitmap_init(llbitmap);
857 		goto out_put_page;
858 	}
859 
860 	chunksize = le32_to_cpu(sb->chunksize);
861 	if (!is_power_of_2(chunksize)) {
862 		pr_err("md/llbitmap: %s: chunksize not a power of 2",
863 		       mdname(mddev));
864 		goto out_put_page;
865 	}
866 
867 	if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
868 					      mddev->bitmap_info.space << SECTOR_SHIFT)) {
869 		pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
870 		       mdname(mddev), chunksize, mddev->resync_max_sectors,
871 		       mddev->bitmap_info.space);
872 		goto out_put_page;
873 	}
874 
875 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
876 	if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
877 		pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
878 		       mdname(mddev), daemon_sleep);
879 		goto out_put_page;
880 	}
881 
882 	events = le64_to_cpu(sb->events);
883 	if (events < mddev->events) {
884 		pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
885 			mdname(mddev), events, mddev->events);
886 		set_bit(BITMAP_STALE, &llbitmap->flags);
887 	}
888 
889 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
890 	mddev->bitmap_info.chunksize = chunksize;
891 	mddev->bitmap_info.daemon_sleep = daemon_sleep;
892 
893 	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
894 	llbitmap->chunksize = chunksize;
895 	llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
896 	llbitmap->chunkshift = ffz(~chunksize);
897 	ret = llbitmap_cache_pages(llbitmap);
898 
899 out_put_page:
900 	__free_page(sb_page);
901 	kunmap_local(sb);
902 	return ret;
903 }
904 
905 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
906 {
907 	struct llbitmap *llbitmap =
908 		container_of(pending_timer, struct llbitmap, pending_timer);
909 
910 	if (work_busy(&llbitmap->daemon_work)) {
911 		pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
912 			mdname(llbitmap->mddev),
913 			llbitmap->mddev->bitmap_info.daemon_sleep);
914 		set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
915 		return;
916 	}
917 
918 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
919 }
920 
921 static void md_llbitmap_daemon_fn(struct work_struct *work)
922 {
923 	struct llbitmap *llbitmap =
924 		container_of(work, struct llbitmap, daemon_work);
925 	unsigned long start;
926 	unsigned long end;
927 	bool restart;
928 	int idx;
929 
930 	if (llbitmap->mddev->degraded)
931 		return;
932 retry:
933 	start = 0;
934 	end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
935 	restart = false;
936 
937 	for (idx = 0; idx < llbitmap->nr_pages; idx++) {
938 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
939 
940 		if (idx > 0) {
941 			start = end + 1;
942 			end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
943 		}
944 
945 		if (!test_bit(LLPageFlush, &pctl->flags) &&
946 		    time_before(jiffies, pctl->expire)) {
947 			restart = true;
948 			continue;
949 		}
950 
951 		if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
952 			pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
953 				mdname(llbitmap->mddev), __func__, idx);
954 			continue;
955 		}
956 
957 		llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
958 		llbitmap_resume(llbitmap, idx);
959 	}
960 
961 	/*
962 	 * If the daemon took a long time to finish, retry to prevent missing
963 	 * clearing dirty bits.
964 	 */
965 	if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
966 		goto retry;
967 
968 	/* If some page is dirty but not expired, setup timer again */
969 	if (restart)
970 		mod_timer(&llbitmap->pending_timer,
971 			  jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
972 }
973 
974 static int llbitmap_create(struct mddev *mddev)
975 {
976 	struct llbitmap *llbitmap;
977 	int ret;
978 
979 	ret = llbitmap_check_support(mddev);
980 	if (ret)
981 		return ret;
982 
983 	llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
984 	if (!llbitmap)
985 		return -ENOMEM;
986 
987 	llbitmap->mddev = mddev;
988 	llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
989 	llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
990 
991 	timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
992 	INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
993 	atomic_set(&llbitmap->behind_writes, 0);
994 	init_waitqueue_head(&llbitmap->behind_wait);
995 
996 	mutex_lock(&mddev->bitmap_info.mutex);
997 	mddev->bitmap = llbitmap;
998 	ret = llbitmap_read_sb(llbitmap);
999 	mutex_unlock(&mddev->bitmap_info.mutex);
1000 	if (ret) {
1001 		kfree(llbitmap);
1002 		mddev->bitmap = NULL;
1003 	}
1004 
1005 	return ret;
1006 }
1007 
1008 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
1009 {
1010 	struct llbitmap *llbitmap = mddev->bitmap;
1011 	unsigned long chunks;
1012 
1013 	if (chunksize == 0)
1014 		chunksize = llbitmap->chunksize;
1015 
1016 	/* If there is enough space, leave the chunksize unchanged. */
1017 	chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1018 	while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
1019 		chunksize = chunksize << 1;
1020 		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1021 	}
1022 
1023 	llbitmap->chunkshift = ffz(~chunksize);
1024 	llbitmap->chunksize = chunksize;
1025 	llbitmap->chunks = chunks;
1026 
1027 	return 0;
1028 }
1029 
1030 static int llbitmap_load(struct mddev *mddev)
1031 {
1032 	enum llbitmap_action action = BitmapActionReload;
1033 	struct llbitmap *llbitmap = mddev->bitmap;
1034 
1035 	if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
1036 		action = BitmapActionStale;
1037 
1038 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
1039 	return 0;
1040 }
1041 
1042 static void llbitmap_destroy(struct mddev *mddev)
1043 {
1044 	struct llbitmap *llbitmap = mddev->bitmap;
1045 
1046 	if (!llbitmap)
1047 		return;
1048 
1049 	mutex_lock(&mddev->bitmap_info.mutex);
1050 
1051 	timer_delete_sync(&llbitmap->pending_timer);
1052 	flush_workqueue(md_llbitmap_io_wq);
1053 	flush_workqueue(md_llbitmap_unplug_wq);
1054 
1055 	mddev->bitmap = NULL;
1056 	llbitmap_free_pages(llbitmap);
1057 	kfree(llbitmap);
1058 	mutex_unlock(&mddev->bitmap_info.mutex);
1059 }
1060 
1061 static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
1062 				 unsigned long sectors)
1063 {
1064 	struct llbitmap *llbitmap = mddev->bitmap;
1065 	unsigned long start = offset >> llbitmap->chunkshift;
1066 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1067 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1068 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1069 
1070 	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
1071 
1072 	while (page_start <= page_end) {
1073 		llbitmap_raise_barrier(llbitmap, page_start);
1074 		page_start++;
1075 	}
1076 }
1077 
1078 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
1079 			       unsigned long sectors)
1080 {
1081 	struct llbitmap *llbitmap = mddev->bitmap;
1082 	unsigned long start = offset >> llbitmap->chunkshift;
1083 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1084 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1085 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1086 
1087 	while (page_start <= page_end) {
1088 		llbitmap_release_barrier(llbitmap, page_start);
1089 		page_start++;
1090 	}
1091 }
1092 
1093 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
1094 				   unsigned long sectors)
1095 {
1096 	struct llbitmap *llbitmap = mddev->bitmap;
1097 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1098 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1099 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1100 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1101 
1102 	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
1103 
1104 	while (page_start <= page_end) {
1105 		llbitmap_raise_barrier(llbitmap, page_start);
1106 		page_start++;
1107 	}
1108 }
1109 
1110 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
1111 				 unsigned long sectors)
1112 {
1113 	struct llbitmap *llbitmap = mddev->bitmap;
1114 	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1115 	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1116 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1117 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1118 
1119 	while (page_start <= page_end) {
1120 		llbitmap_release_barrier(llbitmap, page_start);
1121 		page_start++;
1122 	}
1123 }
1124 
1125 static void llbitmap_unplug_fn(struct work_struct *work)
1126 {
1127 	struct llbitmap_unplug_work *unplug_work =
1128 		container_of(work, struct llbitmap_unplug_work, work);
1129 	struct llbitmap *llbitmap = unplug_work->llbitmap;
1130 	struct blk_plug plug;
1131 	int i;
1132 
1133 	blk_start_plug(&plug);
1134 
1135 	for (i = 0; i < llbitmap->nr_pages; i++) {
1136 		if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
1137 		    !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1138 			continue;
1139 
1140 		llbitmap_write_page(llbitmap, i);
1141 	}
1142 
1143 	blk_finish_plug(&plug);
1144 	md_super_wait(llbitmap->mddev);
1145 	complete(unplug_work->done);
1146 }
1147 
1148 static bool llbitmap_dirty(struct llbitmap *llbitmap)
1149 {
1150 	int i;
1151 
1152 	for (i = 0; i < llbitmap->nr_pages; i++)
1153 		if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1154 			return true;
1155 
1156 	return false;
1157 }
1158 
1159 static void llbitmap_unplug(struct mddev *mddev, bool sync)
1160 {
1161 	DECLARE_COMPLETION_ONSTACK(done);
1162 	struct llbitmap *llbitmap = mddev->bitmap;
1163 	struct llbitmap_unplug_work unplug_work = {
1164 		.llbitmap = llbitmap,
1165 		.done = &done,
1166 	};
1167 
1168 	if (!llbitmap_dirty(llbitmap))
1169 		return;
1170 
1171 	/*
1172 	 * Issue new bitmap IO under submit_bio() context will deadlock:
1173 	 *  - the bio will wait for bitmap bio to be done, before it can be
1174 	 *  issued;
1175 	 *  - bitmap bio will be added to current->bio_list and wait for this
1176 	 *  bio to be issued;
1177 	 */
1178 	INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
1179 	queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
1180 	wait_for_completion(&done);
1181 	destroy_work_on_stack(&unplug_work.work);
1182 }
1183 
1184 /*
1185  * Force to write all bitmap pages to disk, called when stopping the array, or
1186  * every daemon_sleep seconds when sync_thread is running.
1187  */
1188 static void __llbitmap_flush(struct mddev *mddev)
1189 {
1190 	struct llbitmap *llbitmap = mddev->bitmap;
1191 	struct blk_plug plug;
1192 	int i;
1193 
1194 	blk_start_plug(&plug);
1195 	for (i = 0; i < llbitmap->nr_pages; i++) {
1196 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1197 
1198 		/* mark all blocks as dirty */
1199 		set_bit(LLPageDirty, &pctl->flags);
1200 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1201 		llbitmap_write_page(llbitmap, i);
1202 	}
1203 	blk_finish_plug(&plug);
1204 	md_super_wait(llbitmap->mddev);
1205 }
1206 
1207 static void llbitmap_flush(struct mddev *mddev)
1208 {
1209 	struct llbitmap *llbitmap = mddev->bitmap;
1210 	int i;
1211 
1212 	for (i = 0; i < llbitmap->nr_pages; i++)
1213 		set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
1214 
1215 	timer_delete_sync(&llbitmap->pending_timer);
1216 	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1217 	flush_work(&llbitmap->daemon_work);
1218 
1219 	__llbitmap_flush(mddev);
1220 }
1221 
1222 /* This is used for raid5 lazy initial recovery */
1223 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
1224 {
1225 	struct llbitmap *llbitmap = mddev->bitmap;
1226 	unsigned long p = offset >> llbitmap->chunkshift;
1227 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1228 
1229 	return c == BitClean || c == BitDirty;
1230 }
1231 
1232 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
1233 {
1234 	struct llbitmap *llbitmap = mddev->bitmap;
1235 	unsigned long p = offset >> llbitmap->chunkshift;
1236 	int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1237 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
1238 
1239 	/* always skip unwritten blocks */
1240 	if (c == BitUnwritten)
1241 		return blocks;
1242 
1243 	/* For degraded array, don't skip */
1244 	if (mddev->degraded)
1245 		return 0;
1246 
1247 	/* For resync also skip clean/dirty blocks */
1248 	if ((c == BitClean || c == BitDirty) &&
1249 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
1250 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1251 		return blocks;
1252 
1253 	return 0;
1254 }
1255 
1256 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
1257 				sector_t *blocks, bool degraded)
1258 {
1259 	struct llbitmap *llbitmap = mddev->bitmap;
1260 	unsigned long p = offset >> llbitmap->chunkshift;
1261 
1262 	/*
1263 	 * Handle one bit at a time, this is much simpler. And it doesn't matter
1264 	 * if md_do_sync() loop more times.
1265 	 */
1266 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1267 	return llbitmap_state_machine(llbitmap, p, p,
1268 				      BitmapActionStartsync) == BitSyncing;
1269 }
1270 
1271 /* Something is wrong, sync_thread stop at @offset */
1272 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
1273 			      sector_t *blocks)
1274 {
1275 	struct llbitmap *llbitmap = mddev->bitmap;
1276 	unsigned long p = offset >> llbitmap->chunkshift;
1277 
1278 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1279 	llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
1280 			       BitmapActionAbortsync);
1281 }
1282 
1283 /* A full sync_thread is finished */
1284 static void llbitmap_close_sync(struct mddev *mddev)
1285 {
1286 	struct llbitmap *llbitmap = mddev->bitmap;
1287 	int i;
1288 
1289 	for (i = 0; i < llbitmap->nr_pages; i++) {
1290 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1291 
1292 		/* let daemon_fn clear dirty bits immediately */
1293 		WRITE_ONCE(pctl->expire, jiffies);
1294 	}
1295 
1296 	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1297 			       BitmapActionEndsync);
1298 }
1299 
1300 /*
1301  * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
1302  * just in case sync_thread have to restart after power failure.
1303  */
1304 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1305 				   bool force)
1306 {
1307 	struct llbitmap *llbitmap = mddev->bitmap;
1308 
1309 	if (sector == 0) {
1310 		llbitmap->last_end_sync = jiffies;
1311 		return;
1312 	}
1313 
1314 	if (time_before(jiffies, llbitmap->last_end_sync +
1315 				 HZ * mddev->bitmap_info.daemon_sleep))
1316 		return;
1317 
1318 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
1319 
1320 	mddev->curr_resync_completed = sector;
1321 	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1322 	llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
1323 			       BitmapActionEndsync);
1324 	__llbitmap_flush(mddev);
1325 
1326 	llbitmap->last_end_sync = jiffies;
1327 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
1328 }
1329 
1330 static bool llbitmap_enabled(void *data, bool flush)
1331 {
1332 	struct llbitmap *llbitmap = data;
1333 
1334 	return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1335 }
1336 
1337 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1338 				unsigned long e)
1339 {
1340 	llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
1341 }
1342 
1343 static void llbitmap_write_sb(struct llbitmap *llbitmap)
1344 {
1345 	int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
1346 
1347 	bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
1348 	llbitmap_write_page(llbitmap, 0);
1349 	md_super_wait(llbitmap->mddev);
1350 }
1351 
1352 static void llbitmap_update_sb(void *data)
1353 {
1354 	struct llbitmap *llbitmap = data;
1355 	struct mddev *mddev = llbitmap->mddev;
1356 	struct page *sb_page;
1357 	bitmap_super_t *sb;
1358 
1359 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
1360 		return;
1361 
1362 	sb_page = llbitmap_read_page(llbitmap, 0);
1363 	if (IS_ERR(sb_page)) {
1364 		pr_err("%s: %s: read super block failed", __func__,
1365 		       mdname(mddev));
1366 		set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1367 		return;
1368 	}
1369 
1370 	if (mddev->events < llbitmap->events_cleared)
1371 		llbitmap->events_cleared = mddev->events;
1372 
1373 	sb = kmap_local_page(sb_page);
1374 	sb->events = cpu_to_le64(mddev->events);
1375 	sb->state = cpu_to_le32(llbitmap->flags);
1376 	sb->chunksize = cpu_to_le32(llbitmap->chunksize);
1377 	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1378 	sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
1379 	sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
1380 	sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
1381 
1382 	kunmap_local(sb);
1383 	llbitmap_write_sb(llbitmap);
1384 }
1385 
1386 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
1387 {
1388 	struct llbitmap *llbitmap = data;
1389 
1390 	memset(stats, 0, sizeof(*stats));
1391 
1392 	stats->missing_pages = 0;
1393 	stats->pages = llbitmap->nr_pages;
1394 	stats->file_pages = llbitmap->nr_pages;
1395 
1396 	stats->behind_writes = atomic_read(&llbitmap->behind_writes);
1397 	stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
1398 	stats->events_cleared = llbitmap->events_cleared;
1399 
1400 	return 0;
1401 }
1402 
1403 /* just flag all pages as needing to be written */
1404 static void llbitmap_write_all(struct mddev *mddev)
1405 {
1406 	int i;
1407 	struct llbitmap *llbitmap = mddev->bitmap;
1408 
1409 	for (i = 0; i < llbitmap->nr_pages; i++) {
1410 		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1411 
1412 		set_bit(LLPageDirty, &pctl->flags);
1413 		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1414 	}
1415 }
1416 
1417 static void llbitmap_start_behind_write(struct mddev *mddev)
1418 {
1419 	struct llbitmap *llbitmap = mddev->bitmap;
1420 
1421 	atomic_inc(&llbitmap->behind_writes);
1422 }
1423 
1424 static void llbitmap_end_behind_write(struct mddev *mddev)
1425 {
1426 	struct llbitmap *llbitmap = mddev->bitmap;
1427 
1428 	if (atomic_dec_and_test(&llbitmap->behind_writes))
1429 		wake_up(&llbitmap->behind_wait);
1430 }
1431 
1432 static void llbitmap_wait_behind_writes(struct mddev *mddev)
1433 {
1434 	struct llbitmap *llbitmap = mddev->bitmap;
1435 
1436 	if (!llbitmap)
1437 		return;
1438 
1439 	wait_event(llbitmap->behind_wait,
1440 		   atomic_read(&llbitmap->behind_writes) == 0);
1441 
1442 }
1443 
1444 static ssize_t bits_show(struct mddev *mddev, char *page)
1445 {
1446 	struct llbitmap *llbitmap;
1447 	int bits[BitStateCount] = {0};
1448 	loff_t start = 0;
1449 
1450 	mutex_lock(&mddev->bitmap_info.mutex);
1451 	llbitmap = mddev->bitmap;
1452 	if (!llbitmap || !llbitmap->pctl) {
1453 		mutex_unlock(&mddev->bitmap_info.mutex);
1454 		return sprintf(page, "no bitmap\n");
1455 	}
1456 
1457 	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
1458 		mutex_unlock(&mddev->bitmap_info.mutex);
1459 		return sprintf(page, "bitmap io error\n");
1460 	}
1461 
1462 	while (start < llbitmap->chunks) {
1463 		enum llbitmap_state c = llbitmap_read(llbitmap, start);
1464 
1465 		if (c < 0 || c >= BitStateCount)
1466 			pr_err("%s: invalid bit %llu state %d\n",
1467 			       __func__, start, c);
1468 		else
1469 			bits[c]++;
1470 		start++;
1471 	}
1472 
1473 	mutex_unlock(&mddev->bitmap_info.mutex);
1474 	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
1475 		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
1476 		       bits[BitNeedSync], bits[BitSyncing]);
1477 }
1478 
1479 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
1480 
1481 static ssize_t metadata_show(struct mddev *mddev, char *page)
1482 {
1483 	struct llbitmap *llbitmap;
1484 	ssize_t ret;
1485 
1486 	mutex_lock(&mddev->bitmap_info.mutex);
1487 	llbitmap = mddev->bitmap;
1488 	if (!llbitmap) {
1489 		mutex_unlock(&mddev->bitmap_info.mutex);
1490 		return sprintf(page, "no bitmap\n");
1491 	}
1492 
1493 	ret =  sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
1494 		       llbitmap->chunksize, llbitmap->chunkshift,
1495 		       llbitmap->chunks, mddev->bitmap_info.offset,
1496 		       llbitmap->mddev->bitmap_info.daemon_sleep);
1497 	mutex_unlock(&mddev->bitmap_info.mutex);
1498 
1499 	return ret;
1500 }
1501 
1502 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
1503 
1504 static ssize_t
1505 daemon_sleep_show(struct mddev *mddev, char *page)
1506 {
1507 	return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
1508 }
1509 
1510 static ssize_t
1511 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
1512 {
1513 	unsigned long timeout;
1514 	int rv = kstrtoul(buf, 10, &timeout);
1515 
1516 	if (rv)
1517 		return rv;
1518 
1519 	mddev->bitmap_info.daemon_sleep = timeout;
1520 	return len;
1521 }
1522 
1523 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
1524 
1525 static ssize_t
1526 barrier_idle_show(struct mddev *mddev, char *page)
1527 {
1528 	struct llbitmap *llbitmap = mddev->bitmap;
1529 
1530 	return sprintf(page, "%lu\n", llbitmap->barrier_idle);
1531 }
1532 
1533 static ssize_t
1534 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
1535 {
1536 	struct llbitmap *llbitmap = mddev->bitmap;
1537 	unsigned long timeout;
1538 	int rv = kstrtoul(buf, 10, &timeout);
1539 
1540 	if (rv)
1541 		return rv;
1542 
1543 	llbitmap->barrier_idle = timeout;
1544 	return len;
1545 }
1546 
1547 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
1548 
1549 static struct attribute *md_llbitmap_attrs[] = {
1550 	&llbitmap_bits.attr,
1551 	&llbitmap_metadata.attr,
1552 	&llbitmap_daemon_sleep.attr,
1553 	&llbitmap_barrier_idle.attr,
1554 	NULL
1555 };
1556 
1557 static struct attribute_group md_llbitmap_group = {
1558 	.name = "llbitmap",
1559 	.attrs = md_llbitmap_attrs,
1560 };
1561 
1562 static struct bitmap_operations llbitmap_ops = {
1563 	.head = {
1564 		.type	= MD_BITMAP,
1565 		.id	= ID_LLBITMAP,
1566 		.name	= "llbitmap",
1567 	},
1568 
1569 	.enabled		= llbitmap_enabled,
1570 	.create			= llbitmap_create,
1571 	.resize			= llbitmap_resize,
1572 	.load			= llbitmap_load,
1573 	.destroy		= llbitmap_destroy,
1574 
1575 	.start_write		= llbitmap_start_write,
1576 	.end_write		= llbitmap_end_write,
1577 	.start_discard		= llbitmap_start_discard,
1578 	.end_discard		= llbitmap_end_discard,
1579 	.unplug			= llbitmap_unplug,
1580 	.flush			= llbitmap_flush,
1581 
1582 	.start_behind_write	= llbitmap_start_behind_write,
1583 	.end_behind_write	= llbitmap_end_behind_write,
1584 	.wait_behind_writes	= llbitmap_wait_behind_writes,
1585 
1586 	.blocks_synced		= llbitmap_blocks_synced,
1587 	.skip_sync_blocks	= llbitmap_skip_sync_blocks,
1588 	.start_sync		= llbitmap_start_sync,
1589 	.end_sync		= llbitmap_end_sync,
1590 	.close_sync		= llbitmap_close_sync,
1591 	.cond_end_sync		= llbitmap_cond_end_sync,
1592 
1593 	.update_sb		= llbitmap_update_sb,
1594 	.get_stats		= llbitmap_get_stats,
1595 	.dirty_bits		= llbitmap_dirty_bits,
1596 	.write_all		= llbitmap_write_all,
1597 
1598 	.group			= &md_llbitmap_group,
1599 };
1600 
1601 int md_llbitmap_init(void)
1602 {
1603 	md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
1604 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1605 	if (!md_llbitmap_io_wq)
1606 		return -ENOMEM;
1607 
1608 	md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
1609 					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1610 	if (!md_llbitmap_unplug_wq) {
1611 		destroy_workqueue(md_llbitmap_io_wq);
1612 		md_llbitmap_io_wq = NULL;
1613 		return -ENOMEM;
1614 	}
1615 
1616 	return register_md_submodule(&llbitmap_ops.head);
1617 }
1618 
1619 void md_llbitmap_exit(void)
1620 {
1621 	destroy_workqueue(md_llbitmap_io_wq);
1622 	md_llbitmap_io_wq = NULL;
1623 	destroy_workqueue(md_llbitmap_unplug_wq);
1624 	md_llbitmap_unplug_wq = NULL;
1625 	unregister_md_submodule(&llbitmap_ops.head);
1626 }
1627