xref: /linux/drivers/md/md-bitmap.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
4  *
5  * bitmap_create  - sets up the bitmap structure
6  * bitmap_destroy - destroys the bitmap structure
7  *
8  * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
9  * - added disk storage for bitmap
10  * - changes to allow various bitmap chunk sizes
11  */
12 
13 /*
14  * Still to do:
15  *
16  * flush after percent set rather than just time based. (maybe both).
17  */
18 
19 #include <linux/blkdev.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/timer.h>
25 #include <linux/sched.h>
26 #include <linux/list.h>
27 #include <linux/file.h>
28 #include <linux/mount.h>
29 #include <linux/buffer_head.h>
30 #include <linux/seq_file.h>
31 #include <trace/events/block.h>
32 
33 #include "md.h"
34 #include "md-bitmap.h"
35 #include "md-cluster.h"
36 
37 /*
38  * in-memory bitmap:
39  *
40  * Use 16 bit block counters to track pending writes to each "chunk".
41  * The 2 high order bits are special-purpose, the first is a flag indicating
42  * whether a resync is needed.  The second is a flag indicating whether a
43  * resync is active.
44  * This means that the counter is actually 14 bits:
45  *
46  * +--------+--------+------------------------------------------------+
47  * | resync | resync |               counter                          |
48  * | needed | active |                                                |
49  * |  (0-1) |  (0-1) |              (0-16383)                         |
50  * +--------+--------+------------------------------------------------+
51  *
52  * The "resync needed" bit is set when:
53  *    a '1' bit is read from storage at startup.
54  *    a write request fails on some drives
55  *    a resync is aborted on a chunk with 'resync active' set
56  * It is cleared (and resync-active set) when a resync starts across all drives
57  * of the chunk.
58  *
59  *
60  * The "resync active" bit is set when:
61  *    a resync is started on all drives, and resync_needed is set.
62  *       resync_needed will be cleared (as long as resync_active wasn't already set).
63  * It is cleared when a resync completes.
64  *
65  * The counter counts pending write requests, plus the on-disk bit.
66  * When the counter is '1' and the resync bits are clear, the on-disk
67  * bit can be cleared as well, thus setting the counter to 0.
68  * When we set a bit, or in the counter (to start a write), if the fields is
69  * 0, we first set the disk bit and set the counter to 1.
70  *
71  * If the counter is 0, the on-disk bit is clear and the stripe is clean
72  * Anything that dirties the stripe pushes the counter to 2 (at least)
73  * and sets the on-disk bit (lazily).
74  * If a periodic sweep find the counter at 2, it is decremented to 1.
75  * If the sweep find the counter at 1, the on-disk bit is cleared and the
76  * counter goes to zero.
77  *
78  * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
79  * counters as a fallback when "page" memory cannot be allocated:
80  *
81  * Normal case (page memory allocated):
82  *
83  *     page pointer (32-bit)
84  *
85  *     [ ] ------+
86  *               |
87  *               +-------> [   ][   ]..[   ] (4096 byte page == 2048 counters)
88  *                          c1   c2    c2048
89  *
90  * Hijacked case (page memory allocation failed):
91  *
92  *     hijacked page pointer (32-bit)
93  *
94  *     [		  ][		  ] (no page memory allocated)
95  *      counter #1 (16-bit) counter #2 (16-bit)
96  *
97  */
98 
99 typedef __u16 bitmap_counter_t;
100 
101 #define PAGE_BITS (PAGE_SIZE << 3)
102 #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
103 
104 #define COUNTER_BITS 16
105 #define COUNTER_BIT_SHIFT 4
106 #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
107 
108 #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
109 #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
110 #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
111 
112 #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
113 #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
114 #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
115 
116 /* how many counters per page? */
117 #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
118 /* same, except a shift value for more efficient bitops */
119 #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
120 /* same, except a mask value for more efficient bitops */
121 #define PAGE_COUNTER_MASK  (PAGE_COUNTER_RATIO - 1)
122 
123 #define BITMAP_BLOCK_SHIFT 9
124 
125 /*
126  * bitmap structures:
127  */
128 
129 /* the in-memory bitmap is represented by bitmap_pages */
130 struct bitmap_page {
131 	/*
132 	 * map points to the actual memory page
133 	 */
134 	char *map;
135 	/*
136 	 * in emergencies (when map cannot be alloced), hijack the map
137 	 * pointer and use it as two counters itself
138 	 */
139 	unsigned int hijacked:1;
140 	/*
141 	 * If any counter in this page is '1' or '2' - and so could be
142 	 * cleared then that page is marked as 'pending'
143 	 */
144 	unsigned int pending:1;
145 	/*
146 	 * count of dirty bits on the page
147 	 */
148 	unsigned int  count:30;
149 };
150 
151 /* the main bitmap structure - one per mddev */
152 struct bitmap {
153 
154 	struct bitmap_counts {
155 		spinlock_t lock;
156 		struct bitmap_page *bp;
157 		/* total number of pages in the bitmap */
158 		unsigned long pages;
159 		/* number of pages not yet allocated */
160 		unsigned long missing_pages;
161 		/* chunksize = 2^chunkshift (for bitops) */
162 		unsigned long chunkshift;
163 		/* total number of data chunks for the array */
164 		unsigned long chunks;
165 	} counts;
166 
167 	struct mddev *mddev; /* the md device that the bitmap is for */
168 
169 	__u64	events_cleared;
170 	int need_sync;
171 
172 	struct bitmap_storage {
173 		/* backing disk file */
174 		struct file *file;
175 		/* cached copy of the bitmap file superblock */
176 		struct page *sb_page;
177 		unsigned long sb_index;
178 		/* list of cache pages for the file */
179 		struct page **filemap;
180 		/* attributes associated filemap pages */
181 		unsigned long *filemap_attr;
182 		/* number of pages in the file */
183 		unsigned long file_pages;
184 		/* total bytes in the bitmap */
185 		unsigned long bytes;
186 	} storage;
187 
188 	unsigned long flags;
189 
190 	int allclean;
191 
192 	atomic_t behind_writes;
193 	/* highest actual value at runtime */
194 	unsigned long behind_writes_used;
195 
196 	/*
197 	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
198 	 * file, cleaning up bits and flushing out pages to disk as necessary
199 	 */
200 	unsigned long daemon_lastrun; /* jiffies of last run */
201 	/*
202 	 * when we lasted called end_sync to update bitmap with resync
203 	 * progress.
204 	 */
205 	unsigned long last_end_sync;
206 
207 	/* pending writes to the bitmap file */
208 	atomic_t pending_writes;
209 	wait_queue_head_t write_wait;
210 	wait_queue_head_t overflow_wait;
211 	wait_queue_head_t behind_wait;
212 
213 	struct kernfs_node *sysfs_can_clear;
214 	/* slot offset for clustered env */
215 	int cluster_slot;
216 };
217 
218 static struct workqueue_struct *md_bitmap_wq;
219 
220 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
221 			   int chunksize, bool init);
222 
223 static inline char *bmname(struct bitmap *bitmap)
224 {
225 	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
226 }
227 
228 static bool bitmap_enabled(void *data, bool flush)
229 {
230 	struct bitmap *bitmap = data;
231 
232 	if (!flush)
233 		return true;
234 
235 	/*
236 	 * If caller want to flush bitmap pages to underlying disks, check if
237 	 * there are cached pages in filemap.
238 	 */
239 	return !test_bit(BITMAP_STALE, &bitmap->flags) &&
240 	       bitmap->storage.filemap != NULL;
241 }
242 
243 /*
244  * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
245  *
246  * 1) check to see if this page is allocated, if it's not then try to alloc
247  * 2) if the alloc fails, set the page's hijacked flag so we'll use the
248  *    page pointer directly as a counter
249  *
250  * if we find our page, we increment the page's refcount so that it stays
251  * allocated while we're using it
252  */
253 static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
254 			       unsigned long page, int create, int no_hijack)
255 __releases(bitmap->lock)
256 __acquires(bitmap->lock)
257 {
258 	unsigned char *mappage;
259 
260 	WARN_ON_ONCE(page >= bitmap->pages);
261 	if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
262 		return 0;
263 
264 	if (bitmap->bp[page].map) /* page is already allocated, just return */
265 		return 0;
266 
267 	if (!create)
268 		return -ENOENT;
269 
270 	/* this page has not been allocated yet */
271 
272 	spin_unlock_irq(&bitmap->lock);
273 	/* It is possible that this is being called inside a
274 	 * prepare_to_wait/finish_wait loop from raid5c:make_request().
275 	 * In general it is not permitted to sleep in that context as it
276 	 * can cause the loop to spin freely.
277 	 * That doesn't apply here as we can only reach this point
278 	 * once with any loop.
279 	 * When this function completes, either bp[page].map or
280 	 * bp[page].hijacked.  In either case, this function will
281 	 * abort before getting to this point again.  So there is
282 	 * no risk of a free-spin, and so it is safe to assert
283 	 * that sleeping here is allowed.
284 	 */
285 	sched_annotate_sleep();
286 	mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
287 	spin_lock_irq(&bitmap->lock);
288 
289 	if (mappage == NULL) {
290 		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
291 		/* We don't support hijack for cluster raid */
292 		if (no_hijack)
293 			return -ENOMEM;
294 		/* failed - set the hijacked flag so that we can use the
295 		 * pointer as a counter */
296 		if (!bitmap->bp[page].map)
297 			bitmap->bp[page].hijacked = 1;
298 	} else if (bitmap->bp[page].map ||
299 		   bitmap->bp[page].hijacked) {
300 		/* somebody beat us to getting the page */
301 		kfree(mappage);
302 	} else {
303 
304 		/* no page was in place and we have one, so install it */
305 
306 		bitmap->bp[page].map = mappage;
307 		bitmap->missing_pages--;
308 	}
309 	return 0;
310 }
311 
312 /* if page is completely empty, put it back on the free list, or dealloc it */
313 /* if page was hijacked, unmark the flag so it might get alloced next time */
314 /* Note: lock should be held when calling this */
315 static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
316 {
317 	char *ptr;
318 
319 	if (bitmap->bp[page].count) /* page is still busy */
320 		return;
321 
322 	/* page is no longer in use, it can be released */
323 
324 	if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
325 		bitmap->bp[page].hijacked = 0;
326 		bitmap->bp[page].map = NULL;
327 	} else {
328 		/* normal case, free the page */
329 		ptr = bitmap->bp[page].map;
330 		bitmap->bp[page].map = NULL;
331 		bitmap->missing_pages++;
332 		kfree(ptr);
333 	}
334 }
335 
336 /*
337  * bitmap file handling - read and write the bitmap file and its superblock
338  */
339 
340 /*
341  * basic page I/O operations
342  */
343 
344 /* IO operations when bitmap is stored near all superblocks */
345 
346 /* choose a good rdev and read the page from there */
347 static int read_sb_page(struct mddev *mddev, loff_t offset,
348 		struct page *page, unsigned long index, int size)
349 {
350 
351 	sector_t sector = mddev->bitmap_info.offset + offset +
352 		index * (PAGE_SIZE / SECTOR_SIZE);
353 	struct md_rdev *rdev;
354 
355 	rdev_for_each(rdev, mddev) {
356 		u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
357 
358 		if (!test_bit(In_sync, &rdev->flags) ||
359 		    test_bit(Faulty, &rdev->flags) ||
360 		    test_bit(Bitmap_sync, &rdev->flags))
361 			continue;
362 
363 		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
364 			return 0;
365 	}
366 	return -EIO;
367 }
368 
369 static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
370 {
371 	/* Iterate the disks of an mddev, using rcu to protect access to the
372 	 * linked list, and raising the refcount of devices we return to ensure
373 	 * they don't disappear while in use.
374 	 * As devices are only added or removed when raid_disk is < 0 and
375 	 * nr_pending is 0 and In_sync is clear, the entries we return will
376 	 * still be in the same position on the list when we re-enter
377 	 * list_for_each_entry_continue_rcu.
378 	 *
379 	 * Note that if entered with 'rdev == NULL' to start at the
380 	 * beginning, we temporarily assign 'rdev' to an address which
381 	 * isn't really an rdev, but which can be used by
382 	 * list_for_each_entry_continue_rcu() to find the first entry.
383 	 */
384 	rcu_read_lock();
385 	if (rdev == NULL)
386 		/* start at the beginning */
387 		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
388 	else {
389 		/* release the previous rdev and start from there. */
390 		rdev_dec_pending(rdev, mddev);
391 	}
392 	list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
393 		if (rdev->raid_disk >= 0 &&
394 		    !test_bit(Faulty, &rdev->flags)) {
395 			/* this is a usable devices */
396 			atomic_inc(&rdev->nr_pending);
397 			rcu_read_unlock();
398 			return rdev;
399 		}
400 	}
401 	rcu_read_unlock();
402 	return NULL;
403 }
404 
405 static unsigned int optimal_io_size(struct block_device *bdev,
406 				    unsigned int last_page_size,
407 				    unsigned int io_size)
408 {
409 	if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
410 		return roundup(last_page_size, bdev_io_opt(bdev));
411 	return io_size;
412 }
413 
414 static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
415 				   loff_t start, loff_t boundary)
416 {
417 	if (io_size != opt_size &&
418 	    start + opt_size / SECTOR_SIZE <= boundary)
419 		return opt_size;
420 	if (start + io_size / SECTOR_SIZE <= boundary)
421 		return io_size;
422 
423 	/* Overflows boundary */
424 	return 0;
425 }
426 
427 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
428 			   unsigned long pg_index, struct page *page)
429 {
430 	struct block_device *bdev;
431 	struct mddev *mddev = bitmap->mddev;
432 	struct bitmap_storage *store = &bitmap->storage;
433 	unsigned long num_pages = bitmap->storage.file_pages;
434 	unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT;
435 	loff_t sboff, offset = mddev->bitmap_info.offset;
436 	sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
437 	unsigned int size = PAGE_SIZE;
438 	unsigned int opt_size = PAGE_SIZE;
439 	sector_t doff;
440 
441 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
442 	/* we compare length (page numbers), not page offset. */
443 	if ((pg_index - store->sb_index) == num_pages - 1) {
444 		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
445 
446 		if (last_page_size == 0)
447 			last_page_size = PAGE_SIZE;
448 		size = roundup(last_page_size, bdev_logical_block_size(bdev));
449 		opt_size = optimal_io_size(bdev, last_page_size, size);
450 	}
451 
452 	sboff = rdev->sb_start + offset;
453 	doff = rdev->data_offset;
454 
455 	/* Just make sure we aren't corrupting data or metadata */
456 	if (mddev->external) {
457 		/* Bitmap could be anywhere. */
458 		if (sboff + ps > doff &&
459 		    sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
460 			return -EINVAL;
461 	} else if (offset < 0) {
462 		/* DATA  BITMAP METADATA  */
463 		size = bitmap_io_size(size, opt_size, offset + ps, 0);
464 		if (size == 0)
465 			/* bitmap runs in to metadata */
466 			return -EINVAL;
467 
468 		if (doff + mddev->dev_sectors > sboff)
469 			/* data runs in to bitmap */
470 			return -EINVAL;
471 	} else if (rdev->sb_start < rdev->data_offset) {
472 		/* METADATA BITMAP DATA */
473 		size = bitmap_io_size(size, opt_size, sboff + ps, doff);
474 		if (size == 0)
475 			/* bitmap runs in to data */
476 			return -EINVAL;
477 	}
478 
479 	md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit),
480 			  page, 0);
481 	return 0;
482 }
483 
484 static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
485 			  struct page *page, bool wait)
486 {
487 	struct mddev *mddev = bitmap->mddev;
488 
489 	do {
490 		struct md_rdev *rdev = NULL;
491 
492 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
493 			if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
494 				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
495 				return;
496 			}
497 		}
498 	} while (wait && md_super_wait(mddev) < 0);
499 }
500 
501 static void md_bitmap_file_kick(struct bitmap *bitmap);
502 
503 #ifdef CONFIG_MD_BITMAP_FILE
504 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
505 {
506 	struct buffer_head *bh = page_buffers(page);
507 
508 	while (bh && bh->b_blocknr) {
509 		atomic_inc(&bitmap->pending_writes);
510 		set_buffer_locked(bh);
511 		set_buffer_mapped(bh);
512 		submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
513 		bh = bh->b_this_page;
514 	}
515 
516 	if (wait)
517 		wait_event(bitmap->write_wait,
518 			   atomic_read(&bitmap->pending_writes) == 0);
519 }
520 
521 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
522 {
523 	struct bitmap *bitmap = bh->b_private;
524 
525 	if (!uptodate)
526 		set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
527 	if (atomic_dec_and_test(&bitmap->pending_writes))
528 		wake_up(&bitmap->write_wait);
529 }
530 
531 static void free_buffers(struct page *page)
532 {
533 	struct buffer_head *bh;
534 
535 	if (!PagePrivate(page))
536 		return;
537 
538 	bh = page_buffers(page);
539 	while (bh) {
540 		struct buffer_head *next = bh->b_this_page;
541 		free_buffer_head(bh);
542 		bh = next;
543 	}
544 	detach_page_private(page);
545 	put_page(page);
546 }
547 
548 /* read a page from a file.
549  * We both read the page, and attach buffers to the page to record the
550  * address of each block (using bmap).  These addresses will be used
551  * to write the block later, completely bypassing the filesystem.
552  * This usage is similar to how swap files are handled, and allows us
553  * to write to a file with no concerns of memory allocation failing.
554  */
555 static int read_file_page(struct file *file, unsigned long index,
556 		struct bitmap *bitmap, unsigned long count, struct page *page)
557 {
558 	int ret = 0;
559 	struct inode *inode = file_inode(file);
560 	struct buffer_head *bh;
561 	sector_t block, blk_cur;
562 	unsigned long blocksize = i_blocksize(inode);
563 
564 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
565 		 (unsigned long long)index << PAGE_SHIFT);
566 
567 	bh = alloc_page_buffers(page, blocksize);
568 	if (!bh) {
569 		ret = -ENOMEM;
570 		goto out;
571 	}
572 	attach_page_private(page, bh);
573 	blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
574 	while (bh) {
575 		block = blk_cur;
576 
577 		if (count == 0)
578 			bh->b_blocknr = 0;
579 		else {
580 			ret = bmap(inode, &block);
581 			if (ret || !block) {
582 				ret = -EINVAL;
583 				bh->b_blocknr = 0;
584 				goto out;
585 			}
586 
587 			bh->b_blocknr = block;
588 			bh->b_bdev = inode->i_sb->s_bdev;
589 			if (count < blocksize)
590 				count = 0;
591 			else
592 				count -= blocksize;
593 
594 			bh->b_end_io = end_bitmap_write;
595 			bh->b_private = bitmap;
596 			atomic_inc(&bitmap->pending_writes);
597 			set_buffer_locked(bh);
598 			set_buffer_mapped(bh);
599 			submit_bh(REQ_OP_READ, bh);
600 		}
601 		blk_cur++;
602 		bh = bh->b_this_page;
603 	}
604 
605 	wait_event(bitmap->write_wait,
606 		   atomic_read(&bitmap->pending_writes)==0);
607 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
608 		ret = -EIO;
609 out:
610 	if (ret)
611 		pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
612 		       (int)PAGE_SIZE,
613 		       (unsigned long long)index << PAGE_SHIFT,
614 		       ret);
615 	return ret;
616 }
617 #else /* CONFIG_MD_BITMAP_FILE */
618 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
619 {
620 }
621 static int read_file_page(struct file *file, unsigned long index,
622 		struct bitmap *bitmap, unsigned long count, struct page *page)
623 {
624 	return -EIO;
625 }
626 static void free_buffers(struct page *page)
627 {
628 	put_page(page);
629 }
630 #endif /* CONFIG_MD_BITMAP_FILE */
631 
632 /*
633  * bitmap file superblock operations
634  */
635 
636 /*
637  * write out a page to a file
638  */
639 static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
640 			       bool wait)
641 {
642 	struct bitmap_storage *store = &bitmap->storage;
643 	struct page *page = store->filemap[pg_index];
644 
645 	if (mddev_is_clustered(bitmap->mddev)) {
646 		/* go to node bitmap area starting point */
647 		pg_index += store->sb_index;
648 	}
649 
650 	if (store->file)
651 		write_file_page(bitmap, page, wait);
652 	else
653 		write_sb_page(bitmap, pg_index, page, wait);
654 }
655 
656 /*
657  * md_bitmap_wait_writes() should be called before writing any bitmap
658  * blocks, to ensure previous writes, particularly from
659  * md_bitmap_daemon_work(), have completed.
660  */
661 static void md_bitmap_wait_writes(struct bitmap *bitmap)
662 {
663 	if (bitmap->storage.file)
664 		wait_event(bitmap->write_wait,
665 			   atomic_read(&bitmap->pending_writes)==0);
666 	else
667 		/* Note that we ignore the return value.  The writes
668 		 * might have failed, but that would just mean that
669 		 * some bits which should be cleared haven't been,
670 		 * which is safe.  The relevant bitmap blocks will
671 		 * probably get written again, but there is no great
672 		 * loss if they aren't.
673 		 */
674 		md_super_wait(bitmap->mddev);
675 }
676 
677 
678 /* update the event counter and sync the superblock to disk */
679 static void bitmap_update_sb(void *data)
680 {
681 	bitmap_super_t *sb;
682 	struct bitmap *bitmap = data;
683 
684 	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
685 		return;
686 	if (bitmap->mddev->bitmap_info.external)
687 		return;
688 	if (!bitmap->storage.sb_page) /* no superblock */
689 		return;
690 	sb = kmap_local_page(bitmap->storage.sb_page);
691 	sb->events = cpu_to_le64(bitmap->mddev->events);
692 	if (bitmap->mddev->events < bitmap->events_cleared)
693 		/* rocking back to read-only */
694 		bitmap->events_cleared = bitmap->mddev->events;
695 	sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
696 	/*
697 	 * clear BITMAP_WRITE_ERROR bit to protect against the case that
698 	 * a bitmap write error occurred but the later writes succeeded.
699 	 */
700 	sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
701 	/* Just in case these have been changed via sysfs: */
702 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
703 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
704 	/* This might have been changed by a reshape */
705 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
706 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
707 	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
708 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
709 					   bitmap_info.space);
710 	kunmap_local(sb);
711 
712 	if (bitmap->storage.file)
713 		write_file_page(bitmap, bitmap->storage.sb_page, 1);
714 	else
715 		write_sb_page(bitmap, bitmap->storage.sb_index,
716 			      bitmap->storage.sb_page, 1);
717 }
718 
719 static void bitmap_print_sb(struct bitmap *bitmap)
720 {
721 	bitmap_super_t *sb;
722 
723 	if (!bitmap || !bitmap->storage.sb_page)
724 		return;
725 	sb = kmap_local_page(bitmap->storage.sb_page);
726 	pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
727 	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
728 	pr_debug("       version: %u\n", le32_to_cpu(sb->version));
729 	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
730 		 le32_to_cpu(*(__le32 *)(sb->uuid+0)),
731 		 le32_to_cpu(*(__le32 *)(sb->uuid+4)),
732 		 le32_to_cpu(*(__le32 *)(sb->uuid+8)),
733 		 le32_to_cpu(*(__le32 *)(sb->uuid+12)));
734 	pr_debug("        events: %llu\n",
735 		 (unsigned long long) le64_to_cpu(sb->events));
736 	pr_debug("events cleared: %llu\n",
737 		 (unsigned long long) le64_to_cpu(sb->events_cleared));
738 	pr_debug("         state: %08x\n", le32_to_cpu(sb->state));
739 	pr_debug("     chunksize: %u B\n", le32_to_cpu(sb->chunksize));
740 	pr_debug("  daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
741 	pr_debug("     sync size: %llu KB\n",
742 		 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
743 	pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
744 	kunmap_local(sb);
745 }
746 
747 /*
748  * bitmap_new_disk_sb
749  * @bitmap
750  *
751  * This function is somewhat the reverse of bitmap_read_sb.  bitmap_read_sb
752  * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
753  * This function verifies 'bitmap_info' and populates the on-disk bitmap
754  * structure, which is to be written to disk.
755  *
756  * Returns: 0 on success, -Exxx on error
757  */
758 static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
759 {
760 	bitmap_super_t *sb;
761 	unsigned long chunksize, daemon_sleep, write_behind;
762 
763 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
764 	if (bitmap->storage.sb_page == NULL)
765 		return -ENOMEM;
766 	bitmap->storage.sb_index = 0;
767 
768 	sb = kmap_local_page(bitmap->storage.sb_page);
769 
770 	sb->magic = cpu_to_le32(BITMAP_MAGIC);
771 	sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
772 
773 	chunksize = bitmap->mddev->bitmap_info.chunksize;
774 	BUG_ON(!chunksize);
775 	if (!is_power_of_2(chunksize)) {
776 		kunmap_local(sb);
777 		pr_warn("bitmap chunksize not a power of 2\n");
778 		return -EINVAL;
779 	}
780 	sb->chunksize = cpu_to_le32(chunksize);
781 
782 	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
783 	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
784 		pr_debug("Choosing daemon_sleep default (5 sec)\n");
785 		daemon_sleep = 5 * HZ;
786 	}
787 	sb->daemon_sleep = cpu_to_le32(daemon_sleep);
788 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
789 
790 	/*
791 	 * FIXME: write_behind for RAID1.  If not specified, what
792 	 * is a good choice?  We choose COUNTER_MAX / 2 arbitrarily.
793 	 */
794 	write_behind = bitmap->mddev->bitmap_info.max_write_behind;
795 	if (write_behind > COUNTER_MAX / 2)
796 		write_behind = COUNTER_MAX / 2;
797 	sb->write_behind = cpu_to_le32(write_behind);
798 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
799 
800 	/* keep the array size field of the bitmap superblock up to date */
801 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
802 
803 	memcpy(sb->uuid, bitmap->mddev->uuid, 16);
804 
805 	set_bit(BITMAP_STALE, &bitmap->flags);
806 	sb->state = cpu_to_le32(bitmap->flags);
807 	bitmap->events_cleared = bitmap->mddev->events;
808 	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
809 	bitmap->mddev->bitmap_info.nodes = 0;
810 
811 	kunmap_local(sb);
812 
813 	return 0;
814 }
815 
816 /* read the superblock from the bitmap file and initialize some bitmap fields */
817 static int md_bitmap_read_sb(struct bitmap *bitmap)
818 {
819 	char *reason = NULL;
820 	bitmap_super_t *sb;
821 	unsigned long chunksize, daemon_sleep, write_behind;
822 	unsigned long long events;
823 	int nodes = 0;
824 	unsigned long sectors_reserved = 0;
825 	int err = -EINVAL;
826 	struct page *sb_page;
827 	loff_t offset = 0;
828 
829 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
830 		chunksize = 128 * 1024 * 1024;
831 		daemon_sleep = 5 * HZ;
832 		write_behind = 0;
833 		set_bit(BITMAP_STALE, &bitmap->flags);
834 		err = 0;
835 		goto out_no_sb;
836 	}
837 	/* page 0 is the superblock, read it... */
838 	sb_page = alloc_page(GFP_KERNEL);
839 	if (!sb_page)
840 		return -ENOMEM;
841 	bitmap->storage.sb_page = sb_page;
842 
843 re_read:
844 	/* If cluster_slot is set, the cluster is setup */
845 	if (bitmap->cluster_slot >= 0) {
846 		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
847 
848 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks,
849 			   (bitmap->mddev->bitmap_info.chunksize >> 9));
850 		/* bits to bytes */
851 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
852 		/* to 4k blocks */
853 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
854 		offset = bitmap->cluster_slot * (bm_blocks << 3);
855 		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
856 			bitmap->cluster_slot, offset);
857 	}
858 
859 	if (bitmap->storage.file) {
860 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
861 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
862 
863 		err = read_file_page(bitmap->storage.file, 0,
864 				bitmap, bytes, sb_page);
865 	} else {
866 		err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
867 				   sizeof(bitmap_super_t));
868 	}
869 	if (err)
870 		return err;
871 
872 	err = -EINVAL;
873 	sb = kmap_local_page(sb_page);
874 
875 	chunksize = le32_to_cpu(sb->chunksize);
876 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
877 	write_behind = le32_to_cpu(sb->write_behind);
878 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
879 
880 	/* verify that the bitmap-specific fields are valid */
881 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
882 		reason = "bad magic";
883 	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
884 		 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
885 		reason = "unrecognized superblock version";
886 	else if (chunksize < 512)
887 		reason = "bitmap chunksize too small";
888 	else if (!is_power_of_2(chunksize))
889 		reason = "bitmap chunksize not a power of 2";
890 	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
891 		reason = "daemon sleep period out of range";
892 	else if (write_behind > COUNTER_MAX)
893 		reason = "write-behind limit out of range (0 - 16383)";
894 	if (reason) {
895 		pr_warn("%s: invalid bitmap file superblock: %s\n",
896 			bmname(bitmap), reason);
897 		goto out;
898 	}
899 
900 	/*
901 	 * Setup nodes/clustername only if bitmap version is
902 	 * cluster-compatible
903 	 */
904 	if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
905 		nodes = le32_to_cpu(sb->nodes);
906 		strscpy(bitmap->mddev->bitmap_info.cluster_name,
907 				sb->cluster_name, 64);
908 	}
909 
910 	/* keep the array size field of the bitmap superblock up to date */
911 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
912 
913 	if (bitmap->mddev->persistent) {
914 		/*
915 		 * We have a persistent array superblock, so compare the
916 		 * bitmap's UUID and event counter to the mddev's
917 		 */
918 		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
919 			pr_warn("%s: bitmap superblock UUID mismatch\n",
920 				bmname(bitmap));
921 			goto out;
922 		}
923 		events = le64_to_cpu(sb->events);
924 		if (!nodes && (events < bitmap->mddev->events)) {
925 			pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
926 				bmname(bitmap), events,
927 				(unsigned long long) bitmap->mddev->events);
928 			set_bit(BITMAP_STALE, &bitmap->flags);
929 		}
930 	}
931 
932 	/* assign fields using values from superblock */
933 	bitmap->flags |= le32_to_cpu(sb->state);
934 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
935 		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
936 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
937 	err = 0;
938 
939 out:
940 	kunmap_local(sb);
941 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
942 		/* Assigning chunksize is required for "re_read" */
943 		bitmap->mddev->bitmap_info.chunksize = chunksize;
944 		err = md_setup_cluster(bitmap->mddev, nodes);
945 		if (err) {
946 			pr_warn("%s: Could not setup cluster service (%d)\n",
947 				bmname(bitmap), err);
948 			goto out_no_sb;
949 		}
950 		bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev);
951 		goto re_read;
952 	}
953 
954 out_no_sb:
955 	if (err == 0) {
956 		if (test_bit(BITMAP_STALE, &bitmap->flags))
957 			bitmap->events_cleared = bitmap->mddev->events;
958 		bitmap->mddev->bitmap_info.chunksize = chunksize;
959 		bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
960 		bitmap->mddev->bitmap_info.max_write_behind = write_behind;
961 		bitmap->mddev->bitmap_info.nodes = nodes;
962 		if (bitmap->mddev->bitmap_info.space == 0 ||
963 			bitmap->mddev->bitmap_info.space > sectors_reserved)
964 			bitmap->mddev->bitmap_info.space = sectors_reserved;
965 	} else {
966 		bitmap_print_sb(bitmap);
967 		if (bitmap->cluster_slot < 0)
968 			md_cluster_stop(bitmap->mddev);
969 	}
970 	return err;
971 }
972 
973 /*
974  * general bitmap file operations
975  */
976 
977 /*
978  * on-disk bitmap:
979  *
980  * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
981  * file a page at a time. There's a superblock at the start of the file.
982  */
983 /* calculate the index of the page that contains this bit */
984 static inline unsigned long file_page_index(struct bitmap_storage *store,
985 					    unsigned long chunk)
986 {
987 	if (store->sb_page)
988 		chunk += sizeof(bitmap_super_t) << 3;
989 	return chunk >> PAGE_BIT_SHIFT;
990 }
991 
992 /* calculate the (bit) offset of this bit within a page */
993 static inline unsigned long file_page_offset(struct bitmap_storage *store,
994 					     unsigned long chunk)
995 {
996 	if (store->sb_page)
997 		chunk += sizeof(bitmap_super_t) << 3;
998 	return chunk & (PAGE_BITS - 1);
999 }
1000 
1001 /*
1002  * return a pointer to the page in the filemap that contains the given bit
1003  *
1004  */
1005 static inline struct page *filemap_get_page(struct bitmap_storage *store,
1006 					    unsigned long chunk)
1007 {
1008 	if (file_page_index(store, chunk) >= store->file_pages)
1009 		return NULL;
1010 	return store->filemap[file_page_index(store, chunk)];
1011 }
1012 
1013 static int md_bitmap_storage_alloc(struct bitmap_storage *store,
1014 				   unsigned long chunks, int with_super,
1015 				   int slot_number)
1016 {
1017 	int pnum, offset = 0;
1018 	unsigned long num_pages;
1019 	unsigned long bytes;
1020 
1021 	bytes = DIV_ROUND_UP(chunks, 8);
1022 	if (with_super)
1023 		bytes += sizeof(bitmap_super_t);
1024 
1025 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
1026 	offset = slot_number * num_pages;
1027 
1028 	store->filemap = kmalloc_array(num_pages, sizeof(struct page *),
1029 				       GFP_KERNEL);
1030 	if (!store->filemap)
1031 		return -ENOMEM;
1032 
1033 	if (with_super && !store->sb_page) {
1034 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
1035 		if (store->sb_page == NULL)
1036 			return -ENOMEM;
1037 	}
1038 
1039 	pnum = 0;
1040 	if (store->sb_page) {
1041 		store->filemap[0] = store->sb_page;
1042 		pnum = 1;
1043 		store->sb_index = offset;
1044 	}
1045 
1046 	for ( ; pnum < num_pages; pnum++) {
1047 		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
1048 		if (!store->filemap[pnum]) {
1049 			store->file_pages = pnum;
1050 			return -ENOMEM;
1051 		}
1052 	}
1053 	store->file_pages = pnum;
1054 
1055 	/* We need 4 bits per page, rounded up to a multiple
1056 	 * of sizeof(unsigned long) */
1057 	store->filemap_attr = kzalloc(
1058 		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
1059 		GFP_KERNEL);
1060 	if (!store->filemap_attr)
1061 		return -ENOMEM;
1062 
1063 	store->bytes = bytes;
1064 
1065 	return 0;
1066 }
1067 
1068 static void md_bitmap_file_unmap(struct bitmap_storage *store)
1069 {
1070 	struct file *file = store->file;
1071 	struct page *sb_page = store->sb_page;
1072 	struct page **map = store->filemap;
1073 	int pages = store->file_pages;
1074 
1075 	while (pages--)
1076 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
1077 			free_buffers(map[pages]);
1078 	kfree(map);
1079 	kfree(store->filemap_attr);
1080 
1081 	if (sb_page)
1082 		free_buffers(sb_page);
1083 
1084 	if (file) {
1085 		struct inode *inode = file_inode(file);
1086 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
1087 		fput(file);
1088 	}
1089 }
1090 
1091 /*
1092  * bitmap_file_kick - if an error occurs while manipulating the bitmap file
1093  * then it is no longer reliable, so we stop using it and we mark the file
1094  * as failed in the superblock
1095  */
1096 static void md_bitmap_file_kick(struct bitmap *bitmap)
1097 {
1098 	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
1099 		bitmap_update_sb(bitmap);
1100 
1101 		if (bitmap->storage.file) {
1102 			pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
1103 				bmname(bitmap), bitmap->storage.file);
1104 
1105 		} else
1106 			pr_warn("%s: disabling internal bitmap due to errors\n",
1107 				bmname(bitmap));
1108 	}
1109 }
1110 
1111 enum bitmap_page_attr {
1112 	BITMAP_PAGE_DIRTY = 0,     /* there are set bits that need to be synced */
1113 	BITMAP_PAGE_PENDING = 1,   /* there are bits that are being cleaned.
1114 				    * i.e. counter is 1 or 2. */
1115 	BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
1116 };
1117 
1118 static inline void set_page_attr(struct bitmap *bitmap, int pnum,
1119 				 enum bitmap_page_attr attr)
1120 {
1121 	set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1122 }
1123 
1124 static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
1125 				   enum bitmap_page_attr attr)
1126 {
1127 	clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1128 }
1129 
1130 static inline int test_page_attr(struct bitmap *bitmap, int pnum,
1131 				 enum bitmap_page_attr attr)
1132 {
1133 	return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1134 }
1135 
1136 static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
1137 					   enum bitmap_page_attr attr)
1138 {
1139 	return test_and_clear_bit((pnum<<2) + attr,
1140 				  bitmap->storage.filemap_attr);
1141 }
1142 /*
1143  * bitmap_file_set_bit -- called before performing a write to the md device
1144  * to set (and eventually sync) a particular bit in the bitmap file
1145  *
1146  * we set the bit immediately, then we record the page number so that
1147  * when an unplug occurs, we can flush the dirty pages out to disk
1148  */
1149 static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
1150 {
1151 	unsigned long bit;
1152 	struct page *page;
1153 	void *kaddr;
1154 	unsigned long chunk = block >> bitmap->counts.chunkshift;
1155 	struct bitmap_storage *store = &bitmap->storage;
1156 	unsigned long index = file_page_index(store, chunk);
1157 	unsigned long node_offset = 0;
1158 
1159 	index += store->sb_index;
1160 	if (mddev_is_clustered(bitmap->mddev))
1161 		node_offset = bitmap->cluster_slot * store->file_pages;
1162 
1163 	page = filemap_get_page(&bitmap->storage, chunk);
1164 	if (!page)
1165 		return;
1166 	bit = file_page_offset(&bitmap->storage, chunk);
1167 
1168 	/* set the bit */
1169 	kaddr = kmap_local_page(page);
1170 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1171 		set_bit(bit, kaddr);
1172 	else
1173 		set_bit_le(bit, kaddr);
1174 	kunmap_local(kaddr);
1175 	pr_debug("set file bit %lu page %lu\n", bit, index);
1176 	/* record page number so it gets flushed to disk when unplug occurs */
1177 	set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
1178 }
1179 
1180 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
1181 {
1182 	unsigned long bit;
1183 	struct page *page;
1184 	void *paddr;
1185 	unsigned long chunk = block >> bitmap->counts.chunkshift;
1186 	struct bitmap_storage *store = &bitmap->storage;
1187 	unsigned long index = file_page_index(store, chunk);
1188 	unsigned long node_offset = 0;
1189 
1190 	index += store->sb_index;
1191 	if (mddev_is_clustered(bitmap->mddev))
1192 		node_offset = bitmap->cluster_slot * store->file_pages;
1193 
1194 	page = filemap_get_page(&bitmap->storage, chunk);
1195 	if (!page)
1196 		return;
1197 	bit = file_page_offset(&bitmap->storage, chunk);
1198 	paddr = kmap_local_page(page);
1199 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1200 		clear_bit(bit, paddr);
1201 	else
1202 		clear_bit_le(bit, paddr);
1203 	kunmap_local(paddr);
1204 	if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
1205 		set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
1206 		bitmap->allclean = 0;
1207 	}
1208 }
1209 
1210 static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
1211 {
1212 	unsigned long bit;
1213 	struct page *page;
1214 	void *paddr;
1215 	unsigned long chunk = block >> bitmap->counts.chunkshift;
1216 	int set = 0;
1217 
1218 	page = filemap_get_page(&bitmap->storage, chunk);
1219 	if (!page)
1220 		return -EINVAL;
1221 	bit = file_page_offset(&bitmap->storage, chunk);
1222 	paddr = kmap_local_page(page);
1223 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1224 		set = test_bit(bit, paddr);
1225 	else
1226 		set = test_bit_le(bit, paddr);
1227 	kunmap_local(paddr);
1228 	return set;
1229 }
1230 
1231 /* this gets called when the md device is ready to unplug its underlying
1232  * (slave) device queues -- before we let any writes go down, we need to
1233  * sync the dirty pages of the bitmap file to disk */
1234 static void __bitmap_unplug(struct bitmap *bitmap)
1235 {
1236 	unsigned long i;
1237 	int dirty, need_write;
1238 	int writing = 0;
1239 
1240 	if (!bitmap_enabled(bitmap, true))
1241 		return;
1242 
1243 	/* look at each page to see if there are any set bits that need to be
1244 	 * flushed out to disk */
1245 	for (i = 0; i < bitmap->storage.file_pages; i++) {
1246 		dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
1247 		need_write = test_and_clear_page_attr(bitmap, i,
1248 						      BITMAP_PAGE_NEEDWRITE);
1249 		if (dirty || need_write) {
1250 			if (!writing) {
1251 				md_bitmap_wait_writes(bitmap);
1252 				mddev_add_trace_msg(bitmap->mddev,
1253 					"md bitmap_unplug");
1254 			}
1255 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
1256 			filemap_write_page(bitmap, i, false);
1257 			writing = 1;
1258 		}
1259 	}
1260 	if (writing)
1261 		md_bitmap_wait_writes(bitmap);
1262 
1263 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1264 		md_bitmap_file_kick(bitmap);
1265 }
1266 
1267 struct bitmap_unplug_work {
1268 	struct work_struct work;
1269 	struct bitmap *bitmap;
1270 	struct completion *done;
1271 };
1272 
1273 static void md_bitmap_unplug_fn(struct work_struct *work)
1274 {
1275 	struct bitmap_unplug_work *unplug_work =
1276 		container_of(work, struct bitmap_unplug_work, work);
1277 
1278 	__bitmap_unplug(unplug_work->bitmap);
1279 	complete(unplug_work->done);
1280 }
1281 
1282 static void bitmap_unplug_async(struct bitmap *bitmap)
1283 {
1284 	DECLARE_COMPLETION_ONSTACK(done);
1285 	struct bitmap_unplug_work unplug_work;
1286 
1287 	INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn);
1288 	unplug_work.bitmap = bitmap;
1289 	unplug_work.done = &done;
1290 
1291 	queue_work(md_bitmap_wq, &unplug_work.work);
1292 	wait_for_completion(&done);
1293 	destroy_work_on_stack(&unplug_work.work);
1294 }
1295 
1296 static void bitmap_unplug(struct mddev *mddev, bool sync)
1297 {
1298 	struct bitmap *bitmap = mddev->bitmap;
1299 
1300 	if (!bitmap)
1301 		return;
1302 
1303 	if (sync)
1304 		__bitmap_unplug(bitmap);
1305 	else
1306 		bitmap_unplug_async(bitmap);
1307 }
1308 
1309 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
1310 
1311 /*
1312  * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
1313  * mapping of the bitmap file.
1314  *
1315  * Special case: If there's no bitmap file, or if the bitmap file had been
1316  * previously kicked from the array, we mark all the bits as 1's in order to
1317  * cause a full resync.
1318  *
1319  * We ignore all bits for sectors that end earlier than 'start'.
1320  * This is used when reading an out-of-date bitmap.
1321  */
1322 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1323 {
1324 	bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
1325 	struct mddev *mddev = bitmap->mddev;
1326 	unsigned long chunks = bitmap->counts.chunks;
1327 	struct bitmap_storage *store = &bitmap->storage;
1328 	struct file *file = store->file;
1329 	unsigned long node_offset = 0;
1330 	unsigned long bit_cnt = 0;
1331 	unsigned long i;
1332 	int ret;
1333 
1334 	if (!file && !mddev->bitmap_info.offset) {
1335 		/* No permanent bitmap - fill with '1s'. */
1336 		store->filemap = NULL;
1337 		store->file_pages = 0;
1338 		for (i = 0; i < chunks ; i++) {
1339 			/* if the disk bit is set, set the memory bit */
1340 			int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
1341 				      >= start);
1342 			md_bitmap_set_memory_bits(bitmap,
1343 						  (sector_t)i << bitmap->counts.chunkshift,
1344 						  needed);
1345 		}
1346 		return 0;
1347 	}
1348 
1349 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
1350 		pr_warn("%s: bitmap file too short %lu < %lu\n",
1351 			bmname(bitmap),
1352 			(unsigned long) i_size_read(file->f_mapping->host),
1353 			store->bytes);
1354 		ret = -ENOSPC;
1355 		goto err;
1356 	}
1357 
1358 	if (mddev_is_clustered(mddev))
1359 		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
1360 
1361 	for (i = 0; i < store->file_pages; i++) {
1362 		struct page *page = store->filemap[i];
1363 		int count;
1364 
1365 		/* unmap the old page, we're done with it */
1366 		if (i == store->file_pages - 1)
1367 			count = store->bytes - i * PAGE_SIZE;
1368 		else
1369 			count = PAGE_SIZE;
1370 
1371 		if (file)
1372 			ret = read_file_page(file, i, bitmap, count, page);
1373 		else
1374 			ret = read_sb_page(mddev, 0, page, i + node_offset,
1375 					   count);
1376 		if (ret)
1377 			goto err;
1378 	}
1379 
1380 	if (outofdate) {
1381 		pr_warn("%s: bitmap file is out of date, doing full recovery\n",
1382 			bmname(bitmap));
1383 
1384 		for (i = 0; i < store->file_pages; i++) {
1385 			struct page *page = store->filemap[i];
1386 			unsigned long offset = 0;
1387 			void *paddr;
1388 
1389 			if (i == 0 && !mddev->bitmap_info.external)
1390 				offset = sizeof(bitmap_super_t);
1391 
1392 			/*
1393 			 * If the bitmap is out of date, dirty the whole page
1394 			 * and write it out
1395 			 */
1396 			paddr = kmap_local_page(page);
1397 			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
1398 			kunmap_local(paddr);
1399 
1400 			filemap_write_page(bitmap, i, true);
1401 			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
1402 				ret = -EIO;
1403 				goto err;
1404 			}
1405 		}
1406 	}
1407 
1408 	for (i = 0; i < chunks; i++) {
1409 		struct page *page = filemap_get_page(&bitmap->storage, i);
1410 		unsigned long bit = file_page_offset(&bitmap->storage, i);
1411 		void *paddr;
1412 		bool was_set;
1413 
1414 		paddr = kmap_local_page(page);
1415 		if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1416 			was_set = test_bit(bit, paddr);
1417 		else
1418 			was_set = test_bit_le(bit, paddr);
1419 		kunmap_local(paddr);
1420 
1421 		if (was_set) {
1422 			/* if the disk bit is set, set the memory bit */
1423 			int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
1424 				      >= start);
1425 			md_bitmap_set_memory_bits(bitmap,
1426 						  (sector_t)i << bitmap->counts.chunkshift,
1427 						  needed);
1428 			bit_cnt++;
1429 		}
1430 	}
1431 
1432 	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
1433 		 bmname(bitmap), store->file_pages,
1434 		 bit_cnt, chunks);
1435 
1436 	return 0;
1437 
1438  err:
1439 	pr_warn("%s: bitmap initialisation failed: %d\n",
1440 		bmname(bitmap), ret);
1441 	return ret;
1442 }
1443 
1444 /* just flag bitmap pages as needing to be written. */
1445 static void bitmap_write_all(struct mddev *mddev)
1446 {
1447 	int i;
1448 	struct bitmap *bitmap = mddev->bitmap;
1449 
1450 	if (!bitmap || !bitmap->storage.filemap)
1451 		return;
1452 
1453 	/* Only one copy, so nothing needed */
1454 	if (bitmap->storage.file)
1455 		return;
1456 
1457 	for (i = 0; i < bitmap->storage.file_pages; i++)
1458 		set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
1459 	bitmap->allclean = 0;
1460 }
1461 
1462 static void md_bitmap_count_page(struct bitmap_counts *bitmap,
1463 				 sector_t offset, int inc)
1464 {
1465 	sector_t chunk = offset >> bitmap->chunkshift;
1466 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1467 	bitmap->bp[page].count += inc;
1468 	md_bitmap_checkfree(bitmap, page);
1469 }
1470 
1471 static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
1472 {
1473 	sector_t chunk = offset >> bitmap->chunkshift;
1474 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1475 	struct bitmap_page *bp = &bitmap->bp[page];
1476 
1477 	if (!bp->pending)
1478 		bp->pending = 1;
1479 }
1480 
1481 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1482 					       sector_t offset, sector_t *blocks,
1483 					       int create);
1484 
1485 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
1486 			      bool force)
1487 {
1488 	struct md_thread *thread;
1489 
1490 	rcu_read_lock();
1491 	thread = rcu_dereference(mddev->thread);
1492 
1493 	if (!thread)
1494 		goto out;
1495 
1496 	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
1497 		thread->timeout = timeout;
1498 
1499 out:
1500 	rcu_read_unlock();
1501 }
1502 
1503 /*
1504  * bitmap daemon -- periodically wakes up to clean bits and flush pages
1505  *			out to disk
1506  */
1507 static void bitmap_daemon_work(struct mddev *mddev)
1508 {
1509 	struct bitmap *bitmap;
1510 	unsigned long j;
1511 	unsigned long nextpage;
1512 	sector_t blocks;
1513 	struct bitmap_counts *counts;
1514 
1515 	/* Use a mutex to guard daemon_work against
1516 	 * bitmap_destroy.
1517 	 */
1518 	mutex_lock(&mddev->bitmap_info.mutex);
1519 	bitmap = mddev->bitmap;
1520 	if (bitmap == NULL) {
1521 		mutex_unlock(&mddev->bitmap_info.mutex);
1522 		return;
1523 	}
1524 	if (time_before(jiffies, bitmap->daemon_lastrun
1525 			+ mddev->bitmap_info.daemon_sleep))
1526 		goto done;
1527 
1528 	bitmap->daemon_lastrun = jiffies;
1529 	if (bitmap->allclean) {
1530 		mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
1531 		goto done;
1532 	}
1533 	bitmap->allclean = 1;
1534 
1535 	mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
1536 
1537 	/* Any file-page which is PENDING now needs to be written.
1538 	 * So set NEEDWRITE now, then after we make any last-minute changes
1539 	 * we will write it.
1540 	 */
1541 	for (j = 0; j < bitmap->storage.file_pages; j++)
1542 		if (test_and_clear_page_attr(bitmap, j,
1543 					     BITMAP_PAGE_PENDING))
1544 			set_page_attr(bitmap, j,
1545 				      BITMAP_PAGE_NEEDWRITE);
1546 
1547 	if (bitmap->need_sync &&
1548 	    mddev->bitmap_info.external == 0) {
1549 		/* Arrange for superblock update as well as
1550 		 * other changes */
1551 		bitmap_super_t *sb;
1552 		bitmap->need_sync = 0;
1553 		if (bitmap->storage.filemap) {
1554 			sb = kmap_local_page(bitmap->storage.sb_page);
1555 			sb->events_cleared =
1556 				cpu_to_le64(bitmap->events_cleared);
1557 			kunmap_local(sb);
1558 			set_page_attr(bitmap, 0,
1559 				      BITMAP_PAGE_NEEDWRITE);
1560 		}
1561 	}
1562 	/* Now look at the bitmap counters and if any are '2' or '1',
1563 	 * decrement and handle accordingly.
1564 	 */
1565 	counts = &bitmap->counts;
1566 	spin_lock_irq(&counts->lock);
1567 	nextpage = 0;
1568 	for (j = 0; j < counts->chunks; j++) {
1569 		bitmap_counter_t *bmc;
1570 		sector_t  block = (sector_t)j << counts->chunkshift;
1571 
1572 		if (j == nextpage) {
1573 			nextpage += PAGE_COUNTER_RATIO;
1574 			if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
1575 				j |= PAGE_COUNTER_MASK;
1576 				continue;
1577 			}
1578 			counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
1579 		}
1580 
1581 		bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
1582 		if (!bmc) {
1583 			j |= PAGE_COUNTER_MASK;
1584 			continue;
1585 		}
1586 		if (*bmc == 1 && !bitmap->need_sync) {
1587 			/* We can clear the bit */
1588 			*bmc = 0;
1589 			md_bitmap_count_page(counts, block, -1);
1590 			md_bitmap_file_clear_bit(bitmap, block);
1591 		} else if (*bmc && *bmc <= 2) {
1592 			*bmc = 1;
1593 			md_bitmap_set_pending(counts, block);
1594 			bitmap->allclean = 0;
1595 		}
1596 	}
1597 	spin_unlock_irq(&counts->lock);
1598 
1599 	md_bitmap_wait_writes(bitmap);
1600 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
1601 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
1602 	 * for them.
1603 	 * If we find any DIRTY page we stop there and let bitmap_unplug
1604 	 * handle all the rest.  This is important in the case where
1605 	 * the first blocking holds the superblock and it has been updated.
1606 	 * We mustn't write any other blocks before the superblock.
1607 	 */
1608 	for (j = 0;
1609 	     j < bitmap->storage.file_pages
1610 		     && !test_bit(BITMAP_STALE, &bitmap->flags);
1611 	     j++) {
1612 		if (test_page_attr(bitmap, j,
1613 				   BITMAP_PAGE_DIRTY))
1614 			/* bitmap_unplug will handle the rest */
1615 			break;
1616 		if (bitmap->storage.filemap &&
1617 		    test_and_clear_page_attr(bitmap, j,
1618 					     BITMAP_PAGE_NEEDWRITE))
1619 			filemap_write_page(bitmap, j, false);
1620 	}
1621 
1622  done:
1623 	if (bitmap->allclean == 0)
1624 		mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
1625 	mutex_unlock(&mddev->bitmap_info.mutex);
1626 }
1627 
1628 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1629 					       sector_t offset, sector_t *blocks,
1630 					       int create)
1631 __releases(bitmap->lock)
1632 __acquires(bitmap->lock)
1633 {
1634 	/* If 'create', we might release the lock and reclaim it.
1635 	 * The lock must have been taken with interrupts enabled.
1636 	 * If !create, we don't release the lock.
1637 	 */
1638 	sector_t chunk = offset >> bitmap->chunkshift;
1639 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1640 	unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1641 	sector_t csize = ((sector_t)1) << bitmap->chunkshift;
1642 	int err;
1643 
1644 	if (page >= bitmap->pages) {
1645 		/*
1646 		 * This can happen if bitmap_start_sync goes beyond
1647 		 * End-of-device while looking for a whole page or
1648 		 * user set a huge number to sysfs bitmap_set_bits.
1649 		 */
1650 		*blocks = csize - (offset & (csize - 1));
1651 		return NULL;
1652 	}
1653 	err = md_bitmap_checkpage(bitmap, page, create, 0);
1654 
1655 	if (bitmap->bp[page].hijacked ||
1656 	    bitmap->bp[page].map == NULL)
1657 		csize = ((sector_t)1) << (bitmap->chunkshift +
1658 					  PAGE_COUNTER_SHIFT);
1659 
1660 	*blocks = csize - (offset & (csize - 1));
1661 
1662 	if (err < 0)
1663 		return NULL;
1664 
1665 	/* now locked ... */
1666 
1667 	if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1668 		/* should we use the first or second counter field
1669 		 * of the hijacked pointer? */
1670 		int hi = (pageoff > PAGE_COUNTER_MASK);
1671 		return  &((bitmap_counter_t *)
1672 			  &bitmap->bp[page].map)[hi];
1673 	} else /* page is allocated */
1674 		return (bitmap_counter_t *)
1675 			&(bitmap->bp[page].map[pageoff]);
1676 }
1677 
1678 static void bitmap_start_write(struct mddev *mddev, sector_t offset,
1679 			       unsigned long sectors)
1680 {
1681 	struct bitmap *bitmap = mddev->bitmap;
1682 
1683 	if (!bitmap)
1684 		return;
1685 
1686 	while (sectors) {
1687 		sector_t blocks;
1688 		bitmap_counter_t *bmc;
1689 
1690 		spin_lock_irq(&bitmap->counts.lock);
1691 		bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
1692 		if (!bmc) {
1693 			spin_unlock_irq(&bitmap->counts.lock);
1694 			return;
1695 		}
1696 
1697 		if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
1698 			DEFINE_WAIT(__wait);
1699 			/* note that it is safe to do the prepare_to_wait
1700 			 * after the test as long as we do it before dropping
1701 			 * the spinlock.
1702 			 */
1703 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
1704 					TASK_UNINTERRUPTIBLE);
1705 			spin_unlock_irq(&bitmap->counts.lock);
1706 			schedule();
1707 			finish_wait(&bitmap->overflow_wait, &__wait);
1708 			continue;
1709 		}
1710 
1711 		switch (*bmc) {
1712 		case 0:
1713 			md_bitmap_file_set_bit(bitmap, offset);
1714 			md_bitmap_count_page(&bitmap->counts, offset, 1);
1715 			fallthrough;
1716 		case 1:
1717 			*bmc = 2;
1718 		}
1719 
1720 		(*bmc)++;
1721 
1722 		spin_unlock_irq(&bitmap->counts.lock);
1723 
1724 		offset += blocks;
1725 		if (sectors > blocks)
1726 			sectors -= blocks;
1727 		else
1728 			sectors = 0;
1729 	}
1730 }
1731 
1732 static void bitmap_end_write(struct mddev *mddev, sector_t offset,
1733 			     unsigned long sectors)
1734 {
1735 	struct bitmap *bitmap = mddev->bitmap;
1736 
1737 	if (!bitmap)
1738 		return;
1739 
1740 	while (sectors) {
1741 		sector_t blocks;
1742 		unsigned long flags;
1743 		bitmap_counter_t *bmc;
1744 
1745 		spin_lock_irqsave(&bitmap->counts.lock, flags);
1746 		bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
1747 		if (!bmc) {
1748 			spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1749 			return;
1750 		}
1751 
1752 		if (!bitmap->mddev->degraded) {
1753 			if (bitmap->events_cleared < bitmap->mddev->events) {
1754 				bitmap->events_cleared = bitmap->mddev->events;
1755 				bitmap->need_sync = 1;
1756 				sysfs_notify_dirent_safe(
1757 						bitmap->sysfs_can_clear);
1758 			}
1759 		} else if (!NEEDED(*bmc)) {
1760 			*bmc |= NEEDED_MASK;
1761 		}
1762 
1763 		if (COUNTER(*bmc) == COUNTER_MAX)
1764 			wake_up(&bitmap->overflow_wait);
1765 
1766 		(*bmc)--;
1767 		if (*bmc <= 2) {
1768 			md_bitmap_set_pending(&bitmap->counts, offset);
1769 			bitmap->allclean = 0;
1770 		}
1771 		spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1772 		offset += blocks;
1773 		if (sectors > blocks)
1774 			sectors -= blocks;
1775 		else
1776 			sectors = 0;
1777 	}
1778 }
1779 
1780 static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
1781 				sector_t *blocks, bool degraded)
1782 {
1783 	bitmap_counter_t *bmc;
1784 	bool rv = false;
1785 
1786 	spin_lock_irq(&bitmap->counts.lock);
1787 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1788 	if (bmc) {
1789 		/* locked */
1790 		if (RESYNC(*bmc)) {
1791 			rv = true;
1792 		} else if (NEEDED(*bmc)) {
1793 			rv = true;
1794 			if (!degraded) { /* don't set/clear bits if degraded */
1795 				*bmc |= RESYNC_MASK;
1796 				*bmc &= ~NEEDED_MASK;
1797 			}
1798 		}
1799 	}
1800 	spin_unlock_irq(&bitmap->counts.lock);
1801 
1802 	return rv;
1803 }
1804 
1805 static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
1806 			      sector_t *blocks, bool degraded)
1807 {
1808 	/* bitmap_start_sync must always report on multiples of whole
1809 	 * pages, otherwise resync (which is very PAGE_SIZE based) will
1810 	 * get confused.
1811 	 * So call __bitmap_start_sync repeatedly (if needed) until
1812 	 * At least PAGE_SIZE>>9 blocks are covered.
1813 	 * Return the 'or' of the result.
1814 	 */
1815 	bool rv = false;
1816 	sector_t blocks1;
1817 
1818 	*blocks = 0;
1819 	while (*blocks < (PAGE_SIZE>>9)) {
1820 		rv |= __bitmap_start_sync(mddev->bitmap, offset,
1821 					  &blocks1, degraded);
1822 		offset += blocks1;
1823 		*blocks += blocks1;
1824 	}
1825 
1826 	return rv;
1827 }
1828 
1829 static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
1830 			      sector_t *blocks, bool aborted)
1831 {
1832 	bitmap_counter_t *bmc;
1833 	unsigned long flags;
1834 
1835 	spin_lock_irqsave(&bitmap->counts.lock, flags);
1836 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1837 	if (bmc == NULL)
1838 		goto unlock;
1839 	/* locked */
1840 	if (RESYNC(*bmc)) {
1841 		*bmc &= ~RESYNC_MASK;
1842 
1843 		if (!NEEDED(*bmc) && aborted)
1844 			*bmc |= NEEDED_MASK;
1845 		else {
1846 			if (*bmc <= 2) {
1847 				md_bitmap_set_pending(&bitmap->counts, offset);
1848 				bitmap->allclean = 0;
1849 			}
1850 		}
1851 	}
1852  unlock:
1853 	spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1854 }
1855 
1856 static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
1857 			    sector_t *blocks)
1858 {
1859 	__bitmap_end_sync(mddev->bitmap, offset, blocks, true);
1860 }
1861 
1862 static void bitmap_close_sync(struct mddev *mddev)
1863 {
1864 	/* Sync has finished, and any bitmap chunks that weren't synced
1865 	 * properly have been aborted.  It remains to us to clear the
1866 	 * RESYNC bit wherever it is still on
1867 	 */
1868 	sector_t sector = 0;
1869 	sector_t blocks;
1870 	struct bitmap *bitmap = mddev->bitmap;
1871 
1872 	if (!bitmap)
1873 		return;
1874 
1875 	while (sector < bitmap->mddev->resync_max_sectors) {
1876 		__bitmap_end_sync(bitmap, sector, &blocks, false);
1877 		sector += blocks;
1878 	}
1879 }
1880 
1881 static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1882 				 bool force)
1883 {
1884 	sector_t s = 0;
1885 	sector_t blocks;
1886 	struct bitmap *bitmap = mddev->bitmap;
1887 
1888 	if (!bitmap)
1889 		return;
1890 	if (sector == 0) {
1891 		bitmap->last_end_sync = jiffies;
1892 		return;
1893 	}
1894 	if (!force && time_before(jiffies, (bitmap->last_end_sync
1895 				  + bitmap->mddev->bitmap_info.daemon_sleep)))
1896 		return;
1897 	wait_event(bitmap->mddev->recovery_wait,
1898 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
1899 
1900 	bitmap->mddev->curr_resync_completed = sector;
1901 	set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
1902 	sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
1903 	s = 0;
1904 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1905 		__bitmap_end_sync(bitmap, s, &blocks, false);
1906 		s += blocks;
1907 	}
1908 	bitmap->last_end_sync = jiffies;
1909 	sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
1910 }
1911 
1912 static void bitmap_sync_with_cluster(struct mddev *mddev,
1913 				     sector_t old_lo, sector_t old_hi,
1914 				     sector_t new_lo, sector_t new_hi)
1915 {
1916 	struct bitmap *bitmap = mddev->bitmap;
1917 	sector_t sector, blocks = 0;
1918 
1919 	for (sector = old_lo; sector < new_lo; ) {
1920 		__bitmap_end_sync(bitmap, sector, &blocks, false);
1921 		sector += blocks;
1922 	}
1923 	WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
1924 
1925 	for (sector = old_hi; sector < new_hi; ) {
1926 		bitmap_start_sync(mddev, sector, &blocks, false);
1927 		sector += blocks;
1928 	}
1929 	WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
1930 }
1931 
1932 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1933 {
1934 	/* For each chunk covered by any of these sectors, set the
1935 	 * counter to 2 and possibly set resync_needed.  They should all
1936 	 * be 0 at this point
1937 	 */
1938 
1939 	sector_t secs;
1940 	bitmap_counter_t *bmc;
1941 	spin_lock_irq(&bitmap->counts.lock);
1942 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
1943 	if (!bmc) {
1944 		spin_unlock_irq(&bitmap->counts.lock);
1945 		return;
1946 	}
1947 	if (!*bmc) {
1948 		*bmc = 2;
1949 		md_bitmap_count_page(&bitmap->counts, offset, 1);
1950 		md_bitmap_set_pending(&bitmap->counts, offset);
1951 		bitmap->allclean = 0;
1952 	}
1953 	if (needed)
1954 		*bmc |= NEEDED_MASK;
1955 	spin_unlock_irq(&bitmap->counts.lock);
1956 }
1957 
1958 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
1959 static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1960 			      unsigned long e)
1961 {
1962 	unsigned long chunk;
1963 	struct bitmap *bitmap = mddev->bitmap;
1964 
1965 	if (!bitmap)
1966 		return;
1967 
1968 	for (chunk = s; chunk <= e; chunk++) {
1969 		sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
1970 
1971 		md_bitmap_set_memory_bits(bitmap, sec, 1);
1972 		md_bitmap_file_set_bit(bitmap, sec);
1973 		if (sec < bitmap->mddev->resync_offset)
1974 			/* We are asserting that the array is dirty,
1975 			 * so move the resync_offset address back so
1976 			 * that it is obvious that it is dirty
1977 			 */
1978 			bitmap->mddev->resync_offset = sec;
1979 	}
1980 }
1981 
1982 static void bitmap_flush(struct mddev *mddev)
1983 {
1984 	struct bitmap *bitmap = mddev->bitmap;
1985 	long sleep;
1986 
1987 	if (!bitmap) /* there was no bitmap */
1988 		return;
1989 
1990 	/* run the daemon_work three time to ensure everything is flushed
1991 	 * that can be
1992 	 */
1993 	sleep = mddev->bitmap_info.daemon_sleep * 2;
1994 	bitmap->daemon_lastrun -= sleep;
1995 	bitmap_daemon_work(mddev);
1996 	bitmap->daemon_lastrun -= sleep;
1997 	bitmap_daemon_work(mddev);
1998 	bitmap->daemon_lastrun -= sleep;
1999 	bitmap_daemon_work(mddev);
2000 	if (mddev->bitmap_info.external)
2001 		md_super_wait(mddev);
2002 	bitmap_update_sb(bitmap);
2003 }
2004 
2005 static void md_bitmap_free(void *data)
2006 {
2007 	unsigned long k, pages;
2008 	struct bitmap_page *bp;
2009 	struct bitmap *bitmap = data;
2010 
2011 	if (!bitmap) /* there was no bitmap */
2012 		return;
2013 
2014 	if (bitmap->sysfs_can_clear)
2015 		sysfs_put(bitmap->sysfs_can_clear);
2016 
2017 	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
2018 		bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev))
2019 		md_cluster_stop(bitmap->mddev);
2020 
2021 	/* Shouldn't be needed - but just in case.... */
2022 	wait_event(bitmap->write_wait,
2023 		   atomic_read(&bitmap->pending_writes) == 0);
2024 
2025 	/* release the bitmap file  */
2026 	md_bitmap_file_unmap(&bitmap->storage);
2027 
2028 	bp = bitmap->counts.bp;
2029 	pages = bitmap->counts.pages;
2030 
2031 	/* free all allocated memory */
2032 
2033 	if (bp) /* deallocate the page memory */
2034 		for (k = 0; k < pages; k++)
2035 			if (bp[k].map && !bp[k].hijacked)
2036 				kfree(bp[k].map);
2037 	kfree(bp);
2038 	kfree(bitmap);
2039 }
2040 
2041 static void bitmap_start_behind_write(struct mddev *mddev)
2042 {
2043 	struct bitmap *bitmap = mddev->bitmap;
2044 	int bw;
2045 
2046 	atomic_inc(&bitmap->behind_writes);
2047 	bw = atomic_read(&bitmap->behind_writes);
2048 	if (bw > bitmap->behind_writes_used)
2049 		bitmap->behind_writes_used = bw;
2050 
2051 	pr_debug("inc write-behind count %d/%lu\n",
2052 		 bw, bitmap->mddev->bitmap_info.max_write_behind);
2053 }
2054 
2055 static void bitmap_end_behind_write(struct mddev *mddev)
2056 {
2057 	struct bitmap *bitmap = mddev->bitmap;
2058 
2059 	if (atomic_dec_and_test(&bitmap->behind_writes))
2060 		wake_up(&bitmap->behind_wait);
2061 	pr_debug("dec write-behind count %d/%lu\n",
2062 		 atomic_read(&bitmap->behind_writes),
2063 		 bitmap->mddev->bitmap_info.max_write_behind);
2064 }
2065 
2066 static void bitmap_wait_behind_writes(struct mddev *mddev)
2067 {
2068 	struct bitmap *bitmap = mddev->bitmap;
2069 
2070 	/* wait for behind writes to complete */
2071 	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2072 		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
2073 			 mdname(mddev));
2074 		/* need to kick something here to make sure I/O goes? */
2075 		wait_event(bitmap->behind_wait,
2076 			   atomic_read(&bitmap->behind_writes) == 0);
2077 	}
2078 }
2079 
2080 static void bitmap_destroy(struct mddev *mddev)
2081 {
2082 	struct bitmap *bitmap = mddev->bitmap;
2083 
2084 	if (!bitmap) /* there was no bitmap */
2085 		return;
2086 
2087 	bitmap_wait_behind_writes(mddev);
2088 	if (!mddev->serialize_policy)
2089 		mddev_destroy_serial_pool(mddev, NULL);
2090 
2091 	mutex_lock(&mddev->bitmap_info.mutex);
2092 	spin_lock(&mddev->lock);
2093 	mddev->bitmap = NULL; /* disconnect from the md device */
2094 	spin_unlock(&mddev->lock);
2095 	mutex_unlock(&mddev->bitmap_info.mutex);
2096 	mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
2097 
2098 	md_bitmap_free(bitmap);
2099 }
2100 
2101 /*
2102  * initialize the bitmap structure
2103  * if this returns an error, bitmap_destroy must be called to do clean up
2104  * once mddev->bitmap is set
2105  */
2106 static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
2107 {
2108 	struct bitmap *bitmap;
2109 	sector_t blocks = mddev->resync_max_sectors;
2110 	struct file *file = mddev->bitmap_info.file;
2111 	int err;
2112 	struct kernfs_node *bm = NULL;
2113 
2114 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
2115 
2116 	BUG_ON(file && mddev->bitmap_info.offset);
2117 
2118 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
2119 		pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
2120 			  mdname(mddev));
2121 		return ERR_PTR(-EBUSY);
2122 	}
2123 
2124 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
2125 	if (!bitmap)
2126 		return ERR_PTR(-ENOMEM);
2127 
2128 	spin_lock_init(&bitmap->counts.lock);
2129 	atomic_set(&bitmap->pending_writes, 0);
2130 	init_waitqueue_head(&bitmap->write_wait);
2131 	init_waitqueue_head(&bitmap->overflow_wait);
2132 	init_waitqueue_head(&bitmap->behind_wait);
2133 
2134 	bitmap->mddev = mddev;
2135 	bitmap->cluster_slot = slot;
2136 
2137 	if (mddev->kobj.sd)
2138 		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
2139 	if (bm) {
2140 		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
2141 		sysfs_put(bm);
2142 	} else
2143 		bitmap->sysfs_can_clear = NULL;
2144 
2145 	bitmap->storage.file = file;
2146 	if (file) {
2147 		get_file(file);
2148 		/* As future accesses to this file will use bmap,
2149 		 * and bypass the page cache, we must sync the file
2150 		 * first.
2151 		 */
2152 		vfs_fsync(file, 1);
2153 	}
2154 	/* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
2155 	if (!mddev->bitmap_info.external) {
2156 		/*
2157 		 * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
2158 		 * instructing us to create a new on-disk bitmap instance.
2159 		 */
2160 		if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
2161 			err = md_bitmap_new_disk_sb(bitmap);
2162 		else
2163 			err = md_bitmap_read_sb(bitmap);
2164 	} else {
2165 		err = 0;
2166 		if (mddev->bitmap_info.chunksize == 0 ||
2167 		    mddev->bitmap_info.daemon_sleep == 0)
2168 			/* chunksize and time_base need to be
2169 			 * set first. */
2170 			err = -EINVAL;
2171 	}
2172 	if (err)
2173 		goto error;
2174 
2175 	bitmap->daemon_lastrun = jiffies;
2176 	err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
2177 			      true);
2178 	if (err)
2179 		goto error;
2180 
2181 	pr_debug("created bitmap (%lu pages) for device %s\n",
2182 		 bitmap->counts.pages, bmname(bitmap));
2183 
2184 	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
2185 	if (err)
2186 		goto error;
2187 
2188 	return bitmap;
2189  error:
2190 	md_bitmap_free(bitmap);
2191 	return ERR_PTR(err);
2192 }
2193 
2194 static int bitmap_create(struct mddev *mddev)
2195 {
2196 	struct bitmap *bitmap = __bitmap_create(mddev, -1);
2197 
2198 	if (IS_ERR(bitmap))
2199 		return PTR_ERR(bitmap);
2200 
2201 	mddev->bitmap = bitmap;
2202 	return 0;
2203 }
2204 
2205 static int bitmap_load(struct mddev *mddev)
2206 {
2207 	int err = 0;
2208 	sector_t start = 0;
2209 	sector_t sector = 0;
2210 	struct bitmap *bitmap = mddev->bitmap;
2211 	struct md_rdev *rdev;
2212 
2213 	if (!bitmap)
2214 		goto out;
2215 
2216 	rdev_for_each(rdev, mddev)
2217 		mddev_create_serial_pool(mddev, rdev);
2218 
2219 	if (mddev_is_clustered(mddev))
2220 		mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
2221 
2222 	/* Clear out old bitmap info first:  Either there is none, or we
2223 	 * are resuming after someone else has possibly changed things,
2224 	 * so we should forget old cached info.
2225 	 * All chunks should be clean, but some might need_sync.
2226 	 */
2227 	while (sector < mddev->resync_max_sectors) {
2228 		sector_t blocks;
2229 		bitmap_start_sync(mddev, sector, &blocks, false);
2230 		sector += blocks;
2231 	}
2232 	bitmap_close_sync(mddev);
2233 
2234 	if (mddev->degraded == 0
2235 	    || bitmap->events_cleared == mddev->events)
2236 		/* no need to keep dirty bits to optimise a
2237 		 * re-add of a missing device */
2238 		start = mddev->resync_offset;
2239 
2240 	mutex_lock(&mddev->bitmap_info.mutex);
2241 	err = md_bitmap_init_from_disk(bitmap, start);
2242 	mutex_unlock(&mddev->bitmap_info.mutex);
2243 
2244 	if (err)
2245 		goto out;
2246 	clear_bit(BITMAP_STALE, &bitmap->flags);
2247 
2248 	/* Kick recovery in case any bits were set */
2249 	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
2250 
2251 	mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
2252 	md_wakeup_thread(mddev->thread);
2253 
2254 	bitmap_update_sb(bitmap);
2255 
2256 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
2257 		err = -EIO;
2258 out:
2259 	return err;
2260 }
2261 
2262 /* caller need to free returned bitmap with md_bitmap_free() */
2263 static void *bitmap_get_from_slot(struct mddev *mddev, int slot)
2264 {
2265 	int rv = 0;
2266 	struct bitmap *bitmap;
2267 
2268 	bitmap = __bitmap_create(mddev, slot);
2269 	if (IS_ERR(bitmap)) {
2270 		rv = PTR_ERR(bitmap);
2271 		return ERR_PTR(rv);
2272 	}
2273 
2274 	rv = md_bitmap_init_from_disk(bitmap, 0);
2275 	if (rv) {
2276 		md_bitmap_free(bitmap);
2277 		return ERR_PTR(rv);
2278 	}
2279 
2280 	return bitmap;
2281 }
2282 
2283 /* Loads the bitmap associated with slot and copies the resync information
2284  * to our bitmap
2285  */
2286 static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
2287 				 sector_t *high, bool clear_bits)
2288 {
2289 	int rv = 0, i, j;
2290 	sector_t block, lo = 0, hi = 0;
2291 	struct bitmap_counts *counts;
2292 	struct bitmap *bitmap;
2293 
2294 	bitmap = bitmap_get_from_slot(mddev, slot);
2295 	if (IS_ERR(bitmap)) {
2296 		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
2297 		return -1;
2298 	}
2299 
2300 	counts = &bitmap->counts;
2301 	for (j = 0; j < counts->chunks; j++) {
2302 		block = (sector_t)j << counts->chunkshift;
2303 		if (md_bitmap_file_test_bit(bitmap, block)) {
2304 			if (!lo)
2305 				lo = block;
2306 			hi = block;
2307 			md_bitmap_file_clear_bit(bitmap, block);
2308 			md_bitmap_set_memory_bits(mddev->bitmap, block, 1);
2309 			md_bitmap_file_set_bit(mddev->bitmap, block);
2310 		}
2311 	}
2312 
2313 	if (clear_bits) {
2314 		bitmap_update_sb(bitmap);
2315 		/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
2316 		 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
2317 		for (i = 0; i < bitmap->storage.file_pages; i++)
2318 			if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
2319 				set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
2320 		__bitmap_unplug(bitmap);
2321 	}
2322 	__bitmap_unplug(mddev->bitmap);
2323 	*low = lo;
2324 	*high = hi;
2325 	md_bitmap_free(bitmap);
2326 
2327 	return rv;
2328 }
2329 
2330 static void bitmap_set_pages(void *data, unsigned long pages)
2331 {
2332 	struct bitmap *bitmap = data;
2333 
2334 	bitmap->counts.pages = pages;
2335 }
2336 
2337 static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
2338 {
2339 	struct bitmap_storage *storage;
2340 	struct bitmap_counts *counts;
2341 	struct bitmap *bitmap = data;
2342 	bitmap_super_t *sb;
2343 
2344 	if (!bitmap)
2345 		return -ENOENT;
2346 	if (!bitmap->storage.sb_page)
2347 		return -EINVAL;
2348 	sb = kmap_local_page(bitmap->storage.sb_page);
2349 	stats->sync_size = le64_to_cpu(sb->sync_size);
2350 	kunmap_local(sb);
2351 
2352 	counts = &bitmap->counts;
2353 	stats->missing_pages = counts->missing_pages;
2354 	stats->pages = counts->pages;
2355 
2356 	storage = &bitmap->storage;
2357 	stats->file_pages = storage->file_pages;
2358 	stats->file = storage->file;
2359 
2360 	stats->behind_writes = atomic_read(&bitmap->behind_writes);
2361 	stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
2362 	stats->events_cleared = bitmap->events_cleared;
2363 	return 0;
2364 }
2365 
2366 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2367 			   int chunksize, bool init)
2368 {
2369 	/* If chunk_size is 0, choose an appropriate chunk size.
2370 	 * Then possibly allocate new storage space.
2371 	 * Then quiesce, copy bits, replace bitmap, and re-start
2372 	 *
2373 	 * This function is called both to set up the initial bitmap
2374 	 * and to resize the bitmap while the array is active.
2375 	 * If this happens as a result of the array being resized,
2376 	 * chunksize will be zero, and we need to choose a suitable
2377 	 * chunksize, otherwise we use what we are given.
2378 	 */
2379 	struct bitmap_storage store;
2380 	struct bitmap_counts old_counts;
2381 	unsigned long chunks;
2382 	sector_t block;
2383 	sector_t old_blocks, new_blocks;
2384 	int chunkshift;
2385 	int ret = 0;
2386 	long pages;
2387 	struct bitmap_page *new_bp;
2388 
2389 	if (bitmap->storage.file && !init) {
2390 		pr_info("md: cannot resize file-based bitmap\n");
2391 		return -EINVAL;
2392 	}
2393 
2394 	if (chunksize == 0) {
2395 		/* If there is enough space, leave the chunk size unchanged,
2396 		 * else increase by factor of two until there is enough space.
2397 		 */
2398 		long bytes;
2399 		long space = bitmap->mddev->bitmap_info.space;
2400 
2401 		if (space == 0) {
2402 			/* We don't know how much space there is, so limit
2403 			 * to current size - in sectors.
2404 			 */
2405 			bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
2406 			if (!bitmap->mddev->bitmap_info.external)
2407 				bytes += sizeof(bitmap_super_t);
2408 			space = DIV_ROUND_UP(bytes, 512);
2409 			bitmap->mddev->bitmap_info.space = space;
2410 		}
2411 		chunkshift = bitmap->counts.chunkshift;
2412 		chunkshift--;
2413 		do {
2414 			/* 'chunkshift' is shift from block size to chunk size */
2415 			chunkshift++;
2416 			chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2417 			bytes = DIV_ROUND_UP(chunks, 8);
2418 			if (!bitmap->mddev->bitmap_info.external)
2419 				bytes += sizeof(bitmap_super_t);
2420 		} while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
2421 			(BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
2422 	} else
2423 		chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
2424 
2425 	chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2426 	memset(&store, 0, sizeof(store));
2427 	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
2428 		ret = md_bitmap_storage_alloc(&store, chunks,
2429 					      !bitmap->mddev->bitmap_info.external,
2430 					      mddev_is_clustered(bitmap->mddev)
2431 					      ? bitmap->cluster_slot : 0);
2432 	if (ret) {
2433 		md_bitmap_file_unmap(&store);
2434 		goto err;
2435 	}
2436 
2437 	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
2438 
2439 	new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL);
2440 	ret = -ENOMEM;
2441 	if (!new_bp) {
2442 		md_bitmap_file_unmap(&store);
2443 		goto err;
2444 	}
2445 
2446 	if (!init)
2447 		bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
2448 
2449 	store.file = bitmap->storage.file;
2450 	bitmap->storage.file = NULL;
2451 
2452 	if (store.sb_page && bitmap->storage.sb_page)
2453 		memcpy(page_address(store.sb_page),
2454 		       page_address(bitmap->storage.sb_page),
2455 		       sizeof(bitmap_super_t));
2456 	spin_lock_irq(&bitmap->counts.lock);
2457 	md_bitmap_file_unmap(&bitmap->storage);
2458 	bitmap->storage = store;
2459 
2460 	old_counts = bitmap->counts;
2461 	bitmap->counts.bp = new_bp;
2462 	bitmap->counts.pages = pages;
2463 	bitmap->counts.missing_pages = pages;
2464 	bitmap->counts.chunkshift = chunkshift;
2465 	bitmap->counts.chunks = chunks;
2466 	bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
2467 						     BITMAP_BLOCK_SHIFT);
2468 
2469 	blocks = min(old_counts.chunks << old_counts.chunkshift,
2470 		     chunks << chunkshift);
2471 
2472 	/* For cluster raid, need to pre-allocate bitmap */
2473 	if (mddev_is_clustered(bitmap->mddev)) {
2474 		unsigned long page;
2475 		for (page = 0; page < pages; page++) {
2476 			ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
2477 			if (ret) {
2478 				unsigned long k;
2479 
2480 				/* deallocate the page memory */
2481 				for (k = 0; k < page; k++) {
2482 					kfree(new_bp[k].map);
2483 				}
2484 				kfree(new_bp);
2485 
2486 				/* restore some fields from old_counts */
2487 				bitmap->counts.bp = old_counts.bp;
2488 				bitmap->counts.pages = old_counts.pages;
2489 				bitmap->counts.missing_pages = old_counts.pages;
2490 				bitmap->counts.chunkshift = old_counts.chunkshift;
2491 				bitmap->counts.chunks = old_counts.chunks;
2492 				bitmap->mddev->bitmap_info.chunksize =
2493 					1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
2494 				blocks = old_counts.chunks << old_counts.chunkshift;
2495 				pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
2496 				break;
2497 			} else
2498 				bitmap->counts.bp[page].count += 1;
2499 		}
2500 	}
2501 
2502 	for (block = 0; block < blocks; ) {
2503 		bitmap_counter_t *bmc_old, *bmc_new;
2504 		int set;
2505 
2506 		bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
2507 		set = bmc_old && NEEDED(*bmc_old);
2508 
2509 		if (set) {
2510 			bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2511 			if (bmc_new) {
2512 				if (*bmc_new == 0) {
2513 					/* need to set on-disk bits too. */
2514 					sector_t end = block + new_blocks;
2515 					sector_t start = block >> chunkshift;
2516 
2517 					start <<= chunkshift;
2518 					while (start < end) {
2519 						md_bitmap_file_set_bit(bitmap, block);
2520 						start += 1 << chunkshift;
2521 					}
2522 					*bmc_new = 2;
2523 					md_bitmap_count_page(&bitmap->counts, block, 1);
2524 					md_bitmap_set_pending(&bitmap->counts, block);
2525 				}
2526 				*bmc_new |= NEEDED_MASK;
2527 			}
2528 			if (new_blocks < old_blocks)
2529 				old_blocks = new_blocks;
2530 		}
2531 		block += old_blocks;
2532 	}
2533 
2534 	if (bitmap->counts.bp != old_counts.bp) {
2535 		unsigned long k;
2536 		for (k = 0; k < old_counts.pages; k++)
2537 			if (!old_counts.bp[k].hijacked)
2538 				kfree(old_counts.bp[k].map);
2539 		kfree(old_counts.bp);
2540 	}
2541 
2542 	if (!init) {
2543 		int i;
2544 		while (block < (chunks << chunkshift)) {
2545 			bitmap_counter_t *bmc;
2546 			bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2547 			if (bmc) {
2548 				/* new space.  It needs to be resynced, so
2549 				 * we set NEEDED_MASK.
2550 				 */
2551 				if (*bmc == 0) {
2552 					*bmc = NEEDED_MASK | 2;
2553 					md_bitmap_count_page(&bitmap->counts, block, 1);
2554 					md_bitmap_set_pending(&bitmap->counts, block);
2555 				}
2556 			}
2557 			block += new_blocks;
2558 		}
2559 		for (i = 0; i < bitmap->storage.file_pages; i++)
2560 			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
2561 	}
2562 	spin_unlock_irq(&bitmap->counts.lock);
2563 
2564 	if (!init) {
2565 		__bitmap_unplug(bitmap);
2566 		bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
2567 	}
2568 	ret = 0;
2569 err:
2570 	return ret;
2571 }
2572 
2573 static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
2574 {
2575 	struct bitmap *bitmap = mddev->bitmap;
2576 
2577 	if (!bitmap)
2578 		return 0;
2579 
2580 	return __bitmap_resize(bitmap, blocks, chunksize, false);
2581 }
2582 
2583 static ssize_t
2584 location_show(struct mddev *mddev, char *page)
2585 {
2586 	ssize_t len;
2587 	if (mddev->bitmap_info.file)
2588 		len = sprintf(page, "file");
2589 	else if (mddev->bitmap_info.offset)
2590 		len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
2591 	else
2592 		len = sprintf(page, "none");
2593 	len += sprintf(page+len, "\n");
2594 	return len;
2595 }
2596 
2597 static ssize_t
2598 location_store(struct mddev *mddev, const char *buf, size_t len)
2599 {
2600 	int rv;
2601 
2602 	rv = mddev_suspend_and_lock(mddev);
2603 	if (rv)
2604 		return rv;
2605 
2606 	if (mddev->pers) {
2607 		if (mddev->recovery || mddev->sync_thread) {
2608 			rv = -EBUSY;
2609 			goto out;
2610 		}
2611 	}
2612 
2613 	if (mddev->bitmap || mddev->bitmap_info.file ||
2614 	    mddev->bitmap_info.offset) {
2615 		/* bitmap already configured.  Only option is to clear it */
2616 		if (strncmp(buf, "none", 4) != 0) {
2617 			rv = -EBUSY;
2618 			goto out;
2619 		}
2620 
2621 		bitmap_destroy(mddev);
2622 		mddev->bitmap_info.offset = 0;
2623 		if (mddev->bitmap_info.file) {
2624 			struct file *f = mddev->bitmap_info.file;
2625 			mddev->bitmap_info.file = NULL;
2626 			fput(f);
2627 		}
2628 	} else {
2629 		/* No bitmap, OK to set a location */
2630 		long long offset;
2631 
2632 		if (strncmp(buf, "none", 4) == 0)
2633 			/* nothing to be done */;
2634 		else if (strncmp(buf, "file:", 5) == 0) {
2635 			/* Not supported yet */
2636 			rv = -EINVAL;
2637 			goto out;
2638 		} else {
2639 			if (buf[0] == '+')
2640 				rv = kstrtoll(buf+1, 10, &offset);
2641 			else
2642 				rv = kstrtoll(buf, 10, &offset);
2643 			if (rv)
2644 				goto out;
2645 			if (offset == 0) {
2646 				rv = -EINVAL;
2647 				goto out;
2648 			}
2649 			if (mddev->bitmap_info.external == 0 &&
2650 			    mddev->major_version == 0 &&
2651 			    offset != mddev->bitmap_info.default_offset) {
2652 				rv = -EINVAL;
2653 				goto out;
2654 			}
2655 
2656 			mddev->bitmap_info.offset = offset;
2657 			rv = bitmap_create(mddev);
2658 			if (rv)
2659 				goto out;
2660 
2661 			rv = bitmap_load(mddev);
2662 			if (rv) {
2663 				mddev->bitmap_info.offset = 0;
2664 				bitmap_destroy(mddev);
2665 				goto out;
2666 			}
2667 		}
2668 	}
2669 	if (!mddev->external) {
2670 		/* Ensure new bitmap info is stored in
2671 		 * metadata promptly.
2672 		 */
2673 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2674 		md_wakeup_thread(mddev->thread);
2675 	}
2676 	rv = 0;
2677 out:
2678 	mddev_unlock_and_resume(mddev);
2679 	if (rv)
2680 		return rv;
2681 	return len;
2682 }
2683 
2684 static struct md_sysfs_entry bitmap_location =
2685 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
2686 
2687 /* 'bitmap/space' is the space available at 'location' for the
2688  * bitmap.  This allows the kernel to know when it is safe to
2689  * resize the bitmap to match a resized array.
2690  */
2691 static ssize_t
2692 space_show(struct mddev *mddev, char *page)
2693 {
2694 	return sprintf(page, "%lu\n", mddev->bitmap_info.space);
2695 }
2696 
2697 static ssize_t
2698 space_store(struct mddev *mddev, const char *buf, size_t len)
2699 {
2700 	struct bitmap *bitmap;
2701 	unsigned long sectors;
2702 	int rv;
2703 
2704 	rv = kstrtoul(buf, 10, &sectors);
2705 	if (rv)
2706 		return rv;
2707 
2708 	if (sectors == 0)
2709 		return -EINVAL;
2710 
2711 	bitmap = mddev->bitmap;
2712 	if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9)
2713 		return -EFBIG; /* Bitmap is too big for this small space */
2714 
2715 	/* could make sure it isn't too big, but that isn't really
2716 	 * needed - user-space should be careful.
2717 	 */
2718 	mddev->bitmap_info.space = sectors;
2719 	return len;
2720 }
2721 
2722 static struct md_sysfs_entry bitmap_space =
2723 __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
2724 
2725 static ssize_t
2726 timeout_show(struct mddev *mddev, char *page)
2727 {
2728 	ssize_t len;
2729 	unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
2730 	unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
2731 
2732 	len = sprintf(page, "%lu", secs);
2733 	if (jifs)
2734 		len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
2735 	len += sprintf(page+len, "\n");
2736 	return len;
2737 }
2738 
2739 static ssize_t
2740 timeout_store(struct mddev *mddev, const char *buf, size_t len)
2741 {
2742 	/* timeout can be set at any time */
2743 	unsigned long timeout;
2744 	int rv = strict_strtoul_scaled(buf, &timeout, 4);
2745 	if (rv)
2746 		return rv;
2747 
2748 	/* just to make sure we don't overflow... */
2749 	if (timeout >= LONG_MAX / HZ)
2750 		return -EINVAL;
2751 
2752 	timeout = timeout * HZ / 10000;
2753 
2754 	if (timeout >= MAX_SCHEDULE_TIMEOUT)
2755 		timeout = MAX_SCHEDULE_TIMEOUT-1;
2756 	if (timeout < 1)
2757 		timeout = 1;
2758 
2759 	mddev->bitmap_info.daemon_sleep = timeout;
2760 	mddev_set_timeout(mddev, timeout, false);
2761 	md_wakeup_thread(mddev->thread);
2762 
2763 	return len;
2764 }
2765 
2766 static struct md_sysfs_entry bitmap_timeout =
2767 __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
2768 
2769 static ssize_t
2770 backlog_show(struct mddev *mddev, char *page)
2771 {
2772 	return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
2773 }
2774 
2775 static ssize_t
2776 backlog_store(struct mddev *mddev, const char *buf, size_t len)
2777 {
2778 	unsigned long backlog;
2779 	unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
2780 	struct md_rdev *rdev;
2781 	bool has_write_mostly = false;
2782 	int rv = kstrtoul(buf, 10, &backlog);
2783 	if (rv)
2784 		return rv;
2785 	if (backlog > COUNTER_MAX)
2786 		return -EINVAL;
2787 
2788 	rv = mddev_suspend_and_lock(mddev);
2789 	if (rv)
2790 		return rv;
2791 
2792 	/*
2793 	 * Without write mostly device, it doesn't make sense to set
2794 	 * backlog for max_write_behind.
2795 	 */
2796 	rdev_for_each(rdev, mddev) {
2797 		if (test_bit(WriteMostly, &rdev->flags)) {
2798 			has_write_mostly = true;
2799 			break;
2800 		}
2801 	}
2802 	if (!has_write_mostly) {
2803 		pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
2804 				    mdname(mddev));
2805 		mddev_unlock(mddev);
2806 		return -EINVAL;
2807 	}
2808 
2809 	mddev->bitmap_info.max_write_behind = backlog;
2810 	if (!backlog && mddev->serial_info_pool) {
2811 		/* serial_info_pool is not needed if backlog is zero */
2812 		if (!mddev->serialize_policy)
2813 			mddev_destroy_serial_pool(mddev, NULL);
2814 	} else if (backlog && !mddev->serial_info_pool) {
2815 		/* serial_info_pool is needed since backlog is not zero */
2816 		rdev_for_each(rdev, mddev)
2817 			mddev_create_serial_pool(mddev, rdev);
2818 	}
2819 	if (old_mwb != backlog)
2820 		bitmap_update_sb(mddev->bitmap);
2821 
2822 	mddev_unlock_and_resume(mddev);
2823 	return len;
2824 }
2825 
2826 static struct md_sysfs_entry bitmap_backlog =
2827 __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
2828 
2829 static ssize_t
2830 chunksize_show(struct mddev *mddev, char *page)
2831 {
2832 	return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
2833 }
2834 
2835 static ssize_t
2836 chunksize_store(struct mddev *mddev, const char *buf, size_t len)
2837 {
2838 	/* Can only be changed when no bitmap is active */
2839 	int rv;
2840 	unsigned long csize;
2841 	if (mddev->bitmap)
2842 		return -EBUSY;
2843 	rv = kstrtoul(buf, 10, &csize);
2844 	if (rv)
2845 		return rv;
2846 	if (csize < 512 ||
2847 	    !is_power_of_2(csize))
2848 		return -EINVAL;
2849 	if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
2850 		sizeof(((bitmap_super_t *)0)->chunksize))))
2851 		return -EOVERFLOW;
2852 	mddev->bitmap_info.chunksize = csize;
2853 	return len;
2854 }
2855 
2856 static struct md_sysfs_entry bitmap_chunksize =
2857 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
2858 
2859 static ssize_t metadata_show(struct mddev *mddev, char *page)
2860 {
2861 	if (mddev_is_clustered(mddev))
2862 		return sprintf(page, "clustered\n");
2863 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
2864 				      ? "external" : "internal"));
2865 }
2866 
2867 static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
2868 {
2869 	if (mddev->bitmap ||
2870 	    mddev->bitmap_info.file ||
2871 	    mddev->bitmap_info.offset)
2872 		return -EBUSY;
2873 	if (strncmp(buf, "external", 8) == 0)
2874 		mddev->bitmap_info.external = 1;
2875 	else if ((strncmp(buf, "internal", 8) == 0) ||
2876 			(strncmp(buf, "clustered", 9) == 0))
2877 		mddev->bitmap_info.external = 0;
2878 	else
2879 		return -EINVAL;
2880 	return len;
2881 }
2882 
2883 static struct md_sysfs_entry bitmap_metadata =
2884 __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2885 
2886 static ssize_t can_clear_show(struct mddev *mddev, char *page)
2887 {
2888 	int len;
2889 	struct bitmap *bitmap;
2890 
2891 	spin_lock(&mddev->lock);
2892 	bitmap = mddev->bitmap;
2893 	if (bitmap)
2894 		len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" :
2895 								 "true"));
2896 	else
2897 		len = sprintf(page, "\n");
2898 	spin_unlock(&mddev->lock);
2899 	return len;
2900 }
2901 
2902 static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
2903 {
2904 	struct bitmap *bitmap = mddev->bitmap;
2905 
2906 	if (!bitmap)
2907 		return -ENOENT;
2908 
2909 	if (strncmp(buf, "false", 5) == 0) {
2910 		bitmap->need_sync = 1;
2911 		return len;
2912 	}
2913 
2914 	if (strncmp(buf, "true", 4) == 0) {
2915 		if (mddev->degraded)
2916 			return -EBUSY;
2917 		bitmap->need_sync = 0;
2918 		return len;
2919 	}
2920 
2921 	return -EINVAL;
2922 }
2923 
2924 static struct md_sysfs_entry bitmap_can_clear =
2925 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2926 
2927 static ssize_t
2928 behind_writes_used_show(struct mddev *mddev, char *page)
2929 {
2930 	ssize_t ret;
2931 	struct bitmap *bitmap;
2932 
2933 	spin_lock(&mddev->lock);
2934 	bitmap = mddev->bitmap;
2935 	if (!bitmap)
2936 		ret = sprintf(page, "0\n");
2937 	else
2938 		ret = sprintf(page, "%lu\n", bitmap->behind_writes_used);
2939 	spin_unlock(&mddev->lock);
2940 
2941 	return ret;
2942 }
2943 
2944 static ssize_t
2945 behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
2946 {
2947 	struct bitmap *bitmap = mddev->bitmap;
2948 
2949 	if (bitmap)
2950 		bitmap->behind_writes_used = 0;
2951 	return len;
2952 }
2953 
2954 static struct md_sysfs_entry max_backlog_used =
2955 __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2956        behind_writes_used_show, behind_writes_used_reset);
2957 
2958 static struct attribute *md_bitmap_attrs[] = {
2959 	&bitmap_location.attr,
2960 	&bitmap_space.attr,
2961 	&bitmap_timeout.attr,
2962 	&bitmap_backlog.attr,
2963 	&bitmap_chunksize.attr,
2964 	&bitmap_metadata.attr,
2965 	&bitmap_can_clear.attr,
2966 	&max_backlog_used.attr,
2967 	NULL
2968 };
2969 
2970 static struct attribute_group md_bitmap_group = {
2971 	.name = "bitmap",
2972 	.attrs = md_bitmap_attrs,
2973 };
2974 
2975 static struct bitmap_operations bitmap_ops = {
2976 	.head = {
2977 		.type	= MD_BITMAP,
2978 		.id	= ID_BITMAP,
2979 		.name	= "bitmap",
2980 	},
2981 
2982 	.enabled		= bitmap_enabled,
2983 	.create			= bitmap_create,
2984 	.resize			= bitmap_resize,
2985 	.load			= bitmap_load,
2986 	.destroy		= bitmap_destroy,
2987 	.flush			= bitmap_flush,
2988 	.write_all		= bitmap_write_all,
2989 	.dirty_bits		= bitmap_dirty_bits,
2990 	.unplug			= bitmap_unplug,
2991 	.daemon_work		= bitmap_daemon_work,
2992 
2993 	.start_behind_write	= bitmap_start_behind_write,
2994 	.end_behind_write	= bitmap_end_behind_write,
2995 	.wait_behind_writes	= bitmap_wait_behind_writes,
2996 
2997 	.start_write		= bitmap_start_write,
2998 	.end_write		= bitmap_end_write,
2999 	.start_discard		= bitmap_start_write,
3000 	.end_discard		= bitmap_end_write,
3001 
3002 	.start_sync		= bitmap_start_sync,
3003 	.end_sync		= bitmap_end_sync,
3004 	.cond_end_sync		= bitmap_cond_end_sync,
3005 	.close_sync		= bitmap_close_sync,
3006 
3007 	.update_sb		= bitmap_update_sb,
3008 	.get_stats		= bitmap_get_stats,
3009 
3010 	.sync_with_cluster	= bitmap_sync_with_cluster,
3011 	.get_from_slot		= bitmap_get_from_slot,
3012 	.copy_from_slot		= bitmap_copy_from_slot,
3013 	.set_pages		= bitmap_set_pages,
3014 	.free			= md_bitmap_free,
3015 
3016 	.group			= &md_bitmap_group,
3017 };
3018 
3019 int md_bitmap_init(void)
3020 {
3021 	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
3022 				       0);
3023 	if (!md_bitmap_wq)
3024 		return -ENOMEM;
3025 
3026 	return register_md_submodule(&bitmap_ops.head);
3027 }
3028 
3029 void md_bitmap_exit(void)
3030 {
3031 	destroy_workqueue(md_bitmap_wq);
3032 	unregister_md_submodule(&bitmap_ops.head);
3033 }
3034