xref: /linux/drivers/md/md-bitmap.c (revision ef1c400fafe27f06ba1e8050fc2f35662fe8e106)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
4  *
5  * bitmap_create  - sets up the bitmap structure
6  * bitmap_destroy - destroys the bitmap structure
7  *
8  * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
9  * - added disk storage for bitmap
10  * - changes to allow various bitmap chunk sizes
11  */
12 
13 /*
14  * Still to do:
15  *
16  * flush after percent set rather than just time based. (maybe both).
17  */
18 
19 #include <linux/blkdev.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/timer.h>
25 #include <linux/sched.h>
26 #include <linux/list.h>
27 #include <linux/file.h>
28 #include <linux/mount.h>
29 #include <linux/buffer_head.h>
30 #include <linux/seq_file.h>
31 #include <trace/events/block.h>
32 #include "md.h"
33 #include "md-bitmap.h"
34 
35 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
36 			   int chunksize, bool init);
37 
38 static inline char *bmname(struct bitmap *bitmap)
39 {
40 	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
41 }
42 
43 /*
44  * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
45  *
46  * 1) check to see if this page is allocated, if it's not then try to alloc
47  * 2) if the alloc fails, set the page's hijacked flag so we'll use the
48  *    page pointer directly as a counter
49  *
50  * if we find our page, we increment the page's refcount so that it stays
51  * allocated while we're using it
52  */
53 static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
54 			       unsigned long page, int create, int no_hijack)
55 __releases(bitmap->lock)
56 __acquires(bitmap->lock)
57 {
58 	unsigned char *mappage;
59 
60 	WARN_ON_ONCE(page >= bitmap->pages);
61 	if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
62 		return 0;
63 
64 	if (bitmap->bp[page].map) /* page is already allocated, just return */
65 		return 0;
66 
67 	if (!create)
68 		return -ENOENT;
69 
70 	/* this page has not been allocated yet */
71 
72 	spin_unlock_irq(&bitmap->lock);
73 	/* It is possible that this is being called inside a
74 	 * prepare_to_wait/finish_wait loop from raid5c:make_request().
75 	 * In general it is not permitted to sleep in that context as it
76 	 * can cause the loop to spin freely.
77 	 * That doesn't apply here as we can only reach this point
78 	 * once with any loop.
79 	 * When this function completes, either bp[page].map or
80 	 * bp[page].hijacked.  In either case, this function will
81 	 * abort before getting to this point again.  So there is
82 	 * no risk of a free-spin, and so it is safe to assert
83 	 * that sleeping here is allowed.
84 	 */
85 	sched_annotate_sleep();
86 	mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
87 	spin_lock_irq(&bitmap->lock);
88 
89 	if (mappage == NULL) {
90 		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
91 		/* We don't support hijack for cluster raid */
92 		if (no_hijack)
93 			return -ENOMEM;
94 		/* failed - set the hijacked flag so that we can use the
95 		 * pointer as a counter */
96 		if (!bitmap->bp[page].map)
97 			bitmap->bp[page].hijacked = 1;
98 	} else if (bitmap->bp[page].map ||
99 		   bitmap->bp[page].hijacked) {
100 		/* somebody beat us to getting the page */
101 		kfree(mappage);
102 	} else {
103 
104 		/* no page was in place and we have one, so install it */
105 
106 		bitmap->bp[page].map = mappage;
107 		bitmap->missing_pages--;
108 	}
109 	return 0;
110 }
111 
112 /* if page is completely empty, put it back on the free list, or dealloc it */
113 /* if page was hijacked, unmark the flag so it might get alloced next time */
114 /* Note: lock should be held when calling this */
115 static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
116 {
117 	char *ptr;
118 
119 	if (bitmap->bp[page].count) /* page is still busy */
120 		return;
121 
122 	/* page is no longer in use, it can be released */
123 
124 	if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
125 		bitmap->bp[page].hijacked = 0;
126 		bitmap->bp[page].map = NULL;
127 	} else {
128 		/* normal case, free the page */
129 		ptr = bitmap->bp[page].map;
130 		bitmap->bp[page].map = NULL;
131 		bitmap->missing_pages++;
132 		kfree(ptr);
133 	}
134 }
135 
136 /*
137  * bitmap file handling - read and write the bitmap file and its superblock
138  */
139 
140 /*
141  * basic page I/O operations
142  */
143 
144 /* IO operations when bitmap is stored near all superblocks */
145 
146 /* choose a good rdev and read the page from there */
147 static int read_sb_page(struct mddev *mddev, loff_t offset,
148 		struct page *page, unsigned long index, int size)
149 {
150 
151 	sector_t sector = mddev->bitmap_info.offset + offset +
152 		index * (PAGE_SIZE / SECTOR_SIZE);
153 	struct md_rdev *rdev;
154 
155 	rdev_for_each(rdev, mddev) {
156 		u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
157 
158 		if (!test_bit(In_sync, &rdev->flags) ||
159 		    test_bit(Faulty, &rdev->flags) ||
160 		    test_bit(Bitmap_sync, &rdev->flags))
161 			continue;
162 
163 		if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
164 			return 0;
165 	}
166 	return -EIO;
167 }
168 
169 static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
170 {
171 	/* Iterate the disks of an mddev, using rcu to protect access to the
172 	 * linked list, and raising the refcount of devices we return to ensure
173 	 * they don't disappear while in use.
174 	 * As devices are only added or removed when raid_disk is < 0 and
175 	 * nr_pending is 0 and In_sync is clear, the entries we return will
176 	 * still be in the same position on the list when we re-enter
177 	 * list_for_each_entry_continue_rcu.
178 	 *
179 	 * Note that if entered with 'rdev == NULL' to start at the
180 	 * beginning, we temporarily assign 'rdev' to an address which
181 	 * isn't really an rdev, but which can be used by
182 	 * list_for_each_entry_continue_rcu() to find the first entry.
183 	 */
184 	rcu_read_lock();
185 	if (rdev == NULL)
186 		/* start at the beginning */
187 		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
188 	else {
189 		/* release the previous rdev and start from there. */
190 		rdev_dec_pending(rdev, mddev);
191 	}
192 	list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
193 		if (rdev->raid_disk >= 0 &&
194 		    !test_bit(Faulty, &rdev->flags)) {
195 			/* this is a usable devices */
196 			atomic_inc(&rdev->nr_pending);
197 			rcu_read_unlock();
198 			return rdev;
199 		}
200 	}
201 	rcu_read_unlock();
202 	return NULL;
203 }
204 
205 static unsigned int optimal_io_size(struct block_device *bdev,
206 				    unsigned int last_page_size,
207 				    unsigned int io_size)
208 {
209 	if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
210 		return roundup(last_page_size, bdev_io_opt(bdev));
211 	return io_size;
212 }
213 
214 static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
215 				   loff_t start, loff_t boundary)
216 {
217 	if (io_size != opt_size &&
218 	    start + opt_size / SECTOR_SIZE <= boundary)
219 		return opt_size;
220 	if (start + io_size / SECTOR_SIZE <= boundary)
221 		return io_size;
222 
223 	/* Overflows boundary */
224 	return 0;
225 }
226 
227 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
228 			   unsigned long pg_index, struct page *page)
229 {
230 	struct block_device *bdev;
231 	struct mddev *mddev = bitmap->mddev;
232 	struct bitmap_storage *store = &bitmap->storage;
233 	unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) <<
234 		PAGE_SHIFT;
235 	loff_t sboff, offset = mddev->bitmap_info.offset;
236 	sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
237 	unsigned int size = PAGE_SIZE;
238 	unsigned int opt_size = PAGE_SIZE;
239 	sector_t doff;
240 
241 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
242 	/* we compare length (page numbers), not page offset. */
243 	if ((pg_index - store->sb_index) == store->file_pages - 1) {
244 		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
245 
246 		if (last_page_size == 0)
247 			last_page_size = PAGE_SIZE;
248 		size = roundup(last_page_size, bdev_logical_block_size(bdev));
249 		opt_size = optimal_io_size(bdev, last_page_size, size);
250 	}
251 
252 	sboff = rdev->sb_start + offset;
253 	doff = rdev->data_offset;
254 
255 	/* Just make sure we aren't corrupting data or metadata */
256 	if (mddev->external) {
257 		/* Bitmap could be anywhere. */
258 		if (sboff + ps > doff &&
259 		    sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
260 			return -EINVAL;
261 	} else if (offset < 0) {
262 		/* DATA  BITMAP METADATA  */
263 		size = bitmap_io_size(size, opt_size, offset + ps, 0);
264 		if (size == 0)
265 			/* bitmap runs in to metadata */
266 			return -EINVAL;
267 
268 		if (doff + mddev->dev_sectors > sboff)
269 			/* data runs in to bitmap */
270 			return -EINVAL;
271 	} else if (rdev->sb_start < rdev->data_offset) {
272 		/* METADATA BITMAP DATA */
273 		size = bitmap_io_size(size, opt_size, sboff + ps, doff);
274 		if (size == 0)
275 			/* bitmap runs in to data */
276 			return -EINVAL;
277 	}
278 
279 	md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
280 	return 0;
281 }
282 
283 static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
284 			  struct page *page, bool wait)
285 {
286 	struct mddev *mddev = bitmap->mddev;
287 
288 	do {
289 		struct md_rdev *rdev = NULL;
290 
291 		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
292 			if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
293 				set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
294 				return;
295 			}
296 		}
297 	} while (wait && md_super_wait(mddev) < 0);
298 }
299 
300 static void md_bitmap_file_kick(struct bitmap *bitmap);
301 
302 #ifdef CONFIG_MD_BITMAP_FILE
303 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
304 {
305 	struct buffer_head *bh = page_buffers(page);
306 
307 	while (bh && bh->b_blocknr) {
308 		atomic_inc(&bitmap->pending_writes);
309 		set_buffer_locked(bh);
310 		set_buffer_mapped(bh);
311 		submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
312 		bh = bh->b_this_page;
313 	}
314 
315 	if (wait)
316 		wait_event(bitmap->write_wait,
317 			   atomic_read(&bitmap->pending_writes) == 0);
318 }
319 
320 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
321 {
322 	struct bitmap *bitmap = bh->b_private;
323 
324 	if (!uptodate)
325 		set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
326 	if (atomic_dec_and_test(&bitmap->pending_writes))
327 		wake_up(&bitmap->write_wait);
328 }
329 
330 static void free_buffers(struct page *page)
331 {
332 	struct buffer_head *bh;
333 
334 	if (!PagePrivate(page))
335 		return;
336 
337 	bh = page_buffers(page);
338 	while (bh) {
339 		struct buffer_head *next = bh->b_this_page;
340 		free_buffer_head(bh);
341 		bh = next;
342 	}
343 	detach_page_private(page);
344 	put_page(page);
345 }
346 
347 /* read a page from a file.
348  * We both read the page, and attach buffers to the page to record the
349  * address of each block (using bmap).  These addresses will be used
350  * to write the block later, completely bypassing the filesystem.
351  * This usage is similar to how swap files are handled, and allows us
352  * to write to a file with no concerns of memory allocation failing.
353  */
354 static int read_file_page(struct file *file, unsigned long index,
355 		struct bitmap *bitmap, unsigned long count, struct page *page)
356 {
357 	int ret = 0;
358 	struct inode *inode = file_inode(file);
359 	struct buffer_head *bh;
360 	sector_t block, blk_cur;
361 	unsigned long blocksize = i_blocksize(inode);
362 
363 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
364 		 (unsigned long long)index << PAGE_SHIFT);
365 
366 	bh = alloc_page_buffers(page, blocksize, false);
367 	if (!bh) {
368 		ret = -ENOMEM;
369 		goto out;
370 	}
371 	attach_page_private(page, bh);
372 	blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
373 	while (bh) {
374 		block = blk_cur;
375 
376 		if (count == 0)
377 			bh->b_blocknr = 0;
378 		else {
379 			ret = bmap(inode, &block);
380 			if (ret || !block) {
381 				ret = -EINVAL;
382 				bh->b_blocknr = 0;
383 				goto out;
384 			}
385 
386 			bh->b_blocknr = block;
387 			bh->b_bdev = inode->i_sb->s_bdev;
388 			if (count < blocksize)
389 				count = 0;
390 			else
391 				count -= blocksize;
392 
393 			bh->b_end_io = end_bitmap_write;
394 			bh->b_private = bitmap;
395 			atomic_inc(&bitmap->pending_writes);
396 			set_buffer_locked(bh);
397 			set_buffer_mapped(bh);
398 			submit_bh(REQ_OP_READ, bh);
399 		}
400 		blk_cur++;
401 		bh = bh->b_this_page;
402 	}
403 
404 	wait_event(bitmap->write_wait,
405 		   atomic_read(&bitmap->pending_writes)==0);
406 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
407 		ret = -EIO;
408 out:
409 	if (ret)
410 		pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
411 		       (int)PAGE_SIZE,
412 		       (unsigned long long)index << PAGE_SHIFT,
413 		       ret);
414 	return ret;
415 }
416 #else /* CONFIG_MD_BITMAP_FILE */
417 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
418 {
419 }
420 static int read_file_page(struct file *file, unsigned long index,
421 		struct bitmap *bitmap, unsigned long count, struct page *page)
422 {
423 	return -EIO;
424 }
425 static void free_buffers(struct page *page)
426 {
427 	put_page(page);
428 }
429 #endif /* CONFIG_MD_BITMAP_FILE */
430 
431 /*
432  * bitmap file superblock operations
433  */
434 
435 /*
436  * write out a page to a file
437  */
438 static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
439 			       bool wait)
440 {
441 	struct bitmap_storage *store = &bitmap->storage;
442 	struct page *page = store->filemap[pg_index];
443 
444 	if (mddev_is_clustered(bitmap->mddev)) {
445 		/* go to node bitmap area starting point */
446 		pg_index += store->sb_index;
447 	}
448 
449 	if (store->file)
450 		write_file_page(bitmap, page, wait);
451 	else
452 		write_sb_page(bitmap, pg_index, page, wait);
453 }
454 
455 /*
456  * md_bitmap_wait_writes() should be called before writing any bitmap
457  * blocks, to ensure previous writes, particularly from
458  * md_bitmap_daemon_work(), have completed.
459  */
460 static void md_bitmap_wait_writes(struct bitmap *bitmap)
461 {
462 	if (bitmap->storage.file)
463 		wait_event(bitmap->write_wait,
464 			   atomic_read(&bitmap->pending_writes)==0);
465 	else
466 		/* Note that we ignore the return value.  The writes
467 		 * might have failed, but that would just mean that
468 		 * some bits which should be cleared haven't been,
469 		 * which is safe.  The relevant bitmap blocks will
470 		 * probably get written again, but there is no great
471 		 * loss if they aren't.
472 		 */
473 		md_super_wait(bitmap->mddev);
474 }
475 
476 
477 /* update the event counter and sync the superblock to disk */
478 static void bitmap_update_sb(struct bitmap *bitmap)
479 {
480 	bitmap_super_t *sb;
481 
482 	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
483 		return;
484 	if (bitmap->mddev->bitmap_info.external)
485 		return;
486 	if (!bitmap->storage.sb_page) /* no superblock */
487 		return;
488 	sb = kmap_atomic(bitmap->storage.sb_page);
489 	sb->events = cpu_to_le64(bitmap->mddev->events);
490 	if (bitmap->mddev->events < bitmap->events_cleared)
491 		/* rocking back to read-only */
492 		bitmap->events_cleared = bitmap->mddev->events;
493 	sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
494 	/*
495 	 * clear BITMAP_WRITE_ERROR bit to protect against the case that
496 	 * a bitmap write error occurred but the later writes succeeded.
497 	 */
498 	sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
499 	/* Just in case these have been changed via sysfs: */
500 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
501 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
502 	/* This might have been changed by a reshape */
503 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
504 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
505 	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
506 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
507 					   bitmap_info.space);
508 	kunmap_atomic(sb);
509 
510 	if (bitmap->storage.file)
511 		write_file_page(bitmap, bitmap->storage.sb_page, 1);
512 	else
513 		write_sb_page(bitmap, bitmap->storage.sb_index,
514 			      bitmap->storage.sb_page, 1);
515 }
516 
517 static void bitmap_print_sb(struct bitmap *bitmap)
518 {
519 	bitmap_super_t *sb;
520 
521 	if (!bitmap || !bitmap->storage.sb_page)
522 		return;
523 	sb = kmap_atomic(bitmap->storage.sb_page);
524 	pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
525 	pr_debug("         magic: %08x\n", le32_to_cpu(sb->magic));
526 	pr_debug("       version: %u\n", le32_to_cpu(sb->version));
527 	pr_debug("          uuid: %08x.%08x.%08x.%08x\n",
528 		 le32_to_cpu(*(__le32 *)(sb->uuid+0)),
529 		 le32_to_cpu(*(__le32 *)(sb->uuid+4)),
530 		 le32_to_cpu(*(__le32 *)(sb->uuid+8)),
531 		 le32_to_cpu(*(__le32 *)(sb->uuid+12)));
532 	pr_debug("        events: %llu\n",
533 		 (unsigned long long) le64_to_cpu(sb->events));
534 	pr_debug("events cleared: %llu\n",
535 		 (unsigned long long) le64_to_cpu(sb->events_cleared));
536 	pr_debug("         state: %08x\n", le32_to_cpu(sb->state));
537 	pr_debug("     chunksize: %u B\n", le32_to_cpu(sb->chunksize));
538 	pr_debug("  daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
539 	pr_debug("     sync size: %llu KB\n",
540 		 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
541 	pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
542 	kunmap_atomic(sb);
543 }
544 
545 /*
546  * bitmap_new_disk_sb
547  * @bitmap
548  *
549  * This function is somewhat the reverse of bitmap_read_sb.  bitmap_read_sb
550  * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
551  * This function verifies 'bitmap_info' and populates the on-disk bitmap
552  * structure, which is to be written to disk.
553  *
554  * Returns: 0 on success, -Exxx on error
555  */
556 static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
557 {
558 	bitmap_super_t *sb;
559 	unsigned long chunksize, daemon_sleep, write_behind;
560 
561 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
562 	if (bitmap->storage.sb_page == NULL)
563 		return -ENOMEM;
564 	bitmap->storage.sb_index = 0;
565 
566 	sb = kmap_atomic(bitmap->storage.sb_page);
567 
568 	sb->magic = cpu_to_le32(BITMAP_MAGIC);
569 	sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
570 
571 	chunksize = bitmap->mddev->bitmap_info.chunksize;
572 	BUG_ON(!chunksize);
573 	if (!is_power_of_2(chunksize)) {
574 		kunmap_atomic(sb);
575 		pr_warn("bitmap chunksize not a power of 2\n");
576 		return -EINVAL;
577 	}
578 	sb->chunksize = cpu_to_le32(chunksize);
579 
580 	daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
581 	if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
582 		pr_debug("Choosing daemon_sleep default (5 sec)\n");
583 		daemon_sleep = 5 * HZ;
584 	}
585 	sb->daemon_sleep = cpu_to_le32(daemon_sleep);
586 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
587 
588 	/*
589 	 * FIXME: write_behind for RAID1.  If not specified, what
590 	 * is a good choice?  We choose COUNTER_MAX / 2 arbitrarily.
591 	 */
592 	write_behind = bitmap->mddev->bitmap_info.max_write_behind;
593 	if (write_behind > COUNTER_MAX)
594 		write_behind = COUNTER_MAX / 2;
595 	sb->write_behind = cpu_to_le32(write_behind);
596 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
597 
598 	/* keep the array size field of the bitmap superblock up to date */
599 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
600 
601 	memcpy(sb->uuid, bitmap->mddev->uuid, 16);
602 
603 	set_bit(BITMAP_STALE, &bitmap->flags);
604 	sb->state = cpu_to_le32(bitmap->flags);
605 	bitmap->events_cleared = bitmap->mddev->events;
606 	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
607 	bitmap->mddev->bitmap_info.nodes = 0;
608 
609 	kunmap_atomic(sb);
610 
611 	return 0;
612 }
613 
614 /* read the superblock from the bitmap file and initialize some bitmap fields */
615 static int md_bitmap_read_sb(struct bitmap *bitmap)
616 {
617 	char *reason = NULL;
618 	bitmap_super_t *sb;
619 	unsigned long chunksize, daemon_sleep, write_behind;
620 	unsigned long long events;
621 	int nodes = 0;
622 	unsigned long sectors_reserved = 0;
623 	int err = -EINVAL;
624 	struct page *sb_page;
625 	loff_t offset = 0;
626 
627 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
628 		chunksize = 128 * 1024 * 1024;
629 		daemon_sleep = 5 * HZ;
630 		write_behind = 0;
631 		set_bit(BITMAP_STALE, &bitmap->flags);
632 		err = 0;
633 		goto out_no_sb;
634 	}
635 	/* page 0 is the superblock, read it... */
636 	sb_page = alloc_page(GFP_KERNEL);
637 	if (!sb_page)
638 		return -ENOMEM;
639 	bitmap->storage.sb_page = sb_page;
640 
641 re_read:
642 	/* If cluster_slot is set, the cluster is setup */
643 	if (bitmap->cluster_slot >= 0) {
644 		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
645 
646 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks,
647 			   (bitmap->mddev->bitmap_info.chunksize >> 9));
648 		/* bits to bytes */
649 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
650 		/* to 4k blocks */
651 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
652 		offset = bitmap->cluster_slot * (bm_blocks << 3);
653 		pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
654 			bitmap->cluster_slot, offset);
655 	}
656 
657 	if (bitmap->storage.file) {
658 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
659 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
660 
661 		err = read_file_page(bitmap->storage.file, 0,
662 				bitmap, bytes, sb_page);
663 	} else {
664 		err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
665 				   sizeof(bitmap_super_t));
666 	}
667 	if (err)
668 		return err;
669 
670 	err = -EINVAL;
671 	sb = kmap_atomic(sb_page);
672 
673 	chunksize = le32_to_cpu(sb->chunksize);
674 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
675 	write_behind = le32_to_cpu(sb->write_behind);
676 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
677 
678 	/* verify that the bitmap-specific fields are valid */
679 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
680 		reason = "bad magic";
681 	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
682 		 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
683 		reason = "unrecognized superblock version";
684 	else if (chunksize < 512)
685 		reason = "bitmap chunksize too small";
686 	else if (!is_power_of_2(chunksize))
687 		reason = "bitmap chunksize not a power of 2";
688 	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
689 		reason = "daemon sleep period out of range";
690 	else if (write_behind > COUNTER_MAX)
691 		reason = "write-behind limit out of range (0 - 16383)";
692 	if (reason) {
693 		pr_warn("%s: invalid bitmap file superblock: %s\n",
694 			bmname(bitmap), reason);
695 		goto out;
696 	}
697 
698 	/*
699 	 * Setup nodes/clustername only if bitmap version is
700 	 * cluster-compatible
701 	 */
702 	if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
703 		nodes = le32_to_cpu(sb->nodes);
704 		strscpy(bitmap->mddev->bitmap_info.cluster_name,
705 				sb->cluster_name, 64);
706 	}
707 
708 	/* keep the array size field of the bitmap superblock up to date */
709 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
710 
711 	if (bitmap->mddev->persistent) {
712 		/*
713 		 * We have a persistent array superblock, so compare the
714 		 * bitmap's UUID and event counter to the mddev's
715 		 */
716 		if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
717 			pr_warn("%s: bitmap superblock UUID mismatch\n",
718 				bmname(bitmap));
719 			goto out;
720 		}
721 		events = le64_to_cpu(sb->events);
722 		if (!nodes && (events < bitmap->mddev->events)) {
723 			pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
724 				bmname(bitmap), events,
725 				(unsigned long long) bitmap->mddev->events);
726 			set_bit(BITMAP_STALE, &bitmap->flags);
727 		}
728 	}
729 
730 	/* assign fields using values from superblock */
731 	bitmap->flags |= le32_to_cpu(sb->state);
732 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
733 		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
734 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
735 	err = 0;
736 
737 out:
738 	kunmap_atomic(sb);
739 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
740 		/* Assigning chunksize is required for "re_read" */
741 		bitmap->mddev->bitmap_info.chunksize = chunksize;
742 		err = md_setup_cluster(bitmap->mddev, nodes);
743 		if (err) {
744 			pr_warn("%s: Could not setup cluster service (%d)\n",
745 				bmname(bitmap), err);
746 			goto out_no_sb;
747 		}
748 		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
749 		goto re_read;
750 	}
751 
752 out_no_sb:
753 	if (err == 0) {
754 		if (test_bit(BITMAP_STALE, &bitmap->flags))
755 			bitmap->events_cleared = bitmap->mddev->events;
756 		bitmap->mddev->bitmap_info.chunksize = chunksize;
757 		bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
758 		bitmap->mddev->bitmap_info.max_write_behind = write_behind;
759 		bitmap->mddev->bitmap_info.nodes = nodes;
760 		if (bitmap->mddev->bitmap_info.space == 0 ||
761 			bitmap->mddev->bitmap_info.space > sectors_reserved)
762 			bitmap->mddev->bitmap_info.space = sectors_reserved;
763 	} else {
764 		bitmap_print_sb(bitmap);
765 		if (bitmap->cluster_slot < 0)
766 			md_cluster_stop(bitmap->mddev);
767 	}
768 	return err;
769 }
770 
771 /*
772  * general bitmap file operations
773  */
774 
775 /*
776  * on-disk bitmap:
777  *
778  * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
779  * file a page at a time. There's a superblock at the start of the file.
780  */
781 /* calculate the index of the page that contains this bit */
782 static inline unsigned long file_page_index(struct bitmap_storage *store,
783 					    unsigned long chunk)
784 {
785 	if (store->sb_page)
786 		chunk += sizeof(bitmap_super_t) << 3;
787 	return chunk >> PAGE_BIT_SHIFT;
788 }
789 
790 /* calculate the (bit) offset of this bit within a page */
791 static inline unsigned long file_page_offset(struct bitmap_storage *store,
792 					     unsigned long chunk)
793 {
794 	if (store->sb_page)
795 		chunk += sizeof(bitmap_super_t) << 3;
796 	return chunk & (PAGE_BITS - 1);
797 }
798 
799 /*
800  * return a pointer to the page in the filemap that contains the given bit
801  *
802  */
803 static inline struct page *filemap_get_page(struct bitmap_storage *store,
804 					    unsigned long chunk)
805 {
806 	if (file_page_index(store, chunk) >= store->file_pages)
807 		return NULL;
808 	return store->filemap[file_page_index(store, chunk)];
809 }
810 
811 static int md_bitmap_storage_alloc(struct bitmap_storage *store,
812 				   unsigned long chunks, int with_super,
813 				   int slot_number)
814 {
815 	int pnum, offset = 0;
816 	unsigned long num_pages;
817 	unsigned long bytes;
818 
819 	bytes = DIV_ROUND_UP(chunks, 8);
820 	if (with_super)
821 		bytes += sizeof(bitmap_super_t);
822 
823 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
824 	offset = slot_number * num_pages;
825 
826 	store->filemap = kmalloc_array(num_pages, sizeof(struct page *),
827 				       GFP_KERNEL);
828 	if (!store->filemap)
829 		return -ENOMEM;
830 
831 	if (with_super && !store->sb_page) {
832 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
833 		if (store->sb_page == NULL)
834 			return -ENOMEM;
835 	}
836 
837 	pnum = 0;
838 	if (store->sb_page) {
839 		store->filemap[0] = store->sb_page;
840 		pnum = 1;
841 		store->sb_index = offset;
842 	}
843 
844 	for ( ; pnum < num_pages; pnum++) {
845 		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
846 		if (!store->filemap[pnum]) {
847 			store->file_pages = pnum;
848 			return -ENOMEM;
849 		}
850 	}
851 	store->file_pages = pnum;
852 
853 	/* We need 4 bits per page, rounded up to a multiple
854 	 * of sizeof(unsigned long) */
855 	store->filemap_attr = kzalloc(
856 		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
857 		GFP_KERNEL);
858 	if (!store->filemap_attr)
859 		return -ENOMEM;
860 
861 	store->bytes = bytes;
862 
863 	return 0;
864 }
865 
866 static void md_bitmap_file_unmap(struct bitmap_storage *store)
867 {
868 	struct file *file = store->file;
869 	struct page *sb_page = store->sb_page;
870 	struct page **map = store->filemap;
871 	int pages = store->file_pages;
872 
873 	while (pages--)
874 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
875 			free_buffers(map[pages]);
876 	kfree(map);
877 	kfree(store->filemap_attr);
878 
879 	if (sb_page)
880 		free_buffers(sb_page);
881 
882 	if (file) {
883 		struct inode *inode = file_inode(file);
884 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
885 		fput(file);
886 	}
887 }
888 
889 /*
890  * bitmap_file_kick - if an error occurs while manipulating the bitmap file
891  * then it is no longer reliable, so we stop using it and we mark the file
892  * as failed in the superblock
893  */
894 static void md_bitmap_file_kick(struct bitmap *bitmap)
895 {
896 	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
897 		bitmap_update_sb(bitmap);
898 
899 		if (bitmap->storage.file) {
900 			pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
901 				bmname(bitmap), bitmap->storage.file);
902 
903 		} else
904 			pr_warn("%s: disabling internal bitmap due to errors\n",
905 				bmname(bitmap));
906 	}
907 }
908 
909 enum bitmap_page_attr {
910 	BITMAP_PAGE_DIRTY = 0,     /* there are set bits that need to be synced */
911 	BITMAP_PAGE_PENDING = 1,   /* there are bits that are being cleaned.
912 				    * i.e. counter is 1 or 2. */
913 	BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
914 };
915 
916 static inline void set_page_attr(struct bitmap *bitmap, int pnum,
917 				 enum bitmap_page_attr attr)
918 {
919 	set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
920 }
921 
922 static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
923 				   enum bitmap_page_attr attr)
924 {
925 	clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
926 }
927 
928 static inline int test_page_attr(struct bitmap *bitmap, int pnum,
929 				 enum bitmap_page_attr attr)
930 {
931 	return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
932 }
933 
934 static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
935 					   enum bitmap_page_attr attr)
936 {
937 	return test_and_clear_bit((pnum<<2) + attr,
938 				  bitmap->storage.filemap_attr);
939 }
940 /*
941  * bitmap_file_set_bit -- called before performing a write to the md device
942  * to set (and eventually sync) a particular bit in the bitmap file
943  *
944  * we set the bit immediately, then we record the page number so that
945  * when an unplug occurs, we can flush the dirty pages out to disk
946  */
947 static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
948 {
949 	unsigned long bit;
950 	struct page *page;
951 	void *kaddr;
952 	unsigned long chunk = block >> bitmap->counts.chunkshift;
953 	struct bitmap_storage *store = &bitmap->storage;
954 	unsigned long index = file_page_index(store, chunk);
955 	unsigned long node_offset = 0;
956 
957 	index += store->sb_index;
958 	if (mddev_is_clustered(bitmap->mddev))
959 		node_offset = bitmap->cluster_slot * store->file_pages;
960 
961 	page = filemap_get_page(&bitmap->storage, chunk);
962 	if (!page)
963 		return;
964 	bit = file_page_offset(&bitmap->storage, chunk);
965 
966 	/* set the bit */
967 	kaddr = kmap_atomic(page);
968 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
969 		set_bit(bit, kaddr);
970 	else
971 		set_bit_le(bit, kaddr);
972 	kunmap_atomic(kaddr);
973 	pr_debug("set file bit %lu page %lu\n", bit, index);
974 	/* record page number so it gets flushed to disk when unplug occurs */
975 	set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
976 }
977 
978 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
979 {
980 	unsigned long bit;
981 	struct page *page;
982 	void *paddr;
983 	unsigned long chunk = block >> bitmap->counts.chunkshift;
984 	struct bitmap_storage *store = &bitmap->storage;
985 	unsigned long index = file_page_index(store, chunk);
986 	unsigned long node_offset = 0;
987 
988 	index += store->sb_index;
989 	if (mddev_is_clustered(bitmap->mddev))
990 		node_offset = bitmap->cluster_slot * store->file_pages;
991 
992 	page = filemap_get_page(&bitmap->storage, chunk);
993 	if (!page)
994 		return;
995 	bit = file_page_offset(&bitmap->storage, chunk);
996 	paddr = kmap_atomic(page);
997 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
998 		clear_bit(bit, paddr);
999 	else
1000 		clear_bit_le(bit, paddr);
1001 	kunmap_atomic(paddr);
1002 	if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
1003 		set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
1004 		bitmap->allclean = 0;
1005 	}
1006 }
1007 
1008 static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
1009 {
1010 	unsigned long bit;
1011 	struct page *page;
1012 	void *paddr;
1013 	unsigned long chunk = block >> bitmap->counts.chunkshift;
1014 	int set = 0;
1015 
1016 	page = filemap_get_page(&bitmap->storage, chunk);
1017 	if (!page)
1018 		return -EINVAL;
1019 	bit = file_page_offset(&bitmap->storage, chunk);
1020 	paddr = kmap_atomic(page);
1021 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1022 		set = test_bit(bit, paddr);
1023 	else
1024 		set = test_bit_le(bit, paddr);
1025 	kunmap_atomic(paddr);
1026 	return set;
1027 }
1028 
1029 /* this gets called when the md device is ready to unplug its underlying
1030  * (slave) device queues -- before we let any writes go down, we need to
1031  * sync the dirty pages of the bitmap file to disk */
1032 static void __bitmap_unplug(struct bitmap *bitmap)
1033 {
1034 	unsigned long i;
1035 	int dirty, need_write;
1036 	int writing = 0;
1037 
1038 	if (!md_bitmap_enabled(bitmap))
1039 		return;
1040 
1041 	/* look at each page to see if there are any set bits that need to be
1042 	 * flushed out to disk */
1043 	for (i = 0; i < bitmap->storage.file_pages; i++) {
1044 		dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
1045 		need_write = test_and_clear_page_attr(bitmap, i,
1046 						      BITMAP_PAGE_NEEDWRITE);
1047 		if (dirty || need_write) {
1048 			if (!writing) {
1049 				md_bitmap_wait_writes(bitmap);
1050 				mddev_add_trace_msg(bitmap->mddev,
1051 					"md bitmap_unplug");
1052 			}
1053 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
1054 			filemap_write_page(bitmap, i, false);
1055 			writing = 1;
1056 		}
1057 	}
1058 	if (writing)
1059 		md_bitmap_wait_writes(bitmap);
1060 
1061 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1062 		md_bitmap_file_kick(bitmap);
1063 }
1064 
1065 struct bitmap_unplug_work {
1066 	struct work_struct work;
1067 	struct bitmap *bitmap;
1068 	struct completion *done;
1069 };
1070 
1071 static void md_bitmap_unplug_fn(struct work_struct *work)
1072 {
1073 	struct bitmap_unplug_work *unplug_work =
1074 		container_of(work, struct bitmap_unplug_work, work);
1075 
1076 	__bitmap_unplug(unplug_work->bitmap);
1077 	complete(unplug_work->done);
1078 }
1079 
1080 static void bitmap_unplug_async(struct bitmap *bitmap)
1081 {
1082 	DECLARE_COMPLETION_ONSTACK(done);
1083 	struct bitmap_unplug_work unplug_work;
1084 
1085 	INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn);
1086 	unplug_work.bitmap = bitmap;
1087 	unplug_work.done = &done;
1088 
1089 	queue_work(md_bitmap_wq, &unplug_work.work);
1090 	wait_for_completion(&done);
1091 }
1092 
1093 static void bitmap_unplug(struct mddev *mddev, bool sync)
1094 {
1095 	struct bitmap *bitmap = mddev->bitmap;
1096 
1097 	if (!bitmap)
1098 		return;
1099 
1100 	if (sync)
1101 		__bitmap_unplug(bitmap);
1102 	else
1103 		bitmap_unplug_async(bitmap);
1104 }
1105 
1106 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
1107 
1108 /*
1109  * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
1110  * mapping of the bitmap file.
1111  *
1112  * Special case: If there's no bitmap file, or if the bitmap file had been
1113  * previously kicked from the array, we mark all the bits as 1's in order to
1114  * cause a full resync.
1115  *
1116  * We ignore all bits for sectors that end earlier than 'start'.
1117  * This is used when reading an out-of-date bitmap.
1118  */
1119 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1120 {
1121 	bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
1122 	struct mddev *mddev = bitmap->mddev;
1123 	unsigned long chunks = bitmap->counts.chunks;
1124 	struct bitmap_storage *store = &bitmap->storage;
1125 	struct file *file = store->file;
1126 	unsigned long node_offset = 0;
1127 	unsigned long bit_cnt = 0;
1128 	unsigned long i;
1129 	int ret;
1130 
1131 	if (!file && !mddev->bitmap_info.offset) {
1132 		/* No permanent bitmap - fill with '1s'. */
1133 		store->filemap = NULL;
1134 		store->file_pages = 0;
1135 		for (i = 0; i < chunks ; i++) {
1136 			/* if the disk bit is set, set the memory bit */
1137 			int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
1138 				      >= start);
1139 			md_bitmap_set_memory_bits(bitmap,
1140 						  (sector_t)i << bitmap->counts.chunkshift,
1141 						  needed);
1142 		}
1143 		return 0;
1144 	}
1145 
1146 	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
1147 		pr_warn("%s: bitmap file too short %lu < %lu\n",
1148 			bmname(bitmap),
1149 			(unsigned long) i_size_read(file->f_mapping->host),
1150 			store->bytes);
1151 		ret = -ENOSPC;
1152 		goto err;
1153 	}
1154 
1155 	if (mddev_is_clustered(mddev))
1156 		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
1157 
1158 	for (i = 0; i < store->file_pages; i++) {
1159 		struct page *page = store->filemap[i];
1160 		int count;
1161 
1162 		/* unmap the old page, we're done with it */
1163 		if (i == store->file_pages - 1)
1164 			count = store->bytes - i * PAGE_SIZE;
1165 		else
1166 			count = PAGE_SIZE;
1167 
1168 		if (file)
1169 			ret = read_file_page(file, i, bitmap, count, page);
1170 		else
1171 			ret = read_sb_page(mddev, 0, page, i + node_offset,
1172 					   count);
1173 		if (ret)
1174 			goto err;
1175 	}
1176 
1177 	if (outofdate) {
1178 		pr_warn("%s: bitmap file is out of date, doing full recovery\n",
1179 			bmname(bitmap));
1180 
1181 		for (i = 0; i < store->file_pages; i++) {
1182 			struct page *page = store->filemap[i];
1183 			unsigned long offset = 0;
1184 			void *paddr;
1185 
1186 			if (i == 0 && !mddev->bitmap_info.external)
1187 				offset = sizeof(bitmap_super_t);
1188 
1189 			/*
1190 			 * If the bitmap is out of date, dirty the whole page
1191 			 * and write it out
1192 			 */
1193 			paddr = kmap_atomic(page);
1194 			memset(paddr + offset, 0xff, PAGE_SIZE - offset);
1195 			kunmap_atomic(paddr);
1196 
1197 			filemap_write_page(bitmap, i, true);
1198 			if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
1199 				ret = -EIO;
1200 				goto err;
1201 			}
1202 		}
1203 	}
1204 
1205 	for (i = 0; i < chunks; i++) {
1206 		struct page *page = filemap_get_page(&bitmap->storage, i);
1207 		unsigned long bit = file_page_offset(&bitmap->storage, i);
1208 		void *paddr;
1209 		bool was_set;
1210 
1211 		paddr = kmap_atomic(page);
1212 		if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1213 			was_set = test_bit(bit, paddr);
1214 		else
1215 			was_set = test_bit_le(bit, paddr);
1216 		kunmap_atomic(paddr);
1217 
1218 		if (was_set) {
1219 			/* if the disk bit is set, set the memory bit */
1220 			int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
1221 				      >= start);
1222 			md_bitmap_set_memory_bits(bitmap,
1223 						  (sector_t)i << bitmap->counts.chunkshift,
1224 						  needed);
1225 			bit_cnt++;
1226 		}
1227 	}
1228 
1229 	pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
1230 		 bmname(bitmap), store->file_pages,
1231 		 bit_cnt, chunks);
1232 
1233 	return 0;
1234 
1235  err:
1236 	pr_warn("%s: bitmap initialisation failed: %d\n",
1237 		bmname(bitmap), ret);
1238 	return ret;
1239 }
1240 
1241 /* just flag bitmap pages as needing to be written. */
1242 static void bitmap_write_all(struct mddev *mddev)
1243 {
1244 	int i;
1245 	struct bitmap *bitmap = mddev->bitmap;
1246 
1247 	if (!bitmap || !bitmap->storage.filemap)
1248 		return;
1249 
1250 	/* Only one copy, so nothing needed */
1251 	if (bitmap->storage.file)
1252 		return;
1253 
1254 	for (i = 0; i < bitmap->storage.file_pages; i++)
1255 		set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
1256 	bitmap->allclean = 0;
1257 }
1258 
1259 static void md_bitmap_count_page(struct bitmap_counts *bitmap,
1260 				 sector_t offset, int inc)
1261 {
1262 	sector_t chunk = offset >> bitmap->chunkshift;
1263 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1264 	bitmap->bp[page].count += inc;
1265 	md_bitmap_checkfree(bitmap, page);
1266 }
1267 
1268 static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
1269 {
1270 	sector_t chunk = offset >> bitmap->chunkshift;
1271 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1272 	struct bitmap_page *bp = &bitmap->bp[page];
1273 
1274 	if (!bp->pending)
1275 		bp->pending = 1;
1276 }
1277 
1278 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1279 					       sector_t offset, sector_t *blocks,
1280 					       int create);
1281 
1282 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
1283 			      bool force)
1284 {
1285 	struct md_thread *thread;
1286 
1287 	rcu_read_lock();
1288 	thread = rcu_dereference(mddev->thread);
1289 
1290 	if (!thread)
1291 		goto out;
1292 
1293 	if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
1294 		thread->timeout = timeout;
1295 
1296 out:
1297 	rcu_read_unlock();
1298 }
1299 
1300 /*
1301  * bitmap daemon -- periodically wakes up to clean bits and flush pages
1302  *			out to disk
1303  */
1304 static void bitmap_daemon_work(struct mddev *mddev)
1305 {
1306 	struct bitmap *bitmap;
1307 	unsigned long j;
1308 	unsigned long nextpage;
1309 	sector_t blocks;
1310 	struct bitmap_counts *counts;
1311 
1312 	/* Use a mutex to guard daemon_work against
1313 	 * bitmap_destroy.
1314 	 */
1315 	mutex_lock(&mddev->bitmap_info.mutex);
1316 	bitmap = mddev->bitmap;
1317 	if (bitmap == NULL) {
1318 		mutex_unlock(&mddev->bitmap_info.mutex);
1319 		return;
1320 	}
1321 	if (time_before(jiffies, bitmap->daemon_lastrun
1322 			+ mddev->bitmap_info.daemon_sleep))
1323 		goto done;
1324 
1325 	bitmap->daemon_lastrun = jiffies;
1326 	if (bitmap->allclean) {
1327 		mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
1328 		goto done;
1329 	}
1330 	bitmap->allclean = 1;
1331 
1332 	mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
1333 
1334 	/* Any file-page which is PENDING now needs to be written.
1335 	 * So set NEEDWRITE now, then after we make any last-minute changes
1336 	 * we will write it.
1337 	 */
1338 	for (j = 0; j < bitmap->storage.file_pages; j++)
1339 		if (test_and_clear_page_attr(bitmap, j,
1340 					     BITMAP_PAGE_PENDING))
1341 			set_page_attr(bitmap, j,
1342 				      BITMAP_PAGE_NEEDWRITE);
1343 
1344 	if (bitmap->need_sync &&
1345 	    mddev->bitmap_info.external == 0) {
1346 		/* Arrange for superblock update as well as
1347 		 * other changes */
1348 		bitmap_super_t *sb;
1349 		bitmap->need_sync = 0;
1350 		if (bitmap->storage.filemap) {
1351 			sb = kmap_atomic(bitmap->storage.sb_page);
1352 			sb->events_cleared =
1353 				cpu_to_le64(bitmap->events_cleared);
1354 			kunmap_atomic(sb);
1355 			set_page_attr(bitmap, 0,
1356 				      BITMAP_PAGE_NEEDWRITE);
1357 		}
1358 	}
1359 	/* Now look at the bitmap counters and if any are '2' or '1',
1360 	 * decrement and handle accordingly.
1361 	 */
1362 	counts = &bitmap->counts;
1363 	spin_lock_irq(&counts->lock);
1364 	nextpage = 0;
1365 	for (j = 0; j < counts->chunks; j++) {
1366 		bitmap_counter_t *bmc;
1367 		sector_t  block = (sector_t)j << counts->chunkshift;
1368 
1369 		if (j == nextpage) {
1370 			nextpage += PAGE_COUNTER_RATIO;
1371 			if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
1372 				j |= PAGE_COUNTER_MASK;
1373 				continue;
1374 			}
1375 			counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
1376 		}
1377 
1378 		bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
1379 		if (!bmc) {
1380 			j |= PAGE_COUNTER_MASK;
1381 			continue;
1382 		}
1383 		if (*bmc == 1 && !bitmap->need_sync) {
1384 			/* We can clear the bit */
1385 			*bmc = 0;
1386 			md_bitmap_count_page(counts, block, -1);
1387 			md_bitmap_file_clear_bit(bitmap, block);
1388 		} else if (*bmc && *bmc <= 2) {
1389 			*bmc = 1;
1390 			md_bitmap_set_pending(counts, block);
1391 			bitmap->allclean = 0;
1392 		}
1393 	}
1394 	spin_unlock_irq(&counts->lock);
1395 
1396 	md_bitmap_wait_writes(bitmap);
1397 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
1398 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
1399 	 * for them.
1400 	 * If we find any DIRTY page we stop there and let bitmap_unplug
1401 	 * handle all the rest.  This is important in the case where
1402 	 * the first blocking holds the superblock and it has been updated.
1403 	 * We mustn't write any other blocks before the superblock.
1404 	 */
1405 	for (j = 0;
1406 	     j < bitmap->storage.file_pages
1407 		     && !test_bit(BITMAP_STALE, &bitmap->flags);
1408 	     j++) {
1409 		if (test_page_attr(bitmap, j,
1410 				   BITMAP_PAGE_DIRTY))
1411 			/* bitmap_unplug will handle the rest */
1412 			break;
1413 		if (bitmap->storage.filemap &&
1414 		    test_and_clear_page_attr(bitmap, j,
1415 					     BITMAP_PAGE_NEEDWRITE))
1416 			filemap_write_page(bitmap, j, false);
1417 	}
1418 
1419  done:
1420 	if (bitmap->allclean == 0)
1421 		mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
1422 	mutex_unlock(&mddev->bitmap_info.mutex);
1423 }
1424 
1425 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1426 					       sector_t offset, sector_t *blocks,
1427 					       int create)
1428 __releases(bitmap->lock)
1429 __acquires(bitmap->lock)
1430 {
1431 	/* If 'create', we might release the lock and reclaim it.
1432 	 * The lock must have been taken with interrupts enabled.
1433 	 * If !create, we don't release the lock.
1434 	 */
1435 	sector_t chunk = offset >> bitmap->chunkshift;
1436 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1437 	unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1438 	sector_t csize = ((sector_t)1) << bitmap->chunkshift;
1439 	int err;
1440 
1441 	if (page >= bitmap->pages) {
1442 		/*
1443 		 * This can happen if bitmap_start_sync goes beyond
1444 		 * End-of-device while looking for a whole page or
1445 		 * user set a huge number to sysfs bitmap_set_bits.
1446 		 */
1447 		*blocks = csize - (offset & (csize - 1));
1448 		return NULL;
1449 	}
1450 	err = md_bitmap_checkpage(bitmap, page, create, 0);
1451 
1452 	if (bitmap->bp[page].hijacked ||
1453 	    bitmap->bp[page].map == NULL)
1454 		csize = ((sector_t)1) << (bitmap->chunkshift +
1455 					  PAGE_COUNTER_SHIFT);
1456 
1457 	*blocks = csize - (offset & (csize - 1));
1458 
1459 	if (err < 0)
1460 		return NULL;
1461 
1462 	/* now locked ... */
1463 
1464 	if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1465 		/* should we use the first or second counter field
1466 		 * of the hijacked pointer? */
1467 		int hi = (pageoff > PAGE_COUNTER_MASK);
1468 		return  &((bitmap_counter_t *)
1469 			  &bitmap->bp[page].map)[hi];
1470 	} else /* page is allocated */
1471 		return (bitmap_counter_t *)
1472 			&(bitmap->bp[page].map[pageoff]);
1473 }
1474 
1475 static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
1476 			     unsigned long sectors, bool behind)
1477 {
1478 	struct bitmap *bitmap = mddev->bitmap;
1479 
1480 	if (!bitmap)
1481 		return 0;
1482 
1483 	if (behind) {
1484 		int bw;
1485 		atomic_inc(&bitmap->behind_writes);
1486 		bw = atomic_read(&bitmap->behind_writes);
1487 		if (bw > bitmap->behind_writes_used)
1488 			bitmap->behind_writes_used = bw;
1489 
1490 		pr_debug("inc write-behind count %d/%lu\n",
1491 			 bw, bitmap->mddev->bitmap_info.max_write_behind);
1492 	}
1493 
1494 	while (sectors) {
1495 		sector_t blocks;
1496 		bitmap_counter_t *bmc;
1497 
1498 		spin_lock_irq(&bitmap->counts.lock);
1499 		bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
1500 		if (!bmc) {
1501 			spin_unlock_irq(&bitmap->counts.lock);
1502 			return 0;
1503 		}
1504 
1505 		if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
1506 			DEFINE_WAIT(__wait);
1507 			/* note that it is safe to do the prepare_to_wait
1508 			 * after the test as long as we do it before dropping
1509 			 * the spinlock.
1510 			 */
1511 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
1512 					TASK_UNINTERRUPTIBLE);
1513 			spin_unlock_irq(&bitmap->counts.lock);
1514 			schedule();
1515 			finish_wait(&bitmap->overflow_wait, &__wait);
1516 			continue;
1517 		}
1518 
1519 		switch (*bmc) {
1520 		case 0:
1521 			md_bitmap_file_set_bit(bitmap, offset);
1522 			md_bitmap_count_page(&bitmap->counts, offset, 1);
1523 			fallthrough;
1524 		case 1:
1525 			*bmc = 2;
1526 		}
1527 
1528 		(*bmc)++;
1529 
1530 		spin_unlock_irq(&bitmap->counts.lock);
1531 
1532 		offset += blocks;
1533 		if (sectors > blocks)
1534 			sectors -= blocks;
1535 		else
1536 			sectors = 0;
1537 	}
1538 	return 0;
1539 }
1540 
1541 static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
1542 			    unsigned long sectors, bool success, bool behind)
1543 {
1544 	struct bitmap *bitmap = mddev->bitmap;
1545 
1546 	if (!bitmap)
1547 		return;
1548 
1549 	if (behind) {
1550 		if (atomic_dec_and_test(&bitmap->behind_writes))
1551 			wake_up(&bitmap->behind_wait);
1552 		pr_debug("dec write-behind count %d/%lu\n",
1553 			 atomic_read(&bitmap->behind_writes),
1554 			 bitmap->mddev->bitmap_info.max_write_behind);
1555 	}
1556 
1557 	while (sectors) {
1558 		sector_t blocks;
1559 		unsigned long flags;
1560 		bitmap_counter_t *bmc;
1561 
1562 		spin_lock_irqsave(&bitmap->counts.lock, flags);
1563 		bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
1564 		if (!bmc) {
1565 			spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1566 			return;
1567 		}
1568 
1569 		if (success && !bitmap->mddev->degraded &&
1570 		    bitmap->events_cleared < bitmap->mddev->events) {
1571 			bitmap->events_cleared = bitmap->mddev->events;
1572 			bitmap->need_sync = 1;
1573 			sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
1574 		}
1575 
1576 		if (!success && !NEEDED(*bmc))
1577 			*bmc |= NEEDED_MASK;
1578 
1579 		if (COUNTER(*bmc) == COUNTER_MAX)
1580 			wake_up(&bitmap->overflow_wait);
1581 
1582 		(*bmc)--;
1583 		if (*bmc <= 2) {
1584 			md_bitmap_set_pending(&bitmap->counts, offset);
1585 			bitmap->allclean = 0;
1586 		}
1587 		spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1588 		offset += blocks;
1589 		if (sectors > blocks)
1590 			sectors -= blocks;
1591 		else
1592 			sectors = 0;
1593 	}
1594 }
1595 
1596 static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
1597 				sector_t *blocks, bool degraded)
1598 {
1599 	bitmap_counter_t *bmc;
1600 	bool rv;
1601 
1602 	if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
1603 		*blocks = 1024;
1604 		return true; /* always resync if no bitmap */
1605 	}
1606 	spin_lock_irq(&bitmap->counts.lock);
1607 
1608 	rv = false;
1609 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1610 	if (bmc) {
1611 		/* locked */
1612 		if (RESYNC(*bmc)) {
1613 			rv = true;
1614 		} else if (NEEDED(*bmc)) {
1615 			rv = true;
1616 			if (!degraded) { /* don't set/clear bits if degraded */
1617 				*bmc |= RESYNC_MASK;
1618 				*bmc &= ~NEEDED_MASK;
1619 			}
1620 		}
1621 	}
1622 	spin_unlock_irq(&bitmap->counts.lock);
1623 
1624 	return rv;
1625 }
1626 
1627 static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
1628 			      sector_t *blocks, bool degraded)
1629 {
1630 	/* bitmap_start_sync must always report on multiples of whole
1631 	 * pages, otherwise resync (which is very PAGE_SIZE based) will
1632 	 * get confused.
1633 	 * So call __bitmap_start_sync repeatedly (if needed) until
1634 	 * At least PAGE_SIZE>>9 blocks are covered.
1635 	 * Return the 'or' of the result.
1636 	 */
1637 	bool rv = false;
1638 	sector_t blocks1;
1639 
1640 	*blocks = 0;
1641 	while (*blocks < (PAGE_SIZE>>9)) {
1642 		rv |= __bitmap_start_sync(mddev->bitmap, offset,
1643 					  &blocks1, degraded);
1644 		offset += blocks1;
1645 		*blocks += blocks1;
1646 	}
1647 
1648 	return rv;
1649 }
1650 
1651 static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
1652 			      sector_t *blocks, bool aborted)
1653 {
1654 	bitmap_counter_t *bmc;
1655 	unsigned long flags;
1656 
1657 	if (bitmap == NULL) {
1658 		*blocks = 1024;
1659 		return;
1660 	}
1661 	spin_lock_irqsave(&bitmap->counts.lock, flags);
1662 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1663 	if (bmc == NULL)
1664 		goto unlock;
1665 	/* locked */
1666 	if (RESYNC(*bmc)) {
1667 		*bmc &= ~RESYNC_MASK;
1668 
1669 		if (!NEEDED(*bmc) && aborted)
1670 			*bmc |= NEEDED_MASK;
1671 		else {
1672 			if (*bmc <= 2) {
1673 				md_bitmap_set_pending(&bitmap->counts, offset);
1674 				bitmap->allclean = 0;
1675 			}
1676 		}
1677 	}
1678  unlock:
1679 	spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1680 }
1681 
1682 static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
1683 			    sector_t *blocks)
1684 {
1685 	__bitmap_end_sync(mddev->bitmap, offset, blocks, true);
1686 }
1687 
1688 static void bitmap_close_sync(struct mddev *mddev)
1689 {
1690 	/* Sync has finished, and any bitmap chunks that weren't synced
1691 	 * properly have been aborted.  It remains to us to clear the
1692 	 * RESYNC bit wherever it is still on
1693 	 */
1694 	sector_t sector = 0;
1695 	sector_t blocks;
1696 	struct bitmap *bitmap = mddev->bitmap;
1697 
1698 	if (!bitmap)
1699 		return;
1700 
1701 	while (sector < bitmap->mddev->resync_max_sectors) {
1702 		__bitmap_end_sync(bitmap, sector, &blocks, false);
1703 		sector += blocks;
1704 	}
1705 }
1706 
1707 static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1708 				 bool force)
1709 {
1710 	sector_t s = 0;
1711 	sector_t blocks;
1712 	struct bitmap *bitmap = mddev->bitmap;
1713 
1714 	if (!bitmap)
1715 		return;
1716 	if (sector == 0) {
1717 		bitmap->last_end_sync = jiffies;
1718 		return;
1719 	}
1720 	if (!force && time_before(jiffies, (bitmap->last_end_sync
1721 				  + bitmap->mddev->bitmap_info.daemon_sleep)))
1722 		return;
1723 	wait_event(bitmap->mddev->recovery_wait,
1724 		   atomic_read(&bitmap->mddev->recovery_active) == 0);
1725 
1726 	bitmap->mddev->curr_resync_completed = sector;
1727 	set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
1728 	sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
1729 	s = 0;
1730 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1731 		__bitmap_end_sync(bitmap, s, &blocks, false);
1732 		s += blocks;
1733 	}
1734 	bitmap->last_end_sync = jiffies;
1735 	sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
1736 }
1737 
1738 static void bitmap_sync_with_cluster(struct mddev *mddev,
1739 				     sector_t old_lo, sector_t old_hi,
1740 				     sector_t new_lo, sector_t new_hi)
1741 {
1742 	struct bitmap *bitmap = mddev->bitmap;
1743 	sector_t sector, blocks = 0;
1744 
1745 	for (sector = old_lo; sector < new_lo; ) {
1746 		__bitmap_end_sync(bitmap, sector, &blocks, false);
1747 		sector += blocks;
1748 	}
1749 	WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
1750 
1751 	for (sector = old_hi; sector < new_hi; ) {
1752 		bitmap_start_sync(mddev, sector, &blocks, false);
1753 		sector += blocks;
1754 	}
1755 	WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
1756 }
1757 
1758 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1759 {
1760 	/* For each chunk covered by any of these sectors, set the
1761 	 * counter to 2 and possibly set resync_needed.  They should all
1762 	 * be 0 at this point
1763 	 */
1764 
1765 	sector_t secs;
1766 	bitmap_counter_t *bmc;
1767 	spin_lock_irq(&bitmap->counts.lock);
1768 	bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
1769 	if (!bmc) {
1770 		spin_unlock_irq(&bitmap->counts.lock);
1771 		return;
1772 	}
1773 	if (!*bmc) {
1774 		*bmc = 2;
1775 		md_bitmap_count_page(&bitmap->counts, offset, 1);
1776 		md_bitmap_set_pending(&bitmap->counts, offset);
1777 		bitmap->allclean = 0;
1778 	}
1779 	if (needed)
1780 		*bmc |= NEEDED_MASK;
1781 	spin_unlock_irq(&bitmap->counts.lock);
1782 }
1783 
1784 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
1785 static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1786 			      unsigned long e)
1787 {
1788 	unsigned long chunk;
1789 	struct bitmap *bitmap = mddev->bitmap;
1790 
1791 	if (!bitmap)
1792 		return;
1793 
1794 	for (chunk = s; chunk <= e; chunk++) {
1795 		sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
1796 
1797 		md_bitmap_set_memory_bits(bitmap, sec, 1);
1798 		md_bitmap_file_set_bit(bitmap, sec);
1799 		if (sec < bitmap->mddev->recovery_cp)
1800 			/* We are asserting that the array is dirty,
1801 			 * so move the recovery_cp address back so
1802 			 * that it is obvious that it is dirty
1803 			 */
1804 			bitmap->mddev->recovery_cp = sec;
1805 	}
1806 }
1807 
1808 static void bitmap_flush(struct mddev *mddev)
1809 {
1810 	struct bitmap *bitmap = mddev->bitmap;
1811 	long sleep;
1812 
1813 	if (!bitmap) /* there was no bitmap */
1814 		return;
1815 
1816 	/* run the daemon_work three time to ensure everything is flushed
1817 	 * that can be
1818 	 */
1819 	sleep = mddev->bitmap_info.daemon_sleep * 2;
1820 	bitmap->daemon_lastrun -= sleep;
1821 	bitmap_daemon_work(mddev);
1822 	bitmap->daemon_lastrun -= sleep;
1823 	bitmap_daemon_work(mddev);
1824 	bitmap->daemon_lastrun -= sleep;
1825 	bitmap_daemon_work(mddev);
1826 	if (mddev->bitmap_info.external)
1827 		md_super_wait(mddev);
1828 	bitmap_update_sb(bitmap);
1829 }
1830 
1831 /*
1832  * free memory that was allocated
1833  */
1834 void md_bitmap_free(struct bitmap *bitmap)
1835 {
1836 	unsigned long k, pages;
1837 	struct bitmap_page *bp;
1838 
1839 	if (!bitmap) /* there was no bitmap */
1840 		return;
1841 
1842 	if (bitmap->sysfs_can_clear)
1843 		sysfs_put(bitmap->sysfs_can_clear);
1844 
1845 	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
1846 		bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
1847 		md_cluster_stop(bitmap->mddev);
1848 
1849 	/* Shouldn't be needed - but just in case.... */
1850 	wait_event(bitmap->write_wait,
1851 		   atomic_read(&bitmap->pending_writes) == 0);
1852 
1853 	/* release the bitmap file  */
1854 	md_bitmap_file_unmap(&bitmap->storage);
1855 
1856 	bp = bitmap->counts.bp;
1857 	pages = bitmap->counts.pages;
1858 
1859 	/* free all allocated memory */
1860 
1861 	if (bp) /* deallocate the page memory */
1862 		for (k = 0; k < pages; k++)
1863 			if (bp[k].map && !bp[k].hijacked)
1864 				kfree(bp[k].map);
1865 	kfree(bp);
1866 	kfree(bitmap);
1867 }
1868 EXPORT_SYMBOL(md_bitmap_free);
1869 
1870 void md_bitmap_wait_behind_writes(struct mddev *mddev)
1871 {
1872 	struct bitmap *bitmap = mddev->bitmap;
1873 
1874 	/* wait for behind writes to complete */
1875 	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1876 		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
1877 			 mdname(mddev));
1878 		/* need to kick something here to make sure I/O goes? */
1879 		wait_event(bitmap->behind_wait,
1880 			   atomic_read(&bitmap->behind_writes) == 0);
1881 	}
1882 }
1883 EXPORT_SYMBOL_GPL(md_bitmap_wait_behind_writes);
1884 
1885 static void bitmap_destroy(struct mddev *mddev)
1886 {
1887 	struct bitmap *bitmap = mddev->bitmap;
1888 
1889 	if (!bitmap) /* there was no bitmap */
1890 		return;
1891 
1892 	md_bitmap_wait_behind_writes(mddev);
1893 	if (!mddev->serialize_policy)
1894 		mddev_destroy_serial_pool(mddev, NULL);
1895 
1896 	mutex_lock(&mddev->bitmap_info.mutex);
1897 	spin_lock(&mddev->lock);
1898 	mddev->bitmap = NULL; /* disconnect from the md device */
1899 	spin_unlock(&mddev->lock);
1900 	mutex_unlock(&mddev->bitmap_info.mutex);
1901 	mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
1902 
1903 	md_bitmap_free(bitmap);
1904 }
1905 
1906 /*
1907  * initialize the bitmap structure
1908  * if this returns an error, bitmap_destroy must be called to do clean up
1909  * once mddev->bitmap is set
1910  */
1911 static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
1912 {
1913 	struct bitmap *bitmap;
1914 	sector_t blocks = mddev->resync_max_sectors;
1915 	struct file *file = mddev->bitmap_info.file;
1916 	int err;
1917 	struct kernfs_node *bm = NULL;
1918 
1919 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1920 
1921 	BUG_ON(file && mddev->bitmap_info.offset);
1922 
1923 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1924 		pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
1925 			  mdname(mddev));
1926 		return ERR_PTR(-EBUSY);
1927 	}
1928 
1929 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1930 	if (!bitmap)
1931 		return ERR_PTR(-ENOMEM);
1932 
1933 	spin_lock_init(&bitmap->counts.lock);
1934 	atomic_set(&bitmap->pending_writes, 0);
1935 	init_waitqueue_head(&bitmap->write_wait);
1936 	init_waitqueue_head(&bitmap->overflow_wait);
1937 	init_waitqueue_head(&bitmap->behind_wait);
1938 
1939 	bitmap->mddev = mddev;
1940 	bitmap->cluster_slot = slot;
1941 
1942 	if (mddev->kobj.sd)
1943 		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
1944 	if (bm) {
1945 		bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
1946 		sysfs_put(bm);
1947 	} else
1948 		bitmap->sysfs_can_clear = NULL;
1949 
1950 	bitmap->storage.file = file;
1951 	if (file) {
1952 		get_file(file);
1953 		/* As future accesses to this file will use bmap,
1954 		 * and bypass the page cache, we must sync the file
1955 		 * first.
1956 		 */
1957 		vfs_fsync(file, 1);
1958 	}
1959 	/* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
1960 	if (!mddev->bitmap_info.external) {
1961 		/*
1962 		 * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
1963 		 * instructing us to create a new on-disk bitmap instance.
1964 		 */
1965 		if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
1966 			err = md_bitmap_new_disk_sb(bitmap);
1967 		else
1968 			err = md_bitmap_read_sb(bitmap);
1969 	} else {
1970 		err = 0;
1971 		if (mddev->bitmap_info.chunksize == 0 ||
1972 		    mddev->bitmap_info.daemon_sleep == 0)
1973 			/* chunksize and time_base need to be
1974 			 * set first. */
1975 			err = -EINVAL;
1976 	}
1977 	if (err)
1978 		goto error;
1979 
1980 	bitmap->daemon_lastrun = jiffies;
1981 	err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
1982 			      true);
1983 	if (err)
1984 		goto error;
1985 
1986 	pr_debug("created bitmap (%lu pages) for device %s\n",
1987 		 bitmap->counts.pages, bmname(bitmap));
1988 
1989 	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
1990 	if (err)
1991 		goto error;
1992 
1993 	return bitmap;
1994  error:
1995 	md_bitmap_free(bitmap);
1996 	return ERR_PTR(err);
1997 }
1998 
1999 static int bitmap_create(struct mddev *mddev, int slot)
2000 {
2001 	struct bitmap *bitmap = __bitmap_create(mddev, slot);
2002 
2003 	if (IS_ERR(bitmap))
2004 		return PTR_ERR(bitmap);
2005 
2006 	mddev->bitmap = bitmap;
2007 	return 0;
2008 }
2009 
2010 static int bitmap_load(struct mddev *mddev)
2011 {
2012 	int err = 0;
2013 	sector_t start = 0;
2014 	sector_t sector = 0;
2015 	struct bitmap *bitmap = mddev->bitmap;
2016 	struct md_rdev *rdev;
2017 
2018 	if (!bitmap)
2019 		goto out;
2020 
2021 	rdev_for_each(rdev, mddev)
2022 		mddev_create_serial_pool(mddev, rdev);
2023 
2024 	if (mddev_is_clustered(mddev))
2025 		md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
2026 
2027 	/* Clear out old bitmap info first:  Either there is none, or we
2028 	 * are resuming after someone else has possibly changed things,
2029 	 * so we should forget old cached info.
2030 	 * All chunks should be clean, but some might need_sync.
2031 	 */
2032 	while (sector < mddev->resync_max_sectors) {
2033 		sector_t blocks;
2034 		bitmap_start_sync(mddev, sector, &blocks, false);
2035 		sector += blocks;
2036 	}
2037 	bitmap_close_sync(mddev);
2038 
2039 	if (mddev->degraded == 0
2040 	    || bitmap->events_cleared == mddev->events)
2041 		/* no need to keep dirty bits to optimise a
2042 		 * re-add of a missing device */
2043 		start = mddev->recovery_cp;
2044 
2045 	mutex_lock(&mddev->bitmap_info.mutex);
2046 	err = md_bitmap_init_from_disk(bitmap, start);
2047 	mutex_unlock(&mddev->bitmap_info.mutex);
2048 
2049 	if (err)
2050 		goto out;
2051 	clear_bit(BITMAP_STALE, &bitmap->flags);
2052 
2053 	/* Kick recovery in case any bits were set */
2054 	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
2055 
2056 	mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
2057 	md_wakeup_thread(mddev->thread);
2058 
2059 	bitmap_update_sb(bitmap);
2060 
2061 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
2062 		err = -EIO;
2063 out:
2064 	return err;
2065 }
2066 
2067 /* caller need to free returned bitmap with md_bitmap_free() */
2068 static struct bitmap *bitmap_get_from_slot(struct mddev *mddev, int slot)
2069 {
2070 	int rv = 0;
2071 	struct bitmap *bitmap;
2072 
2073 	bitmap = __bitmap_create(mddev, slot);
2074 	if (IS_ERR(bitmap)) {
2075 		rv = PTR_ERR(bitmap);
2076 		return ERR_PTR(rv);
2077 	}
2078 
2079 	rv = md_bitmap_init_from_disk(bitmap, 0);
2080 	if (rv) {
2081 		md_bitmap_free(bitmap);
2082 		return ERR_PTR(rv);
2083 	}
2084 
2085 	return bitmap;
2086 }
2087 
2088 /* Loads the bitmap associated with slot and copies the resync information
2089  * to our bitmap
2090  */
2091 static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
2092 				 sector_t *high, bool clear_bits)
2093 {
2094 	int rv = 0, i, j;
2095 	sector_t block, lo = 0, hi = 0;
2096 	struct bitmap_counts *counts;
2097 	struct bitmap *bitmap;
2098 
2099 	bitmap = bitmap_get_from_slot(mddev, slot);
2100 	if (IS_ERR(bitmap)) {
2101 		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
2102 		return -1;
2103 	}
2104 
2105 	counts = &bitmap->counts;
2106 	for (j = 0; j < counts->chunks; j++) {
2107 		block = (sector_t)j << counts->chunkshift;
2108 		if (md_bitmap_file_test_bit(bitmap, block)) {
2109 			if (!lo)
2110 				lo = block;
2111 			hi = block;
2112 			md_bitmap_file_clear_bit(bitmap, block);
2113 			md_bitmap_set_memory_bits(mddev->bitmap, block, 1);
2114 			md_bitmap_file_set_bit(mddev->bitmap, block);
2115 		}
2116 	}
2117 
2118 	if (clear_bits) {
2119 		bitmap_update_sb(bitmap);
2120 		/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
2121 		 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
2122 		for (i = 0; i < bitmap->storage.file_pages; i++)
2123 			if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
2124 				set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
2125 		__bitmap_unplug(bitmap);
2126 	}
2127 	__bitmap_unplug(mddev->bitmap);
2128 	*low = lo;
2129 	*high = hi;
2130 	md_bitmap_free(bitmap);
2131 
2132 	return rv;
2133 }
2134 
2135 static void bitmap_set_pages(struct bitmap *bitmap, unsigned long pages)
2136 {
2137 	bitmap->counts.pages = pages;
2138 }
2139 
2140 static int bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats)
2141 {
2142 	struct bitmap_storage *storage;
2143 	struct bitmap_counts *counts;
2144 	bitmap_super_t *sb;
2145 
2146 	if (!bitmap)
2147 		return -ENOENT;
2148 
2149 	sb = kmap_local_page(bitmap->storage.sb_page);
2150 	stats->sync_size = le64_to_cpu(sb->sync_size);
2151 	kunmap_local(sb);
2152 
2153 	counts = &bitmap->counts;
2154 	stats->missing_pages = counts->missing_pages;
2155 	stats->pages = counts->pages;
2156 
2157 	storage = &bitmap->storage;
2158 	stats->file_pages = storage->file_pages;
2159 	stats->file = storage->file;
2160 
2161 	stats->behind_writes = atomic_read(&bitmap->behind_writes);
2162 	stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
2163 	stats->events_cleared = bitmap->events_cleared;
2164 	return 0;
2165 }
2166 
2167 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2168 			   int chunksize, bool init)
2169 {
2170 	/* If chunk_size is 0, choose an appropriate chunk size.
2171 	 * Then possibly allocate new storage space.
2172 	 * Then quiesce, copy bits, replace bitmap, and re-start
2173 	 *
2174 	 * This function is called both to set up the initial bitmap
2175 	 * and to resize the bitmap while the array is active.
2176 	 * If this happens as a result of the array being resized,
2177 	 * chunksize will be zero, and we need to choose a suitable
2178 	 * chunksize, otherwise we use what we are given.
2179 	 */
2180 	struct bitmap_storage store;
2181 	struct bitmap_counts old_counts;
2182 	unsigned long chunks;
2183 	sector_t block;
2184 	sector_t old_blocks, new_blocks;
2185 	int chunkshift;
2186 	int ret = 0;
2187 	long pages;
2188 	struct bitmap_page *new_bp;
2189 
2190 	if (bitmap->storage.file && !init) {
2191 		pr_info("md: cannot resize file-based bitmap\n");
2192 		return -EINVAL;
2193 	}
2194 
2195 	if (chunksize == 0) {
2196 		/* If there is enough space, leave the chunk size unchanged,
2197 		 * else increase by factor of two until there is enough space.
2198 		 */
2199 		long bytes;
2200 		long space = bitmap->mddev->bitmap_info.space;
2201 
2202 		if (space == 0) {
2203 			/* We don't know how much space there is, so limit
2204 			 * to current size - in sectors.
2205 			 */
2206 			bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
2207 			if (!bitmap->mddev->bitmap_info.external)
2208 				bytes += sizeof(bitmap_super_t);
2209 			space = DIV_ROUND_UP(bytes, 512);
2210 			bitmap->mddev->bitmap_info.space = space;
2211 		}
2212 		chunkshift = bitmap->counts.chunkshift;
2213 		chunkshift--;
2214 		do {
2215 			/* 'chunkshift' is shift from block size to chunk size */
2216 			chunkshift++;
2217 			chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2218 			bytes = DIV_ROUND_UP(chunks, 8);
2219 			if (!bitmap->mddev->bitmap_info.external)
2220 				bytes += sizeof(bitmap_super_t);
2221 		} while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
2222 			(BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
2223 	} else
2224 		chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
2225 
2226 	chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2227 	memset(&store, 0, sizeof(store));
2228 	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
2229 		ret = md_bitmap_storage_alloc(&store, chunks,
2230 					      !bitmap->mddev->bitmap_info.external,
2231 					      mddev_is_clustered(bitmap->mddev)
2232 					      ? bitmap->cluster_slot : 0);
2233 	if (ret) {
2234 		md_bitmap_file_unmap(&store);
2235 		goto err;
2236 	}
2237 
2238 	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
2239 
2240 	new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL);
2241 	ret = -ENOMEM;
2242 	if (!new_bp) {
2243 		md_bitmap_file_unmap(&store);
2244 		goto err;
2245 	}
2246 
2247 	if (!init)
2248 		bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
2249 
2250 	store.file = bitmap->storage.file;
2251 	bitmap->storage.file = NULL;
2252 
2253 	if (store.sb_page && bitmap->storage.sb_page)
2254 		memcpy(page_address(store.sb_page),
2255 		       page_address(bitmap->storage.sb_page),
2256 		       sizeof(bitmap_super_t));
2257 	spin_lock_irq(&bitmap->counts.lock);
2258 	md_bitmap_file_unmap(&bitmap->storage);
2259 	bitmap->storage = store;
2260 
2261 	old_counts = bitmap->counts;
2262 	bitmap->counts.bp = new_bp;
2263 	bitmap->counts.pages = pages;
2264 	bitmap->counts.missing_pages = pages;
2265 	bitmap->counts.chunkshift = chunkshift;
2266 	bitmap->counts.chunks = chunks;
2267 	bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
2268 						     BITMAP_BLOCK_SHIFT);
2269 
2270 	blocks = min(old_counts.chunks << old_counts.chunkshift,
2271 		     chunks << chunkshift);
2272 
2273 	/* For cluster raid, need to pre-allocate bitmap */
2274 	if (mddev_is_clustered(bitmap->mddev)) {
2275 		unsigned long page;
2276 		for (page = 0; page < pages; page++) {
2277 			ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
2278 			if (ret) {
2279 				unsigned long k;
2280 
2281 				/* deallocate the page memory */
2282 				for (k = 0; k < page; k++) {
2283 					kfree(new_bp[k].map);
2284 				}
2285 				kfree(new_bp);
2286 
2287 				/* restore some fields from old_counts */
2288 				bitmap->counts.bp = old_counts.bp;
2289 				bitmap->counts.pages = old_counts.pages;
2290 				bitmap->counts.missing_pages = old_counts.pages;
2291 				bitmap->counts.chunkshift = old_counts.chunkshift;
2292 				bitmap->counts.chunks = old_counts.chunks;
2293 				bitmap->mddev->bitmap_info.chunksize =
2294 					1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
2295 				blocks = old_counts.chunks << old_counts.chunkshift;
2296 				pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
2297 				break;
2298 			} else
2299 				bitmap->counts.bp[page].count += 1;
2300 		}
2301 	}
2302 
2303 	for (block = 0; block < blocks; ) {
2304 		bitmap_counter_t *bmc_old, *bmc_new;
2305 		int set;
2306 
2307 		bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
2308 		set = bmc_old && NEEDED(*bmc_old);
2309 
2310 		if (set) {
2311 			bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2312 			if (bmc_new) {
2313 				if (*bmc_new == 0) {
2314 					/* need to set on-disk bits too. */
2315 					sector_t end = block + new_blocks;
2316 					sector_t start = block >> chunkshift;
2317 
2318 					start <<= chunkshift;
2319 					while (start < end) {
2320 						md_bitmap_file_set_bit(bitmap, block);
2321 						start += 1 << chunkshift;
2322 					}
2323 					*bmc_new = 2;
2324 					md_bitmap_count_page(&bitmap->counts, block, 1);
2325 					md_bitmap_set_pending(&bitmap->counts, block);
2326 				}
2327 				*bmc_new |= NEEDED_MASK;
2328 			}
2329 			if (new_blocks < old_blocks)
2330 				old_blocks = new_blocks;
2331 		}
2332 		block += old_blocks;
2333 	}
2334 
2335 	if (bitmap->counts.bp != old_counts.bp) {
2336 		unsigned long k;
2337 		for (k = 0; k < old_counts.pages; k++)
2338 			if (!old_counts.bp[k].hijacked)
2339 				kfree(old_counts.bp[k].map);
2340 		kfree(old_counts.bp);
2341 	}
2342 
2343 	if (!init) {
2344 		int i;
2345 		while (block < (chunks << chunkshift)) {
2346 			bitmap_counter_t *bmc;
2347 			bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2348 			if (bmc) {
2349 				/* new space.  It needs to be resynced, so
2350 				 * we set NEEDED_MASK.
2351 				 */
2352 				if (*bmc == 0) {
2353 					*bmc = NEEDED_MASK | 2;
2354 					md_bitmap_count_page(&bitmap->counts, block, 1);
2355 					md_bitmap_set_pending(&bitmap->counts, block);
2356 				}
2357 			}
2358 			block += new_blocks;
2359 		}
2360 		for (i = 0; i < bitmap->storage.file_pages; i++)
2361 			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
2362 	}
2363 	spin_unlock_irq(&bitmap->counts.lock);
2364 
2365 	if (!init) {
2366 		__bitmap_unplug(bitmap);
2367 		bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
2368 	}
2369 	ret = 0;
2370 err:
2371 	return ret;
2372 }
2373 
2374 static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
2375 			 bool init)
2376 {
2377 	struct bitmap *bitmap = mddev->bitmap;
2378 
2379 	if (!bitmap)
2380 		return 0;
2381 
2382 	return __bitmap_resize(bitmap, blocks, chunksize, init);
2383 }
2384 
2385 static ssize_t
2386 location_show(struct mddev *mddev, char *page)
2387 {
2388 	ssize_t len;
2389 	if (mddev->bitmap_info.file)
2390 		len = sprintf(page, "file");
2391 	else if (mddev->bitmap_info.offset)
2392 		len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
2393 	else
2394 		len = sprintf(page, "none");
2395 	len += sprintf(page+len, "\n");
2396 	return len;
2397 }
2398 
2399 static ssize_t
2400 location_store(struct mddev *mddev, const char *buf, size_t len)
2401 {
2402 	int rv;
2403 
2404 	rv = mddev_suspend_and_lock(mddev);
2405 	if (rv)
2406 		return rv;
2407 
2408 	if (mddev->pers) {
2409 		if (mddev->recovery || mddev->sync_thread) {
2410 			rv = -EBUSY;
2411 			goto out;
2412 		}
2413 	}
2414 
2415 	if (mddev->bitmap || mddev->bitmap_info.file ||
2416 	    mddev->bitmap_info.offset) {
2417 		/* bitmap already configured.  Only option is to clear it */
2418 		if (strncmp(buf, "none", 4) != 0) {
2419 			rv = -EBUSY;
2420 			goto out;
2421 		}
2422 
2423 		bitmap_destroy(mddev);
2424 		mddev->bitmap_info.offset = 0;
2425 		if (mddev->bitmap_info.file) {
2426 			struct file *f = mddev->bitmap_info.file;
2427 			mddev->bitmap_info.file = NULL;
2428 			fput(f);
2429 		}
2430 	} else {
2431 		/* No bitmap, OK to set a location */
2432 		long long offset;
2433 
2434 		if (strncmp(buf, "none", 4) == 0)
2435 			/* nothing to be done */;
2436 		else if (strncmp(buf, "file:", 5) == 0) {
2437 			/* Not supported yet */
2438 			rv = -EINVAL;
2439 			goto out;
2440 		} else {
2441 			if (buf[0] == '+')
2442 				rv = kstrtoll(buf+1, 10, &offset);
2443 			else
2444 				rv = kstrtoll(buf, 10, &offset);
2445 			if (rv)
2446 				goto out;
2447 			if (offset == 0) {
2448 				rv = -EINVAL;
2449 				goto out;
2450 			}
2451 			if (mddev->bitmap_info.external == 0 &&
2452 			    mddev->major_version == 0 &&
2453 			    offset != mddev->bitmap_info.default_offset) {
2454 				rv = -EINVAL;
2455 				goto out;
2456 			}
2457 
2458 			mddev->bitmap_info.offset = offset;
2459 			rv = bitmap_create(mddev, -1);
2460 			if (rv)
2461 				goto out;
2462 
2463 			rv = bitmap_load(mddev);
2464 			if (rv) {
2465 				mddev->bitmap_info.offset = 0;
2466 				bitmap_destroy(mddev);
2467 				goto out;
2468 			}
2469 		}
2470 	}
2471 	if (!mddev->external) {
2472 		/* Ensure new bitmap info is stored in
2473 		 * metadata promptly.
2474 		 */
2475 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2476 		md_wakeup_thread(mddev->thread);
2477 	}
2478 	rv = 0;
2479 out:
2480 	mddev_unlock_and_resume(mddev);
2481 	if (rv)
2482 		return rv;
2483 	return len;
2484 }
2485 
2486 static struct md_sysfs_entry bitmap_location =
2487 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
2488 
2489 /* 'bitmap/space' is the space available at 'location' for the
2490  * bitmap.  This allows the kernel to know when it is safe to
2491  * resize the bitmap to match a resized array.
2492  */
2493 static ssize_t
2494 space_show(struct mddev *mddev, char *page)
2495 {
2496 	return sprintf(page, "%lu\n", mddev->bitmap_info.space);
2497 }
2498 
2499 static ssize_t
2500 space_store(struct mddev *mddev, const char *buf, size_t len)
2501 {
2502 	unsigned long sectors;
2503 	int rv;
2504 
2505 	rv = kstrtoul(buf, 10, &sectors);
2506 	if (rv)
2507 		return rv;
2508 
2509 	if (sectors == 0)
2510 		return -EINVAL;
2511 
2512 	if (mddev->bitmap &&
2513 	    sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
2514 		return -EFBIG; /* Bitmap is too big for this small space */
2515 
2516 	/* could make sure it isn't too big, but that isn't really
2517 	 * needed - user-space should be careful.
2518 	 */
2519 	mddev->bitmap_info.space = sectors;
2520 	return len;
2521 }
2522 
2523 static struct md_sysfs_entry bitmap_space =
2524 __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
2525 
2526 static ssize_t
2527 timeout_show(struct mddev *mddev, char *page)
2528 {
2529 	ssize_t len;
2530 	unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
2531 	unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
2532 
2533 	len = sprintf(page, "%lu", secs);
2534 	if (jifs)
2535 		len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
2536 	len += sprintf(page+len, "\n");
2537 	return len;
2538 }
2539 
2540 static ssize_t
2541 timeout_store(struct mddev *mddev, const char *buf, size_t len)
2542 {
2543 	/* timeout can be set at any time */
2544 	unsigned long timeout;
2545 	int rv = strict_strtoul_scaled(buf, &timeout, 4);
2546 	if (rv)
2547 		return rv;
2548 
2549 	/* just to make sure we don't overflow... */
2550 	if (timeout >= LONG_MAX / HZ)
2551 		return -EINVAL;
2552 
2553 	timeout = timeout * HZ / 10000;
2554 
2555 	if (timeout >= MAX_SCHEDULE_TIMEOUT)
2556 		timeout = MAX_SCHEDULE_TIMEOUT-1;
2557 	if (timeout < 1)
2558 		timeout = 1;
2559 
2560 	mddev->bitmap_info.daemon_sleep = timeout;
2561 	mddev_set_timeout(mddev, timeout, false);
2562 	md_wakeup_thread(mddev->thread);
2563 
2564 	return len;
2565 }
2566 
2567 static struct md_sysfs_entry bitmap_timeout =
2568 __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
2569 
2570 static ssize_t
2571 backlog_show(struct mddev *mddev, char *page)
2572 {
2573 	return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
2574 }
2575 
2576 static ssize_t
2577 backlog_store(struct mddev *mddev, const char *buf, size_t len)
2578 {
2579 	unsigned long backlog;
2580 	unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
2581 	struct md_rdev *rdev;
2582 	bool has_write_mostly = false;
2583 	int rv = kstrtoul(buf, 10, &backlog);
2584 	if (rv)
2585 		return rv;
2586 	if (backlog > COUNTER_MAX)
2587 		return -EINVAL;
2588 
2589 	rv = mddev_suspend_and_lock(mddev);
2590 	if (rv)
2591 		return rv;
2592 
2593 	/*
2594 	 * Without write mostly device, it doesn't make sense to set
2595 	 * backlog for max_write_behind.
2596 	 */
2597 	rdev_for_each(rdev, mddev) {
2598 		if (test_bit(WriteMostly, &rdev->flags)) {
2599 			has_write_mostly = true;
2600 			break;
2601 		}
2602 	}
2603 	if (!has_write_mostly) {
2604 		pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
2605 				    mdname(mddev));
2606 		mddev_unlock(mddev);
2607 		return -EINVAL;
2608 	}
2609 
2610 	mddev->bitmap_info.max_write_behind = backlog;
2611 	if (!backlog && mddev->serial_info_pool) {
2612 		/* serial_info_pool is not needed if backlog is zero */
2613 		if (!mddev->serialize_policy)
2614 			mddev_destroy_serial_pool(mddev, NULL);
2615 	} else if (backlog && !mddev->serial_info_pool) {
2616 		/* serial_info_pool is needed since backlog is not zero */
2617 		rdev_for_each(rdev, mddev)
2618 			mddev_create_serial_pool(mddev, rdev);
2619 	}
2620 	if (old_mwb != backlog)
2621 		bitmap_update_sb(mddev->bitmap);
2622 
2623 	mddev_unlock_and_resume(mddev);
2624 	return len;
2625 }
2626 
2627 static struct md_sysfs_entry bitmap_backlog =
2628 __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
2629 
2630 static ssize_t
2631 chunksize_show(struct mddev *mddev, char *page)
2632 {
2633 	return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
2634 }
2635 
2636 static ssize_t
2637 chunksize_store(struct mddev *mddev, const char *buf, size_t len)
2638 {
2639 	/* Can only be changed when no bitmap is active */
2640 	int rv;
2641 	unsigned long csize;
2642 	if (mddev->bitmap)
2643 		return -EBUSY;
2644 	rv = kstrtoul(buf, 10, &csize);
2645 	if (rv)
2646 		return rv;
2647 	if (csize < 512 ||
2648 	    !is_power_of_2(csize))
2649 		return -EINVAL;
2650 	if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
2651 		sizeof(((bitmap_super_t *)0)->chunksize))))
2652 		return -EOVERFLOW;
2653 	mddev->bitmap_info.chunksize = csize;
2654 	return len;
2655 }
2656 
2657 static struct md_sysfs_entry bitmap_chunksize =
2658 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
2659 
2660 static ssize_t metadata_show(struct mddev *mddev, char *page)
2661 {
2662 	if (mddev_is_clustered(mddev))
2663 		return sprintf(page, "clustered\n");
2664 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
2665 				      ? "external" : "internal"));
2666 }
2667 
2668 static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
2669 {
2670 	if (mddev->bitmap ||
2671 	    mddev->bitmap_info.file ||
2672 	    mddev->bitmap_info.offset)
2673 		return -EBUSY;
2674 	if (strncmp(buf, "external", 8) == 0)
2675 		mddev->bitmap_info.external = 1;
2676 	else if ((strncmp(buf, "internal", 8) == 0) ||
2677 			(strncmp(buf, "clustered", 9) == 0))
2678 		mddev->bitmap_info.external = 0;
2679 	else
2680 		return -EINVAL;
2681 	return len;
2682 }
2683 
2684 static struct md_sysfs_entry bitmap_metadata =
2685 __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2686 
2687 static ssize_t can_clear_show(struct mddev *mddev, char *page)
2688 {
2689 	int len;
2690 	spin_lock(&mddev->lock);
2691 	if (mddev->bitmap)
2692 		len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
2693 					     "false" : "true"));
2694 	else
2695 		len = sprintf(page, "\n");
2696 	spin_unlock(&mddev->lock);
2697 	return len;
2698 }
2699 
2700 static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
2701 {
2702 	if (mddev->bitmap == NULL)
2703 		return -ENOENT;
2704 	if (strncmp(buf, "false", 5) == 0)
2705 		mddev->bitmap->need_sync = 1;
2706 	else if (strncmp(buf, "true", 4) == 0) {
2707 		if (mddev->degraded)
2708 			return -EBUSY;
2709 		mddev->bitmap->need_sync = 0;
2710 	} else
2711 		return -EINVAL;
2712 	return len;
2713 }
2714 
2715 static struct md_sysfs_entry bitmap_can_clear =
2716 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2717 
2718 static ssize_t
2719 behind_writes_used_show(struct mddev *mddev, char *page)
2720 {
2721 	ssize_t ret;
2722 	spin_lock(&mddev->lock);
2723 	if (mddev->bitmap == NULL)
2724 		ret = sprintf(page, "0\n");
2725 	else
2726 		ret = sprintf(page, "%lu\n",
2727 			      mddev->bitmap->behind_writes_used);
2728 	spin_unlock(&mddev->lock);
2729 	return ret;
2730 }
2731 
2732 static ssize_t
2733 behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
2734 {
2735 	if (mddev->bitmap)
2736 		mddev->bitmap->behind_writes_used = 0;
2737 	return len;
2738 }
2739 
2740 static struct md_sysfs_entry max_backlog_used =
2741 __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2742        behind_writes_used_show, behind_writes_used_reset);
2743 
2744 static struct attribute *md_bitmap_attrs[] = {
2745 	&bitmap_location.attr,
2746 	&bitmap_space.attr,
2747 	&bitmap_timeout.attr,
2748 	&bitmap_backlog.attr,
2749 	&bitmap_chunksize.attr,
2750 	&bitmap_metadata.attr,
2751 	&bitmap_can_clear.attr,
2752 	&max_backlog_used.attr,
2753 	NULL
2754 };
2755 const struct attribute_group md_bitmap_group = {
2756 	.name = "bitmap",
2757 	.attrs = md_bitmap_attrs,
2758 };
2759 
2760 static struct bitmap_operations bitmap_ops = {
2761 	.create			= bitmap_create,
2762 	.resize			= bitmap_resize,
2763 	.load			= bitmap_load,
2764 	.destroy		= bitmap_destroy,
2765 	.flush			= bitmap_flush,
2766 	.write_all		= bitmap_write_all,
2767 	.dirty_bits		= bitmap_dirty_bits,
2768 	.unplug			= bitmap_unplug,
2769 	.daemon_work		= bitmap_daemon_work,
2770 
2771 	.startwrite		= bitmap_startwrite,
2772 	.endwrite		= bitmap_endwrite,
2773 	.start_sync		= bitmap_start_sync,
2774 	.end_sync		= bitmap_end_sync,
2775 	.cond_end_sync		= bitmap_cond_end_sync,
2776 	.close_sync		= bitmap_close_sync,
2777 
2778 	.update_sb		= bitmap_update_sb,
2779 	.get_stats		= bitmap_get_stats,
2780 
2781 	.sync_with_cluster	= bitmap_sync_with_cluster,
2782 	.get_from_slot		= bitmap_get_from_slot,
2783 	.copy_from_slot		= bitmap_copy_from_slot,
2784 	.set_pages		= bitmap_set_pages,
2785 };
2786 
2787 void mddev_set_bitmap_ops(struct mddev *mddev)
2788 {
2789 	mddev->bitmap_ops = &bitmap_ops;
2790 }
2791