1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
4 *
5 * bitmap_create - sets up the bitmap structure
6 * bitmap_destroy - destroys the bitmap structure
7 *
8 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
9 * - added disk storage for bitmap
10 * - changes to allow various bitmap chunk sizes
11 */
12
13 /*
14 * Still to do:
15 *
16 * flush after percent set rather than just time based. (maybe both).
17 */
18
19 #include <linux/blkdev.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/timer.h>
25 #include <linux/sched.h>
26 #include <linux/list.h>
27 #include <linux/file.h>
28 #include <linux/mount.h>
29 #include <linux/buffer_head.h>
30 #include <linux/seq_file.h>
31 #include <trace/events/block.h>
32
33 #include "md.h"
34 #include "md-bitmap.h"
35 #include "md-cluster.h"
36
37 #define BITMAP_MAJOR_LO 3
38 /* version 4 insists the bitmap is in little-endian order
39 * with version 3, it is host-endian which is non-portable
40 * Version 5 is currently set only for clustered devices
41 */
42 #define BITMAP_MAJOR_HI 4
43 #define BITMAP_MAJOR_CLUSTERED 5
44 #define BITMAP_MAJOR_HOSTENDIAN 3
45
46 /*
47 * in-memory bitmap:
48 *
49 * Use 16 bit block counters to track pending writes to each "chunk".
50 * The 2 high order bits are special-purpose, the first is a flag indicating
51 * whether a resync is needed. The second is a flag indicating whether a
52 * resync is active.
53 * This means that the counter is actually 14 bits:
54 *
55 * +--------+--------+------------------------------------------------+
56 * | resync | resync | counter |
57 * | needed | active | |
58 * | (0-1) | (0-1) | (0-16383) |
59 * +--------+--------+------------------------------------------------+
60 *
61 * The "resync needed" bit is set when:
62 * a '1' bit is read from storage at startup.
63 * a write request fails on some drives
64 * a resync is aborted on a chunk with 'resync active' set
65 * It is cleared (and resync-active set) when a resync starts across all drives
66 * of the chunk.
67 *
68 *
69 * The "resync active" bit is set when:
70 * a resync is started on all drives, and resync_needed is set.
71 * resync_needed will be cleared (as long as resync_active wasn't already set).
72 * It is cleared when a resync completes.
73 *
74 * The counter counts pending write requests, plus the on-disk bit.
75 * When the counter is '1' and the resync bits are clear, the on-disk
76 * bit can be cleared as well, thus setting the counter to 0.
77 * When we set a bit, or in the counter (to start a write), if the fields is
78 * 0, we first set the disk bit and set the counter to 1.
79 *
80 * If the counter is 0, the on-disk bit is clear and the stripe is clean
81 * Anything that dirties the stripe pushes the counter to 2 (at least)
82 * and sets the on-disk bit (lazily).
83 * If a periodic sweep find the counter at 2, it is decremented to 1.
84 * If the sweep find the counter at 1, the on-disk bit is cleared and the
85 * counter goes to zero.
86 *
87 * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
88 * counters as a fallback when "page" memory cannot be allocated:
89 *
90 * Normal case (page memory allocated):
91 *
92 * page pointer (32-bit)
93 *
94 * [ ] ------+
95 * |
96 * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
97 * c1 c2 c2048
98 *
99 * Hijacked case (page memory allocation failed):
100 *
101 * hijacked page pointer (32-bit)
102 *
103 * [ ][ ] (no page memory allocated)
104 * counter #1 (16-bit) counter #2 (16-bit)
105 *
106 */
107
108 #define PAGE_BITS (PAGE_SIZE << 3)
109 #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
110
111 #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
112 #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
113 #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
114
115 /* how many counters per page? */
116 #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
117 /* same, except a shift value for more efficient bitops */
118 #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
119 /* same, except a mask value for more efficient bitops */
120 #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
121
122 #define BITMAP_BLOCK_SHIFT 9
123
124 /*
125 * bitmap structures:
126 */
127
128 /* the in-memory bitmap is represented by bitmap_pages */
129 struct bitmap_page {
130 /*
131 * map points to the actual memory page
132 */
133 char *map;
134 /*
135 * in emergencies (when map cannot be alloced), hijack the map
136 * pointer and use it as two counters itself
137 */
138 unsigned int hijacked:1;
139 /*
140 * If any counter in this page is '1' or '2' - and so could be
141 * cleared then that page is marked as 'pending'
142 */
143 unsigned int pending:1;
144 /*
145 * count of dirty bits on the page
146 */
147 unsigned int count:30;
148 };
149
150 /* the main bitmap structure - one per mddev */
151 struct bitmap {
152
153 struct bitmap_counts {
154 spinlock_t lock;
155 struct bitmap_page *bp;
156 /* total number of pages in the bitmap */
157 unsigned long pages;
158 /* number of pages not yet allocated */
159 unsigned long missing_pages;
160 /* chunksize = 2^chunkshift (for bitops) */
161 unsigned long chunkshift;
162 /* total number of data chunks for the array */
163 unsigned long chunks;
164 } counts;
165
166 struct mddev *mddev; /* the md device that the bitmap is for */
167
168 __u64 events_cleared;
169 int need_sync;
170
171 struct bitmap_storage {
172 /* backing disk file */
173 struct file *file;
174 /* cached copy of the bitmap file superblock */
175 struct page *sb_page;
176 unsigned long sb_index;
177 /* list of cache pages for the file */
178 struct page **filemap;
179 /* attributes associated filemap pages */
180 unsigned long *filemap_attr;
181 /* number of pages in the file */
182 unsigned long file_pages;
183 /* total bytes in the bitmap */
184 unsigned long bytes;
185 } storage;
186
187 unsigned long flags;
188
189 int allclean;
190
191 atomic_t behind_writes;
192 /* highest actual value at runtime */
193 unsigned long behind_writes_used;
194
195 /*
196 * the bitmap daemon - periodically wakes up and sweeps the bitmap
197 * file, cleaning up bits and flushing out pages to disk as necessary
198 */
199 unsigned long daemon_lastrun; /* jiffies of last run */
200 /*
201 * when we lasted called end_sync to update bitmap with resync
202 * progress.
203 */
204 unsigned long last_end_sync;
205
206 /* pending writes to the bitmap file */
207 atomic_t pending_writes;
208 wait_queue_head_t write_wait;
209 wait_queue_head_t overflow_wait;
210 wait_queue_head_t behind_wait;
211
212 struct kernfs_node *sysfs_can_clear;
213 /* slot offset for clustered env */
214 int cluster_slot;
215 };
216
217 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
218 int chunksize, bool init);
219
bmname(struct bitmap * bitmap)220 static inline char *bmname(struct bitmap *bitmap)
221 {
222 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
223 }
224
__bitmap_enabled(struct bitmap * bitmap)225 static bool __bitmap_enabled(struct bitmap *bitmap)
226 {
227 return bitmap->storage.filemap &&
228 !test_bit(BITMAP_STALE, &bitmap->flags);
229 }
230
bitmap_enabled(struct mddev * mddev)231 static bool bitmap_enabled(struct mddev *mddev)
232 {
233 struct bitmap *bitmap = mddev->bitmap;
234
235 if (!bitmap)
236 return false;
237
238 return __bitmap_enabled(bitmap);
239 }
240
241 /*
242 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
243 *
244 * 1) check to see if this page is allocated, if it's not then try to alloc
245 * 2) if the alloc fails, set the page's hijacked flag so we'll use the
246 * page pointer directly as a counter
247 *
248 * if we find our page, we increment the page's refcount so that it stays
249 * allocated while we're using it
250 */
md_bitmap_checkpage(struct bitmap_counts * bitmap,unsigned long page,int create,int no_hijack)251 static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
252 unsigned long page, int create, int no_hijack)
253 __releases(bitmap->lock)
254 __acquires(bitmap->lock)
255 {
256 unsigned char *mappage;
257
258 WARN_ON_ONCE(page >= bitmap->pages);
259 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
260 return 0;
261
262 if (bitmap->bp[page].map) /* page is already allocated, just return */
263 return 0;
264
265 if (!create)
266 return -ENOENT;
267
268 /* this page has not been allocated yet */
269
270 spin_unlock_irq(&bitmap->lock);
271 /* It is possible that this is being called inside a
272 * prepare_to_wait/finish_wait loop from raid5c:make_request().
273 * In general it is not permitted to sleep in that context as it
274 * can cause the loop to spin freely.
275 * That doesn't apply here as we can only reach this point
276 * once with any loop.
277 * When this function completes, either bp[page].map or
278 * bp[page].hijacked. In either case, this function will
279 * abort before getting to this point again. So there is
280 * no risk of a free-spin, and so it is safe to assert
281 * that sleeping here is allowed.
282 */
283 sched_annotate_sleep();
284 mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
285 spin_lock_irq(&bitmap->lock);
286
287 if (mappage == NULL) {
288 pr_debug("md/bitmap: map page allocation failed, hijacking\n");
289 /* We don't support hijack for cluster raid */
290 if (no_hijack)
291 return -ENOMEM;
292 /* failed - set the hijacked flag so that we can use the
293 * pointer as a counter */
294 if (!bitmap->bp[page].map)
295 bitmap->bp[page].hijacked = 1;
296 } else if (bitmap->bp[page].map ||
297 bitmap->bp[page].hijacked) {
298 /* somebody beat us to getting the page */
299 kfree(mappage);
300 } else {
301
302 /* no page was in place and we have one, so install it */
303
304 bitmap->bp[page].map = mappage;
305 bitmap->missing_pages--;
306 }
307 return 0;
308 }
309
310 /* if page is completely empty, put it back on the free list, or dealloc it */
311 /* if page was hijacked, unmark the flag so it might get alloced next time */
312 /* Note: lock should be held when calling this */
md_bitmap_checkfree(struct bitmap_counts * bitmap,unsigned long page)313 static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
314 {
315 char *ptr;
316
317 if (bitmap->bp[page].count) /* page is still busy */
318 return;
319
320 /* page is no longer in use, it can be released */
321
322 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
323 bitmap->bp[page].hijacked = 0;
324 bitmap->bp[page].map = NULL;
325 } else {
326 /* normal case, free the page */
327 ptr = bitmap->bp[page].map;
328 bitmap->bp[page].map = NULL;
329 bitmap->missing_pages++;
330 kfree(ptr);
331 }
332 }
333
334 /*
335 * bitmap file handling - read and write the bitmap file and its superblock
336 */
337
338 /*
339 * basic page I/O operations
340 */
341
342 /* IO operations when bitmap is stored near all superblocks */
343
344 /* choose a good rdev and read the page from there */
read_sb_page(struct mddev * mddev,loff_t offset,struct page * page,unsigned long index,int size)345 static int read_sb_page(struct mddev *mddev, loff_t offset,
346 struct page *page, unsigned long index, int size)
347 {
348
349 sector_t sector = mddev->bitmap_info.offset + offset +
350 index * (PAGE_SIZE / SECTOR_SIZE);
351 struct md_rdev *rdev;
352
353 rdev_for_each(rdev, mddev) {
354 u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
355
356 if (!test_bit(In_sync, &rdev->flags) ||
357 test_bit(Faulty, &rdev->flags) ||
358 test_bit(Bitmap_sync, &rdev->flags))
359 continue;
360
361 if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
362 return 0;
363 }
364 return -EIO;
365 }
366
next_active_rdev(struct md_rdev * rdev,struct mddev * mddev)367 static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
368 {
369 /* Iterate the disks of an mddev, using rcu to protect access to the
370 * linked list, and raising the refcount of devices we return to ensure
371 * they don't disappear while in use.
372 * As devices are only added or removed when raid_disk is < 0 and
373 * nr_pending is 0 and In_sync is clear, the entries we return will
374 * still be in the same position on the list when we re-enter
375 * list_for_each_entry_continue_rcu.
376 *
377 * Note that if entered with 'rdev == NULL' to start at the
378 * beginning, we temporarily assign 'rdev' to an address which
379 * isn't really an rdev, but which can be used by
380 * list_for_each_entry_continue_rcu() to find the first entry.
381 */
382 rcu_read_lock();
383 if (rdev == NULL)
384 /* start at the beginning */
385 rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
386 else {
387 /* release the previous rdev and start from there. */
388 rdev_dec_pending(rdev, mddev);
389 }
390 list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
391 if (rdev->raid_disk >= 0 &&
392 !test_bit(Faulty, &rdev->flags)) {
393 /* this is a usable devices */
394 atomic_inc(&rdev->nr_pending);
395 rcu_read_unlock();
396 return rdev;
397 }
398 }
399 rcu_read_unlock();
400 return NULL;
401 }
402
optimal_io_size(struct block_device * bdev,unsigned int last_page_size,unsigned int io_size)403 static unsigned int optimal_io_size(struct block_device *bdev,
404 unsigned int last_page_size,
405 unsigned int io_size)
406 {
407 if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
408 return roundup(last_page_size, bdev_io_opt(bdev));
409 return io_size;
410 }
411
bitmap_io_size(unsigned int io_size,unsigned int opt_size,loff_t start,loff_t boundary)412 static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
413 loff_t start, loff_t boundary)
414 {
415 if (io_size != opt_size &&
416 start + opt_size / SECTOR_SIZE <= boundary)
417 return opt_size;
418 if (start + io_size / SECTOR_SIZE <= boundary)
419 return io_size;
420
421 /* Overflows boundary */
422 return 0;
423 }
424
__write_sb_page(struct md_rdev * rdev,struct bitmap * bitmap,unsigned long pg_index,struct page * page)425 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
426 unsigned long pg_index, struct page *page)
427 {
428 struct block_device *bdev;
429 struct mddev *mddev = bitmap->mddev;
430 struct bitmap_storage *store = &bitmap->storage;
431 unsigned long num_pages = bitmap->storage.file_pages;
432 unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT;
433 loff_t sboff, offset = mddev->bitmap_info.offset;
434 sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
435 unsigned int size = PAGE_SIZE;
436 unsigned int opt_size = PAGE_SIZE;
437 sector_t doff;
438
439 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
440 /* we compare length (page numbers), not page offset. */
441 if ((pg_index - store->sb_index) == num_pages - 1) {
442 unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
443
444 if (last_page_size == 0)
445 last_page_size = PAGE_SIZE;
446 size = roundup(last_page_size, bdev_logical_block_size(bdev));
447 opt_size = optimal_io_size(bdev, last_page_size, size);
448 }
449
450 sboff = rdev->sb_start + offset;
451 doff = rdev->data_offset;
452
453 /* Just make sure we aren't corrupting data or metadata */
454 if (mddev->external) {
455 /* Bitmap could be anywhere. */
456 if (sboff + ps > doff &&
457 sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
458 return -EINVAL;
459 } else if (offset < 0) {
460 /* DATA BITMAP METADATA */
461 size = bitmap_io_size(size, opt_size, offset + ps, 0);
462 if (size == 0)
463 /* bitmap runs in to metadata */
464 return -EINVAL;
465
466 if (doff + mddev->dev_sectors > sboff)
467 /* data runs in to bitmap */
468 return -EINVAL;
469 } else if (rdev->sb_start < rdev->data_offset) {
470 /* METADATA BITMAP DATA */
471 size = bitmap_io_size(size, opt_size, sboff + ps, doff);
472 if (size == 0)
473 /* bitmap runs in to data */
474 return -EINVAL;
475 }
476
477 md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
478 return 0;
479 }
480
write_sb_page(struct bitmap * bitmap,unsigned long pg_index,struct page * page,bool wait)481 static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
482 struct page *page, bool wait)
483 {
484 struct mddev *mddev = bitmap->mddev;
485
486 do {
487 struct md_rdev *rdev = NULL;
488
489 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
490 if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
491 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
492 return;
493 }
494 }
495 } while (wait && md_super_wait(mddev) < 0);
496 }
497
498 static void md_bitmap_file_kick(struct bitmap *bitmap);
499
500 #ifdef CONFIG_MD_BITMAP_FILE
write_file_page(struct bitmap * bitmap,struct page * page,int wait)501 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
502 {
503 struct buffer_head *bh = page_buffers(page);
504
505 while (bh && bh->b_blocknr) {
506 atomic_inc(&bitmap->pending_writes);
507 set_buffer_locked(bh);
508 set_buffer_mapped(bh);
509 submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
510 bh = bh->b_this_page;
511 }
512
513 if (wait)
514 wait_event(bitmap->write_wait,
515 atomic_read(&bitmap->pending_writes) == 0);
516 }
517
end_bitmap_write(struct buffer_head * bh,int uptodate)518 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
519 {
520 struct bitmap *bitmap = bh->b_private;
521
522 if (!uptodate)
523 set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
524 if (atomic_dec_and_test(&bitmap->pending_writes))
525 wake_up(&bitmap->write_wait);
526 }
527
free_buffers(struct page * page)528 static void free_buffers(struct page *page)
529 {
530 struct buffer_head *bh;
531
532 if (!PagePrivate(page))
533 return;
534
535 bh = page_buffers(page);
536 while (bh) {
537 struct buffer_head *next = bh->b_this_page;
538 free_buffer_head(bh);
539 bh = next;
540 }
541 detach_page_private(page);
542 put_page(page);
543 }
544
545 /* read a page from a file.
546 * We both read the page, and attach buffers to the page to record the
547 * address of each block (using bmap). These addresses will be used
548 * to write the block later, completely bypassing the filesystem.
549 * This usage is similar to how swap files are handled, and allows us
550 * to write to a file with no concerns of memory allocation failing.
551 */
read_file_page(struct file * file,unsigned long index,struct bitmap * bitmap,unsigned long count,struct page * page)552 static int read_file_page(struct file *file, unsigned long index,
553 struct bitmap *bitmap, unsigned long count, struct page *page)
554 {
555 int ret = 0;
556 struct inode *inode = file_inode(file);
557 struct buffer_head *bh;
558 sector_t block, blk_cur;
559 unsigned long blocksize = i_blocksize(inode);
560
561 pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
562 (unsigned long long)index << PAGE_SHIFT);
563
564 bh = alloc_page_buffers(page, blocksize);
565 if (!bh) {
566 ret = -ENOMEM;
567 goto out;
568 }
569 attach_page_private(page, bh);
570 blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
571 while (bh) {
572 block = blk_cur;
573
574 if (count == 0)
575 bh->b_blocknr = 0;
576 else {
577 ret = bmap(inode, &block);
578 if (ret || !block) {
579 ret = -EINVAL;
580 bh->b_blocknr = 0;
581 goto out;
582 }
583
584 bh->b_blocknr = block;
585 bh->b_bdev = inode->i_sb->s_bdev;
586 if (count < blocksize)
587 count = 0;
588 else
589 count -= blocksize;
590
591 bh->b_end_io = end_bitmap_write;
592 bh->b_private = bitmap;
593 atomic_inc(&bitmap->pending_writes);
594 set_buffer_locked(bh);
595 set_buffer_mapped(bh);
596 submit_bh(REQ_OP_READ, bh);
597 }
598 blk_cur++;
599 bh = bh->b_this_page;
600 }
601
602 wait_event(bitmap->write_wait,
603 atomic_read(&bitmap->pending_writes)==0);
604 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
605 ret = -EIO;
606 out:
607 if (ret)
608 pr_err("md: bitmap read error: (%dB @ %llu): %d\n",
609 (int)PAGE_SIZE,
610 (unsigned long long)index << PAGE_SHIFT,
611 ret);
612 return ret;
613 }
614 #else /* CONFIG_MD_BITMAP_FILE */
write_file_page(struct bitmap * bitmap,struct page * page,int wait)615 static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
616 {
617 }
read_file_page(struct file * file,unsigned long index,struct bitmap * bitmap,unsigned long count,struct page * page)618 static int read_file_page(struct file *file, unsigned long index,
619 struct bitmap *bitmap, unsigned long count, struct page *page)
620 {
621 return -EIO;
622 }
free_buffers(struct page * page)623 static void free_buffers(struct page *page)
624 {
625 put_page(page);
626 }
627 #endif /* CONFIG_MD_BITMAP_FILE */
628
629 /*
630 * bitmap file superblock operations
631 */
632
633 /*
634 * write out a page to a file
635 */
filemap_write_page(struct bitmap * bitmap,unsigned long pg_index,bool wait)636 static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
637 bool wait)
638 {
639 struct bitmap_storage *store = &bitmap->storage;
640 struct page *page = store->filemap[pg_index];
641
642 if (mddev_is_clustered(bitmap->mddev)) {
643 /* go to node bitmap area starting point */
644 pg_index += store->sb_index;
645 }
646
647 if (store->file)
648 write_file_page(bitmap, page, wait);
649 else
650 write_sb_page(bitmap, pg_index, page, wait);
651 }
652
653 /*
654 * md_bitmap_wait_writes() should be called before writing any bitmap
655 * blocks, to ensure previous writes, particularly from
656 * md_bitmap_daemon_work(), have completed.
657 */
md_bitmap_wait_writes(struct bitmap * bitmap)658 static void md_bitmap_wait_writes(struct bitmap *bitmap)
659 {
660 if (bitmap->storage.file)
661 wait_event(bitmap->write_wait,
662 atomic_read(&bitmap->pending_writes)==0);
663 else
664 /* Note that we ignore the return value. The writes
665 * might have failed, but that would just mean that
666 * some bits which should be cleared haven't been,
667 * which is safe. The relevant bitmap blocks will
668 * probably get written again, but there is no great
669 * loss if they aren't.
670 */
671 md_super_wait(bitmap->mddev);
672 }
673
674
675 /* update the event counter and sync the superblock to disk */
bitmap_update_sb(void * data)676 static void bitmap_update_sb(void *data)
677 {
678 bitmap_super_t *sb;
679 struct bitmap *bitmap = data;
680
681 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
682 return;
683 if (bitmap->mddev->bitmap_info.external)
684 return;
685 if (!bitmap->storage.sb_page) /* no superblock */
686 return;
687 sb = kmap_local_page(bitmap->storage.sb_page);
688 sb->events = cpu_to_le64(bitmap->mddev->events);
689 if (bitmap->mddev->events < bitmap->events_cleared)
690 /* rocking back to read-only */
691 bitmap->events_cleared = bitmap->mddev->events;
692 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
693 /*
694 * clear BITMAP_WRITE_ERROR bit to protect against the case that
695 * a bitmap write error occurred but the later writes succeeded.
696 */
697 sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
698 /* Just in case these have been changed via sysfs: */
699 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
700 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
701 /* This might have been changed by a reshape */
702 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
703 sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
704 sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
705 sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
706 bitmap_info.space);
707 kunmap_local(sb);
708
709 if (bitmap->storage.file)
710 write_file_page(bitmap, bitmap->storage.sb_page, 1);
711 else
712 write_sb_page(bitmap, bitmap->storage.sb_index,
713 bitmap->storage.sb_page, 1);
714 }
715
bitmap_print_sb(struct bitmap * bitmap)716 static void bitmap_print_sb(struct bitmap *bitmap)
717 {
718 bitmap_super_t *sb;
719
720 if (!bitmap || !bitmap->storage.sb_page)
721 return;
722 sb = kmap_local_page(bitmap->storage.sb_page);
723 pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
724 pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
725 pr_debug(" version: %u\n", le32_to_cpu(sb->version));
726 pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
727 le32_to_cpu(*(__le32 *)(sb->uuid+0)),
728 le32_to_cpu(*(__le32 *)(sb->uuid+4)),
729 le32_to_cpu(*(__le32 *)(sb->uuid+8)),
730 le32_to_cpu(*(__le32 *)(sb->uuid+12)));
731 pr_debug(" events: %llu\n",
732 (unsigned long long) le64_to_cpu(sb->events));
733 pr_debug("events cleared: %llu\n",
734 (unsigned long long) le64_to_cpu(sb->events_cleared));
735 pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
736 pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize));
737 pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
738 pr_debug(" sync size: %llu KB\n",
739 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
740 pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
741 kunmap_local(sb);
742 }
743
744 /*
745 * bitmap_new_disk_sb
746 * @bitmap
747 *
748 * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb
749 * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
750 * This function verifies 'bitmap_info' and populates the on-disk bitmap
751 * structure, which is to be written to disk.
752 *
753 * Returns: 0 on success, -Exxx on error
754 */
md_bitmap_new_disk_sb(struct bitmap * bitmap)755 static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
756 {
757 bitmap_super_t *sb;
758 unsigned long chunksize, daemon_sleep, write_behind;
759
760 bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
761 if (bitmap->storage.sb_page == NULL)
762 return -ENOMEM;
763 bitmap->storage.sb_index = 0;
764
765 sb = kmap_local_page(bitmap->storage.sb_page);
766
767 sb->magic = cpu_to_le32(BITMAP_MAGIC);
768 sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
769
770 chunksize = bitmap->mddev->bitmap_info.chunksize;
771 BUG_ON(!chunksize);
772 if (!is_power_of_2(chunksize)) {
773 kunmap_local(sb);
774 pr_warn("bitmap chunksize not a power of 2\n");
775 return -EINVAL;
776 }
777 sb->chunksize = cpu_to_le32(chunksize);
778
779 daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
780 if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
781 pr_debug("Choosing daemon_sleep default (5 sec)\n");
782 daemon_sleep = 5 * HZ;
783 }
784 sb->daemon_sleep = cpu_to_le32(daemon_sleep);
785 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
786
787 /*
788 * FIXME: write_behind for RAID1. If not specified, what
789 * is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
790 */
791 write_behind = bitmap->mddev->bitmap_info.max_write_behind;
792 if (write_behind > COUNTER_MAX)
793 write_behind = COUNTER_MAX / 2;
794 sb->write_behind = cpu_to_le32(write_behind);
795 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
796
797 /* keep the array size field of the bitmap superblock up to date */
798 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
799
800 memcpy(sb->uuid, bitmap->mddev->uuid, 16);
801
802 set_bit(BITMAP_STALE, &bitmap->flags);
803 sb->state = cpu_to_le32(bitmap->flags);
804 bitmap->events_cleared = bitmap->mddev->events;
805 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
806 bitmap->mddev->bitmap_info.nodes = 0;
807
808 kunmap_local(sb);
809
810 return 0;
811 }
812
813 /* read the superblock from the bitmap file and initialize some bitmap fields */
md_bitmap_read_sb(struct bitmap * bitmap)814 static int md_bitmap_read_sb(struct bitmap *bitmap)
815 {
816 char *reason = NULL;
817 bitmap_super_t *sb;
818 unsigned long chunksize, daemon_sleep, write_behind;
819 unsigned long long events;
820 int nodes = 0;
821 unsigned long sectors_reserved = 0;
822 int err = -EINVAL;
823 struct page *sb_page;
824 loff_t offset = 0;
825
826 if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
827 chunksize = 128 * 1024 * 1024;
828 daemon_sleep = 5 * HZ;
829 write_behind = 0;
830 set_bit(BITMAP_STALE, &bitmap->flags);
831 err = 0;
832 goto out_no_sb;
833 }
834 /* page 0 is the superblock, read it... */
835 sb_page = alloc_page(GFP_KERNEL);
836 if (!sb_page)
837 return -ENOMEM;
838 bitmap->storage.sb_page = sb_page;
839
840 re_read:
841 /* If cluster_slot is set, the cluster is setup */
842 if (bitmap->cluster_slot >= 0) {
843 sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
844
845 bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks,
846 (bitmap->mddev->bitmap_info.chunksize >> 9));
847 /* bits to bytes */
848 bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
849 /* to 4k blocks */
850 bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
851 offset = bitmap->cluster_slot * (bm_blocks << 3);
852 pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
853 bitmap->cluster_slot, offset);
854 }
855
856 if (bitmap->storage.file) {
857 loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
858 int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
859
860 err = read_file_page(bitmap->storage.file, 0,
861 bitmap, bytes, sb_page);
862 } else {
863 err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
864 sizeof(bitmap_super_t));
865 }
866 if (err)
867 return err;
868
869 err = -EINVAL;
870 sb = kmap_local_page(sb_page);
871
872 chunksize = le32_to_cpu(sb->chunksize);
873 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
874 write_behind = le32_to_cpu(sb->write_behind);
875 sectors_reserved = le32_to_cpu(sb->sectors_reserved);
876
877 /* verify that the bitmap-specific fields are valid */
878 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
879 reason = "bad magic";
880 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
881 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
882 reason = "unrecognized superblock version";
883 else if (chunksize < 512)
884 reason = "bitmap chunksize too small";
885 else if (!is_power_of_2(chunksize))
886 reason = "bitmap chunksize not a power of 2";
887 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
888 reason = "daemon sleep period out of range";
889 else if (write_behind > COUNTER_MAX)
890 reason = "write-behind limit out of range (0 - 16383)";
891 if (reason) {
892 pr_warn("%s: invalid bitmap file superblock: %s\n",
893 bmname(bitmap), reason);
894 goto out;
895 }
896
897 /*
898 * Setup nodes/clustername only if bitmap version is
899 * cluster-compatible
900 */
901 if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
902 nodes = le32_to_cpu(sb->nodes);
903 strscpy(bitmap->mddev->bitmap_info.cluster_name,
904 sb->cluster_name, 64);
905 }
906
907 /* keep the array size field of the bitmap superblock up to date */
908 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
909
910 if (bitmap->mddev->persistent) {
911 /*
912 * We have a persistent array superblock, so compare the
913 * bitmap's UUID and event counter to the mddev's
914 */
915 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
916 pr_warn("%s: bitmap superblock UUID mismatch\n",
917 bmname(bitmap));
918 goto out;
919 }
920 events = le64_to_cpu(sb->events);
921 if (!nodes && (events < bitmap->mddev->events)) {
922 pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
923 bmname(bitmap), events,
924 (unsigned long long) bitmap->mddev->events);
925 set_bit(BITMAP_STALE, &bitmap->flags);
926 }
927 }
928
929 /* assign fields using values from superblock */
930 bitmap->flags |= le32_to_cpu(sb->state);
931 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
932 set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
933 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
934 err = 0;
935
936 out:
937 kunmap_local(sb);
938 if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
939 /* Assigning chunksize is required for "re_read" */
940 bitmap->mddev->bitmap_info.chunksize = chunksize;
941 err = md_setup_cluster(bitmap->mddev, nodes);
942 if (err) {
943 pr_warn("%s: Could not setup cluster service (%d)\n",
944 bmname(bitmap), err);
945 goto out_no_sb;
946 }
947 bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev);
948 goto re_read;
949 }
950
951 out_no_sb:
952 if (err == 0) {
953 if (test_bit(BITMAP_STALE, &bitmap->flags))
954 bitmap->events_cleared = bitmap->mddev->events;
955 bitmap->mddev->bitmap_info.chunksize = chunksize;
956 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
957 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
958 bitmap->mddev->bitmap_info.nodes = nodes;
959 if (bitmap->mddev->bitmap_info.space == 0 ||
960 bitmap->mddev->bitmap_info.space > sectors_reserved)
961 bitmap->mddev->bitmap_info.space = sectors_reserved;
962 } else {
963 bitmap_print_sb(bitmap);
964 if (bitmap->cluster_slot < 0)
965 md_cluster_stop(bitmap->mddev);
966 }
967 return err;
968 }
969
970 /*
971 * general bitmap file operations
972 */
973
974 /*
975 * on-disk bitmap:
976 *
977 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
978 * file a page at a time. There's a superblock at the start of the file.
979 */
980 /* calculate the index of the page that contains this bit */
file_page_index(struct bitmap_storage * store,unsigned long chunk)981 static inline unsigned long file_page_index(struct bitmap_storage *store,
982 unsigned long chunk)
983 {
984 if (store->sb_page)
985 chunk += sizeof(bitmap_super_t) << 3;
986 return chunk >> PAGE_BIT_SHIFT;
987 }
988
989 /* calculate the (bit) offset of this bit within a page */
file_page_offset(struct bitmap_storage * store,unsigned long chunk)990 static inline unsigned long file_page_offset(struct bitmap_storage *store,
991 unsigned long chunk)
992 {
993 if (store->sb_page)
994 chunk += sizeof(bitmap_super_t) << 3;
995 return chunk & (PAGE_BITS - 1);
996 }
997
998 /*
999 * return a pointer to the page in the filemap that contains the given bit
1000 *
1001 */
filemap_get_page(struct bitmap_storage * store,unsigned long chunk)1002 static inline struct page *filemap_get_page(struct bitmap_storage *store,
1003 unsigned long chunk)
1004 {
1005 if (file_page_index(store, chunk) >= store->file_pages)
1006 return NULL;
1007 return store->filemap[file_page_index(store, chunk)];
1008 }
1009
md_bitmap_storage_alloc(struct bitmap_storage * store,unsigned long chunks,int with_super,int slot_number)1010 static int md_bitmap_storage_alloc(struct bitmap_storage *store,
1011 unsigned long chunks, int with_super,
1012 int slot_number)
1013 {
1014 int pnum, offset = 0;
1015 unsigned long num_pages;
1016 unsigned long bytes;
1017
1018 bytes = DIV_ROUND_UP(chunks, 8);
1019 if (with_super)
1020 bytes += sizeof(bitmap_super_t);
1021
1022 num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
1023 offset = slot_number * num_pages;
1024
1025 store->filemap = kmalloc_array(num_pages, sizeof(struct page *),
1026 GFP_KERNEL);
1027 if (!store->filemap)
1028 return -ENOMEM;
1029
1030 if (with_super && !store->sb_page) {
1031 store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
1032 if (store->sb_page == NULL)
1033 return -ENOMEM;
1034 }
1035
1036 pnum = 0;
1037 if (store->sb_page) {
1038 store->filemap[0] = store->sb_page;
1039 pnum = 1;
1040 store->sb_index = offset;
1041 }
1042
1043 for ( ; pnum < num_pages; pnum++) {
1044 store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
1045 if (!store->filemap[pnum]) {
1046 store->file_pages = pnum;
1047 return -ENOMEM;
1048 }
1049 }
1050 store->file_pages = pnum;
1051
1052 /* We need 4 bits per page, rounded up to a multiple
1053 * of sizeof(unsigned long) */
1054 store->filemap_attr = kzalloc(
1055 roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
1056 GFP_KERNEL);
1057 if (!store->filemap_attr)
1058 return -ENOMEM;
1059
1060 store->bytes = bytes;
1061
1062 return 0;
1063 }
1064
md_bitmap_file_unmap(struct bitmap_storage * store)1065 static void md_bitmap_file_unmap(struct bitmap_storage *store)
1066 {
1067 struct file *file = store->file;
1068 struct page *sb_page = store->sb_page;
1069 struct page **map = store->filemap;
1070 int pages = store->file_pages;
1071
1072 while (pages--)
1073 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
1074 free_buffers(map[pages]);
1075 kfree(map);
1076 kfree(store->filemap_attr);
1077
1078 if (sb_page)
1079 free_buffers(sb_page);
1080
1081 if (file) {
1082 struct inode *inode = file_inode(file);
1083 invalidate_mapping_pages(inode->i_mapping, 0, -1);
1084 fput(file);
1085 }
1086 }
1087
1088 /*
1089 * bitmap_file_kick - if an error occurs while manipulating the bitmap file
1090 * then it is no longer reliable, so we stop using it and we mark the file
1091 * as failed in the superblock
1092 */
md_bitmap_file_kick(struct bitmap * bitmap)1093 static void md_bitmap_file_kick(struct bitmap *bitmap)
1094 {
1095 if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
1096 bitmap_update_sb(bitmap);
1097
1098 if (bitmap->storage.file) {
1099 pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
1100 bmname(bitmap), bitmap->storage.file);
1101
1102 } else
1103 pr_warn("%s: disabling internal bitmap due to errors\n",
1104 bmname(bitmap));
1105 }
1106 }
1107
1108 enum bitmap_page_attr {
1109 BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */
1110 BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned.
1111 * i.e. counter is 1 or 2. */
1112 BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
1113 };
1114
set_page_attr(struct bitmap * bitmap,int pnum,enum bitmap_page_attr attr)1115 static inline void set_page_attr(struct bitmap *bitmap, int pnum,
1116 enum bitmap_page_attr attr)
1117 {
1118 set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1119 }
1120
clear_page_attr(struct bitmap * bitmap,int pnum,enum bitmap_page_attr attr)1121 static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
1122 enum bitmap_page_attr attr)
1123 {
1124 clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1125 }
1126
test_page_attr(struct bitmap * bitmap,int pnum,enum bitmap_page_attr attr)1127 static inline int test_page_attr(struct bitmap *bitmap, int pnum,
1128 enum bitmap_page_attr attr)
1129 {
1130 return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
1131 }
1132
test_and_clear_page_attr(struct bitmap * bitmap,int pnum,enum bitmap_page_attr attr)1133 static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
1134 enum bitmap_page_attr attr)
1135 {
1136 return test_and_clear_bit((pnum<<2) + attr,
1137 bitmap->storage.filemap_attr);
1138 }
1139 /*
1140 * bitmap_file_set_bit -- called before performing a write to the md device
1141 * to set (and eventually sync) a particular bit in the bitmap file
1142 *
1143 * we set the bit immediately, then we record the page number so that
1144 * when an unplug occurs, we can flush the dirty pages out to disk
1145 */
md_bitmap_file_set_bit(struct bitmap * bitmap,sector_t block)1146 static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
1147 {
1148 unsigned long bit;
1149 struct page *page;
1150 void *kaddr;
1151 unsigned long chunk = block >> bitmap->counts.chunkshift;
1152 struct bitmap_storage *store = &bitmap->storage;
1153 unsigned long index = file_page_index(store, chunk);
1154 unsigned long node_offset = 0;
1155
1156 index += store->sb_index;
1157 if (mddev_is_clustered(bitmap->mddev))
1158 node_offset = bitmap->cluster_slot * store->file_pages;
1159
1160 page = filemap_get_page(&bitmap->storage, chunk);
1161 if (!page)
1162 return;
1163 bit = file_page_offset(&bitmap->storage, chunk);
1164
1165 /* set the bit */
1166 kaddr = kmap_local_page(page);
1167 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1168 set_bit(bit, kaddr);
1169 else
1170 set_bit_le(bit, kaddr);
1171 kunmap_local(kaddr);
1172 pr_debug("set file bit %lu page %lu\n", bit, index);
1173 /* record page number so it gets flushed to disk when unplug occurs */
1174 set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
1175 }
1176
md_bitmap_file_clear_bit(struct bitmap * bitmap,sector_t block)1177 static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
1178 {
1179 unsigned long bit;
1180 struct page *page;
1181 void *paddr;
1182 unsigned long chunk = block >> bitmap->counts.chunkshift;
1183 struct bitmap_storage *store = &bitmap->storage;
1184 unsigned long index = file_page_index(store, chunk);
1185 unsigned long node_offset = 0;
1186
1187 index += store->sb_index;
1188 if (mddev_is_clustered(bitmap->mddev))
1189 node_offset = bitmap->cluster_slot * store->file_pages;
1190
1191 page = filemap_get_page(&bitmap->storage, chunk);
1192 if (!page)
1193 return;
1194 bit = file_page_offset(&bitmap->storage, chunk);
1195 paddr = kmap_local_page(page);
1196 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1197 clear_bit(bit, paddr);
1198 else
1199 clear_bit_le(bit, paddr);
1200 kunmap_local(paddr);
1201 if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
1202 set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
1203 bitmap->allclean = 0;
1204 }
1205 }
1206
md_bitmap_file_test_bit(struct bitmap * bitmap,sector_t block)1207 static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
1208 {
1209 unsigned long bit;
1210 struct page *page;
1211 void *paddr;
1212 unsigned long chunk = block >> bitmap->counts.chunkshift;
1213 int set = 0;
1214
1215 page = filemap_get_page(&bitmap->storage, chunk);
1216 if (!page)
1217 return -EINVAL;
1218 bit = file_page_offset(&bitmap->storage, chunk);
1219 paddr = kmap_local_page(page);
1220 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1221 set = test_bit(bit, paddr);
1222 else
1223 set = test_bit_le(bit, paddr);
1224 kunmap_local(paddr);
1225 return set;
1226 }
1227
1228 /* this gets called when the md device is ready to unplug its underlying
1229 * (slave) device queues -- before we let any writes go down, we need to
1230 * sync the dirty pages of the bitmap file to disk */
__bitmap_unplug(struct bitmap * bitmap)1231 static void __bitmap_unplug(struct bitmap *bitmap)
1232 {
1233 unsigned long i;
1234 int dirty, need_write;
1235 int writing = 0;
1236
1237 if (!__bitmap_enabled(bitmap))
1238 return;
1239
1240 /* look at each page to see if there are any set bits that need to be
1241 * flushed out to disk */
1242 for (i = 0; i < bitmap->storage.file_pages; i++) {
1243 dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
1244 need_write = test_and_clear_page_attr(bitmap, i,
1245 BITMAP_PAGE_NEEDWRITE);
1246 if (dirty || need_write) {
1247 if (!writing) {
1248 md_bitmap_wait_writes(bitmap);
1249 mddev_add_trace_msg(bitmap->mddev,
1250 "md bitmap_unplug");
1251 }
1252 clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
1253 filemap_write_page(bitmap, i, false);
1254 writing = 1;
1255 }
1256 }
1257 if (writing)
1258 md_bitmap_wait_writes(bitmap);
1259
1260 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
1261 md_bitmap_file_kick(bitmap);
1262 }
1263
1264 struct bitmap_unplug_work {
1265 struct work_struct work;
1266 struct bitmap *bitmap;
1267 struct completion *done;
1268 };
1269
md_bitmap_unplug_fn(struct work_struct * work)1270 static void md_bitmap_unplug_fn(struct work_struct *work)
1271 {
1272 struct bitmap_unplug_work *unplug_work =
1273 container_of(work, struct bitmap_unplug_work, work);
1274
1275 __bitmap_unplug(unplug_work->bitmap);
1276 complete(unplug_work->done);
1277 }
1278
bitmap_unplug_async(struct bitmap * bitmap)1279 static void bitmap_unplug_async(struct bitmap *bitmap)
1280 {
1281 DECLARE_COMPLETION_ONSTACK(done);
1282 struct bitmap_unplug_work unplug_work;
1283
1284 INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn);
1285 unplug_work.bitmap = bitmap;
1286 unplug_work.done = &done;
1287
1288 queue_work(md_bitmap_wq, &unplug_work.work);
1289 wait_for_completion(&done);
1290 destroy_work_on_stack(&unplug_work.work);
1291 }
1292
bitmap_unplug(struct mddev * mddev,bool sync)1293 static void bitmap_unplug(struct mddev *mddev, bool sync)
1294 {
1295 struct bitmap *bitmap = mddev->bitmap;
1296
1297 if (!bitmap)
1298 return;
1299
1300 if (sync)
1301 __bitmap_unplug(bitmap);
1302 else
1303 bitmap_unplug_async(bitmap);
1304 }
1305
1306 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
1307
1308 /*
1309 * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
1310 * mapping of the bitmap file.
1311 *
1312 * Special case: If there's no bitmap file, or if the bitmap file had been
1313 * previously kicked from the array, we mark all the bits as 1's in order to
1314 * cause a full resync.
1315 *
1316 * We ignore all bits for sectors that end earlier than 'start'.
1317 * This is used when reading an out-of-date bitmap.
1318 */
md_bitmap_init_from_disk(struct bitmap * bitmap,sector_t start)1319 static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1320 {
1321 bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
1322 struct mddev *mddev = bitmap->mddev;
1323 unsigned long chunks = bitmap->counts.chunks;
1324 struct bitmap_storage *store = &bitmap->storage;
1325 struct file *file = store->file;
1326 unsigned long node_offset = 0;
1327 unsigned long bit_cnt = 0;
1328 unsigned long i;
1329 int ret;
1330
1331 if (!file && !mddev->bitmap_info.offset) {
1332 /* No permanent bitmap - fill with '1s'. */
1333 store->filemap = NULL;
1334 store->file_pages = 0;
1335 for (i = 0; i < chunks ; i++) {
1336 /* if the disk bit is set, set the memory bit */
1337 int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
1338 >= start);
1339 md_bitmap_set_memory_bits(bitmap,
1340 (sector_t)i << bitmap->counts.chunkshift,
1341 needed);
1342 }
1343 return 0;
1344 }
1345
1346 if (file && i_size_read(file->f_mapping->host) < store->bytes) {
1347 pr_warn("%s: bitmap file too short %lu < %lu\n",
1348 bmname(bitmap),
1349 (unsigned long) i_size_read(file->f_mapping->host),
1350 store->bytes);
1351 ret = -ENOSPC;
1352 goto err;
1353 }
1354
1355 if (mddev_is_clustered(mddev))
1356 node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
1357
1358 for (i = 0; i < store->file_pages; i++) {
1359 struct page *page = store->filemap[i];
1360 int count;
1361
1362 /* unmap the old page, we're done with it */
1363 if (i == store->file_pages - 1)
1364 count = store->bytes - i * PAGE_SIZE;
1365 else
1366 count = PAGE_SIZE;
1367
1368 if (file)
1369 ret = read_file_page(file, i, bitmap, count, page);
1370 else
1371 ret = read_sb_page(mddev, 0, page, i + node_offset,
1372 count);
1373 if (ret)
1374 goto err;
1375 }
1376
1377 if (outofdate) {
1378 pr_warn("%s: bitmap file is out of date, doing full recovery\n",
1379 bmname(bitmap));
1380
1381 for (i = 0; i < store->file_pages; i++) {
1382 struct page *page = store->filemap[i];
1383 unsigned long offset = 0;
1384 void *paddr;
1385
1386 if (i == 0 && !mddev->bitmap_info.external)
1387 offset = sizeof(bitmap_super_t);
1388
1389 /*
1390 * If the bitmap is out of date, dirty the whole page
1391 * and write it out
1392 */
1393 paddr = kmap_local_page(page);
1394 memset(paddr + offset, 0xff, PAGE_SIZE - offset);
1395 kunmap_local(paddr);
1396
1397 filemap_write_page(bitmap, i, true);
1398 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
1399 ret = -EIO;
1400 goto err;
1401 }
1402 }
1403 }
1404
1405 for (i = 0; i < chunks; i++) {
1406 struct page *page = filemap_get_page(&bitmap->storage, i);
1407 unsigned long bit = file_page_offset(&bitmap->storage, i);
1408 void *paddr;
1409 bool was_set;
1410
1411 paddr = kmap_local_page(page);
1412 if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
1413 was_set = test_bit(bit, paddr);
1414 else
1415 was_set = test_bit_le(bit, paddr);
1416 kunmap_local(paddr);
1417
1418 if (was_set) {
1419 /* if the disk bit is set, set the memory bit */
1420 int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
1421 >= start);
1422 md_bitmap_set_memory_bits(bitmap,
1423 (sector_t)i << bitmap->counts.chunkshift,
1424 needed);
1425 bit_cnt++;
1426 }
1427 }
1428
1429 pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
1430 bmname(bitmap), store->file_pages,
1431 bit_cnt, chunks);
1432
1433 return 0;
1434
1435 err:
1436 pr_warn("%s: bitmap initialisation failed: %d\n",
1437 bmname(bitmap), ret);
1438 return ret;
1439 }
1440
1441 /* just flag bitmap pages as needing to be written. */
bitmap_write_all(struct mddev * mddev)1442 static void bitmap_write_all(struct mddev *mddev)
1443 {
1444 int i;
1445 struct bitmap *bitmap = mddev->bitmap;
1446
1447 if (!bitmap || !bitmap->storage.filemap)
1448 return;
1449
1450 /* Only one copy, so nothing needed */
1451 if (bitmap->storage.file)
1452 return;
1453
1454 for (i = 0; i < bitmap->storage.file_pages; i++)
1455 set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
1456 bitmap->allclean = 0;
1457 }
1458
md_bitmap_count_page(struct bitmap_counts * bitmap,sector_t offset,int inc)1459 static void md_bitmap_count_page(struct bitmap_counts *bitmap,
1460 sector_t offset, int inc)
1461 {
1462 sector_t chunk = offset >> bitmap->chunkshift;
1463 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1464 bitmap->bp[page].count += inc;
1465 md_bitmap_checkfree(bitmap, page);
1466 }
1467
md_bitmap_set_pending(struct bitmap_counts * bitmap,sector_t offset)1468 static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
1469 {
1470 sector_t chunk = offset >> bitmap->chunkshift;
1471 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1472 struct bitmap_page *bp = &bitmap->bp[page];
1473
1474 if (!bp->pending)
1475 bp->pending = 1;
1476 }
1477
1478 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1479 sector_t offset, sector_t *blocks,
1480 int create);
1481
mddev_set_timeout(struct mddev * mddev,unsigned long timeout,bool force)1482 static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
1483 bool force)
1484 {
1485 struct md_thread *thread;
1486
1487 rcu_read_lock();
1488 thread = rcu_dereference(mddev->thread);
1489
1490 if (!thread)
1491 goto out;
1492
1493 if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
1494 thread->timeout = timeout;
1495
1496 out:
1497 rcu_read_unlock();
1498 }
1499
1500 /*
1501 * bitmap daemon -- periodically wakes up to clean bits and flush pages
1502 * out to disk
1503 */
bitmap_daemon_work(struct mddev * mddev)1504 static void bitmap_daemon_work(struct mddev *mddev)
1505 {
1506 struct bitmap *bitmap;
1507 unsigned long j;
1508 unsigned long nextpage;
1509 sector_t blocks;
1510 struct bitmap_counts *counts;
1511
1512 /* Use a mutex to guard daemon_work against
1513 * bitmap_destroy.
1514 */
1515 mutex_lock(&mddev->bitmap_info.mutex);
1516 bitmap = mddev->bitmap;
1517 if (bitmap == NULL) {
1518 mutex_unlock(&mddev->bitmap_info.mutex);
1519 return;
1520 }
1521 if (time_before(jiffies, bitmap->daemon_lastrun
1522 + mddev->bitmap_info.daemon_sleep))
1523 goto done;
1524
1525 bitmap->daemon_lastrun = jiffies;
1526 if (bitmap->allclean) {
1527 mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
1528 goto done;
1529 }
1530 bitmap->allclean = 1;
1531
1532 mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
1533
1534 /* Any file-page which is PENDING now needs to be written.
1535 * So set NEEDWRITE now, then after we make any last-minute changes
1536 * we will write it.
1537 */
1538 for (j = 0; j < bitmap->storage.file_pages; j++)
1539 if (test_and_clear_page_attr(bitmap, j,
1540 BITMAP_PAGE_PENDING))
1541 set_page_attr(bitmap, j,
1542 BITMAP_PAGE_NEEDWRITE);
1543
1544 if (bitmap->need_sync &&
1545 mddev->bitmap_info.external == 0) {
1546 /* Arrange for superblock update as well as
1547 * other changes */
1548 bitmap_super_t *sb;
1549 bitmap->need_sync = 0;
1550 if (bitmap->storage.filemap) {
1551 sb = kmap_local_page(bitmap->storage.sb_page);
1552 sb->events_cleared =
1553 cpu_to_le64(bitmap->events_cleared);
1554 kunmap_local(sb);
1555 set_page_attr(bitmap, 0,
1556 BITMAP_PAGE_NEEDWRITE);
1557 }
1558 }
1559 /* Now look at the bitmap counters and if any are '2' or '1',
1560 * decrement and handle accordingly.
1561 */
1562 counts = &bitmap->counts;
1563 spin_lock_irq(&counts->lock);
1564 nextpage = 0;
1565 for (j = 0; j < counts->chunks; j++) {
1566 bitmap_counter_t *bmc;
1567 sector_t block = (sector_t)j << counts->chunkshift;
1568
1569 if (j == nextpage) {
1570 nextpage += PAGE_COUNTER_RATIO;
1571 if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
1572 j |= PAGE_COUNTER_MASK;
1573 continue;
1574 }
1575 counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
1576 }
1577
1578 bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
1579 if (!bmc) {
1580 j |= PAGE_COUNTER_MASK;
1581 continue;
1582 }
1583 if (*bmc == 1 && !bitmap->need_sync) {
1584 /* We can clear the bit */
1585 *bmc = 0;
1586 md_bitmap_count_page(counts, block, -1);
1587 md_bitmap_file_clear_bit(bitmap, block);
1588 } else if (*bmc && *bmc <= 2) {
1589 *bmc = 1;
1590 md_bitmap_set_pending(counts, block);
1591 bitmap->allclean = 0;
1592 }
1593 }
1594 spin_unlock_irq(&counts->lock);
1595
1596 md_bitmap_wait_writes(bitmap);
1597 /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
1598 * DIRTY pages need to be written by bitmap_unplug so it can wait
1599 * for them.
1600 * If we find any DIRTY page we stop there and let bitmap_unplug
1601 * handle all the rest. This is important in the case where
1602 * the first blocking holds the superblock and it has been updated.
1603 * We mustn't write any other blocks before the superblock.
1604 */
1605 for (j = 0;
1606 j < bitmap->storage.file_pages
1607 && !test_bit(BITMAP_STALE, &bitmap->flags);
1608 j++) {
1609 if (test_page_attr(bitmap, j,
1610 BITMAP_PAGE_DIRTY))
1611 /* bitmap_unplug will handle the rest */
1612 break;
1613 if (bitmap->storage.filemap &&
1614 test_and_clear_page_attr(bitmap, j,
1615 BITMAP_PAGE_NEEDWRITE))
1616 filemap_write_page(bitmap, j, false);
1617 }
1618
1619 done:
1620 if (bitmap->allclean == 0)
1621 mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
1622 mutex_unlock(&mddev->bitmap_info.mutex);
1623 }
1624
md_bitmap_get_counter(struct bitmap_counts * bitmap,sector_t offset,sector_t * blocks,int create)1625 static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
1626 sector_t offset, sector_t *blocks,
1627 int create)
1628 __releases(bitmap->lock)
1629 __acquires(bitmap->lock)
1630 {
1631 /* If 'create', we might release the lock and reclaim it.
1632 * The lock must have been taken with interrupts enabled.
1633 * If !create, we don't release the lock.
1634 */
1635 sector_t chunk = offset >> bitmap->chunkshift;
1636 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1637 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1638 sector_t csize = ((sector_t)1) << bitmap->chunkshift;
1639 int err;
1640
1641 if (page >= bitmap->pages) {
1642 /*
1643 * This can happen if bitmap_start_sync goes beyond
1644 * End-of-device while looking for a whole page or
1645 * user set a huge number to sysfs bitmap_set_bits.
1646 */
1647 *blocks = csize - (offset & (csize - 1));
1648 return NULL;
1649 }
1650 err = md_bitmap_checkpage(bitmap, page, create, 0);
1651
1652 if (bitmap->bp[page].hijacked ||
1653 bitmap->bp[page].map == NULL)
1654 csize = ((sector_t)1) << (bitmap->chunkshift +
1655 PAGE_COUNTER_SHIFT);
1656
1657 *blocks = csize - (offset & (csize - 1));
1658
1659 if (err < 0)
1660 return NULL;
1661
1662 /* now locked ... */
1663
1664 if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1665 /* should we use the first or second counter field
1666 * of the hijacked pointer? */
1667 int hi = (pageoff > PAGE_COUNTER_MASK);
1668 return &((bitmap_counter_t *)
1669 &bitmap->bp[page].map)[hi];
1670 } else /* page is allocated */
1671 return (bitmap_counter_t *)
1672 &(bitmap->bp[page].map[pageoff]);
1673 }
1674
bitmap_startwrite(struct mddev * mddev,sector_t offset,unsigned long sectors)1675 static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
1676 unsigned long sectors)
1677 {
1678 struct bitmap *bitmap = mddev->bitmap;
1679
1680 if (!bitmap)
1681 return 0;
1682
1683 while (sectors) {
1684 sector_t blocks;
1685 bitmap_counter_t *bmc;
1686
1687 spin_lock_irq(&bitmap->counts.lock);
1688 bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
1689 if (!bmc) {
1690 spin_unlock_irq(&bitmap->counts.lock);
1691 return 0;
1692 }
1693
1694 if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
1695 DEFINE_WAIT(__wait);
1696 /* note that it is safe to do the prepare_to_wait
1697 * after the test as long as we do it before dropping
1698 * the spinlock.
1699 */
1700 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1701 TASK_UNINTERRUPTIBLE);
1702 spin_unlock_irq(&bitmap->counts.lock);
1703 schedule();
1704 finish_wait(&bitmap->overflow_wait, &__wait);
1705 continue;
1706 }
1707
1708 switch (*bmc) {
1709 case 0:
1710 md_bitmap_file_set_bit(bitmap, offset);
1711 md_bitmap_count_page(&bitmap->counts, offset, 1);
1712 fallthrough;
1713 case 1:
1714 *bmc = 2;
1715 }
1716
1717 (*bmc)++;
1718
1719 spin_unlock_irq(&bitmap->counts.lock);
1720
1721 offset += blocks;
1722 if (sectors > blocks)
1723 sectors -= blocks;
1724 else
1725 sectors = 0;
1726 }
1727 return 0;
1728 }
1729
bitmap_endwrite(struct mddev * mddev,sector_t offset,unsigned long sectors)1730 static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
1731 unsigned long sectors)
1732 {
1733 struct bitmap *bitmap = mddev->bitmap;
1734
1735 if (!bitmap)
1736 return;
1737
1738 while (sectors) {
1739 sector_t blocks;
1740 unsigned long flags;
1741 bitmap_counter_t *bmc;
1742
1743 spin_lock_irqsave(&bitmap->counts.lock, flags);
1744 bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
1745 if (!bmc) {
1746 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1747 return;
1748 }
1749
1750 if (!bitmap->mddev->degraded) {
1751 if (bitmap->events_cleared < bitmap->mddev->events) {
1752 bitmap->events_cleared = bitmap->mddev->events;
1753 bitmap->need_sync = 1;
1754 sysfs_notify_dirent_safe(
1755 bitmap->sysfs_can_clear);
1756 }
1757 } else if (!NEEDED(*bmc)) {
1758 *bmc |= NEEDED_MASK;
1759 }
1760
1761 if (COUNTER(*bmc) == COUNTER_MAX)
1762 wake_up(&bitmap->overflow_wait);
1763
1764 (*bmc)--;
1765 if (*bmc <= 2) {
1766 md_bitmap_set_pending(&bitmap->counts, offset);
1767 bitmap->allclean = 0;
1768 }
1769 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1770 offset += blocks;
1771 if (sectors > blocks)
1772 sectors -= blocks;
1773 else
1774 sectors = 0;
1775 }
1776 }
1777
__bitmap_start_sync(struct bitmap * bitmap,sector_t offset,sector_t * blocks,bool degraded)1778 static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
1779 sector_t *blocks, bool degraded)
1780 {
1781 bitmap_counter_t *bmc;
1782 bool rv;
1783
1784 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
1785 *blocks = 1024;
1786 return true; /* always resync if no bitmap */
1787 }
1788 spin_lock_irq(&bitmap->counts.lock);
1789
1790 rv = false;
1791 bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1792 if (bmc) {
1793 /* locked */
1794 if (RESYNC(*bmc)) {
1795 rv = true;
1796 } else if (NEEDED(*bmc)) {
1797 rv = true;
1798 if (!degraded) { /* don't set/clear bits if degraded */
1799 *bmc |= RESYNC_MASK;
1800 *bmc &= ~NEEDED_MASK;
1801 }
1802 }
1803 }
1804 spin_unlock_irq(&bitmap->counts.lock);
1805
1806 return rv;
1807 }
1808
bitmap_start_sync(struct mddev * mddev,sector_t offset,sector_t * blocks,bool degraded)1809 static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
1810 sector_t *blocks, bool degraded)
1811 {
1812 /* bitmap_start_sync must always report on multiples of whole
1813 * pages, otherwise resync (which is very PAGE_SIZE based) will
1814 * get confused.
1815 * So call __bitmap_start_sync repeatedly (if needed) until
1816 * At least PAGE_SIZE>>9 blocks are covered.
1817 * Return the 'or' of the result.
1818 */
1819 bool rv = false;
1820 sector_t blocks1;
1821
1822 *blocks = 0;
1823 while (*blocks < (PAGE_SIZE>>9)) {
1824 rv |= __bitmap_start_sync(mddev->bitmap, offset,
1825 &blocks1, degraded);
1826 offset += blocks1;
1827 *blocks += blocks1;
1828 }
1829
1830 return rv;
1831 }
1832
__bitmap_end_sync(struct bitmap * bitmap,sector_t offset,sector_t * blocks,bool aborted)1833 static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
1834 sector_t *blocks, bool aborted)
1835 {
1836 bitmap_counter_t *bmc;
1837 unsigned long flags;
1838
1839 if (bitmap == NULL) {
1840 *blocks = 1024;
1841 return;
1842 }
1843 spin_lock_irqsave(&bitmap->counts.lock, flags);
1844 bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
1845 if (bmc == NULL)
1846 goto unlock;
1847 /* locked */
1848 if (RESYNC(*bmc)) {
1849 *bmc &= ~RESYNC_MASK;
1850
1851 if (!NEEDED(*bmc) && aborted)
1852 *bmc |= NEEDED_MASK;
1853 else {
1854 if (*bmc <= 2) {
1855 md_bitmap_set_pending(&bitmap->counts, offset);
1856 bitmap->allclean = 0;
1857 }
1858 }
1859 }
1860 unlock:
1861 spin_unlock_irqrestore(&bitmap->counts.lock, flags);
1862 }
1863
bitmap_end_sync(struct mddev * mddev,sector_t offset,sector_t * blocks)1864 static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
1865 sector_t *blocks)
1866 {
1867 __bitmap_end_sync(mddev->bitmap, offset, blocks, true);
1868 }
1869
bitmap_close_sync(struct mddev * mddev)1870 static void bitmap_close_sync(struct mddev *mddev)
1871 {
1872 /* Sync has finished, and any bitmap chunks that weren't synced
1873 * properly have been aborted. It remains to us to clear the
1874 * RESYNC bit wherever it is still on
1875 */
1876 sector_t sector = 0;
1877 sector_t blocks;
1878 struct bitmap *bitmap = mddev->bitmap;
1879
1880 if (!bitmap)
1881 return;
1882
1883 while (sector < bitmap->mddev->resync_max_sectors) {
1884 __bitmap_end_sync(bitmap, sector, &blocks, false);
1885 sector += blocks;
1886 }
1887 }
1888
bitmap_cond_end_sync(struct mddev * mddev,sector_t sector,bool force)1889 static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1890 bool force)
1891 {
1892 sector_t s = 0;
1893 sector_t blocks;
1894 struct bitmap *bitmap = mddev->bitmap;
1895
1896 if (!bitmap)
1897 return;
1898 if (sector == 0) {
1899 bitmap->last_end_sync = jiffies;
1900 return;
1901 }
1902 if (!force && time_before(jiffies, (bitmap->last_end_sync
1903 + bitmap->mddev->bitmap_info.daemon_sleep)))
1904 return;
1905 wait_event(bitmap->mddev->recovery_wait,
1906 atomic_read(&bitmap->mddev->recovery_active) == 0);
1907
1908 bitmap->mddev->curr_resync_completed = sector;
1909 set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags);
1910 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
1911 s = 0;
1912 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1913 __bitmap_end_sync(bitmap, s, &blocks, false);
1914 s += blocks;
1915 }
1916 bitmap->last_end_sync = jiffies;
1917 sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
1918 }
1919
bitmap_sync_with_cluster(struct mddev * mddev,sector_t old_lo,sector_t old_hi,sector_t new_lo,sector_t new_hi)1920 static void bitmap_sync_with_cluster(struct mddev *mddev,
1921 sector_t old_lo, sector_t old_hi,
1922 sector_t new_lo, sector_t new_hi)
1923 {
1924 struct bitmap *bitmap = mddev->bitmap;
1925 sector_t sector, blocks = 0;
1926
1927 for (sector = old_lo; sector < new_lo; ) {
1928 __bitmap_end_sync(bitmap, sector, &blocks, false);
1929 sector += blocks;
1930 }
1931 WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
1932
1933 for (sector = old_hi; sector < new_hi; ) {
1934 bitmap_start_sync(mddev, sector, &blocks, false);
1935 sector += blocks;
1936 }
1937 WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
1938 }
1939
md_bitmap_set_memory_bits(struct bitmap * bitmap,sector_t offset,int needed)1940 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1941 {
1942 /* For each chunk covered by any of these sectors, set the
1943 * counter to 2 and possibly set resync_needed. They should all
1944 * be 0 at this point
1945 */
1946
1947 sector_t secs;
1948 bitmap_counter_t *bmc;
1949 spin_lock_irq(&bitmap->counts.lock);
1950 bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
1951 if (!bmc) {
1952 spin_unlock_irq(&bitmap->counts.lock);
1953 return;
1954 }
1955 if (!*bmc) {
1956 *bmc = 2;
1957 md_bitmap_count_page(&bitmap->counts, offset, 1);
1958 md_bitmap_set_pending(&bitmap->counts, offset);
1959 bitmap->allclean = 0;
1960 }
1961 if (needed)
1962 *bmc |= NEEDED_MASK;
1963 spin_unlock_irq(&bitmap->counts.lock);
1964 }
1965
1966 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
bitmap_dirty_bits(struct mddev * mddev,unsigned long s,unsigned long e)1967 static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1968 unsigned long e)
1969 {
1970 unsigned long chunk;
1971 struct bitmap *bitmap = mddev->bitmap;
1972
1973 if (!bitmap)
1974 return;
1975
1976 for (chunk = s; chunk <= e; chunk++) {
1977 sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
1978
1979 md_bitmap_set_memory_bits(bitmap, sec, 1);
1980 md_bitmap_file_set_bit(bitmap, sec);
1981 if (sec < bitmap->mddev->recovery_cp)
1982 /* We are asserting that the array is dirty,
1983 * so move the recovery_cp address back so
1984 * that it is obvious that it is dirty
1985 */
1986 bitmap->mddev->recovery_cp = sec;
1987 }
1988 }
1989
bitmap_flush(struct mddev * mddev)1990 static void bitmap_flush(struct mddev *mddev)
1991 {
1992 struct bitmap *bitmap = mddev->bitmap;
1993 long sleep;
1994
1995 if (!bitmap) /* there was no bitmap */
1996 return;
1997
1998 /* run the daemon_work three time to ensure everything is flushed
1999 * that can be
2000 */
2001 sleep = mddev->bitmap_info.daemon_sleep * 2;
2002 bitmap->daemon_lastrun -= sleep;
2003 bitmap_daemon_work(mddev);
2004 bitmap->daemon_lastrun -= sleep;
2005 bitmap_daemon_work(mddev);
2006 bitmap->daemon_lastrun -= sleep;
2007 bitmap_daemon_work(mddev);
2008 if (mddev->bitmap_info.external)
2009 md_super_wait(mddev);
2010 bitmap_update_sb(bitmap);
2011 }
2012
md_bitmap_free(void * data)2013 static void md_bitmap_free(void *data)
2014 {
2015 unsigned long k, pages;
2016 struct bitmap_page *bp;
2017 struct bitmap *bitmap = data;
2018
2019 if (!bitmap) /* there was no bitmap */
2020 return;
2021
2022 if (bitmap->sysfs_can_clear)
2023 sysfs_put(bitmap->sysfs_can_clear);
2024
2025 if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
2026 bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev))
2027 md_cluster_stop(bitmap->mddev);
2028
2029 /* Shouldn't be needed - but just in case.... */
2030 wait_event(bitmap->write_wait,
2031 atomic_read(&bitmap->pending_writes) == 0);
2032
2033 /* release the bitmap file */
2034 md_bitmap_file_unmap(&bitmap->storage);
2035
2036 bp = bitmap->counts.bp;
2037 pages = bitmap->counts.pages;
2038
2039 /* free all allocated memory */
2040
2041 if (bp) /* deallocate the page memory */
2042 for (k = 0; k < pages; k++)
2043 if (bp[k].map && !bp[k].hijacked)
2044 kfree(bp[k].map);
2045 kfree(bp);
2046 kfree(bitmap);
2047 }
2048
bitmap_start_behind_write(struct mddev * mddev)2049 static void bitmap_start_behind_write(struct mddev *mddev)
2050 {
2051 struct bitmap *bitmap = mddev->bitmap;
2052 int bw;
2053
2054 if (!bitmap)
2055 return;
2056
2057 atomic_inc(&bitmap->behind_writes);
2058 bw = atomic_read(&bitmap->behind_writes);
2059 if (bw > bitmap->behind_writes_used)
2060 bitmap->behind_writes_used = bw;
2061
2062 pr_debug("inc write-behind count %d/%lu\n",
2063 bw, bitmap->mddev->bitmap_info.max_write_behind);
2064 }
2065
bitmap_end_behind_write(struct mddev * mddev)2066 static void bitmap_end_behind_write(struct mddev *mddev)
2067 {
2068 struct bitmap *bitmap = mddev->bitmap;
2069
2070 if (!bitmap)
2071 return;
2072
2073 if (atomic_dec_and_test(&bitmap->behind_writes))
2074 wake_up(&bitmap->behind_wait);
2075 pr_debug("dec write-behind count %d/%lu\n",
2076 atomic_read(&bitmap->behind_writes),
2077 bitmap->mddev->bitmap_info.max_write_behind);
2078 }
2079
bitmap_wait_behind_writes(struct mddev * mddev)2080 static void bitmap_wait_behind_writes(struct mddev *mddev)
2081 {
2082 struct bitmap *bitmap = mddev->bitmap;
2083
2084 /* wait for behind writes to complete */
2085 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2086 pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
2087 mdname(mddev));
2088 /* need to kick something here to make sure I/O goes? */
2089 wait_event(bitmap->behind_wait,
2090 atomic_read(&bitmap->behind_writes) == 0);
2091 }
2092 }
2093
bitmap_destroy(struct mddev * mddev)2094 static void bitmap_destroy(struct mddev *mddev)
2095 {
2096 struct bitmap *bitmap = mddev->bitmap;
2097
2098 if (!bitmap) /* there was no bitmap */
2099 return;
2100
2101 bitmap_wait_behind_writes(mddev);
2102 if (!mddev->serialize_policy)
2103 mddev_destroy_serial_pool(mddev, NULL);
2104
2105 mutex_lock(&mddev->bitmap_info.mutex);
2106 spin_lock(&mddev->lock);
2107 mddev->bitmap = NULL; /* disconnect from the md device */
2108 spin_unlock(&mddev->lock);
2109 mutex_unlock(&mddev->bitmap_info.mutex);
2110 mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
2111
2112 md_bitmap_free(bitmap);
2113 }
2114
2115 /*
2116 * initialize the bitmap structure
2117 * if this returns an error, bitmap_destroy must be called to do clean up
2118 * once mddev->bitmap is set
2119 */
__bitmap_create(struct mddev * mddev,int slot)2120 static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
2121 {
2122 struct bitmap *bitmap;
2123 sector_t blocks = mddev->resync_max_sectors;
2124 struct file *file = mddev->bitmap_info.file;
2125 int err;
2126 struct kernfs_node *bm = NULL;
2127
2128 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
2129
2130 BUG_ON(file && mddev->bitmap_info.offset);
2131
2132 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
2133 pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
2134 mdname(mddev));
2135 return ERR_PTR(-EBUSY);
2136 }
2137
2138 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
2139 if (!bitmap)
2140 return ERR_PTR(-ENOMEM);
2141
2142 spin_lock_init(&bitmap->counts.lock);
2143 atomic_set(&bitmap->pending_writes, 0);
2144 init_waitqueue_head(&bitmap->write_wait);
2145 init_waitqueue_head(&bitmap->overflow_wait);
2146 init_waitqueue_head(&bitmap->behind_wait);
2147
2148 bitmap->mddev = mddev;
2149 bitmap->cluster_slot = slot;
2150
2151 if (mddev->kobj.sd)
2152 bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
2153 if (bm) {
2154 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
2155 sysfs_put(bm);
2156 } else
2157 bitmap->sysfs_can_clear = NULL;
2158
2159 bitmap->storage.file = file;
2160 if (file) {
2161 get_file(file);
2162 /* As future accesses to this file will use bmap,
2163 * and bypass the page cache, we must sync the file
2164 * first.
2165 */
2166 vfs_fsync(file, 1);
2167 }
2168 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
2169 if (!mddev->bitmap_info.external) {
2170 /*
2171 * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
2172 * instructing us to create a new on-disk bitmap instance.
2173 */
2174 if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
2175 err = md_bitmap_new_disk_sb(bitmap);
2176 else
2177 err = md_bitmap_read_sb(bitmap);
2178 } else {
2179 err = 0;
2180 if (mddev->bitmap_info.chunksize == 0 ||
2181 mddev->bitmap_info.daemon_sleep == 0)
2182 /* chunksize and time_base need to be
2183 * set first. */
2184 err = -EINVAL;
2185 }
2186 if (err)
2187 goto error;
2188
2189 bitmap->daemon_lastrun = jiffies;
2190 err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
2191 true);
2192 if (err)
2193 goto error;
2194
2195 pr_debug("created bitmap (%lu pages) for device %s\n",
2196 bitmap->counts.pages, bmname(bitmap));
2197
2198 err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
2199 if (err)
2200 goto error;
2201
2202 return bitmap;
2203 error:
2204 md_bitmap_free(bitmap);
2205 return ERR_PTR(err);
2206 }
2207
bitmap_create(struct mddev * mddev,int slot)2208 static int bitmap_create(struct mddev *mddev, int slot)
2209 {
2210 struct bitmap *bitmap = __bitmap_create(mddev, slot);
2211
2212 if (IS_ERR(bitmap))
2213 return PTR_ERR(bitmap);
2214
2215 mddev->bitmap = bitmap;
2216 return 0;
2217 }
2218
bitmap_load(struct mddev * mddev)2219 static int bitmap_load(struct mddev *mddev)
2220 {
2221 int err = 0;
2222 sector_t start = 0;
2223 sector_t sector = 0;
2224 struct bitmap *bitmap = mddev->bitmap;
2225 struct md_rdev *rdev;
2226
2227 if (!bitmap)
2228 goto out;
2229
2230 rdev_for_each(rdev, mddev)
2231 mddev_create_serial_pool(mddev, rdev);
2232
2233 if (mddev_is_clustered(mddev))
2234 mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
2235
2236 /* Clear out old bitmap info first: Either there is none, or we
2237 * are resuming after someone else has possibly changed things,
2238 * so we should forget old cached info.
2239 * All chunks should be clean, but some might need_sync.
2240 */
2241 while (sector < mddev->resync_max_sectors) {
2242 sector_t blocks;
2243 bitmap_start_sync(mddev, sector, &blocks, false);
2244 sector += blocks;
2245 }
2246 bitmap_close_sync(mddev);
2247
2248 if (mddev->degraded == 0
2249 || bitmap->events_cleared == mddev->events)
2250 /* no need to keep dirty bits to optimise a
2251 * re-add of a missing device */
2252 start = mddev->recovery_cp;
2253
2254 mutex_lock(&mddev->bitmap_info.mutex);
2255 err = md_bitmap_init_from_disk(bitmap, start);
2256 mutex_unlock(&mddev->bitmap_info.mutex);
2257
2258 if (err)
2259 goto out;
2260 clear_bit(BITMAP_STALE, &bitmap->flags);
2261
2262 /* Kick recovery in case any bits were set */
2263 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
2264
2265 mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
2266 md_wakeup_thread(mddev->thread);
2267
2268 bitmap_update_sb(bitmap);
2269
2270 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
2271 err = -EIO;
2272 out:
2273 return err;
2274 }
2275
2276 /* caller need to free returned bitmap with md_bitmap_free() */
bitmap_get_from_slot(struct mddev * mddev,int slot)2277 static void *bitmap_get_from_slot(struct mddev *mddev, int slot)
2278 {
2279 int rv = 0;
2280 struct bitmap *bitmap;
2281
2282 bitmap = __bitmap_create(mddev, slot);
2283 if (IS_ERR(bitmap)) {
2284 rv = PTR_ERR(bitmap);
2285 return ERR_PTR(rv);
2286 }
2287
2288 rv = md_bitmap_init_from_disk(bitmap, 0);
2289 if (rv) {
2290 md_bitmap_free(bitmap);
2291 return ERR_PTR(rv);
2292 }
2293
2294 return bitmap;
2295 }
2296
2297 /* Loads the bitmap associated with slot and copies the resync information
2298 * to our bitmap
2299 */
bitmap_copy_from_slot(struct mddev * mddev,int slot,sector_t * low,sector_t * high,bool clear_bits)2300 static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
2301 sector_t *high, bool clear_bits)
2302 {
2303 int rv = 0, i, j;
2304 sector_t block, lo = 0, hi = 0;
2305 struct bitmap_counts *counts;
2306 struct bitmap *bitmap;
2307
2308 bitmap = bitmap_get_from_slot(mddev, slot);
2309 if (IS_ERR(bitmap)) {
2310 pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
2311 return -1;
2312 }
2313
2314 counts = &bitmap->counts;
2315 for (j = 0; j < counts->chunks; j++) {
2316 block = (sector_t)j << counts->chunkshift;
2317 if (md_bitmap_file_test_bit(bitmap, block)) {
2318 if (!lo)
2319 lo = block;
2320 hi = block;
2321 md_bitmap_file_clear_bit(bitmap, block);
2322 md_bitmap_set_memory_bits(mddev->bitmap, block, 1);
2323 md_bitmap_file_set_bit(mddev->bitmap, block);
2324 }
2325 }
2326
2327 if (clear_bits) {
2328 bitmap_update_sb(bitmap);
2329 /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
2330 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
2331 for (i = 0; i < bitmap->storage.file_pages; i++)
2332 if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
2333 set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
2334 __bitmap_unplug(bitmap);
2335 }
2336 __bitmap_unplug(mddev->bitmap);
2337 *low = lo;
2338 *high = hi;
2339 md_bitmap_free(bitmap);
2340
2341 return rv;
2342 }
2343
bitmap_set_pages(void * data,unsigned long pages)2344 static void bitmap_set_pages(void *data, unsigned long pages)
2345 {
2346 struct bitmap *bitmap = data;
2347
2348 bitmap->counts.pages = pages;
2349 }
2350
bitmap_get_stats(void * data,struct md_bitmap_stats * stats)2351 static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
2352 {
2353 struct bitmap_storage *storage;
2354 struct bitmap_counts *counts;
2355 struct bitmap *bitmap = data;
2356 bitmap_super_t *sb;
2357
2358 if (!bitmap)
2359 return -ENOENT;
2360 if (bitmap->mddev->bitmap_info.external)
2361 return -ENOENT;
2362 if (!bitmap->storage.sb_page) /* no superblock */
2363 return -EINVAL;
2364 sb = kmap_local_page(bitmap->storage.sb_page);
2365 stats->sync_size = le64_to_cpu(sb->sync_size);
2366 kunmap_local(sb);
2367
2368 counts = &bitmap->counts;
2369 stats->missing_pages = counts->missing_pages;
2370 stats->pages = counts->pages;
2371
2372 storage = &bitmap->storage;
2373 stats->file_pages = storage->file_pages;
2374 stats->file = storage->file;
2375
2376 stats->behind_writes = atomic_read(&bitmap->behind_writes);
2377 stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
2378 stats->events_cleared = bitmap->events_cleared;
2379 return 0;
2380 }
2381
__bitmap_resize(struct bitmap * bitmap,sector_t blocks,int chunksize,bool init)2382 static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2383 int chunksize, bool init)
2384 {
2385 /* If chunk_size is 0, choose an appropriate chunk size.
2386 * Then possibly allocate new storage space.
2387 * Then quiesce, copy bits, replace bitmap, and re-start
2388 *
2389 * This function is called both to set up the initial bitmap
2390 * and to resize the bitmap while the array is active.
2391 * If this happens as a result of the array being resized,
2392 * chunksize will be zero, and we need to choose a suitable
2393 * chunksize, otherwise we use what we are given.
2394 */
2395 struct bitmap_storage store;
2396 struct bitmap_counts old_counts;
2397 unsigned long chunks;
2398 sector_t block;
2399 sector_t old_blocks, new_blocks;
2400 int chunkshift;
2401 int ret = 0;
2402 long pages;
2403 struct bitmap_page *new_bp;
2404
2405 if (bitmap->storage.file && !init) {
2406 pr_info("md: cannot resize file-based bitmap\n");
2407 return -EINVAL;
2408 }
2409
2410 if (chunksize == 0) {
2411 /* If there is enough space, leave the chunk size unchanged,
2412 * else increase by factor of two until there is enough space.
2413 */
2414 long bytes;
2415 long space = bitmap->mddev->bitmap_info.space;
2416
2417 if (space == 0) {
2418 /* We don't know how much space there is, so limit
2419 * to current size - in sectors.
2420 */
2421 bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
2422 if (!bitmap->mddev->bitmap_info.external)
2423 bytes += sizeof(bitmap_super_t);
2424 space = DIV_ROUND_UP(bytes, 512);
2425 bitmap->mddev->bitmap_info.space = space;
2426 }
2427 chunkshift = bitmap->counts.chunkshift;
2428 chunkshift--;
2429 do {
2430 /* 'chunkshift' is shift from block size to chunk size */
2431 chunkshift++;
2432 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2433 bytes = DIV_ROUND_UP(chunks, 8);
2434 if (!bitmap->mddev->bitmap_info.external)
2435 bytes += sizeof(bitmap_super_t);
2436 } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
2437 (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
2438 } else
2439 chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
2440
2441 chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
2442 memset(&store, 0, sizeof(store));
2443 if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
2444 ret = md_bitmap_storage_alloc(&store, chunks,
2445 !bitmap->mddev->bitmap_info.external,
2446 mddev_is_clustered(bitmap->mddev)
2447 ? bitmap->cluster_slot : 0);
2448 if (ret) {
2449 md_bitmap_file_unmap(&store);
2450 goto err;
2451 }
2452
2453 pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
2454
2455 new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL);
2456 ret = -ENOMEM;
2457 if (!new_bp) {
2458 md_bitmap_file_unmap(&store);
2459 goto err;
2460 }
2461
2462 if (!init)
2463 bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
2464
2465 store.file = bitmap->storage.file;
2466 bitmap->storage.file = NULL;
2467
2468 if (store.sb_page && bitmap->storage.sb_page)
2469 memcpy(page_address(store.sb_page),
2470 page_address(bitmap->storage.sb_page),
2471 sizeof(bitmap_super_t));
2472 spin_lock_irq(&bitmap->counts.lock);
2473 md_bitmap_file_unmap(&bitmap->storage);
2474 bitmap->storage = store;
2475
2476 old_counts = bitmap->counts;
2477 bitmap->counts.bp = new_bp;
2478 bitmap->counts.pages = pages;
2479 bitmap->counts.missing_pages = pages;
2480 bitmap->counts.chunkshift = chunkshift;
2481 bitmap->counts.chunks = chunks;
2482 bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
2483 BITMAP_BLOCK_SHIFT);
2484
2485 blocks = min(old_counts.chunks << old_counts.chunkshift,
2486 chunks << chunkshift);
2487
2488 /* For cluster raid, need to pre-allocate bitmap */
2489 if (mddev_is_clustered(bitmap->mddev)) {
2490 unsigned long page;
2491 for (page = 0; page < pages; page++) {
2492 ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
2493 if (ret) {
2494 unsigned long k;
2495
2496 /* deallocate the page memory */
2497 for (k = 0; k < page; k++) {
2498 kfree(new_bp[k].map);
2499 }
2500 kfree(new_bp);
2501
2502 /* restore some fields from old_counts */
2503 bitmap->counts.bp = old_counts.bp;
2504 bitmap->counts.pages = old_counts.pages;
2505 bitmap->counts.missing_pages = old_counts.pages;
2506 bitmap->counts.chunkshift = old_counts.chunkshift;
2507 bitmap->counts.chunks = old_counts.chunks;
2508 bitmap->mddev->bitmap_info.chunksize =
2509 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
2510 blocks = old_counts.chunks << old_counts.chunkshift;
2511 pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
2512 break;
2513 } else
2514 bitmap->counts.bp[page].count += 1;
2515 }
2516 }
2517
2518 for (block = 0; block < blocks; ) {
2519 bitmap_counter_t *bmc_old, *bmc_new;
2520 int set;
2521
2522 bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
2523 set = bmc_old && NEEDED(*bmc_old);
2524
2525 if (set) {
2526 bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2527 if (bmc_new) {
2528 if (*bmc_new == 0) {
2529 /* need to set on-disk bits too. */
2530 sector_t end = block + new_blocks;
2531 sector_t start = block >> chunkshift;
2532
2533 start <<= chunkshift;
2534 while (start < end) {
2535 md_bitmap_file_set_bit(bitmap, block);
2536 start += 1 << chunkshift;
2537 }
2538 *bmc_new = 2;
2539 md_bitmap_count_page(&bitmap->counts, block, 1);
2540 md_bitmap_set_pending(&bitmap->counts, block);
2541 }
2542 *bmc_new |= NEEDED_MASK;
2543 }
2544 if (new_blocks < old_blocks)
2545 old_blocks = new_blocks;
2546 }
2547 block += old_blocks;
2548 }
2549
2550 if (bitmap->counts.bp != old_counts.bp) {
2551 unsigned long k;
2552 for (k = 0; k < old_counts.pages; k++)
2553 if (!old_counts.bp[k].hijacked)
2554 kfree(old_counts.bp[k].map);
2555 kfree(old_counts.bp);
2556 }
2557
2558 if (!init) {
2559 int i;
2560 while (block < (chunks << chunkshift)) {
2561 bitmap_counter_t *bmc;
2562 bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
2563 if (bmc) {
2564 /* new space. It needs to be resynced, so
2565 * we set NEEDED_MASK.
2566 */
2567 if (*bmc == 0) {
2568 *bmc = NEEDED_MASK | 2;
2569 md_bitmap_count_page(&bitmap->counts, block, 1);
2570 md_bitmap_set_pending(&bitmap->counts, block);
2571 }
2572 }
2573 block += new_blocks;
2574 }
2575 for (i = 0; i < bitmap->storage.file_pages; i++)
2576 set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
2577 }
2578 spin_unlock_irq(&bitmap->counts.lock);
2579
2580 if (!init) {
2581 __bitmap_unplug(bitmap);
2582 bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
2583 }
2584 ret = 0;
2585 err:
2586 return ret;
2587 }
2588
bitmap_resize(struct mddev * mddev,sector_t blocks,int chunksize,bool init)2589 static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
2590 bool init)
2591 {
2592 struct bitmap *bitmap = mddev->bitmap;
2593
2594 if (!bitmap)
2595 return 0;
2596
2597 return __bitmap_resize(bitmap, blocks, chunksize, init);
2598 }
2599
2600 static ssize_t
location_show(struct mddev * mddev,char * page)2601 location_show(struct mddev *mddev, char *page)
2602 {
2603 ssize_t len;
2604 if (mddev->bitmap_info.file)
2605 len = sprintf(page, "file");
2606 else if (mddev->bitmap_info.offset)
2607 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
2608 else
2609 len = sprintf(page, "none");
2610 len += sprintf(page+len, "\n");
2611 return len;
2612 }
2613
2614 static ssize_t
location_store(struct mddev * mddev,const char * buf,size_t len)2615 location_store(struct mddev *mddev, const char *buf, size_t len)
2616 {
2617 int rv;
2618
2619 rv = mddev_suspend_and_lock(mddev);
2620 if (rv)
2621 return rv;
2622
2623 if (mddev->pers) {
2624 if (mddev->recovery || mddev->sync_thread) {
2625 rv = -EBUSY;
2626 goto out;
2627 }
2628 }
2629
2630 if (mddev->bitmap || mddev->bitmap_info.file ||
2631 mddev->bitmap_info.offset) {
2632 /* bitmap already configured. Only option is to clear it */
2633 if (strncmp(buf, "none", 4) != 0) {
2634 rv = -EBUSY;
2635 goto out;
2636 }
2637
2638 bitmap_destroy(mddev);
2639 mddev->bitmap_info.offset = 0;
2640 if (mddev->bitmap_info.file) {
2641 struct file *f = mddev->bitmap_info.file;
2642 mddev->bitmap_info.file = NULL;
2643 fput(f);
2644 }
2645 } else {
2646 /* No bitmap, OK to set a location */
2647 long long offset;
2648
2649 if (strncmp(buf, "none", 4) == 0)
2650 /* nothing to be done */;
2651 else if (strncmp(buf, "file:", 5) == 0) {
2652 /* Not supported yet */
2653 rv = -EINVAL;
2654 goto out;
2655 } else {
2656 if (buf[0] == '+')
2657 rv = kstrtoll(buf+1, 10, &offset);
2658 else
2659 rv = kstrtoll(buf, 10, &offset);
2660 if (rv)
2661 goto out;
2662 if (offset == 0) {
2663 rv = -EINVAL;
2664 goto out;
2665 }
2666 if (mddev->bitmap_info.external == 0 &&
2667 mddev->major_version == 0 &&
2668 offset != mddev->bitmap_info.default_offset) {
2669 rv = -EINVAL;
2670 goto out;
2671 }
2672
2673 mddev->bitmap_info.offset = offset;
2674 rv = bitmap_create(mddev, -1);
2675 if (rv)
2676 goto out;
2677
2678 rv = bitmap_load(mddev);
2679 if (rv) {
2680 mddev->bitmap_info.offset = 0;
2681 bitmap_destroy(mddev);
2682 goto out;
2683 }
2684 }
2685 }
2686 if (!mddev->external) {
2687 /* Ensure new bitmap info is stored in
2688 * metadata promptly.
2689 */
2690 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2691 md_wakeup_thread(mddev->thread);
2692 }
2693 rv = 0;
2694 out:
2695 mddev_unlock_and_resume(mddev);
2696 if (rv)
2697 return rv;
2698 return len;
2699 }
2700
2701 static struct md_sysfs_entry bitmap_location =
2702 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
2703
2704 /* 'bitmap/space' is the space available at 'location' for the
2705 * bitmap. This allows the kernel to know when it is safe to
2706 * resize the bitmap to match a resized array.
2707 */
2708 static ssize_t
space_show(struct mddev * mddev,char * page)2709 space_show(struct mddev *mddev, char *page)
2710 {
2711 return sprintf(page, "%lu\n", mddev->bitmap_info.space);
2712 }
2713
2714 static ssize_t
space_store(struct mddev * mddev,const char * buf,size_t len)2715 space_store(struct mddev *mddev, const char *buf, size_t len)
2716 {
2717 struct bitmap *bitmap;
2718 unsigned long sectors;
2719 int rv;
2720
2721 rv = kstrtoul(buf, 10, §ors);
2722 if (rv)
2723 return rv;
2724
2725 if (sectors == 0)
2726 return -EINVAL;
2727
2728 bitmap = mddev->bitmap;
2729 if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9)
2730 return -EFBIG; /* Bitmap is too big for this small space */
2731
2732 /* could make sure it isn't too big, but that isn't really
2733 * needed - user-space should be careful.
2734 */
2735 mddev->bitmap_info.space = sectors;
2736 return len;
2737 }
2738
2739 static struct md_sysfs_entry bitmap_space =
2740 __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
2741
2742 static ssize_t
timeout_show(struct mddev * mddev,char * page)2743 timeout_show(struct mddev *mddev, char *page)
2744 {
2745 ssize_t len;
2746 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
2747 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
2748
2749 len = sprintf(page, "%lu", secs);
2750 if (jifs)
2751 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
2752 len += sprintf(page+len, "\n");
2753 return len;
2754 }
2755
2756 static ssize_t
timeout_store(struct mddev * mddev,const char * buf,size_t len)2757 timeout_store(struct mddev *mddev, const char *buf, size_t len)
2758 {
2759 /* timeout can be set at any time */
2760 unsigned long timeout;
2761 int rv = strict_strtoul_scaled(buf, &timeout, 4);
2762 if (rv)
2763 return rv;
2764
2765 /* just to make sure we don't overflow... */
2766 if (timeout >= LONG_MAX / HZ)
2767 return -EINVAL;
2768
2769 timeout = timeout * HZ / 10000;
2770
2771 if (timeout >= MAX_SCHEDULE_TIMEOUT)
2772 timeout = MAX_SCHEDULE_TIMEOUT-1;
2773 if (timeout < 1)
2774 timeout = 1;
2775
2776 mddev->bitmap_info.daemon_sleep = timeout;
2777 mddev_set_timeout(mddev, timeout, false);
2778 md_wakeup_thread(mddev->thread);
2779
2780 return len;
2781 }
2782
2783 static struct md_sysfs_entry bitmap_timeout =
2784 __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
2785
2786 static ssize_t
backlog_show(struct mddev * mddev,char * page)2787 backlog_show(struct mddev *mddev, char *page)
2788 {
2789 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
2790 }
2791
2792 static ssize_t
backlog_store(struct mddev * mddev,const char * buf,size_t len)2793 backlog_store(struct mddev *mddev, const char *buf, size_t len)
2794 {
2795 unsigned long backlog;
2796 unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
2797 struct md_rdev *rdev;
2798 bool has_write_mostly = false;
2799 int rv = kstrtoul(buf, 10, &backlog);
2800 if (rv)
2801 return rv;
2802 if (backlog > COUNTER_MAX)
2803 return -EINVAL;
2804
2805 rv = mddev_suspend_and_lock(mddev);
2806 if (rv)
2807 return rv;
2808
2809 /*
2810 * Without write mostly device, it doesn't make sense to set
2811 * backlog for max_write_behind.
2812 */
2813 rdev_for_each(rdev, mddev) {
2814 if (test_bit(WriteMostly, &rdev->flags)) {
2815 has_write_mostly = true;
2816 break;
2817 }
2818 }
2819 if (!has_write_mostly) {
2820 pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
2821 mdname(mddev));
2822 mddev_unlock(mddev);
2823 return -EINVAL;
2824 }
2825
2826 mddev->bitmap_info.max_write_behind = backlog;
2827 if (!backlog && mddev->serial_info_pool) {
2828 /* serial_info_pool is not needed if backlog is zero */
2829 if (!mddev->serialize_policy)
2830 mddev_destroy_serial_pool(mddev, NULL);
2831 } else if (backlog && !mddev->serial_info_pool) {
2832 /* serial_info_pool is needed since backlog is not zero */
2833 rdev_for_each(rdev, mddev)
2834 mddev_create_serial_pool(mddev, rdev);
2835 }
2836 if (old_mwb != backlog)
2837 bitmap_update_sb(mddev->bitmap);
2838
2839 mddev_unlock_and_resume(mddev);
2840 return len;
2841 }
2842
2843 static struct md_sysfs_entry bitmap_backlog =
2844 __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
2845
2846 static ssize_t
chunksize_show(struct mddev * mddev,char * page)2847 chunksize_show(struct mddev *mddev, char *page)
2848 {
2849 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
2850 }
2851
2852 static ssize_t
chunksize_store(struct mddev * mddev,const char * buf,size_t len)2853 chunksize_store(struct mddev *mddev, const char *buf, size_t len)
2854 {
2855 /* Can only be changed when no bitmap is active */
2856 int rv;
2857 unsigned long csize;
2858 if (mddev->bitmap)
2859 return -EBUSY;
2860 rv = kstrtoul(buf, 10, &csize);
2861 if (rv)
2862 return rv;
2863 if (csize < 512 ||
2864 !is_power_of_2(csize))
2865 return -EINVAL;
2866 if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
2867 sizeof(((bitmap_super_t *)0)->chunksize))))
2868 return -EOVERFLOW;
2869 mddev->bitmap_info.chunksize = csize;
2870 return len;
2871 }
2872
2873 static struct md_sysfs_entry bitmap_chunksize =
2874 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
2875
metadata_show(struct mddev * mddev,char * page)2876 static ssize_t metadata_show(struct mddev *mddev, char *page)
2877 {
2878 if (mddev_is_clustered(mddev))
2879 return sprintf(page, "clustered\n");
2880 return sprintf(page, "%s\n", (mddev->bitmap_info.external
2881 ? "external" : "internal"));
2882 }
2883
metadata_store(struct mddev * mddev,const char * buf,size_t len)2884 static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
2885 {
2886 if (mddev->bitmap ||
2887 mddev->bitmap_info.file ||
2888 mddev->bitmap_info.offset)
2889 return -EBUSY;
2890 if (strncmp(buf, "external", 8) == 0)
2891 mddev->bitmap_info.external = 1;
2892 else if ((strncmp(buf, "internal", 8) == 0) ||
2893 (strncmp(buf, "clustered", 9) == 0))
2894 mddev->bitmap_info.external = 0;
2895 else
2896 return -EINVAL;
2897 return len;
2898 }
2899
2900 static struct md_sysfs_entry bitmap_metadata =
2901 __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2902
can_clear_show(struct mddev * mddev,char * page)2903 static ssize_t can_clear_show(struct mddev *mddev, char *page)
2904 {
2905 int len;
2906 struct bitmap *bitmap;
2907
2908 spin_lock(&mddev->lock);
2909 bitmap = mddev->bitmap;
2910 if (bitmap)
2911 len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" :
2912 "true"));
2913 else
2914 len = sprintf(page, "\n");
2915 spin_unlock(&mddev->lock);
2916 return len;
2917 }
2918
can_clear_store(struct mddev * mddev,const char * buf,size_t len)2919 static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
2920 {
2921 struct bitmap *bitmap = mddev->bitmap;
2922
2923 if (!bitmap)
2924 return -ENOENT;
2925
2926 if (strncmp(buf, "false", 5) == 0) {
2927 bitmap->need_sync = 1;
2928 return len;
2929 }
2930
2931 if (strncmp(buf, "true", 4) == 0) {
2932 if (mddev->degraded)
2933 return -EBUSY;
2934 bitmap->need_sync = 0;
2935 return len;
2936 }
2937
2938 return -EINVAL;
2939 }
2940
2941 static struct md_sysfs_entry bitmap_can_clear =
2942 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2943
2944 static ssize_t
behind_writes_used_show(struct mddev * mddev,char * page)2945 behind_writes_used_show(struct mddev *mddev, char *page)
2946 {
2947 ssize_t ret;
2948 struct bitmap *bitmap;
2949
2950 spin_lock(&mddev->lock);
2951 bitmap = mddev->bitmap;
2952 if (!bitmap)
2953 ret = sprintf(page, "0\n");
2954 else
2955 ret = sprintf(page, "%lu\n", bitmap->behind_writes_used);
2956 spin_unlock(&mddev->lock);
2957
2958 return ret;
2959 }
2960
2961 static ssize_t
behind_writes_used_reset(struct mddev * mddev,const char * buf,size_t len)2962 behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
2963 {
2964 struct bitmap *bitmap = mddev->bitmap;
2965
2966 if (bitmap)
2967 bitmap->behind_writes_used = 0;
2968 return len;
2969 }
2970
2971 static struct md_sysfs_entry max_backlog_used =
2972 __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
2973 behind_writes_used_show, behind_writes_used_reset);
2974
2975 static struct attribute *md_bitmap_attrs[] = {
2976 &bitmap_location.attr,
2977 &bitmap_space.attr,
2978 &bitmap_timeout.attr,
2979 &bitmap_backlog.attr,
2980 &bitmap_chunksize.attr,
2981 &bitmap_metadata.attr,
2982 &bitmap_can_clear.attr,
2983 &max_backlog_used.attr,
2984 NULL
2985 };
2986 const struct attribute_group md_bitmap_group = {
2987 .name = "bitmap",
2988 .attrs = md_bitmap_attrs,
2989 };
2990
2991 static struct bitmap_operations bitmap_ops = {
2992 .enabled = bitmap_enabled,
2993 .create = bitmap_create,
2994 .resize = bitmap_resize,
2995 .load = bitmap_load,
2996 .destroy = bitmap_destroy,
2997 .flush = bitmap_flush,
2998 .write_all = bitmap_write_all,
2999 .dirty_bits = bitmap_dirty_bits,
3000 .unplug = bitmap_unplug,
3001 .daemon_work = bitmap_daemon_work,
3002
3003 .start_behind_write = bitmap_start_behind_write,
3004 .end_behind_write = bitmap_end_behind_write,
3005 .wait_behind_writes = bitmap_wait_behind_writes,
3006
3007 .startwrite = bitmap_startwrite,
3008 .endwrite = bitmap_endwrite,
3009 .start_sync = bitmap_start_sync,
3010 .end_sync = bitmap_end_sync,
3011 .cond_end_sync = bitmap_cond_end_sync,
3012 .close_sync = bitmap_close_sync,
3013
3014 .update_sb = bitmap_update_sb,
3015 .get_stats = bitmap_get_stats,
3016
3017 .sync_with_cluster = bitmap_sync_with_cluster,
3018 .get_from_slot = bitmap_get_from_slot,
3019 .copy_from_slot = bitmap_copy_from_slot,
3020 .set_pages = bitmap_set_pages,
3021 .free = md_bitmap_free,
3022 };
3023
mddev_set_bitmap_ops(struct mddev * mddev)3024 void mddev_set_bitmap_ops(struct mddev *mddev)
3025 {
3026 mddev->bitmap_ops = &bitmap_ops;
3027 }
3028