1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5
6 completely rewritten, based on the MD driver code from Marc Zyngier
7
8 Changes:
9
10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14 - kmod support by: Cyrus Durgin
15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17
18 - lots of fixes and improvements to the RAID1/RAID5 and generic
19 RAID code (such as request based resynchronization):
20
21 Neil Brown <neilb@cse.unsw.edu.au>.
22
23 - persistent bitmap code
24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25
26
27 Errors, Warnings, etc.
28 Please use:
29 pr_crit() for error conditions that risk data loss
30 pr_err() for error conditions that are unexpected, like an IO error
31 or internal inconsistency
32 pr_warn() for error conditions that could have been predicated, like
33 adding a device to an array when it has incompatible metadata
34 pr_info() for every interesting, very rare events, like an array starting
35 or stopping, or resync starting or stopping
36 pr_debug() for everything else.
37
38 */
39
40 #include <linux/sched/mm.h>
41 #include <linux/sched/signal.h>
42 #include <linux/kthread.h>
43 #include <linux/blkdev.h>
44 #include <linux/blk-integrity.h>
45 #include <linux/badblocks.h>
46 #include <linux/sysctl.h>
47 #include <linux/seq_file.h>
48 #include <linux/fs.h>
49 #include <linux/poll.h>
50 #include <linux/ctype.h>
51 #include <linux/string.h>
52 #include <linux/hdreg.h>
53 #include <linux/proc_fs.h>
54 #include <linux/random.h>
55 #include <linux/major.h>
56 #include <linux/module.h>
57 #include <linux/reboot.h>
58 #include <linux/file.h>
59 #include <linux/compat.h>
60 #include <linux/delay.h>
61 #include <linux/raid/md_p.h>
62 #include <linux/raid/md_u.h>
63 #include <linux/raid/detect.h>
64 #include <linux/slab.h>
65 #include <linux/percpu-refcount.h>
66 #include <linux/part_stat.h>
67
68 #include "md.h"
69 #include "md-bitmap.h"
70 #include "md-cluster.h"
71
72 static const char *action_name[NR_SYNC_ACTIONS] = {
73 [ACTION_RESYNC] = "resync",
74 [ACTION_RECOVER] = "recover",
75 [ACTION_CHECK] = "check",
76 [ACTION_REPAIR] = "repair",
77 [ACTION_RESHAPE] = "reshape",
78 [ACTION_FROZEN] = "frozen",
79 [ACTION_IDLE] = "idle",
80 };
81
82 static DEFINE_XARRAY(md_submodule);
83
84 static const struct kobj_type md_ktype;
85
86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87 static struct workqueue_struct *md_wq;
88
89 /*
90 * This workqueue is used for sync_work to register new sync_thread, and for
91 * del_work to remove rdev, and for event_work that is only set by dm-raid.
92 *
93 * Noted that sync_work will grab reconfig_mutex, hence never flush this
94 * workqueue whith reconfig_mutex grabbed.
95 */
96 static struct workqueue_struct *md_misc_wq;
97
98 static int remove_and_add_spares(struct mddev *mddev,
99 struct md_rdev *this);
100 static void mddev_detach(struct mddev *mddev);
101 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
102 static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
103
104 /*
105 * Default number of read corrections we'll attempt on an rdev
106 * before ejecting it from the array. We divide the read error
107 * count by 2 for every hour elapsed between read errors.
108 */
109 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
110 /* Default safemode delay: 200 msec */
111 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
112 /*
113 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
114 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
115 * does not show up that much. Increase it if you want to have more guaranteed
116 * speed. Note that the RAID driver will use the maximum bandwidth
117 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
118 *
119 * Background sync IO speed control:
120 *
121 * - below speed min:
122 * no limit;
123 * - above speed min and below speed max:
124 * a) if mddev is idle, then no limit;
125 * b) if mddev is busy handling normal IO, then limit inflight sync IO
126 * to sync_io_depth;
127 * - above speed max:
128 * sync IO can't be issued;
129 *
130 * Following configurations can be changed via /proc/sys/dev/raid/ for system
131 * or /sys/block/mdX/md/ for one array.
132 */
133 static int sysctl_speed_limit_min = 1000;
134 static int sysctl_speed_limit_max = 200000;
135 static int sysctl_sync_io_depth = 32;
136
speed_min(struct mddev * mddev)137 static int speed_min(struct mddev *mddev)
138 {
139 return mddev->sync_speed_min ?
140 mddev->sync_speed_min : sysctl_speed_limit_min;
141 }
142
speed_max(struct mddev * mddev)143 static int speed_max(struct mddev *mddev)
144 {
145 return mddev->sync_speed_max ?
146 mddev->sync_speed_max : sysctl_speed_limit_max;
147 }
148
sync_io_depth(struct mddev * mddev)149 static int sync_io_depth(struct mddev *mddev)
150 {
151 return mddev->sync_io_depth ?
152 mddev->sync_io_depth : sysctl_sync_io_depth;
153 }
154
rdev_uninit_serial(struct md_rdev * rdev)155 static void rdev_uninit_serial(struct md_rdev *rdev)
156 {
157 if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
158 return;
159
160 kvfree(rdev->serial);
161 rdev->serial = NULL;
162 }
163
rdevs_uninit_serial(struct mddev * mddev)164 static void rdevs_uninit_serial(struct mddev *mddev)
165 {
166 struct md_rdev *rdev;
167
168 rdev_for_each(rdev, mddev)
169 rdev_uninit_serial(rdev);
170 }
171
rdev_init_serial(struct md_rdev * rdev)172 static int rdev_init_serial(struct md_rdev *rdev)
173 {
174 /* serial_nums equals with BARRIER_BUCKETS_NR */
175 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
176 struct serial_in_rdev *serial = NULL;
177
178 if (test_bit(CollisionCheck, &rdev->flags))
179 return 0;
180
181 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
182 GFP_KERNEL);
183 if (!serial)
184 return -ENOMEM;
185
186 for (i = 0; i < serial_nums; i++) {
187 struct serial_in_rdev *serial_tmp = &serial[i];
188
189 spin_lock_init(&serial_tmp->serial_lock);
190 serial_tmp->serial_rb = RB_ROOT_CACHED;
191 init_waitqueue_head(&serial_tmp->serial_io_wait);
192 }
193
194 rdev->serial = serial;
195 set_bit(CollisionCheck, &rdev->flags);
196
197 return 0;
198 }
199
rdevs_init_serial(struct mddev * mddev)200 static int rdevs_init_serial(struct mddev *mddev)
201 {
202 struct md_rdev *rdev;
203 int ret = 0;
204
205 rdev_for_each(rdev, mddev) {
206 ret = rdev_init_serial(rdev);
207 if (ret)
208 break;
209 }
210
211 /* Free all resources if pool is not existed */
212 if (ret && !mddev->serial_info_pool)
213 rdevs_uninit_serial(mddev);
214
215 return ret;
216 }
217
218 /*
219 * rdev needs to enable serial stuffs if it meets the conditions:
220 * 1. it is multi-queue device flaged with writemostly.
221 * 2. the write-behind mode is enabled.
222 */
rdev_need_serial(struct md_rdev * rdev)223 static int rdev_need_serial(struct md_rdev *rdev)
224 {
225 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
226 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
227 test_bit(WriteMostly, &rdev->flags));
228 }
229
230 /*
231 * Init resource for rdev(s), then create serial_info_pool if:
232 * 1. rdev is the first device which return true from rdev_enable_serial.
233 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
234 */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev)235 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
236 {
237 int ret = 0;
238
239 if (rdev && !rdev_need_serial(rdev) &&
240 !test_bit(CollisionCheck, &rdev->flags))
241 return;
242
243 if (!rdev)
244 ret = rdevs_init_serial(mddev);
245 else
246 ret = rdev_init_serial(rdev);
247 if (ret)
248 return;
249
250 if (mddev->serial_info_pool == NULL) {
251 /*
252 * already in memalloc noio context by
253 * mddev_suspend()
254 */
255 mddev->serial_info_pool =
256 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
257 sizeof(struct serial_info));
258 if (!mddev->serial_info_pool) {
259 rdevs_uninit_serial(mddev);
260 pr_err("can't alloc memory pool for serialization\n");
261 }
262 }
263 }
264
265 /*
266 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
267 * 1. rdev is the last device flaged with CollisionCheck.
268 * 2. when bitmap is destroyed while policy is not enabled.
269 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
270 */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev)271 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
272 {
273 if (rdev && !test_bit(CollisionCheck, &rdev->flags))
274 return;
275
276 if (mddev->serial_info_pool) {
277 struct md_rdev *temp;
278 int num = 0; /* used to track if other rdevs need the pool */
279
280 rdev_for_each(temp, mddev) {
281 if (!rdev) {
282 if (!test_bit(MD_SERIALIZE_POLICY,
283 &mddev->flags) ||
284 !rdev_need_serial(temp))
285 rdev_uninit_serial(temp);
286 else
287 num++;
288 } else if (temp != rdev &&
289 test_bit(CollisionCheck, &temp->flags))
290 num++;
291 }
292
293 if (rdev)
294 rdev_uninit_serial(rdev);
295
296 if (num)
297 pr_info("The mempool could be used by other devices\n");
298 else {
299 mempool_destroy(mddev->serial_info_pool);
300 mddev->serial_info_pool = NULL;
301 }
302 }
303 }
304
305 static struct ctl_table_header *raid_table_header;
306
307 static const struct ctl_table raid_table[] = {
308 {
309 .procname = "speed_limit_min",
310 .data = &sysctl_speed_limit_min,
311 .maxlen = sizeof(int),
312 .mode = 0644,
313 .proc_handler = proc_dointvec,
314 },
315 {
316 .procname = "speed_limit_max",
317 .data = &sysctl_speed_limit_max,
318 .maxlen = sizeof(int),
319 .mode = 0644,
320 .proc_handler = proc_dointvec,
321 },
322 {
323 .procname = "sync_io_depth",
324 .data = &sysctl_sync_io_depth,
325 .maxlen = sizeof(int),
326 .mode = 0644,
327 .proc_handler = proc_dointvec,
328 },
329 };
330
331 static int start_readonly;
332
333 /*
334 * The original mechanism for creating an md device is to create
335 * a device node in /dev and to open it. This causes races with device-close.
336 * The preferred method is to write to the "new_array" module parameter.
337 * This can avoid races.
338 * Setting create_on_open to false disables the original mechanism
339 * so all the races disappear.
340 */
341 static bool create_on_open = true;
342 static bool legacy_async_del_gendisk = true;
343 static bool check_new_feature = true;
344
345 /*
346 * We have a system wide 'event count' that is incremented
347 * on any 'interesting' event, and readers of /proc/mdstat
348 * can use 'poll' or 'select' to find out when the event
349 * count increases.
350 *
351 * Events are:
352 * start array, stop array, error, add device, remove device,
353 * start build, activate spare
354 */
355 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
356 static atomic_t md_event_count;
md_new_event(void)357 void md_new_event(void)
358 {
359 atomic_inc(&md_event_count);
360 wake_up(&md_event_waiters);
361 }
362 EXPORT_SYMBOL_GPL(md_new_event);
363
364 /*
365 * Enables to iterate over all existing md arrays
366 * all_mddevs_lock protects this list.
367 */
368 static LIST_HEAD(all_mddevs);
369 static DEFINE_SPINLOCK(all_mddevs_lock);
370
is_md_suspended(struct mddev * mddev)371 static bool is_md_suspended(struct mddev *mddev)
372 {
373 return percpu_ref_is_dying(&mddev->active_io);
374 }
375 /* Rather than calling directly into the personality make_request function,
376 * IO requests come here first so that we can check if the device is
377 * being suspended pending a reconfiguration.
378 * We hold a refcount over the call to ->make_request. By the time that
379 * call has finished, the bio has been linked into some internal structure
380 * and so is visible to ->quiesce(), so we don't need the refcount any more.
381 */
is_suspended(struct mddev * mddev,struct bio * bio)382 static bool is_suspended(struct mddev *mddev, struct bio *bio)
383 {
384 if (is_md_suspended(mddev))
385 return true;
386 if (bio_data_dir(bio) != WRITE)
387 return false;
388 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
389 return false;
390 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
391 return false;
392 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
393 return false;
394 return true;
395 }
396
md_handle_request(struct mddev * mddev,struct bio * bio)397 bool md_handle_request(struct mddev *mddev, struct bio *bio)
398 {
399 check_suspended:
400 if (is_suspended(mddev, bio)) {
401 DEFINE_WAIT(__wait);
402 /* Bail out if REQ_NOWAIT is set for the bio */
403 if (bio->bi_opf & REQ_NOWAIT) {
404 bio_wouldblock_error(bio);
405 return true;
406 }
407 for (;;) {
408 prepare_to_wait(&mddev->sb_wait, &__wait,
409 TASK_UNINTERRUPTIBLE);
410 if (!is_suspended(mddev, bio))
411 break;
412 schedule();
413 }
414 finish_wait(&mddev->sb_wait, &__wait);
415 }
416 if (!percpu_ref_tryget_live(&mddev->active_io))
417 goto check_suspended;
418
419 if (!mddev->pers->make_request(mddev, bio)) {
420 percpu_ref_put(&mddev->active_io);
421 if (!mddev->gendisk && mddev->pers->prepare_suspend)
422 return false;
423 goto check_suspended;
424 }
425
426 percpu_ref_put(&mddev->active_io);
427 return true;
428 }
429 EXPORT_SYMBOL(md_handle_request);
430
md_submit_bio(struct bio * bio)431 static void md_submit_bio(struct bio *bio)
432 {
433 const int rw = bio_data_dir(bio);
434 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
435
436 if (mddev == NULL || mddev->pers == NULL) {
437 bio_io_error(bio);
438 return;
439 }
440
441 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
442 bio_io_error(bio);
443 return;
444 }
445
446 bio = bio_split_to_limits(bio);
447 if (!bio)
448 return;
449
450 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
451 if (bio_sectors(bio) != 0)
452 bio->bi_status = BLK_STS_IOERR;
453 bio_endio(bio);
454 return;
455 }
456
457 /* bio could be mergeable after passing to underlayer */
458 bio->bi_opf &= ~REQ_NOMERGE;
459
460 md_handle_request(mddev, bio);
461 }
462
463 /*
464 * Make sure no new requests are submitted to the device, and any requests that
465 * have been submitted are completely handled.
466 */
mddev_suspend(struct mddev * mddev,bool interruptible)467 int mddev_suspend(struct mddev *mddev, bool interruptible)
468 {
469 int err = 0;
470
471 /*
472 * hold reconfig_mutex to wait for normal io will deadlock, because
473 * other context can't update super_block, and normal io can rely on
474 * updating super_block.
475 */
476 lockdep_assert_not_held(&mddev->reconfig_mutex);
477
478 if (interruptible)
479 err = mutex_lock_interruptible(&mddev->suspend_mutex);
480 else
481 mutex_lock(&mddev->suspend_mutex);
482 if (err)
483 return err;
484
485 if (mddev->suspended) {
486 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
487 mutex_unlock(&mddev->suspend_mutex);
488 return 0;
489 }
490
491 percpu_ref_kill(&mddev->active_io);
492 if (interruptible)
493 err = wait_event_interruptible(mddev->sb_wait,
494 percpu_ref_is_zero(&mddev->active_io));
495 else
496 wait_event(mddev->sb_wait,
497 percpu_ref_is_zero(&mddev->active_io));
498 if (err) {
499 percpu_ref_resurrect(&mddev->active_io);
500 mutex_unlock(&mddev->suspend_mutex);
501 return err;
502 }
503
504 /*
505 * For raid456, io might be waiting for reshape to make progress,
506 * allow new reshape to start while waiting for io to be done to
507 * prevent deadlock.
508 */
509 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
510
511 /* restrict memory reclaim I/O during raid array is suspend */
512 mddev->noio_flag = memalloc_noio_save();
513
514 mutex_unlock(&mddev->suspend_mutex);
515 return 0;
516 }
517 EXPORT_SYMBOL_GPL(mddev_suspend);
518
__mddev_resume(struct mddev * mddev,bool recovery_needed)519 static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
520 {
521 lockdep_assert_not_held(&mddev->reconfig_mutex);
522
523 mutex_lock(&mddev->suspend_mutex);
524 WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
525 if (mddev->suspended) {
526 mutex_unlock(&mddev->suspend_mutex);
527 return;
528 }
529
530 /* entred the memalloc scope from mddev_suspend() */
531 memalloc_noio_restore(mddev->noio_flag);
532
533 percpu_ref_resurrect(&mddev->active_io);
534 wake_up(&mddev->sb_wait);
535
536 if (recovery_needed)
537 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
538 md_wakeup_thread(mddev->thread);
539 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
540
541 mutex_unlock(&mddev->suspend_mutex);
542 }
543
mddev_resume(struct mddev * mddev)544 void mddev_resume(struct mddev *mddev)
545 {
546 return __mddev_resume(mddev, true);
547 }
548 EXPORT_SYMBOL_GPL(mddev_resume);
549
550 /* sync bdev before setting device to readonly or stopping raid*/
mddev_set_closing_and_sync_blockdev(struct mddev * mddev,int opener_num)551 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
552 {
553 mutex_lock(&mddev->open_mutex);
554 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
555 mutex_unlock(&mddev->open_mutex);
556 return -EBUSY;
557 }
558 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
559 mutex_unlock(&mddev->open_mutex);
560 return -EBUSY;
561 }
562 mutex_unlock(&mddev->open_mutex);
563
564 sync_blockdev(mddev->gendisk->part0);
565 return 0;
566 }
567
568 /*
569 * The only difference from bio_chain_endio() is that the current
570 * bi_status of bio does not affect the bi_status of parent.
571 */
md_end_flush(struct bio * bio)572 static void md_end_flush(struct bio *bio)
573 {
574 struct bio *parent = bio->bi_private;
575
576 /*
577 * If any flush io error before the power failure,
578 * disk data may be lost.
579 */
580 if (bio->bi_status)
581 pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
582 blk_status_to_errno(bio->bi_status));
583
584 bio_put(bio);
585 bio_endio(parent);
586 }
587
md_flush_request(struct mddev * mddev,struct bio * bio)588 bool md_flush_request(struct mddev *mddev, struct bio *bio)
589 {
590 struct md_rdev *rdev;
591 struct bio *new;
592
593 /*
594 * md_flush_reqeust() should be called under md_handle_request() and
595 * 'active_io' is already grabbed. Hence it's safe to get rdev directly
596 * without rcu protection.
597 */
598 WARN_ON(percpu_ref_is_zero(&mddev->active_io));
599
600 rdev_for_each(rdev, mddev) {
601 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
602 continue;
603
604 new = bio_alloc_bioset(rdev->bdev, 0,
605 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
606 &mddev->bio_set);
607 new->bi_private = bio;
608 new->bi_end_io = md_end_flush;
609 bio_inc_remaining(bio);
610 submit_bio(new);
611 }
612
613 if (bio_sectors(bio) == 0) {
614 bio_endio(bio);
615 return true;
616 }
617
618 bio->bi_opf &= ~REQ_PREFLUSH;
619 return false;
620 }
621 EXPORT_SYMBOL(md_flush_request);
622
mddev_get(struct mddev * mddev)623 static inline struct mddev *mddev_get(struct mddev *mddev)
624 {
625 lockdep_assert_held(&all_mddevs_lock);
626
627 if (test_bit(MD_DELETED, &mddev->flags))
628 return NULL;
629 atomic_inc(&mddev->active);
630 return mddev;
631 }
632
633 static void mddev_delayed_delete(struct work_struct *ws);
634
__mddev_put(struct mddev * mddev)635 static void __mddev_put(struct mddev *mddev)
636 {
637 if (mddev->raid_disks || !list_empty(&mddev->disks) ||
638 mddev->ctime || mddev->hold_active)
639 return;
640
641 /*
642 * If array is freed by stopping array, MD_DELETED is set by
643 * do_md_stop(), MD_DELETED is still set here in case mddev is freed
644 * directly by closing a mddev that is created by create_on_open.
645 */
646 set_bit(MD_DELETED, &mddev->flags);
647 /*
648 * Call queue_work inside the spinlock so that flush_workqueue() after
649 * mddev_find will succeed in waiting for the work to be done.
650 */
651 queue_work(md_misc_wq, &mddev->del_work);
652 }
653
mddev_put_locked(struct mddev * mddev)654 static void mddev_put_locked(struct mddev *mddev)
655 {
656 if (atomic_dec_and_test(&mddev->active))
657 __mddev_put(mddev);
658 }
659
mddev_put(struct mddev * mddev)660 void mddev_put(struct mddev *mddev)
661 {
662 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
663 return;
664
665 __mddev_put(mddev);
666 spin_unlock(&all_mddevs_lock);
667 }
668
669 static void md_safemode_timeout(struct timer_list *t);
670 static void md_start_sync(struct work_struct *ws);
671
active_io_release(struct percpu_ref * ref)672 static void active_io_release(struct percpu_ref *ref)
673 {
674 struct mddev *mddev = container_of(ref, struct mddev, active_io);
675
676 wake_up(&mddev->sb_wait);
677 }
678
no_op(struct percpu_ref * r)679 static void no_op(struct percpu_ref *r) {}
680
mddev_set_bitmap_ops(struct mddev * mddev)681 static bool mddev_set_bitmap_ops(struct mddev *mddev)
682 {
683 struct bitmap_operations *old = mddev->bitmap_ops;
684 struct md_submodule_head *head;
685
686 if (mddev->bitmap_id == ID_BITMAP_NONE ||
687 (old && old->head.id == mddev->bitmap_id))
688 return true;
689
690 xa_lock(&md_submodule);
691 head = xa_load(&md_submodule, mddev->bitmap_id);
692
693 if (!head) {
694 pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id);
695 goto err;
696 }
697
698 if (head->type != MD_BITMAP) {
699 pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id);
700 goto err;
701 }
702
703 mddev->bitmap_ops = (void *)head;
704 xa_unlock(&md_submodule);
705
706 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) {
707 if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group))
708 pr_warn("md: cannot register extra bitmap attributes for %s\n",
709 mdname(mddev));
710 else
711 /*
712 * Inform user with KOBJ_CHANGE about new bitmap
713 * attributes.
714 */
715 kobject_uevent(&mddev->kobj, KOBJ_CHANGE);
716 }
717 return true;
718
719 err:
720 xa_unlock(&md_submodule);
721 return false;
722 }
723
mddev_clear_bitmap_ops(struct mddev * mddev)724 static void mddev_clear_bitmap_ops(struct mddev *mddev)
725 {
726 if (!mddev_is_dm(mddev) && mddev->bitmap_ops &&
727 mddev->bitmap_ops->group)
728 sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group);
729
730 mddev->bitmap_ops = NULL;
731 }
732
mddev_init(struct mddev * mddev)733 int mddev_init(struct mddev *mddev)
734 {
735 int err = 0;
736
737 if (!IS_ENABLED(CONFIG_MD_BITMAP))
738 mddev->bitmap_id = ID_BITMAP_NONE;
739 else
740 mddev->bitmap_id = ID_BITMAP;
741
742 if (percpu_ref_init(&mddev->active_io, active_io_release,
743 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
744 return -ENOMEM;
745
746 if (percpu_ref_init(&mddev->writes_pending, no_op,
747 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
748 err = -ENOMEM;
749 goto exit_acitve_io;
750 }
751
752 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
753 if (err)
754 goto exit_writes_pending;
755
756 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
757 if (err)
758 goto exit_bio_set;
759
760 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
761 offsetof(struct md_io_clone, bio_clone), 0);
762 if (err)
763 goto exit_sync_set;
764
765 /* We want to start with the refcount at zero */
766 percpu_ref_put(&mddev->writes_pending);
767
768 mutex_init(&mddev->open_mutex);
769 mutex_init(&mddev->reconfig_mutex);
770 mutex_init(&mddev->suspend_mutex);
771 mutex_init(&mddev->bitmap_info.mutex);
772 INIT_LIST_HEAD(&mddev->disks);
773 INIT_LIST_HEAD(&mddev->all_mddevs);
774 INIT_LIST_HEAD(&mddev->deleting);
775 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
776 atomic_set(&mddev->active, 1);
777 atomic_set(&mddev->openers, 0);
778 atomic_set(&mddev->sync_seq, 0);
779 spin_lock_init(&mddev->lock);
780 init_waitqueue_head(&mddev->sb_wait);
781 init_waitqueue_head(&mddev->recovery_wait);
782 mddev->reshape_position = MaxSector;
783 mddev->reshape_backwards = 0;
784 mddev->last_sync_action = ACTION_IDLE;
785 mddev->resync_min = 0;
786 mddev->resync_max = MaxSector;
787 mddev->level = LEVEL_NONE;
788
789 INIT_WORK(&mddev->sync_work, md_start_sync);
790 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
791
792 return 0;
793
794 exit_sync_set:
795 bioset_exit(&mddev->sync_set);
796 exit_bio_set:
797 bioset_exit(&mddev->bio_set);
798 exit_writes_pending:
799 percpu_ref_exit(&mddev->writes_pending);
800 exit_acitve_io:
801 percpu_ref_exit(&mddev->active_io);
802 return err;
803 }
804 EXPORT_SYMBOL_GPL(mddev_init);
805
mddev_destroy(struct mddev * mddev)806 void mddev_destroy(struct mddev *mddev)
807 {
808 bioset_exit(&mddev->bio_set);
809 bioset_exit(&mddev->sync_set);
810 bioset_exit(&mddev->io_clone_set);
811 percpu_ref_exit(&mddev->active_io);
812 percpu_ref_exit(&mddev->writes_pending);
813 }
814 EXPORT_SYMBOL_GPL(mddev_destroy);
815
mddev_find_locked(dev_t unit)816 static struct mddev *mddev_find_locked(dev_t unit)
817 {
818 struct mddev *mddev;
819
820 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
821 if (mddev->unit == unit)
822 return mddev;
823
824 return NULL;
825 }
826
827 /* find an unused unit number */
mddev_alloc_unit(void)828 static dev_t mddev_alloc_unit(void)
829 {
830 static int next_minor = 512;
831 int start = next_minor;
832 bool is_free = 0;
833 dev_t dev = 0;
834
835 while (!is_free) {
836 dev = MKDEV(MD_MAJOR, next_minor);
837 next_minor++;
838 if (next_minor > MINORMASK)
839 next_minor = 0;
840 if (next_minor == start)
841 return 0; /* Oh dear, all in use. */
842 is_free = !mddev_find_locked(dev);
843 }
844
845 return dev;
846 }
847
mddev_alloc(dev_t unit)848 static struct mddev *mddev_alloc(dev_t unit)
849 {
850 struct mddev *new;
851 int error;
852
853 if (unit && MAJOR(unit) != MD_MAJOR)
854 unit &= ~((1 << MdpMinorShift) - 1);
855
856 new = kzalloc_obj(*new);
857 if (!new)
858 return ERR_PTR(-ENOMEM);
859
860 error = mddev_init(new);
861 if (error)
862 goto out_free_new;
863
864 spin_lock(&all_mddevs_lock);
865 if (unit) {
866 error = -EEXIST;
867 if (mddev_find_locked(unit))
868 goto out_destroy_new;
869 new->unit = unit;
870 if (MAJOR(unit) == MD_MAJOR)
871 new->md_minor = MINOR(unit);
872 else
873 new->md_minor = MINOR(unit) >> MdpMinorShift;
874 new->hold_active = UNTIL_IOCTL;
875 } else {
876 error = -ENODEV;
877 new->unit = mddev_alloc_unit();
878 if (!new->unit)
879 goto out_destroy_new;
880 new->md_minor = MINOR(new->unit);
881 new->hold_active = UNTIL_STOP;
882 }
883
884 list_add(&new->all_mddevs, &all_mddevs);
885 spin_unlock(&all_mddevs_lock);
886 return new;
887
888 out_destroy_new:
889 spin_unlock(&all_mddevs_lock);
890 mddev_destroy(new);
891 out_free_new:
892 kfree(new);
893 return ERR_PTR(error);
894 }
895
mddev_free(struct mddev * mddev)896 static void mddev_free(struct mddev *mddev)
897 {
898 spin_lock(&all_mddevs_lock);
899 list_del(&mddev->all_mddevs);
900 spin_unlock(&all_mddevs_lock);
901
902 mddev_destroy(mddev);
903 kfree(mddev);
904 }
905
906 static const struct attribute_group md_redundancy_group;
907
mddev_unlock(struct mddev * mddev)908 void mddev_unlock(struct mddev *mddev)
909 {
910 struct md_rdev *rdev;
911 struct md_rdev *tmp;
912 LIST_HEAD(delete);
913
914 if (!list_empty(&mddev->deleting))
915 list_splice_init(&mddev->deleting, &delete);
916
917 if (mddev->to_remove) {
918 /* These cannot be removed under reconfig_mutex as
919 * an access to the files will try to take reconfig_mutex
920 * while holding the file unremovable, which leads to
921 * a deadlock.
922 * So hold set sysfs_active while the remove in happeing,
923 * and anything else which might set ->to_remove or my
924 * otherwise change the sysfs namespace will fail with
925 * -EBUSY if sysfs_active is still set.
926 * We set sysfs_active under reconfig_mutex and elsewhere
927 * test it under the same mutex to ensure its correct value
928 * is seen.
929 */
930 const struct attribute_group *to_remove = mddev->to_remove;
931 mddev->to_remove = NULL;
932 mddev->sysfs_active = 1;
933 mutex_unlock(&mddev->reconfig_mutex);
934
935 if (mddev->kobj.sd) {
936 if (to_remove != &md_redundancy_group)
937 sysfs_remove_group(&mddev->kobj, to_remove);
938 if (mddev->pers == NULL ||
939 mddev->pers->sync_request == NULL) {
940 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
941 if (mddev->sysfs_action)
942 sysfs_put(mddev->sysfs_action);
943 if (mddev->sysfs_completed)
944 sysfs_put(mddev->sysfs_completed);
945 if (mddev->sysfs_degraded)
946 sysfs_put(mddev->sysfs_degraded);
947 mddev->sysfs_action = NULL;
948 mddev->sysfs_completed = NULL;
949 mddev->sysfs_degraded = NULL;
950 }
951 }
952 mddev->sysfs_active = 0;
953 } else
954 mutex_unlock(&mddev->reconfig_mutex);
955
956 md_wakeup_thread(mddev->thread);
957 wake_up(&mddev->sb_wait);
958
959 list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
960 list_del_init(&rdev->same_set);
961 kobject_del(&rdev->kobj);
962 export_rdev(rdev, mddev);
963 }
964
965 if (!legacy_async_del_gendisk) {
966 /*
967 * Call del_gendisk after release reconfig_mutex to avoid
968 * deadlock (e.g. call del_gendisk under the lock and an
969 * access to sysfs files waits the lock)
970 * And MD_DELETED is only used for md raid which is set in
971 * do_md_stop. dm raid only uses md_stop to stop. So dm raid
972 * doesn't need to check MD_DELETED when getting reconfig lock
973 */
974 if (test_bit(MD_DELETED, &mddev->flags) &&
975 !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) {
976 kobject_del(&mddev->kobj);
977 del_gendisk(mddev->gendisk);
978 }
979 }
980 }
981 EXPORT_SYMBOL_GPL(mddev_unlock);
982
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)983 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
984 {
985 struct md_rdev *rdev;
986
987 rdev_for_each_rcu(rdev, mddev)
988 if (rdev->desc_nr == nr)
989 return rdev;
990
991 return NULL;
992 }
993 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
994
find_rdev(struct mddev * mddev,dev_t dev)995 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
996 {
997 struct md_rdev *rdev;
998
999 rdev_for_each(rdev, mddev)
1000 if (rdev->bdev->bd_dev == dev)
1001 return rdev;
1002
1003 return NULL;
1004 }
1005
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)1006 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
1007 {
1008 struct md_rdev *rdev;
1009
1010 rdev_for_each_rcu(rdev, mddev)
1011 if (rdev->bdev->bd_dev == dev)
1012 return rdev;
1013
1014 return NULL;
1015 }
1016 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
1017
get_pers(int level,char * clevel)1018 static struct md_personality *get_pers(int level, char *clevel)
1019 {
1020 struct md_personality *ret = NULL;
1021 struct md_submodule_head *head;
1022 unsigned long i;
1023
1024 xa_lock(&md_submodule);
1025 xa_for_each(&md_submodule, i, head) {
1026 if (head->type != MD_PERSONALITY)
1027 continue;
1028 if ((level != LEVEL_NONE && head->id == level) ||
1029 !strcmp(head->name, clevel)) {
1030 if (try_module_get(head->owner))
1031 ret = (void *)head;
1032 break;
1033 }
1034 }
1035 xa_unlock(&md_submodule);
1036
1037 if (!ret) {
1038 if (level != LEVEL_NONE)
1039 pr_warn("md: personality for level %d is not loaded!\n",
1040 level);
1041 else
1042 pr_warn("md: personality for level %s is not loaded!\n",
1043 clevel);
1044 }
1045
1046 return ret;
1047 }
1048
put_pers(struct md_personality * pers)1049 static void put_pers(struct md_personality *pers)
1050 {
1051 module_put(pers->head.owner);
1052 }
1053
1054 /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)1055 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
1056 {
1057 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
1058 }
1059
alloc_disk_sb(struct md_rdev * rdev)1060 static int alloc_disk_sb(struct md_rdev *rdev)
1061 {
1062 rdev->sb_page = alloc_page(GFP_KERNEL);
1063 if (!rdev->sb_page)
1064 return -ENOMEM;
1065 return 0;
1066 }
1067
md_rdev_clear(struct md_rdev * rdev)1068 void md_rdev_clear(struct md_rdev *rdev)
1069 {
1070 if (rdev->sb_page) {
1071 put_page(rdev->sb_page);
1072 rdev->sb_loaded = 0;
1073 rdev->sb_page = NULL;
1074 rdev->sb_start = 0;
1075 rdev->sectors = 0;
1076 }
1077 if (rdev->bb_page) {
1078 put_page(rdev->bb_page);
1079 rdev->bb_page = NULL;
1080 }
1081 badblocks_exit(&rdev->badblocks);
1082 }
1083 EXPORT_SYMBOL_GPL(md_rdev_clear);
1084
super_written(struct bio * bio)1085 static void super_written(struct bio *bio)
1086 {
1087 struct md_rdev *rdev = bio->bi_private;
1088 struct mddev *mddev = rdev->mddev;
1089
1090 if (bio->bi_status) {
1091 pr_err("md: %s gets error=%d\n", __func__,
1092 blk_status_to_errno(bio->bi_status));
1093 md_error(mddev, rdev);
1094 if (!test_bit(Faulty, &rdev->flags)
1095 && (bio->bi_opf & MD_FAILFAST)) {
1096 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
1097 set_bit(LastDev, &rdev->flags);
1098 }
1099 } else
1100 clear_bit(LastDev, &rdev->flags);
1101
1102 bio_put(bio);
1103
1104 rdev_dec_pending(rdev, mddev);
1105
1106 if (atomic_dec_and_test(&mddev->pending_writes))
1107 wake_up(&mddev->sb_wait);
1108 }
1109
1110 /**
1111 * md_write_metadata - write metadata to underlying disk, including
1112 * array superblock, badblocks, bitmap superblock and bitmap bits.
1113 * @mddev: the array to write
1114 * @rdev: the underlying disk to write
1115 * @sector: the offset to @rdev
1116 * @size: the length of the metadata
1117 * @page: the metadata
1118 * @offset: the offset to @page
1119 *
1120 * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment
1121 * mddev->pending_writes before returning, and decrement it on completion,
1122 * waking up sb_wait. Caller must call md_super_wait() after issuing io to all
1123 * rdev. If an error occurred, md_error() will be called, and the @rdev will be
1124 * kicked out from @mddev.
1125 */
md_write_metadata(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page,unsigned int offset)1126 void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
1127 sector_t sector, int size, struct page *page,
1128 unsigned int offset)
1129 {
1130 struct bio *bio;
1131
1132 if (!page)
1133 return;
1134
1135 if (test_bit(Faulty, &rdev->flags))
1136 return;
1137
1138 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1139 1,
1140 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1141 | REQ_PREFLUSH | REQ_FUA,
1142 GFP_NOIO, &mddev->sync_set);
1143
1144 atomic_inc(&rdev->nr_pending);
1145
1146 bio->bi_iter.bi_sector = sector;
1147 __bio_add_page(bio, page, size, offset);
1148 bio->bi_private = rdev;
1149 bio->bi_end_io = super_written;
1150
1151 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1152 test_bit(FailFast, &rdev->flags) &&
1153 !test_bit(LastDev, &rdev->flags))
1154 bio->bi_opf |= MD_FAILFAST;
1155
1156 atomic_inc(&mddev->pending_writes);
1157 submit_bio(bio);
1158 }
1159
md_super_wait(struct mddev * mddev)1160 int md_super_wait(struct mddev *mddev)
1161 {
1162 /* wait for all superblock writes that were scheduled to complete */
1163 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1164 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1165 return -EAGAIN;
1166 return 0;
1167 }
1168
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)1169 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1170 struct page *page, blk_opf_t opf, bool metadata_op)
1171 {
1172 struct bio bio;
1173 struct bio_vec bvec;
1174
1175 if (metadata_op && rdev->meta_bdev)
1176 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1177 else
1178 bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1179
1180 if (metadata_op)
1181 bio.bi_iter.bi_sector = sector + rdev->sb_start;
1182 else if (rdev->mddev->reshape_position != MaxSector &&
1183 (rdev->mddev->reshape_backwards ==
1184 (sector >= rdev->mddev->reshape_position)))
1185 bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1186 else
1187 bio.bi_iter.bi_sector = sector + rdev->data_offset;
1188 __bio_add_page(&bio, page, size, 0);
1189
1190 submit_bio_wait(&bio);
1191
1192 return !bio.bi_status;
1193 }
1194 EXPORT_SYMBOL_GPL(sync_page_io);
1195
read_disk_sb(struct md_rdev * rdev,int size)1196 static int read_disk_sb(struct md_rdev *rdev, int size)
1197 {
1198 if (rdev->sb_loaded)
1199 return 0;
1200
1201 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1202 goto fail;
1203 rdev->sb_loaded = 1;
1204 return 0;
1205
1206 fail:
1207 pr_err("md: disabled device %pg, could not read superblock.\n",
1208 rdev->bdev);
1209 return -EINVAL;
1210 }
1211
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1212 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1213 {
1214 return sb1->set_uuid0 == sb2->set_uuid0 &&
1215 sb1->set_uuid1 == sb2->set_uuid1 &&
1216 sb1->set_uuid2 == sb2->set_uuid2 &&
1217 sb1->set_uuid3 == sb2->set_uuid3;
1218 }
1219
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1220 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1221 {
1222 int ret;
1223 mdp_super_t *tmp1, *tmp2;
1224
1225 tmp1 = kmalloc_obj(*tmp1);
1226 tmp2 = kmalloc_obj(*tmp2);
1227
1228 if (!tmp1 || !tmp2) {
1229 ret = 0;
1230 goto abort;
1231 }
1232
1233 *tmp1 = *sb1;
1234 *tmp2 = *sb2;
1235
1236 /*
1237 * nr_disks is not constant
1238 */
1239 tmp1->nr_disks = 0;
1240 tmp2->nr_disks = 0;
1241
1242 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1243 abort:
1244 kfree(tmp1);
1245 kfree(tmp2);
1246 return ret;
1247 }
1248
md_csum_fold(u32 csum)1249 static u32 md_csum_fold(u32 csum)
1250 {
1251 csum = (csum & 0xffff) + (csum >> 16);
1252 return (csum & 0xffff) + (csum >> 16);
1253 }
1254
calc_sb_csum(mdp_super_t * sb)1255 static unsigned int calc_sb_csum(mdp_super_t *sb)
1256 {
1257 u64 newcsum = 0;
1258 u32 *sb32 = (u32*)sb;
1259 int i;
1260 unsigned int disk_csum, csum;
1261
1262 disk_csum = sb->sb_csum;
1263 sb->sb_csum = 0;
1264
1265 for (i = 0; i < MD_SB_BYTES/4 ; i++)
1266 newcsum += sb32[i];
1267 csum = (newcsum & 0xffffffff) + (newcsum>>32);
1268
1269 #ifdef CONFIG_ALPHA
1270 /* This used to use csum_partial, which was wrong for several
1271 * reasons including that different results are returned on
1272 * different architectures. It isn't critical that we get exactly
1273 * the same return value as before (we always csum_fold before
1274 * testing, and that removes any differences). However as we
1275 * know that csum_partial always returned a 16bit value on
1276 * alphas, do a fold to maximise conformity to previous behaviour.
1277 */
1278 sb->sb_csum = md_csum_fold(disk_csum);
1279 #else
1280 sb->sb_csum = disk_csum;
1281 #endif
1282 return csum;
1283 }
1284
1285 /*
1286 * Handle superblock details.
1287 * We want to be able to handle multiple superblock formats
1288 * so we have a common interface to them all, and an array of
1289 * different handlers.
1290 * We rely on user-space to write the initial superblock, and support
1291 * reading and updating of superblocks.
1292 * Interface methods are:
1293 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1294 * loads and validates a superblock on dev.
1295 * if refdev != NULL, compare superblocks on both devices
1296 * Return:
1297 * 0 - dev has a superblock that is compatible with refdev
1298 * 1 - dev has a superblock that is compatible and newer than refdev
1299 * so dev should be used as the refdev in future
1300 * -EINVAL superblock incompatible or invalid
1301 * -othererror e.g. -EIO
1302 *
1303 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1304 * Verify that dev is acceptable into mddev.
1305 * The first time, mddev->raid_disks will be 0, and data from
1306 * dev should be merged in. Subsequent calls check that dev
1307 * is new enough. Return 0 or -EINVAL
1308 *
1309 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1310 * Update the superblock for rdev with data in mddev
1311 * This does not write to disc.
1312 *
1313 */
1314
1315 struct super_type {
1316 char *name;
1317 struct module *owner;
1318 int (*load_super)(struct md_rdev *rdev,
1319 struct md_rdev *refdev,
1320 int minor_version);
1321 int (*validate_super)(struct mddev *mddev,
1322 struct md_rdev *freshest,
1323 struct md_rdev *rdev);
1324 void (*sync_super)(struct mddev *mddev,
1325 struct md_rdev *rdev);
1326 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1327 sector_t num_sectors);
1328 int (*allow_new_offset)(struct md_rdev *rdev,
1329 unsigned long long new_offset);
1330 };
1331
1332 /*
1333 * Check that the given mddev has no bitmap.
1334 *
1335 * This function is called from the run method of all personalities that do not
1336 * support bitmaps. It prints an error message and returns non-zero if mddev
1337 * has a bitmap. Otherwise, it returns 0.
1338 *
1339 */
md_check_no_bitmap(struct mddev * mddev)1340 int md_check_no_bitmap(struct mddev *mddev)
1341 {
1342 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1343 return 0;
1344 pr_warn("%s: bitmaps are not supported for %s\n",
1345 mdname(mddev), mddev->pers->head.name);
1346 return 1;
1347 }
1348 EXPORT_SYMBOL(md_check_no_bitmap);
1349
1350 /*
1351 * load_super for 0.90.0
1352 */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1353 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1354 {
1355 mdp_super_t *sb;
1356 int ret;
1357 bool spare_disk = true;
1358
1359 /*
1360 * Calculate the position of the superblock (512byte sectors),
1361 * it's at the end of the disk.
1362 *
1363 * It also happens to be a multiple of 4Kb.
1364 */
1365 rdev->sb_start = calc_dev_sboffset(rdev);
1366
1367 ret = read_disk_sb(rdev, MD_SB_BYTES);
1368 if (ret)
1369 return ret;
1370
1371 ret = -EINVAL;
1372
1373 sb = page_address(rdev->sb_page);
1374
1375 if (sb->md_magic != MD_SB_MAGIC) {
1376 pr_warn("md: invalid raid superblock magic on %pg\n",
1377 rdev->bdev);
1378 goto abort;
1379 }
1380
1381 if (sb->major_version != 0 ||
1382 sb->minor_version < 90 ||
1383 sb->minor_version > 91) {
1384 pr_warn("Bad version number %d.%d on %pg\n",
1385 sb->major_version, sb->minor_version, rdev->bdev);
1386 goto abort;
1387 }
1388
1389 if (sb->raid_disks <= 0)
1390 goto abort;
1391
1392 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1393 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1394 goto abort;
1395 }
1396
1397 rdev->preferred_minor = sb->md_minor;
1398 rdev->data_offset = 0;
1399 rdev->new_data_offset = 0;
1400 rdev->sb_size = MD_SB_BYTES;
1401 rdev->badblocks.shift = -1;
1402
1403 rdev->desc_nr = sb->this_disk.number;
1404
1405 /* not spare disk */
1406 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1407 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1408 spare_disk = false;
1409
1410 if (!refdev) {
1411 if (!spare_disk)
1412 ret = 1;
1413 else
1414 ret = 0;
1415 } else {
1416 __u64 ev1, ev2;
1417 mdp_super_t *refsb = page_address(refdev->sb_page);
1418 if (!md_uuid_equal(refsb, sb)) {
1419 pr_warn("md: %pg has different UUID to %pg\n",
1420 rdev->bdev, refdev->bdev);
1421 goto abort;
1422 }
1423 if (!md_sb_equal(refsb, sb)) {
1424 pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1425 rdev->bdev, refdev->bdev);
1426 goto abort;
1427 }
1428 ev1 = md_event(sb);
1429 ev2 = md_event(refsb);
1430
1431 if (!spare_disk && ev1 > ev2)
1432 ret = 1;
1433 else
1434 ret = 0;
1435 }
1436 rdev->sectors = rdev->sb_start;
1437 /* Limit to 4TB as metadata cannot record more than that.
1438 * (not needed for Linear and RAID0 as metadata doesn't
1439 * record this size)
1440 */
1441 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1442 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1443
1444 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1445 /* "this cannot possibly happen" ... */
1446 ret = -EINVAL;
1447
1448 abort:
1449 return ret;
1450 }
1451
md_bitmap_events_cleared(struct mddev * mddev)1452 static u64 md_bitmap_events_cleared(struct mddev *mddev)
1453 {
1454 struct md_bitmap_stats stats;
1455 int err;
1456
1457 if (!md_bitmap_enabled(mddev, false))
1458 return 0;
1459
1460 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
1461 if (err)
1462 return 0;
1463
1464 return stats.events_cleared;
1465 }
1466
1467 /*
1468 * validate_super for 0.90.0
1469 * note: we are not using "freshest" for 0.9 superblock
1470 */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1471 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1472 {
1473 mdp_disk_t *desc;
1474 mdp_super_t *sb = page_address(rdev->sb_page);
1475 __u64 ev1 = md_event(sb);
1476
1477 rdev->raid_disk = -1;
1478 clear_bit(Faulty, &rdev->flags);
1479 clear_bit(In_sync, &rdev->flags);
1480 clear_bit(Bitmap_sync, &rdev->flags);
1481 clear_bit(WriteMostly, &rdev->flags);
1482
1483 if (mddev->raid_disks == 0) {
1484 mddev->major_version = 0;
1485 mddev->minor_version = sb->minor_version;
1486 mddev->patch_version = sb->patch_version;
1487 mddev->external = 0;
1488 mddev->chunk_sectors = sb->chunk_size >> 9;
1489 mddev->ctime = sb->ctime;
1490 mddev->utime = sb->utime;
1491 mddev->level = sb->level;
1492 mddev->clevel[0] = 0;
1493 mddev->layout = sb->layout;
1494 mddev->raid_disks = sb->raid_disks;
1495 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1496 mddev->events = ev1;
1497 mddev->bitmap_info.offset = 0;
1498 mddev->bitmap_info.space = 0;
1499 /* bitmap can use 60 K after the 4K superblocks */
1500 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1501 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1502 mddev->reshape_backwards = 0;
1503
1504 if (mddev->minor_version >= 91) {
1505 mddev->reshape_position = sb->reshape_position;
1506 mddev->delta_disks = sb->delta_disks;
1507 mddev->new_level = sb->new_level;
1508 mddev->new_layout = sb->new_layout;
1509 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1510 if (mddev->delta_disks < 0)
1511 mddev->reshape_backwards = 1;
1512 } else {
1513 mddev->reshape_position = MaxSector;
1514 mddev->delta_disks = 0;
1515 mddev->new_level = mddev->level;
1516 mddev->new_layout = mddev->layout;
1517 mddev->new_chunk_sectors = mddev->chunk_sectors;
1518 }
1519 if (mddev->level == 0)
1520 mddev->layout = -1;
1521
1522 if (sb->state & (1<<MD_SB_CLEAN))
1523 mddev->resync_offset = MaxSector;
1524 else {
1525 if (sb->events_hi == sb->cp_events_hi &&
1526 sb->events_lo == sb->cp_events_lo) {
1527 mddev->resync_offset = sb->recovery_cp;
1528 } else
1529 mddev->resync_offset = 0;
1530 }
1531
1532 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1533 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1534 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1535 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1536
1537 mddev->max_disks = MD_SB_DISKS;
1538
1539 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1540 mddev->bitmap_info.file == NULL) {
1541 mddev->bitmap_info.offset =
1542 mddev->bitmap_info.default_offset;
1543 mddev->bitmap_info.space =
1544 mddev->bitmap_info.default_space;
1545 }
1546
1547 } else if (mddev->pers == NULL) {
1548 /* Insist on good event counter while assembling, except
1549 * for spares (which don't need an event count) */
1550 ++ev1;
1551 if (sb->disks[rdev->desc_nr].state & (
1552 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1553 if (ev1 < mddev->events)
1554 return -EINVAL;
1555 } else if (mddev->bitmap) {
1556 /* if adding to array with a bitmap, then we can accept an
1557 * older device ... but not too old.
1558 */
1559 if (ev1 < md_bitmap_events_cleared(mddev))
1560 return 0;
1561 if (ev1 < mddev->events)
1562 set_bit(Bitmap_sync, &rdev->flags);
1563 } else {
1564 if (ev1 < mddev->events)
1565 /* just a hot-add of a new device, leave raid_disk at -1 */
1566 return 0;
1567 }
1568
1569 desc = sb->disks + rdev->desc_nr;
1570
1571 if (desc->state & (1<<MD_DISK_FAULTY))
1572 set_bit(Faulty, &rdev->flags);
1573 else if (desc->state & (1<<MD_DISK_SYNC)) {
1574 set_bit(In_sync, &rdev->flags);
1575 rdev->raid_disk = desc->raid_disk;
1576 rdev->saved_raid_disk = desc->raid_disk;
1577 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1578 /* active but not in sync implies recovery up to
1579 * reshape position. We don't know exactly where
1580 * that is, so set to zero for now
1581 */
1582 if (mddev->minor_version >= 91) {
1583 rdev->recovery_offset = 0;
1584 rdev->raid_disk = desc->raid_disk;
1585 }
1586 }
1587 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1588 set_bit(WriteMostly, &rdev->flags);
1589 if (desc->state & (1<<MD_DISK_FAILFAST))
1590 set_bit(FailFast, &rdev->flags);
1591 return 0;
1592 }
1593
1594 /*
1595 * sync_super for 0.90.0
1596 */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1597 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1598 {
1599 mdp_super_t *sb;
1600 struct md_rdev *rdev2;
1601 int next_spare = mddev->raid_disks;
1602
1603 /* make rdev->sb match mddev data..
1604 *
1605 * 1/ zero out disks
1606 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1607 * 3/ any empty disks < next_spare become removed
1608 *
1609 * disks[0] gets initialised to REMOVED because
1610 * we cannot be sure from other fields if it has
1611 * been initialised or not.
1612 */
1613 int i;
1614 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1615
1616 rdev->sb_size = MD_SB_BYTES;
1617
1618 sb = page_address(rdev->sb_page);
1619
1620 memset(sb, 0, sizeof(*sb));
1621
1622 sb->md_magic = MD_SB_MAGIC;
1623 sb->major_version = mddev->major_version;
1624 sb->patch_version = mddev->patch_version;
1625 sb->gvalid_words = 0; /* ignored */
1626 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1627 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1628 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1629 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1630
1631 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1632 sb->level = mddev->level;
1633 sb->size = mddev->dev_sectors / 2;
1634 sb->raid_disks = mddev->raid_disks;
1635 sb->md_minor = mddev->md_minor;
1636 sb->not_persistent = 0;
1637 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1638 sb->state = 0;
1639 sb->events_hi = (mddev->events>>32);
1640 sb->events_lo = (u32)mddev->events;
1641
1642 if (mddev->reshape_position == MaxSector)
1643 sb->minor_version = 90;
1644 else {
1645 sb->minor_version = 91;
1646 sb->reshape_position = mddev->reshape_position;
1647 sb->new_level = mddev->new_level;
1648 sb->delta_disks = mddev->delta_disks;
1649 sb->new_layout = mddev->new_layout;
1650 sb->new_chunk = mddev->new_chunk_sectors << 9;
1651 }
1652 mddev->minor_version = sb->minor_version;
1653 if (mddev->in_sync)
1654 {
1655 sb->recovery_cp = mddev->resync_offset;
1656 sb->cp_events_hi = (mddev->events>>32);
1657 sb->cp_events_lo = (u32)mddev->events;
1658 if (mddev->resync_offset == MaxSector)
1659 sb->state = (1<< MD_SB_CLEAN);
1660 } else
1661 sb->recovery_cp = 0;
1662
1663 sb->layout = mddev->layout;
1664 sb->chunk_size = mddev->chunk_sectors << 9;
1665
1666 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1667 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1668
1669 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1670 rdev_for_each(rdev2, mddev) {
1671 mdp_disk_t *d;
1672 int desc_nr;
1673 int is_active = test_bit(In_sync, &rdev2->flags);
1674
1675 if (rdev2->raid_disk >= 0 &&
1676 sb->minor_version >= 91)
1677 /* we have nowhere to store the recovery_offset,
1678 * but if it is not below the reshape_position,
1679 * we can piggy-back on that.
1680 */
1681 is_active = 1;
1682 if (rdev2->raid_disk < 0 ||
1683 test_bit(Faulty, &rdev2->flags))
1684 is_active = 0;
1685 if (is_active)
1686 desc_nr = rdev2->raid_disk;
1687 else
1688 desc_nr = next_spare++;
1689 rdev2->desc_nr = desc_nr;
1690 d = &sb->disks[rdev2->desc_nr];
1691 nr_disks++;
1692 d->number = rdev2->desc_nr;
1693 d->major = MAJOR(rdev2->bdev->bd_dev);
1694 d->minor = MINOR(rdev2->bdev->bd_dev);
1695 if (is_active)
1696 d->raid_disk = rdev2->raid_disk;
1697 else
1698 d->raid_disk = rdev2->desc_nr; /* compatibility */
1699 if (test_bit(Faulty, &rdev2->flags))
1700 d->state = (1<<MD_DISK_FAULTY);
1701 else if (is_active) {
1702 d->state = (1<<MD_DISK_ACTIVE);
1703 if (test_bit(In_sync, &rdev2->flags))
1704 d->state |= (1<<MD_DISK_SYNC);
1705 active++;
1706 working++;
1707 } else {
1708 d->state = 0;
1709 spare++;
1710 working++;
1711 }
1712 if (test_bit(WriteMostly, &rdev2->flags))
1713 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1714 if (test_bit(FailFast, &rdev2->flags))
1715 d->state |= (1<<MD_DISK_FAILFAST);
1716 }
1717 /* now set the "removed" and "faulty" bits on any missing devices */
1718 for (i=0 ; i < mddev->raid_disks ; i++) {
1719 mdp_disk_t *d = &sb->disks[i];
1720 if (d->state == 0 && d->number == 0) {
1721 d->number = i;
1722 d->raid_disk = i;
1723 d->state = (1<<MD_DISK_REMOVED);
1724 d->state |= (1<<MD_DISK_FAULTY);
1725 failed++;
1726 }
1727 }
1728 sb->nr_disks = nr_disks;
1729 sb->active_disks = active;
1730 sb->working_disks = working;
1731 sb->failed_disks = failed;
1732 sb->spare_disks = spare;
1733
1734 sb->this_disk = sb->disks[rdev->desc_nr];
1735 sb->sb_csum = calc_sb_csum(sb);
1736 }
1737
1738 /*
1739 * rdev_size_change for 0.90.0
1740 */
1741 static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1742 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1743 {
1744 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1745 return 0; /* component must fit device */
1746 if (rdev->mddev->bitmap_info.offset)
1747 return 0; /* can't move bitmap */
1748 rdev->sb_start = calc_dev_sboffset(rdev);
1749 if (!num_sectors || num_sectors > rdev->sb_start)
1750 num_sectors = rdev->sb_start;
1751 /* Limit to 4TB as metadata cannot record more than that.
1752 * 4TB == 2^32 KB, or 2*2^32 sectors.
1753 */
1754 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1755 num_sectors = (sector_t)(2ULL << 32) - 2;
1756 do {
1757 md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
1758 rdev->sb_size, rdev->sb_page, 0);
1759 } while (md_super_wait(rdev->mddev) < 0);
1760 return num_sectors;
1761 }
1762
1763 static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1764 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1765 {
1766 /* non-zero offset changes not possible with v0.90 */
1767 return new_offset == 0;
1768 }
1769
1770 /*
1771 * version 1 superblock
1772 */
1773
calc_sb_1_csum(struct mdp_superblock_1 * sb)1774 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1775 {
1776 __le32 disk_csum;
1777 u32 csum;
1778 unsigned long long newcsum;
1779 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1780 __le32 *isuper = (__le32*)sb;
1781
1782 disk_csum = sb->sb_csum;
1783 sb->sb_csum = 0;
1784 newcsum = 0;
1785 for (; size >= 4; size -= 4)
1786 newcsum += le32_to_cpu(*isuper++);
1787
1788 if (size == 2)
1789 newcsum += le16_to_cpu(*(__le16*) isuper);
1790
1791 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1792 sb->sb_csum = disk_csum;
1793 return cpu_to_le32(csum);
1794 }
1795
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1796 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1797 {
1798 struct mdp_superblock_1 *sb;
1799 int ret;
1800 sector_t sb_start;
1801 sector_t sectors;
1802 int bmask;
1803 bool spare_disk = true;
1804
1805 /*
1806 * Calculate the position of the superblock in 512byte sectors.
1807 * It is always aligned to a 4K boundary and
1808 * depeding on minor_version, it can be:
1809 * 0: At least 8K, but less than 12K, from end of device
1810 * 1: At start of device
1811 * 2: 4K from start of device.
1812 */
1813 switch(minor_version) {
1814 case 0:
1815 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1816 sb_start &= ~(sector_t)(4*2-1);
1817 break;
1818 case 1:
1819 sb_start = 0;
1820 break;
1821 case 2:
1822 sb_start = 8;
1823 break;
1824 default:
1825 return -EINVAL;
1826 }
1827 rdev->sb_start = sb_start;
1828
1829 /* superblock is rarely larger than 1K, but it can be larger,
1830 * and it is safe to read 4k, so we do that
1831 */
1832 ret = read_disk_sb(rdev, 4096);
1833 if (ret) return ret;
1834
1835 sb = page_address(rdev->sb_page);
1836
1837 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1838 sb->major_version != cpu_to_le32(1) ||
1839 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1840 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1841 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1842 return -EINVAL;
1843
1844 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1845 pr_warn("md: invalid superblock checksum on %pg\n",
1846 rdev->bdev);
1847 return -EINVAL;
1848 }
1849 if (le64_to_cpu(sb->data_size) < 10) {
1850 pr_warn("md: data_size too small on %pg\n",
1851 rdev->bdev);
1852 return -EINVAL;
1853 }
1854 if (sb->pad0 ||
1855 sb->pad3[0] ||
1856 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) {
1857 pr_warn("Some padding is non-zero on %pg, might be a new feature\n",
1858 rdev->bdev);
1859 if (check_new_feature)
1860 return -EINVAL;
1861 pr_warn("check_new_feature is disabled, data corruption possible\n");
1862 }
1863
1864 rdev->preferred_minor = 0xffff;
1865 rdev->data_offset = le64_to_cpu(sb->data_offset);
1866 rdev->new_data_offset = rdev->data_offset;
1867 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1868 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1869 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1870 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1871
1872 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1873 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1874 if (rdev->sb_size & bmask)
1875 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1876
1877 if (minor_version
1878 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1879 return -EINVAL;
1880 if (minor_version
1881 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1882 return -EINVAL;
1883
1884 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1885
1886 if (!rdev->bb_page) {
1887 rdev->bb_page = alloc_page(GFP_KERNEL);
1888 if (!rdev->bb_page)
1889 return -ENOMEM;
1890 }
1891 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1892 rdev->badblocks.count == 0) {
1893 /* need to load the bad block list.
1894 * Currently we limit it to one page.
1895 */
1896 s32 offset;
1897 sector_t bb_sector;
1898 __le64 *bbp;
1899 int i;
1900 int sectors = le16_to_cpu(sb->bblog_size);
1901 if (sectors > (PAGE_SIZE / 512))
1902 return -EINVAL;
1903 offset = le32_to_cpu(sb->bblog_offset);
1904 if (offset == 0)
1905 return -EINVAL;
1906 bb_sector = (long long)offset;
1907 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1908 rdev->bb_page, REQ_OP_READ, true))
1909 return -EIO;
1910 bbp = (__le64 *)page_address(rdev->bb_page);
1911 rdev->badblocks.shift = sb->bblog_shift;
1912 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1913 u64 bb = le64_to_cpu(*bbp);
1914 int count = bb & (0x3ff);
1915 u64 sector = bb >> 10;
1916 sector <<= sb->bblog_shift;
1917 count <<= sb->bblog_shift;
1918 if (bb + 1 == 0)
1919 break;
1920 if (!badblocks_set(&rdev->badblocks, sector, count, 1))
1921 return -EINVAL;
1922 }
1923 } else if (sb->bblog_offset != 0)
1924 rdev->badblocks.shift = 0;
1925
1926 if ((le32_to_cpu(sb->feature_map) &
1927 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1928 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1929 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1930 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1931 }
1932
1933 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1934 sb->level != 0)
1935 return -EINVAL;
1936
1937 /* not spare disk */
1938 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1939 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1940 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1941 spare_disk = false;
1942
1943 if (!refdev) {
1944 if (!spare_disk)
1945 ret = 1;
1946 else
1947 ret = 0;
1948 } else {
1949 __u64 ev1, ev2;
1950 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1951
1952 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1953 sb->level != refsb->level ||
1954 sb->layout != refsb->layout ||
1955 sb->chunksize != refsb->chunksize) {
1956 pr_warn("md: %pg has strangely different superblock to %pg\n",
1957 rdev->bdev,
1958 refdev->bdev);
1959 return -EINVAL;
1960 }
1961 ev1 = le64_to_cpu(sb->events);
1962 ev2 = le64_to_cpu(refsb->events);
1963
1964 if (!spare_disk && ev1 > ev2)
1965 ret = 1;
1966 else
1967 ret = 0;
1968 }
1969 if (minor_version)
1970 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1971 else
1972 sectors = rdev->sb_start;
1973 if (sectors < le64_to_cpu(sb->data_size))
1974 return -EINVAL;
1975 rdev->sectors = le64_to_cpu(sb->data_size);
1976 return ret;
1977 }
1978
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1979 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1980 {
1981 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1982 __u64 ev1 = le64_to_cpu(sb->events);
1983 int role;
1984
1985 rdev->raid_disk = -1;
1986 clear_bit(Faulty, &rdev->flags);
1987 clear_bit(In_sync, &rdev->flags);
1988 clear_bit(Bitmap_sync, &rdev->flags);
1989 clear_bit(WriteMostly, &rdev->flags);
1990
1991 if (mddev->raid_disks == 0) {
1992 mddev->major_version = 1;
1993 mddev->patch_version = 0;
1994 mddev->external = 0;
1995 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1996 mddev->ctime = le64_to_cpu(sb->ctime);
1997 mddev->utime = le64_to_cpu(sb->utime);
1998 mddev->level = le32_to_cpu(sb->level);
1999 mddev->clevel[0] = 0;
2000 mddev->layout = le32_to_cpu(sb->layout);
2001 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
2002 mddev->dev_sectors = le64_to_cpu(sb->size);
2003 mddev->events = ev1;
2004 mddev->bitmap_info.offset = 0;
2005 mddev->bitmap_info.space = 0;
2006 /* Default location for bitmap is 1K after superblock
2007 * using 3K - total of 4K
2008 */
2009 mddev->bitmap_info.default_offset = 1024 >> 9;
2010 mddev->bitmap_info.default_space = (4096-1024) >> 9;
2011 mddev->reshape_backwards = 0;
2012
2013 mddev->resync_offset = le64_to_cpu(sb->resync_offset);
2014 memcpy(mddev->uuid, sb->set_uuid, 16);
2015
2016 mddev->max_disks = (4096-256)/2;
2017
2018 if (!mddev->logical_block_size)
2019 mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
2020
2021 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
2022 mddev->bitmap_info.file == NULL) {
2023 mddev->bitmap_info.offset =
2024 (__s32)le32_to_cpu(sb->bitmap_offset);
2025 /* Metadata doesn't record how much space is available.
2026 * For 1.0, we assume we can use up to the superblock
2027 * if before, else to 4K beyond superblock.
2028 * For others, assume no change is possible.
2029 */
2030 if (mddev->minor_version > 0)
2031 mddev->bitmap_info.space = 0;
2032 else if (mddev->bitmap_info.offset > 0)
2033 mddev->bitmap_info.space =
2034 8 - mddev->bitmap_info.offset;
2035 else
2036 mddev->bitmap_info.space =
2037 -mddev->bitmap_info.offset;
2038 }
2039
2040 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
2041 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
2042 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
2043 mddev->new_level = le32_to_cpu(sb->new_level);
2044 mddev->new_layout = le32_to_cpu(sb->new_layout);
2045 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
2046 if (mddev->delta_disks < 0 ||
2047 (mddev->delta_disks == 0 &&
2048 (le32_to_cpu(sb->feature_map)
2049 & MD_FEATURE_RESHAPE_BACKWARDS)))
2050 mddev->reshape_backwards = 1;
2051 } else {
2052 mddev->reshape_position = MaxSector;
2053 mddev->delta_disks = 0;
2054 mddev->new_level = mddev->level;
2055 mddev->new_layout = mddev->layout;
2056 mddev->new_chunk_sectors = mddev->chunk_sectors;
2057 }
2058
2059 if (mddev->level == 0 &&
2060 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
2061 mddev->layout = -1;
2062
2063 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
2064 set_bit(MD_HAS_JOURNAL, &mddev->flags);
2065
2066 if (le32_to_cpu(sb->feature_map) &
2067 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
2068 if (le32_to_cpu(sb->feature_map) &
2069 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
2070 return -EINVAL;
2071 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
2072 (le32_to_cpu(sb->feature_map) &
2073 MD_FEATURE_MULTIPLE_PPLS))
2074 return -EINVAL;
2075 set_bit(MD_HAS_PPL, &mddev->flags);
2076 }
2077 } else if (mddev->pers == NULL) {
2078 /* Insist of good event counter while assembling, except for
2079 * spares (which don't need an event count).
2080 * Similar to mdadm, we allow event counter difference of 1
2081 * from the freshest device.
2082 */
2083 if (rdev->desc_nr >= 0 &&
2084 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
2085 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
2086 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
2087 if (ev1 + 1 < mddev->events)
2088 return -EINVAL;
2089 } else if (mddev->bitmap) {
2090 /* If adding to array with a bitmap, then we can accept an
2091 * older device, but not too old.
2092 */
2093 if (ev1 < md_bitmap_events_cleared(mddev))
2094 return 0;
2095 if (ev1 < mddev->events)
2096 set_bit(Bitmap_sync, &rdev->flags);
2097 } else {
2098 if (ev1 < mddev->events)
2099 /* just a hot-add of a new device, leave raid_disk at -1 */
2100 return 0;
2101 }
2102
2103 if (rdev->desc_nr < 0 ||
2104 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
2105 role = MD_DISK_ROLE_SPARE;
2106 rdev->desc_nr = -1;
2107 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
2108 /*
2109 * If we are assembling, and our event counter is smaller than the
2110 * highest event counter, we cannot trust our superblock about the role.
2111 * It could happen that our rdev was marked as Faulty, and all other
2112 * superblocks were updated with +1 event counter.
2113 * Then, before the next superblock update, which typically happens when
2114 * remove_and_add_spares() removes the device from the array, there was
2115 * a crash or reboot.
2116 * If we allow current rdev without consulting the freshest superblock,
2117 * we could cause data corruption.
2118 * Note that in this case our event counter is smaller by 1 than the
2119 * highest, otherwise, this rdev would not be allowed into array;
2120 * both kernel and mdadm allow event counter difference of 1.
2121 */
2122 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
2123 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
2124
2125 if (rdev->desc_nr >= freshest_max_dev) {
2126 /* this is unexpected, better not proceed */
2127 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
2128 mdname(mddev), rdev->bdev, rdev->desc_nr,
2129 freshest->bdev, freshest_max_dev);
2130 return -EUCLEAN;
2131 }
2132
2133 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
2134 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
2135 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
2136 } else {
2137 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2138 }
2139 switch (role) {
2140 case MD_DISK_ROLE_SPARE: /* spare */
2141 break;
2142 case MD_DISK_ROLE_FAULTY: /* faulty */
2143 set_bit(Faulty, &rdev->flags);
2144 break;
2145 case MD_DISK_ROLE_JOURNAL: /* journal device */
2146 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
2147 /* journal device without journal feature */
2148 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2149 return -EINVAL;
2150 }
2151 set_bit(Journal, &rdev->flags);
2152 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2153 rdev->raid_disk = 0;
2154 break;
2155 default:
2156 rdev->saved_raid_disk = role;
2157 if ((le32_to_cpu(sb->feature_map) &
2158 MD_FEATURE_RECOVERY_OFFSET)) {
2159 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2160 if (!(le32_to_cpu(sb->feature_map) &
2161 MD_FEATURE_RECOVERY_BITMAP))
2162 rdev->saved_raid_disk = -1;
2163 } else {
2164 /*
2165 * If the array is FROZEN, then the device can't
2166 * be in_sync with rest of array.
2167 */
2168 if (!test_bit(MD_RECOVERY_FROZEN,
2169 &mddev->recovery))
2170 set_bit(In_sync, &rdev->flags);
2171 }
2172 rdev->raid_disk = role;
2173 break;
2174 }
2175 if (sb->devflags & WriteMostly1)
2176 set_bit(WriteMostly, &rdev->flags);
2177 if (sb->devflags & FailFast1)
2178 set_bit(FailFast, &rdev->flags);
2179 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2180 set_bit(Replacement, &rdev->flags);
2181
2182 return 0;
2183 }
2184
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)2185 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2186 {
2187 struct mdp_superblock_1 *sb;
2188 struct md_rdev *rdev2;
2189 int max_dev, i;
2190 /* make rdev->sb match mddev and rdev data. */
2191
2192 sb = page_address(rdev->sb_page);
2193
2194 sb->feature_map = 0;
2195 sb->pad0 = 0;
2196 sb->recovery_offset = cpu_to_le64(0);
2197 memset(sb->pad3, 0, sizeof(sb->pad3));
2198
2199 sb->utime = cpu_to_le64((__u64)mddev->utime);
2200 sb->events = cpu_to_le64(mddev->events);
2201 if (mddev->in_sync)
2202 sb->resync_offset = cpu_to_le64(mddev->resync_offset);
2203 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2204 sb->resync_offset = cpu_to_le64(MaxSector);
2205 else
2206 sb->resync_offset = cpu_to_le64(0);
2207
2208 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2209
2210 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2211 sb->size = cpu_to_le64(mddev->dev_sectors);
2212 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2213 sb->level = cpu_to_le32(mddev->level);
2214 sb->layout = cpu_to_le32(mddev->layout);
2215 sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
2216 if (test_bit(FailFast, &rdev->flags))
2217 sb->devflags |= FailFast1;
2218 else
2219 sb->devflags &= ~FailFast1;
2220
2221 if (test_bit(WriteMostly, &rdev->flags))
2222 sb->devflags |= WriteMostly1;
2223 else
2224 sb->devflags &= ~WriteMostly1;
2225 sb->data_offset = cpu_to_le64(rdev->data_offset);
2226 sb->data_size = cpu_to_le64(rdev->sectors);
2227
2228 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2229 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2230 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2231 }
2232
2233 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2234 !test_bit(In_sync, &rdev->flags)) {
2235 sb->feature_map |=
2236 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2237 sb->recovery_offset =
2238 cpu_to_le64(rdev->recovery_offset);
2239 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2240 sb->feature_map |=
2241 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2242 }
2243 /* Note: recovery_offset and journal_tail share space */
2244 if (test_bit(Journal, &rdev->flags))
2245 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2246 if (test_bit(Replacement, &rdev->flags))
2247 sb->feature_map |=
2248 cpu_to_le32(MD_FEATURE_REPLACEMENT);
2249
2250 if (mddev->reshape_position != MaxSector) {
2251 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2252 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2253 sb->new_layout = cpu_to_le32(mddev->new_layout);
2254 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2255 sb->new_level = cpu_to_le32(mddev->new_level);
2256 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2257 if (mddev->delta_disks == 0 &&
2258 mddev->reshape_backwards)
2259 sb->feature_map
2260 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2261 if (rdev->new_data_offset != rdev->data_offset) {
2262 sb->feature_map
2263 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2264 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2265 - rdev->data_offset));
2266 }
2267 }
2268
2269 if (mddev_is_clustered(mddev))
2270 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2271
2272 if (rdev->badblocks.count == 0)
2273 /* Nothing to do for bad blocks*/ ;
2274 else if (sb->bblog_offset == 0)
2275 /* Cannot record bad blocks on this device */
2276 md_error(mddev, rdev);
2277 else {
2278 struct badblocks *bb = &rdev->badblocks;
2279 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2280 u64 *p = bb->page;
2281 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2282 if (bb->changed) {
2283 unsigned seq;
2284
2285 retry:
2286 seq = read_seqbegin(&bb->lock);
2287
2288 memset(bbp, 0xff, PAGE_SIZE);
2289
2290 for (i = 0 ; i < bb->count ; i++) {
2291 u64 internal_bb = p[i];
2292 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2293 | BB_LEN(internal_bb));
2294 bbp[i] = cpu_to_le64(store_bb);
2295 }
2296 bb->changed = 0;
2297 if (read_seqretry(&bb->lock, seq))
2298 goto retry;
2299
2300 bb->sector = (rdev->sb_start +
2301 (int)le32_to_cpu(sb->bblog_offset));
2302 bb->size = le16_to_cpu(sb->bblog_size);
2303 }
2304 }
2305
2306 max_dev = 0;
2307 rdev_for_each(rdev2, mddev)
2308 if (rdev2->desc_nr+1 > max_dev)
2309 max_dev = rdev2->desc_nr+1;
2310
2311 if (max_dev > le32_to_cpu(sb->max_dev)) {
2312 int bmask;
2313 sb->max_dev = cpu_to_le32(max_dev);
2314 rdev->sb_size = max_dev * 2 + 256;
2315 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2316 if (rdev->sb_size & bmask)
2317 rdev->sb_size = (rdev->sb_size | bmask) + 1;
2318 } else
2319 max_dev = le32_to_cpu(sb->max_dev);
2320
2321 for (i=0; i<max_dev;i++)
2322 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2323
2324 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2325 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2326
2327 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2328 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2329 sb->feature_map |=
2330 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2331 else
2332 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2333 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2334 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2335 }
2336
2337 rdev_for_each(rdev2, mddev) {
2338 i = rdev2->desc_nr;
2339 if (test_bit(Faulty, &rdev2->flags))
2340 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2341 else if (test_bit(In_sync, &rdev2->flags))
2342 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2343 else if (test_bit(Journal, &rdev2->flags))
2344 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2345 else if (rdev2->raid_disk >= 0)
2346 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2347 else
2348 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2349 }
2350
2351 sb->sb_csum = calc_sb_1_csum(sb);
2352 }
2353
super_1_choose_bm_space(sector_t dev_size)2354 static sector_t super_1_choose_bm_space(sector_t dev_size)
2355 {
2356 sector_t bm_space;
2357
2358 /* if the device is bigger than 8Gig, save 64k for bitmap
2359 * usage, if bigger than 200Gig, save 128k
2360 */
2361 if (dev_size < 64*2)
2362 bm_space = 0;
2363 else if (dev_size - 64*2 >= 200*1024*1024*2)
2364 bm_space = 128*2;
2365 else if (dev_size - 4*2 > 8*1024*1024*2)
2366 bm_space = 64*2;
2367 else
2368 bm_space = 4*2;
2369 return bm_space;
2370 }
2371
2372 static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2373 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2374 {
2375 struct mdp_superblock_1 *sb;
2376 sector_t max_sectors;
2377 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2378 return 0; /* component must fit device */
2379 if (rdev->data_offset != rdev->new_data_offset)
2380 return 0; /* too confusing */
2381 if (rdev->sb_start < rdev->data_offset) {
2382 /* minor versions 1 and 2; superblock before data */
2383 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2384 if (!num_sectors || num_sectors > max_sectors)
2385 num_sectors = max_sectors;
2386 } else if (rdev->mddev->bitmap_info.offset) {
2387 /* minor version 0 with bitmap we can't move */
2388 return 0;
2389 } else {
2390 /* minor version 0; superblock after data */
2391 sector_t sb_start, bm_space;
2392 sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2393
2394 /* 8K is for superblock */
2395 sb_start = dev_size - 8*2;
2396 sb_start &= ~(sector_t)(4*2 - 1);
2397
2398 bm_space = super_1_choose_bm_space(dev_size);
2399
2400 /* Space that can be used to store date needs to decrease
2401 * superblock bitmap space and bad block space(4K)
2402 */
2403 max_sectors = sb_start - bm_space - 4*2;
2404
2405 if (!num_sectors || num_sectors > max_sectors)
2406 num_sectors = max_sectors;
2407 rdev->sb_start = sb_start;
2408 }
2409 sb = page_address(rdev->sb_page);
2410 sb->data_size = cpu_to_le64(num_sectors);
2411 sb->super_offset = cpu_to_le64(rdev->sb_start);
2412 sb->sb_csum = calc_sb_1_csum(sb);
2413 do {
2414 md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
2415 rdev->sb_size, rdev->sb_page, 0);
2416 } while (md_super_wait(rdev->mddev) < 0);
2417 return num_sectors;
2418
2419 }
2420
2421 static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2422 super_1_allow_new_offset(struct md_rdev *rdev,
2423 unsigned long long new_offset)
2424 {
2425 struct mddev *mddev = rdev->mddev;
2426
2427 /* All necessary checks on new >= old have been done */
2428 if (new_offset >= rdev->data_offset)
2429 return 1;
2430
2431 /* with 1.0 metadata, there is no metadata to tread on
2432 * so we can always move back */
2433 if (mddev->minor_version == 0)
2434 return 1;
2435
2436 /* otherwise we must be sure not to step on
2437 * any metadata, so stay:
2438 * 36K beyond start of superblock
2439 * beyond end of badblocks
2440 * beyond write-intent bitmap
2441 */
2442 if (rdev->sb_start + (32+4)*2 > new_offset)
2443 return 0;
2444
2445 if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) {
2446 struct md_bitmap_stats stats;
2447 int err;
2448
2449 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
2450 if (!err && rdev->sb_start + mddev->bitmap_info.offset +
2451 stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
2452 return 0;
2453 }
2454
2455 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2456 return 0;
2457
2458 return 1;
2459 }
2460
2461 static struct super_type super_types[] = {
2462 [0] = {
2463 .name = "0.90.0",
2464 .owner = THIS_MODULE,
2465 .load_super = super_90_load,
2466 .validate_super = super_90_validate,
2467 .sync_super = super_90_sync,
2468 .rdev_size_change = super_90_rdev_size_change,
2469 .allow_new_offset = super_90_allow_new_offset,
2470 },
2471 [1] = {
2472 .name = "md-1",
2473 .owner = THIS_MODULE,
2474 .load_super = super_1_load,
2475 .validate_super = super_1_validate,
2476 .sync_super = super_1_sync,
2477 .rdev_size_change = super_1_rdev_size_change,
2478 .allow_new_offset = super_1_allow_new_offset,
2479 },
2480 };
2481
sync_super(struct mddev * mddev,struct md_rdev * rdev)2482 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2483 {
2484 if (mddev->sync_super) {
2485 mddev->sync_super(mddev, rdev);
2486 return;
2487 }
2488
2489 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2490
2491 super_types[mddev->major_version].sync_super(mddev, rdev);
2492 }
2493
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2494 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2495 {
2496 struct md_rdev *rdev, *rdev2;
2497
2498 rcu_read_lock();
2499 rdev_for_each_rcu(rdev, mddev1) {
2500 if (test_bit(Faulty, &rdev->flags) ||
2501 test_bit(Journal, &rdev->flags) ||
2502 rdev->raid_disk == -1)
2503 continue;
2504 rdev_for_each_rcu(rdev2, mddev2) {
2505 if (test_bit(Faulty, &rdev2->flags) ||
2506 test_bit(Journal, &rdev2->flags) ||
2507 rdev2->raid_disk == -1)
2508 continue;
2509 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2510 rcu_read_unlock();
2511 return 1;
2512 }
2513 }
2514 }
2515 rcu_read_unlock();
2516 return 0;
2517 }
2518
2519 static LIST_HEAD(pending_raid_disks);
2520
2521 /*
2522 * Try to register data integrity profile for an mddev
2523 *
2524 * This is called when an array is started and after a disk has been kicked
2525 * from the array. It only succeeds if all working and active component devices
2526 * are integrity capable with matching profiles.
2527 */
md_integrity_register(struct mddev * mddev)2528 int md_integrity_register(struct mddev *mddev)
2529 {
2530 if (list_empty(&mddev->disks))
2531 return 0; /* nothing to do */
2532 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
2533 return 0; /* shouldn't register */
2534
2535 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2536 return 0;
2537 }
2538 EXPORT_SYMBOL(md_integrity_register);
2539
rdev_read_only(struct md_rdev * rdev)2540 static bool rdev_read_only(struct md_rdev *rdev)
2541 {
2542 return bdev_read_only(rdev->bdev) ||
2543 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2544 }
2545
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2546 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2547 {
2548 char b[BDEVNAME_SIZE];
2549 int err;
2550
2551 /* prevent duplicates */
2552 if (find_rdev(mddev, rdev->bdev->bd_dev))
2553 return -EEXIST;
2554
2555 if (rdev_read_only(rdev) && mddev->pers)
2556 return -EROFS;
2557
2558 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2559 if (!test_bit(Journal, &rdev->flags) &&
2560 rdev->sectors &&
2561 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2562 if (mddev->pers) {
2563 /* Cannot change size, so fail
2564 * If mddev->level <= 0, then we don't care
2565 * about aligning sizes (e.g. linear)
2566 */
2567 if (mddev->level > 0)
2568 return -ENOSPC;
2569 } else
2570 mddev->dev_sectors = rdev->sectors;
2571 }
2572
2573 /* Verify rdev->desc_nr is unique.
2574 * If it is -1, assign a free number, else
2575 * check number is not in use
2576 */
2577 rcu_read_lock();
2578 if (rdev->desc_nr < 0) {
2579 int choice = 0;
2580 if (mddev->pers)
2581 choice = mddev->raid_disks;
2582 while (md_find_rdev_nr_rcu(mddev, choice))
2583 choice++;
2584 rdev->desc_nr = choice;
2585 } else {
2586 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2587 rcu_read_unlock();
2588 return -EBUSY;
2589 }
2590 }
2591 rcu_read_unlock();
2592 if (!test_bit(Journal, &rdev->flags) &&
2593 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2594 pr_warn("md: %s: array is limited to %d devices\n",
2595 mdname(mddev), mddev->max_disks);
2596 return -EBUSY;
2597 }
2598 snprintf(b, sizeof(b), "%pg", rdev->bdev);
2599 strreplace(b, '/', '!');
2600
2601 rdev->mddev = mddev;
2602 pr_debug("md: bind<%s>\n", b);
2603
2604 if (mddev->raid_disks)
2605 mddev_create_serial_pool(mddev, rdev);
2606
2607 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2608 goto fail;
2609
2610 /* failure here is OK */
2611 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2612 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2613 rdev->sysfs_unack_badblocks =
2614 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2615 rdev->sysfs_badblocks =
2616 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2617
2618 list_add_rcu(&rdev->same_set, &mddev->disks);
2619 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2620
2621 return 0;
2622
2623 fail:
2624 pr_warn("md: failed to register dev-%s for %s\n",
2625 b, mdname(mddev));
2626 mddev_destroy_serial_pool(mddev, rdev);
2627 return err;
2628 }
2629
2630 void md_autodetect_dev(dev_t dev);
2631
2632 /* just for claiming the bdev */
2633 static struct md_rdev claim_rdev;
2634
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2635 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2636 {
2637 pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2638 md_rdev_clear(rdev);
2639 #ifndef MODULE
2640 if (test_bit(AutoDetected, &rdev->flags))
2641 md_autodetect_dev(rdev->bdev->bd_dev);
2642 #endif
2643 fput(rdev->bdev_file);
2644 rdev->bdev = NULL;
2645 kobject_put(&rdev->kobj);
2646 }
2647
md_kick_rdev_from_array(struct md_rdev * rdev)2648 static void md_kick_rdev_from_array(struct md_rdev *rdev)
2649 {
2650 struct mddev *mddev = rdev->mddev;
2651
2652 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2653 list_del_rcu(&rdev->same_set);
2654 pr_debug("md: unbind<%pg>\n", rdev->bdev);
2655 mddev_destroy_serial_pool(rdev->mddev, rdev);
2656 WRITE_ONCE(rdev->mddev, NULL);
2657 sysfs_remove_link(&rdev->kobj, "block");
2658 sysfs_put(rdev->sysfs_state);
2659 sysfs_put(rdev->sysfs_unack_badblocks);
2660 sysfs_put(rdev->sysfs_badblocks);
2661 rdev->sysfs_state = NULL;
2662 rdev->sysfs_unack_badblocks = NULL;
2663 rdev->sysfs_badblocks = NULL;
2664 rdev->badblocks.count = 0;
2665
2666 synchronize_rcu();
2667
2668 /*
2669 * kobject_del() will wait for all in progress writers to be done, where
2670 * reconfig_mutex is held, hence it can't be called under
2671 * reconfig_mutex and it's delayed to mddev_unlock().
2672 */
2673 list_add(&rdev->same_set, &mddev->deleting);
2674 }
2675
export_array(struct mddev * mddev)2676 static void export_array(struct mddev *mddev)
2677 {
2678 struct md_rdev *rdev;
2679
2680 while (!list_empty(&mddev->disks)) {
2681 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2682 same_set);
2683 md_kick_rdev_from_array(rdev);
2684 }
2685 mddev->raid_disks = 0;
2686 mddev->major_version = 0;
2687 }
2688
set_in_sync(struct mddev * mddev)2689 static bool set_in_sync(struct mddev *mddev)
2690 {
2691 lockdep_assert_held(&mddev->lock);
2692 if (!mddev->in_sync) {
2693 mddev->sync_checkers++;
2694 spin_unlock(&mddev->lock);
2695 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2696 spin_lock(&mddev->lock);
2697 if (!mddev->in_sync &&
2698 percpu_ref_is_zero(&mddev->writes_pending)) {
2699 mddev->in_sync = 1;
2700 /*
2701 * Ensure ->in_sync is visible before we clear
2702 * ->sync_checkers.
2703 */
2704 smp_mb();
2705 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2706 sysfs_notify_dirent_safe(mddev->sysfs_state);
2707 }
2708 if (--mddev->sync_checkers == 0)
2709 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2710 }
2711 if (mddev->safemode == 1)
2712 mddev->safemode = 0;
2713 return mddev->in_sync;
2714 }
2715
sync_sbs(struct mddev * mddev,int nospares)2716 static void sync_sbs(struct mddev *mddev, int nospares)
2717 {
2718 /* Update each superblock (in-memory image), but
2719 * if we are allowed to, skip spares which already
2720 * have the right event counter, or have one earlier
2721 * (which would mean they aren't being marked as dirty
2722 * with the rest of the array)
2723 */
2724 struct md_rdev *rdev;
2725 rdev_for_each(rdev, mddev) {
2726 if (rdev->sb_events == mddev->events ||
2727 (nospares &&
2728 rdev->raid_disk < 0 &&
2729 rdev->sb_events+1 == mddev->events)) {
2730 /* Don't update this superblock */
2731 rdev->sb_loaded = 2;
2732 } else {
2733 sync_super(mddev, rdev);
2734 rdev->sb_loaded = 1;
2735 }
2736 }
2737 }
2738
does_sb_need_changing(struct mddev * mddev)2739 static bool does_sb_need_changing(struct mddev *mddev)
2740 {
2741 struct md_rdev *rdev = NULL, *iter;
2742 struct mdp_superblock_1 *sb;
2743 int role;
2744
2745 /* Find a good rdev */
2746 rdev_for_each(iter, mddev)
2747 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2748 rdev = iter;
2749 break;
2750 }
2751
2752 /* No good device found. */
2753 if (!rdev)
2754 return false;
2755
2756 sb = page_address(rdev->sb_page);
2757 /* Check if a device has become faulty or a spare become active */
2758 rdev_for_each(rdev, mddev) {
2759 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2760 /* Device activated? */
2761 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2762 !test_bit(Faulty, &rdev->flags))
2763 return true;
2764 /* Device turned faulty? */
2765 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2766 return true;
2767 }
2768
2769 /* Check if any mddev parameters have changed */
2770 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2771 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2772 (mddev->layout != le32_to_cpu(sb->layout)) ||
2773 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2774 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2775 return true;
2776
2777 return false;
2778 }
2779
md_update_sb(struct mddev * mddev,int force_change)2780 void md_update_sb(struct mddev *mddev, int force_change)
2781 {
2782 struct md_rdev *rdev;
2783 int sync_req;
2784 int nospares = 0;
2785 int any_badblocks_changed = 0;
2786 int ret = -1;
2787
2788 if (!md_is_rdwr(mddev)) {
2789 if (force_change)
2790 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2791 pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
2792 return;
2793 }
2794
2795 repeat:
2796 if (mddev_is_clustered(mddev)) {
2797 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2798 force_change = 1;
2799 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2800 nospares = 1;
2801 ret = mddev->cluster_ops->metadata_update_start(mddev);
2802 /* Has someone else has updated the sb */
2803 if (!does_sb_need_changing(mddev)) {
2804 if (ret == 0)
2805 mddev->cluster_ops->metadata_update_cancel(mddev);
2806 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2807 BIT(MD_SB_CHANGE_DEVS) |
2808 BIT(MD_SB_CHANGE_CLEAN));
2809 return;
2810 }
2811 }
2812
2813 /*
2814 * First make sure individual recovery_offsets are correct
2815 * curr_resync_completed can only be used during recovery.
2816 * During reshape/resync it might use array-addresses rather
2817 * that device addresses.
2818 */
2819 rdev_for_each(rdev, mddev) {
2820 if (rdev->raid_disk >= 0 &&
2821 mddev->delta_disks >= 0 &&
2822 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2823 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2824 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2825 !test_bit(Journal, &rdev->flags) &&
2826 !test_bit(In_sync, &rdev->flags) &&
2827 mddev->curr_resync_completed > rdev->recovery_offset)
2828 rdev->recovery_offset = mddev->curr_resync_completed;
2829
2830 }
2831 if (!mddev->persistent) {
2832 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2833 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2834 if (!mddev->external) {
2835 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2836 rdev_for_each(rdev, mddev) {
2837 if (rdev->badblocks.changed) {
2838 rdev->badblocks.changed = 0;
2839 ack_all_badblocks(&rdev->badblocks);
2840 md_error(mddev, rdev);
2841 }
2842 clear_bit(Blocked, &rdev->flags);
2843 clear_bit(BlockedBadBlocks, &rdev->flags);
2844 wake_up(&rdev->blocked_wait);
2845 }
2846 }
2847 wake_up(&mddev->sb_wait);
2848 return;
2849 }
2850
2851 spin_lock(&mddev->lock);
2852
2853 mddev->utime = ktime_get_real_seconds();
2854
2855 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2856 force_change = 1;
2857 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2858 /* just a clean<-> dirty transition, possibly leave spares alone,
2859 * though if events isn't the right even/odd, we will have to do
2860 * spares after all
2861 */
2862 nospares = 1;
2863 if (force_change)
2864 nospares = 0;
2865 if (mddev->degraded)
2866 /* If the array is degraded, then skipping spares is both
2867 * dangerous and fairly pointless.
2868 * Dangerous because a device that was removed from the array
2869 * might have a event_count that still looks up-to-date,
2870 * so it can be re-added without a resync.
2871 * Pointless because if there are any spares to skip,
2872 * then a recovery will happen and soon that array won't
2873 * be degraded any more and the spare can go back to sleep then.
2874 */
2875 nospares = 0;
2876
2877 sync_req = mddev->in_sync;
2878
2879 /* If this is just a dirty<->clean transition, and the array is clean
2880 * and 'events' is odd, we can roll back to the previous clean state */
2881 if (nospares
2882 && (mddev->in_sync && mddev->resync_offset == MaxSector)
2883 && mddev->can_decrease_events
2884 && mddev->events != 1) {
2885 mddev->events--;
2886 mddev->can_decrease_events = 0;
2887 } else {
2888 /* otherwise we have to go forward and ... */
2889 mddev->events ++;
2890 mddev->can_decrease_events = nospares;
2891 }
2892
2893 /*
2894 * This 64-bit counter should never wrap.
2895 * Either we are in around ~1 trillion A.C., assuming
2896 * 1 reboot per second, or we have a bug...
2897 */
2898 WARN_ON(mddev->events == 0);
2899
2900 rdev_for_each(rdev, mddev) {
2901 if (rdev->badblocks.changed)
2902 any_badblocks_changed++;
2903 if (test_bit(Faulty, &rdev->flags))
2904 set_bit(FaultRecorded, &rdev->flags);
2905 }
2906
2907 sync_sbs(mddev, nospares);
2908 spin_unlock(&mddev->lock);
2909
2910 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2911 mdname(mddev), mddev->in_sync);
2912
2913 mddev_add_trace_msg(mddev, "md md_update_sb");
2914 rewrite:
2915 if (md_bitmap_enabled(mddev, false))
2916 mddev->bitmap_ops->update_sb(mddev->bitmap);
2917 rdev_for_each(rdev, mddev) {
2918 if (rdev->sb_loaded != 1)
2919 continue; /* no noise on spare devices */
2920
2921 if (!test_bit(Faulty, &rdev->flags)) {
2922 md_write_metadata(mddev, rdev, rdev->sb_start,
2923 rdev->sb_size, rdev->sb_page, 0);
2924 pr_debug("md: (write) %pg's sb offset: %llu\n",
2925 rdev->bdev,
2926 (unsigned long long)rdev->sb_start);
2927 rdev->sb_events = mddev->events;
2928 if (rdev->badblocks.size) {
2929 md_write_metadata(mddev, rdev,
2930 rdev->badblocks.sector,
2931 rdev->badblocks.size << 9,
2932 rdev->bb_page, 0);
2933 rdev->badblocks.size = 0;
2934 }
2935
2936 } else
2937 pr_debug("md: %pg (skipping faulty)\n",
2938 rdev->bdev);
2939 }
2940 if (md_super_wait(mddev) < 0)
2941 goto rewrite;
2942 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2943
2944 if (mddev_is_clustered(mddev) && ret == 0)
2945 mddev->cluster_ops->metadata_update_finish(mddev);
2946
2947 if (mddev->in_sync != sync_req ||
2948 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2949 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2950 /* have to write it out again */
2951 goto repeat;
2952 wake_up(&mddev->sb_wait);
2953 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2954 sysfs_notify_dirent_safe(mddev->sysfs_completed);
2955
2956 rdev_for_each(rdev, mddev) {
2957 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2958 clear_bit(Blocked, &rdev->flags);
2959
2960 if (any_badblocks_changed)
2961 ack_all_badblocks(&rdev->badblocks);
2962 clear_bit(BlockedBadBlocks, &rdev->flags);
2963 wake_up(&rdev->blocked_wait);
2964 }
2965 }
2966 EXPORT_SYMBOL(md_update_sb);
2967
add_bound_rdev(struct md_rdev * rdev)2968 static int add_bound_rdev(struct md_rdev *rdev)
2969 {
2970 struct mddev *mddev = rdev->mddev;
2971 int err = 0;
2972 bool add_journal = test_bit(Journal, &rdev->flags);
2973
2974 if (!mddev->pers->hot_remove_disk || add_journal) {
2975 /* If there is hot_add_disk but no hot_remove_disk
2976 * then added disks for geometry changes,
2977 * and should be added immediately.
2978 */
2979 super_types[mddev->major_version].
2980 validate_super(mddev, NULL/*freshest*/, rdev);
2981 err = mddev->pers->hot_add_disk(mddev, rdev);
2982 if (err) {
2983 md_kick_rdev_from_array(rdev);
2984 return err;
2985 }
2986 }
2987 sysfs_notify_dirent_safe(rdev->sysfs_state);
2988
2989 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2990 if (mddev->degraded)
2991 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2992 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2993 md_new_event();
2994 return 0;
2995 }
2996
2997 /* words written to sysfs files may, or may not, be \n terminated.
2998 * We want to accept with case. For this we use cmd_match.
2999 */
cmd_match(const char * cmd,const char * str)3000 static int cmd_match(const char *cmd, const char *str)
3001 {
3002 /* See if cmd, written into a sysfs file, matches
3003 * str. They must either be the same, or cmd can
3004 * have a trailing newline
3005 */
3006 while (*cmd && *str && *cmd == *str) {
3007 cmd++;
3008 str++;
3009 }
3010 if (*cmd == '\n')
3011 cmd++;
3012 if (*str || *cmd)
3013 return 0;
3014 return 1;
3015 }
3016
3017 struct rdev_sysfs_entry {
3018 struct attribute attr;
3019 ssize_t (*show)(struct md_rdev *, char *);
3020 ssize_t (*store)(struct md_rdev *, const char *, size_t);
3021 };
3022
3023 static ssize_t
state_show(struct md_rdev * rdev,char * page)3024 state_show(struct md_rdev *rdev, char *page)
3025 {
3026 char *sep = ",";
3027 size_t len = 0;
3028 unsigned long flags = READ_ONCE(rdev->flags);
3029
3030 if (test_bit(Faulty, &flags) ||
3031 (!test_bit(ExternalBbl, &flags) &&
3032 rdev->badblocks.unacked_exist))
3033 len += sprintf(page+len, "faulty%s", sep);
3034 if (test_bit(In_sync, &flags))
3035 len += sprintf(page+len, "in_sync%s", sep);
3036 if (test_bit(Journal, &flags))
3037 len += sprintf(page+len, "journal%s", sep);
3038 if (test_bit(WriteMostly, &flags))
3039 len += sprintf(page+len, "write_mostly%s", sep);
3040 if (test_bit(Blocked, &flags) ||
3041 (rdev->badblocks.unacked_exist
3042 && !test_bit(Faulty, &flags)))
3043 len += sprintf(page+len, "blocked%s", sep);
3044 if (!test_bit(Faulty, &flags) &&
3045 !test_bit(Journal, &flags) &&
3046 !test_bit(In_sync, &flags))
3047 len += sprintf(page+len, "spare%s", sep);
3048 if (test_bit(WriteErrorSeen, &flags))
3049 len += sprintf(page+len, "write_error%s", sep);
3050 if (test_bit(WantReplacement, &flags))
3051 len += sprintf(page+len, "want_replacement%s", sep);
3052 if (test_bit(Replacement, &flags))
3053 len += sprintf(page+len, "replacement%s", sep);
3054 if (test_bit(ExternalBbl, &flags))
3055 len += sprintf(page+len, "external_bbl%s", sep);
3056 if (test_bit(FailFast, &flags))
3057 len += sprintf(page+len, "failfast%s", sep);
3058
3059 if (len)
3060 len -= strlen(sep);
3061
3062 return len+sprintf(page+len, "\n");
3063 }
3064
3065 static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)3066 state_store(struct md_rdev *rdev, const char *buf, size_t len)
3067 {
3068 /* can write
3069 * faulty - simulates an error
3070 * remove - disconnects the device
3071 * writemostly - sets write_mostly
3072 * -writemostly - clears write_mostly
3073 * blocked - sets the Blocked flags
3074 * -blocked - clears the Blocked and possibly simulates an error
3075 * insync - sets Insync providing device isn't active
3076 * -insync - clear Insync for a device with a slot assigned,
3077 * so that it gets rebuilt based on bitmap
3078 * write_error - sets WriteErrorSeen
3079 * -write_error - clears WriteErrorSeen
3080 * {,-}failfast - set/clear FailFast
3081 */
3082
3083 struct mddev *mddev = rdev->mddev;
3084 int err = -EINVAL;
3085 bool need_update_sb = false;
3086
3087 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
3088 md_error(rdev->mddev, rdev);
3089
3090 if (test_bit(MD_BROKEN, &rdev->mddev->flags))
3091 err = -EBUSY;
3092 else
3093 err = 0;
3094 } else if (cmd_match(buf, "remove")) {
3095 if (rdev->mddev->pers) {
3096 clear_bit(Blocked, &rdev->flags);
3097 remove_and_add_spares(rdev->mddev, rdev);
3098 }
3099 if (rdev->raid_disk >= 0)
3100 err = -EBUSY;
3101 else {
3102 err = 0;
3103 if (mddev_is_clustered(mddev))
3104 err = mddev->cluster_ops->remove_disk(mddev, rdev);
3105
3106 if (err == 0) {
3107 md_kick_rdev_from_array(rdev);
3108 if (mddev->pers)
3109 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3110 md_new_event();
3111 }
3112 }
3113 } else if (cmd_match(buf, "writemostly")) {
3114 set_bit(WriteMostly, &rdev->flags);
3115 mddev_create_serial_pool(rdev->mddev, rdev);
3116 need_update_sb = true;
3117 err = 0;
3118 } else if (cmd_match(buf, "-writemostly")) {
3119 mddev_destroy_serial_pool(rdev->mddev, rdev);
3120 clear_bit(WriteMostly, &rdev->flags);
3121 need_update_sb = true;
3122 err = 0;
3123 } else if (cmd_match(buf, "blocked")) {
3124 set_bit(Blocked, &rdev->flags);
3125 err = 0;
3126 } else if (cmd_match(buf, "-blocked")) {
3127 if (!test_bit(Faulty, &rdev->flags) &&
3128 !test_bit(ExternalBbl, &rdev->flags) &&
3129 rdev->badblocks.unacked_exist) {
3130 /* metadata handler doesn't understand badblocks,
3131 * so we need to fail the device
3132 */
3133 md_error(rdev->mddev, rdev);
3134 }
3135 clear_bit(Blocked, &rdev->flags);
3136 clear_bit(BlockedBadBlocks, &rdev->flags);
3137 wake_up(&rdev->blocked_wait);
3138 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3139
3140 err = 0;
3141 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3142 set_bit(In_sync, &rdev->flags);
3143 err = 0;
3144 } else if (cmd_match(buf, "failfast")) {
3145 set_bit(FailFast, &rdev->flags);
3146 need_update_sb = true;
3147 err = 0;
3148 } else if (cmd_match(buf, "-failfast")) {
3149 clear_bit(FailFast, &rdev->flags);
3150 need_update_sb = true;
3151 err = 0;
3152 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3153 !test_bit(Journal, &rdev->flags)) {
3154 if (rdev->mddev->pers == NULL) {
3155 clear_bit(In_sync, &rdev->flags);
3156 rdev->saved_raid_disk = rdev->raid_disk;
3157 rdev->raid_disk = -1;
3158 err = 0;
3159 }
3160 } else if (cmd_match(buf, "write_error")) {
3161 set_bit(WriteErrorSeen, &rdev->flags);
3162 err = 0;
3163 } else if (cmd_match(buf, "-write_error")) {
3164 clear_bit(WriteErrorSeen, &rdev->flags);
3165 err = 0;
3166 } else if (cmd_match(buf, "want_replacement")) {
3167 /* Any non-spare device that is not a replacement can
3168 * become want_replacement at any time, but we then need to
3169 * check if recovery is needed.
3170 */
3171 if (rdev->raid_disk >= 0 &&
3172 !test_bit(Journal, &rdev->flags) &&
3173 !test_bit(Replacement, &rdev->flags))
3174 set_bit(WantReplacement, &rdev->flags);
3175 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3176 err = 0;
3177 } else if (cmd_match(buf, "-want_replacement")) {
3178 /* Clearing 'want_replacement' is always allowed.
3179 * Once replacements starts it is too late though.
3180 */
3181 err = 0;
3182 clear_bit(WantReplacement, &rdev->flags);
3183 } else if (cmd_match(buf, "replacement")) {
3184 /* Can only set a device as a replacement when array has not
3185 * yet been started. Once running, replacement is automatic
3186 * from spares, or by assigning 'slot'.
3187 */
3188 if (rdev->mddev->pers)
3189 err = -EBUSY;
3190 else {
3191 set_bit(Replacement, &rdev->flags);
3192 err = 0;
3193 }
3194 } else if (cmd_match(buf, "-replacement")) {
3195 /* Similarly, can only clear Replacement before start */
3196 if (rdev->mddev->pers)
3197 err = -EBUSY;
3198 else {
3199 clear_bit(Replacement, &rdev->flags);
3200 err = 0;
3201 }
3202 } else if (cmd_match(buf, "re-add")) {
3203 if (!rdev->mddev->pers)
3204 err = -EINVAL;
3205 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3206 rdev->saved_raid_disk >= 0) {
3207 /* clear_bit is performed _after_ all the devices
3208 * have their local Faulty bit cleared. If any writes
3209 * happen in the meantime in the local node, they
3210 * will land in the local bitmap, which will be synced
3211 * by this node eventually
3212 */
3213 if (!mddev_is_clustered(rdev->mddev) ||
3214 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
3215 clear_bit(Faulty, &rdev->flags);
3216 err = add_bound_rdev(rdev);
3217 }
3218 } else
3219 err = -EBUSY;
3220 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3221 set_bit(ExternalBbl, &rdev->flags);
3222 rdev->badblocks.shift = 0;
3223 err = 0;
3224 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3225 clear_bit(ExternalBbl, &rdev->flags);
3226 err = 0;
3227 }
3228 if (need_update_sb)
3229 md_update_sb(mddev, 1);
3230 if (!err)
3231 sysfs_notify_dirent_safe(rdev->sysfs_state);
3232 return err ? err : len;
3233 }
3234 static struct rdev_sysfs_entry rdev_state =
3235 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3236
3237 static ssize_t
errors_show(struct md_rdev * rdev,char * page)3238 errors_show(struct md_rdev *rdev, char *page)
3239 {
3240 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3241 }
3242
3243 static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3244 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3245 {
3246 unsigned int n;
3247 int rv;
3248
3249 rv = kstrtouint(buf, 10, &n);
3250 if (rv < 0)
3251 return rv;
3252 atomic_set(&rdev->corrected_errors, n);
3253 return len;
3254 }
3255 static struct rdev_sysfs_entry rdev_errors =
3256 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3257
3258 static ssize_t
slot_show(struct md_rdev * rdev,char * page)3259 slot_show(struct md_rdev *rdev, char *page)
3260 {
3261 if (test_bit(Journal, &rdev->flags))
3262 return sprintf(page, "journal\n");
3263 else if (rdev->raid_disk < 0)
3264 return sprintf(page, "none\n");
3265 else
3266 return sprintf(page, "%d\n", rdev->raid_disk);
3267 }
3268
3269 static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3270 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3271 {
3272 int slot;
3273 int err;
3274
3275 if (test_bit(Journal, &rdev->flags))
3276 return -EBUSY;
3277 if (strncmp(buf, "none", 4)==0)
3278 slot = -1;
3279 else {
3280 err = kstrtouint(buf, 10, (unsigned int *)&slot);
3281 if (err < 0)
3282 return err;
3283 if (slot < 0)
3284 /* overflow */
3285 return -ENOSPC;
3286 }
3287 if (rdev->mddev->pers && slot == -1) {
3288 /* Setting 'slot' on an active array requires also
3289 * updating the 'rd%d' link, and communicating
3290 * with the personality with ->hot_*_disk.
3291 * For now we only support removing
3292 * failed/spare devices. This normally happens automatically,
3293 * but not when the metadata is externally managed.
3294 */
3295 if (rdev->raid_disk == -1)
3296 return -EEXIST;
3297 /* personality does all needed checks */
3298 if (rdev->mddev->pers->hot_remove_disk == NULL)
3299 return -EINVAL;
3300 clear_bit(Blocked, &rdev->flags);
3301 remove_and_add_spares(rdev->mddev, rdev);
3302 if (rdev->raid_disk >= 0)
3303 return -EBUSY;
3304 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3305 } else if (rdev->mddev->pers) {
3306 /* Activating a spare .. or possibly reactivating
3307 * if we ever get bitmaps working here.
3308 */
3309 int err;
3310
3311 if (rdev->raid_disk != -1)
3312 return -EBUSY;
3313
3314 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3315 return -EBUSY;
3316
3317 if (rdev->mddev->pers->hot_add_disk == NULL)
3318 return -EINVAL;
3319
3320 if (slot >= rdev->mddev->raid_disks &&
3321 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3322 return -ENOSPC;
3323
3324 rdev->raid_disk = slot;
3325 if (test_bit(In_sync, &rdev->flags))
3326 rdev->saved_raid_disk = slot;
3327 else
3328 rdev->saved_raid_disk = -1;
3329 clear_bit(In_sync, &rdev->flags);
3330 clear_bit(Bitmap_sync, &rdev->flags);
3331 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3332 if (err) {
3333 rdev->raid_disk = -1;
3334 return err;
3335 } else
3336 sysfs_notify_dirent_safe(rdev->sysfs_state);
3337 /* failure here is OK */;
3338 sysfs_link_rdev(rdev->mddev, rdev);
3339 /* don't wakeup anyone, leave that to userspace. */
3340 } else {
3341 if (slot >= rdev->mddev->raid_disks &&
3342 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3343 return -ENOSPC;
3344 rdev->raid_disk = slot;
3345 /* assume it is working */
3346 clear_bit(Faulty, &rdev->flags);
3347 clear_bit(WriteMostly, &rdev->flags);
3348 set_bit(In_sync, &rdev->flags);
3349 sysfs_notify_dirent_safe(rdev->sysfs_state);
3350 }
3351 return len;
3352 }
3353
3354 static struct rdev_sysfs_entry rdev_slot =
3355 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3356
3357 static ssize_t
offset_show(struct md_rdev * rdev,char * page)3358 offset_show(struct md_rdev *rdev, char *page)
3359 {
3360 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3361 }
3362
3363 static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3364 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3365 {
3366 unsigned long long offset;
3367 if (kstrtoull(buf, 10, &offset) < 0)
3368 return -EINVAL;
3369 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3370 return -EBUSY;
3371 if (rdev->sectors && rdev->mddev->external)
3372 /* Must set offset before size, so overlap checks
3373 * can be sane */
3374 return -EBUSY;
3375 rdev->data_offset = offset;
3376 rdev->new_data_offset = offset;
3377 return len;
3378 }
3379
3380 static struct rdev_sysfs_entry rdev_offset =
3381 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3382
new_offset_show(struct md_rdev * rdev,char * page)3383 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3384 {
3385 return sprintf(page, "%llu\n",
3386 (unsigned long long)rdev->new_data_offset);
3387 }
3388
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3389 static ssize_t new_offset_store(struct md_rdev *rdev,
3390 const char *buf, size_t len)
3391 {
3392 unsigned long long new_offset;
3393 struct mddev *mddev = rdev->mddev;
3394
3395 if (kstrtoull(buf, 10, &new_offset) < 0)
3396 return -EINVAL;
3397
3398 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3399 return -EBUSY;
3400 if (new_offset == rdev->data_offset)
3401 /* reset is always permitted */
3402 ;
3403 else if (new_offset > rdev->data_offset) {
3404 /* must not push array size beyond rdev_sectors */
3405 if (new_offset - rdev->data_offset
3406 + mddev->dev_sectors > rdev->sectors)
3407 return -E2BIG;
3408 }
3409 /* Metadata worries about other space details. */
3410
3411 /* decreasing the offset is inconsistent with a backwards
3412 * reshape.
3413 */
3414 if (new_offset < rdev->data_offset &&
3415 mddev->reshape_backwards)
3416 return -EINVAL;
3417 /* Increasing offset is inconsistent with forwards
3418 * reshape. reshape_direction should be set to
3419 * 'backwards' first.
3420 */
3421 if (new_offset > rdev->data_offset &&
3422 !mddev->reshape_backwards)
3423 return -EINVAL;
3424
3425 if (mddev->pers && mddev->persistent &&
3426 !super_types[mddev->major_version]
3427 .allow_new_offset(rdev, new_offset))
3428 return -E2BIG;
3429 rdev->new_data_offset = new_offset;
3430 if (new_offset > rdev->data_offset)
3431 mddev->reshape_backwards = 1;
3432 else if (new_offset < rdev->data_offset)
3433 mddev->reshape_backwards = 0;
3434
3435 return len;
3436 }
3437 static struct rdev_sysfs_entry rdev_new_offset =
3438 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3439
3440 static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3441 rdev_size_show(struct md_rdev *rdev, char *page)
3442 {
3443 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3444 }
3445
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3446 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3447 {
3448 /* check if two start/length pairs overlap */
3449 if (a->data_offset + a->sectors <= b->data_offset)
3450 return false;
3451 if (b->data_offset + b->sectors <= a->data_offset)
3452 return false;
3453 return true;
3454 }
3455
md_rdev_overlaps(struct md_rdev * rdev)3456 static bool md_rdev_overlaps(struct md_rdev *rdev)
3457 {
3458 struct mddev *mddev;
3459 struct md_rdev *rdev2;
3460
3461 spin_lock(&all_mddevs_lock);
3462 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3463 if (test_bit(MD_DELETED, &mddev->flags))
3464 continue;
3465 rdev_for_each(rdev2, mddev) {
3466 if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3467 md_rdevs_overlap(rdev, rdev2)) {
3468 spin_unlock(&all_mddevs_lock);
3469 return true;
3470 }
3471 }
3472 }
3473 spin_unlock(&all_mddevs_lock);
3474 return false;
3475 }
3476
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3477 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3478 {
3479 unsigned long long blocks;
3480 sector_t new;
3481
3482 if (kstrtoull(buf, 10, &blocks) < 0)
3483 return -EINVAL;
3484
3485 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3486 return -EINVAL; /* sector conversion overflow */
3487
3488 new = blocks * 2;
3489 if (new != blocks * 2)
3490 return -EINVAL; /* unsigned long long to sector_t overflow */
3491
3492 *sectors = new;
3493 return 0;
3494 }
3495
3496 static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3497 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3498 {
3499 struct mddev *my_mddev = rdev->mddev;
3500 sector_t oldsectors = rdev->sectors;
3501 sector_t sectors;
3502
3503 if (test_bit(Journal, &rdev->flags))
3504 return -EBUSY;
3505 if (strict_blocks_to_sectors(buf, §ors) < 0)
3506 return -EINVAL;
3507 if (rdev->data_offset != rdev->new_data_offset)
3508 return -EINVAL; /* too confusing */
3509 if (my_mddev->pers && rdev->raid_disk >= 0) {
3510 if (my_mddev->persistent) {
3511 sectors = super_types[my_mddev->major_version].
3512 rdev_size_change(rdev, sectors);
3513 if (!sectors)
3514 return -EBUSY;
3515 } else if (!sectors)
3516 sectors = bdev_nr_sectors(rdev->bdev) -
3517 rdev->data_offset;
3518 if (!my_mddev->pers->resize)
3519 /* Cannot change size for RAID0 or Linear etc */
3520 return -EINVAL;
3521 }
3522 if (sectors < my_mddev->dev_sectors)
3523 return -EINVAL; /* component must fit device */
3524
3525 rdev->sectors = sectors;
3526
3527 /*
3528 * Check that all other rdevs with the same bdev do not overlap. This
3529 * check does not provide a hard guarantee, it just helps avoid
3530 * dangerous mistakes.
3531 */
3532 if (sectors > oldsectors && my_mddev->external &&
3533 md_rdev_overlaps(rdev)) {
3534 /*
3535 * Someone else could have slipped in a size change here, but
3536 * doing so is just silly. We put oldsectors back because we
3537 * know it is safe, and trust userspace not to race with itself.
3538 */
3539 rdev->sectors = oldsectors;
3540 return -EBUSY;
3541 }
3542 return len;
3543 }
3544
3545 static struct rdev_sysfs_entry rdev_size =
3546 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3547
recovery_start_show(struct md_rdev * rdev,char * page)3548 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3549 {
3550 unsigned long long recovery_start = rdev->recovery_offset;
3551
3552 if (test_bit(In_sync, &rdev->flags) ||
3553 recovery_start == MaxSector)
3554 return sprintf(page, "none\n");
3555
3556 return sprintf(page, "%llu\n", recovery_start);
3557 }
3558
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3559 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3560 {
3561 unsigned long long recovery_start;
3562
3563 if (cmd_match(buf, "none"))
3564 recovery_start = MaxSector;
3565 else if (kstrtoull(buf, 10, &recovery_start))
3566 return -EINVAL;
3567
3568 if (rdev->mddev->pers &&
3569 rdev->raid_disk >= 0)
3570 return -EBUSY;
3571
3572 rdev->recovery_offset = recovery_start;
3573 if (recovery_start == MaxSector)
3574 set_bit(In_sync, &rdev->flags);
3575 else
3576 clear_bit(In_sync, &rdev->flags);
3577 return len;
3578 }
3579
3580 static struct rdev_sysfs_entry rdev_recovery_start =
3581 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3582
3583 /* sysfs access to bad-blocks list.
3584 * We present two files.
3585 * 'bad-blocks' lists sector numbers and lengths of ranges that
3586 * are recorded as bad. The list is truncated to fit within
3587 * the one-page limit of sysfs.
3588 * Writing "sector length" to this file adds an acknowledged
3589 * bad block list.
3590 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3591 * been acknowledged. Writing to this file adds bad blocks
3592 * without acknowledging them. This is largely for testing.
3593 */
bb_show(struct md_rdev * rdev,char * page)3594 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3595 {
3596 return badblocks_show(&rdev->badblocks, page, 0);
3597 }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3598 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3599 {
3600 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3601 /* Maybe that ack was all we needed */
3602 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3603 wake_up(&rdev->blocked_wait);
3604 return rv;
3605 }
3606 static struct rdev_sysfs_entry rdev_bad_blocks =
3607 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3608
ubb_show(struct md_rdev * rdev,char * page)3609 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3610 {
3611 return badblocks_show(&rdev->badblocks, page, 1);
3612 }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3613 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3614 {
3615 return badblocks_store(&rdev->badblocks, page, len, 1);
3616 }
3617 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3618 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3619
3620 static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3621 ppl_sector_show(struct md_rdev *rdev, char *page)
3622 {
3623 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3624 }
3625
3626 static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3627 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3628 {
3629 unsigned long long sector;
3630
3631 if (kstrtoull(buf, 10, §or) < 0)
3632 return -EINVAL;
3633 if (sector != (sector_t)sector)
3634 return -EINVAL;
3635
3636 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3637 rdev->raid_disk >= 0)
3638 return -EBUSY;
3639
3640 if (rdev->mddev->persistent) {
3641 if (rdev->mddev->major_version == 0)
3642 return -EINVAL;
3643 if ((sector > rdev->sb_start &&
3644 sector - rdev->sb_start > S16_MAX) ||
3645 (sector < rdev->sb_start &&
3646 rdev->sb_start - sector > -S16_MIN))
3647 return -EINVAL;
3648 rdev->ppl.offset = sector - rdev->sb_start;
3649 } else if (!rdev->mddev->external) {
3650 return -EBUSY;
3651 }
3652 rdev->ppl.sector = sector;
3653 return len;
3654 }
3655
3656 static struct rdev_sysfs_entry rdev_ppl_sector =
3657 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3658
3659 static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3660 ppl_size_show(struct md_rdev *rdev, char *page)
3661 {
3662 return sprintf(page, "%u\n", rdev->ppl.size);
3663 }
3664
3665 static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3666 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3667 {
3668 unsigned int size;
3669
3670 if (kstrtouint(buf, 10, &size) < 0)
3671 return -EINVAL;
3672
3673 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3674 rdev->raid_disk >= 0)
3675 return -EBUSY;
3676
3677 if (rdev->mddev->persistent) {
3678 if (rdev->mddev->major_version == 0)
3679 return -EINVAL;
3680 if (size > U16_MAX)
3681 return -EINVAL;
3682 } else if (!rdev->mddev->external) {
3683 return -EBUSY;
3684 }
3685 rdev->ppl.size = size;
3686 return len;
3687 }
3688
3689 static struct rdev_sysfs_entry rdev_ppl_size =
3690 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3691
3692 static struct attribute *rdev_default_attrs[] = {
3693 &rdev_state.attr,
3694 &rdev_errors.attr,
3695 &rdev_slot.attr,
3696 &rdev_offset.attr,
3697 &rdev_new_offset.attr,
3698 &rdev_size.attr,
3699 &rdev_recovery_start.attr,
3700 &rdev_bad_blocks.attr,
3701 &rdev_unack_bad_blocks.attr,
3702 &rdev_ppl_sector.attr,
3703 &rdev_ppl_size.attr,
3704 NULL,
3705 };
3706 ATTRIBUTE_GROUPS(rdev_default);
3707 static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3708 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3709 {
3710 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3711 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3712
3713 if (!entry->show)
3714 return -EIO;
3715 if (!rdev->mddev)
3716 return -ENODEV;
3717 return entry->show(rdev, page);
3718 }
3719
3720 static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3721 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3722 const char *page, size_t length)
3723 {
3724 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3725 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3726 struct kernfs_node *kn = NULL;
3727 bool suspend = false;
3728 ssize_t rv;
3729 struct mddev *mddev = READ_ONCE(rdev->mddev);
3730
3731 if (!entry->store)
3732 return -EIO;
3733 if (!capable(CAP_SYS_ADMIN))
3734 return -EACCES;
3735 if (!mddev)
3736 return -ENODEV;
3737
3738 if (entry->store == state_store) {
3739 if (cmd_match(page, "remove"))
3740 kn = sysfs_break_active_protection(kobj, attr);
3741 if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3742 cmd_match(page, "writemostly") ||
3743 cmd_match(page, "-writemostly"))
3744 suspend = true;
3745 }
3746
3747 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3748 if (!rv) {
3749 if (rdev->mddev == NULL)
3750 rv = -ENODEV;
3751 else
3752 rv = entry->store(rdev, page, length);
3753 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3754 }
3755
3756 if (kn)
3757 sysfs_unbreak_active_protection(kn);
3758
3759 return rv;
3760 }
3761
rdev_free(struct kobject * ko)3762 static void rdev_free(struct kobject *ko)
3763 {
3764 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3765 kfree(rdev);
3766 }
3767 static const struct sysfs_ops rdev_sysfs_ops = {
3768 .show = rdev_attr_show,
3769 .store = rdev_attr_store,
3770 };
3771 static const struct kobj_type rdev_ktype = {
3772 .release = rdev_free,
3773 .sysfs_ops = &rdev_sysfs_ops,
3774 .default_groups = rdev_default_groups,
3775 };
3776
md_rdev_init(struct md_rdev * rdev)3777 int md_rdev_init(struct md_rdev *rdev)
3778 {
3779 rdev->desc_nr = -1;
3780 rdev->saved_raid_disk = -1;
3781 rdev->raid_disk = -1;
3782 rdev->flags = 0;
3783 rdev->data_offset = 0;
3784 rdev->new_data_offset = 0;
3785 rdev->sb_events = 0;
3786 rdev->last_read_error = 0;
3787 rdev->sb_loaded = 0;
3788 rdev->bb_page = NULL;
3789 atomic_set(&rdev->nr_pending, 0);
3790 atomic_set(&rdev->read_errors, 0);
3791 atomic_set(&rdev->corrected_errors, 0);
3792
3793 INIT_LIST_HEAD(&rdev->same_set);
3794 init_waitqueue_head(&rdev->blocked_wait);
3795
3796 /* Add space to store bad block list.
3797 * This reserves the space even on arrays where it cannot
3798 * be used - I wonder if that matters
3799 */
3800 return badblocks_init(&rdev->badblocks, 0);
3801 }
3802 EXPORT_SYMBOL_GPL(md_rdev_init);
3803
3804 /*
3805 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3806 *
3807 * mark the device faulty if:
3808 *
3809 * - the device is nonexistent (zero size)
3810 * - the device has no valid superblock
3811 *
3812 * a faulty rdev _never_ has rdev->sb set.
3813 */
md_import_device(dev_t newdev,int super_format,int super_minor)3814 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3815 {
3816 struct md_rdev *rdev;
3817 sector_t size;
3818 int err;
3819
3820 rdev = kzalloc_obj(*rdev);
3821 if (!rdev)
3822 return ERR_PTR(-ENOMEM);
3823
3824 err = md_rdev_init(rdev);
3825 if (err)
3826 goto out_free_rdev;
3827 err = alloc_disk_sb(rdev);
3828 if (err)
3829 goto out_clear_rdev;
3830
3831 rdev->bdev_file = bdev_file_open_by_dev(newdev,
3832 BLK_OPEN_READ | BLK_OPEN_WRITE,
3833 super_format == -2 ? &claim_rdev : rdev, NULL);
3834 if (IS_ERR(rdev->bdev_file)) {
3835 pr_warn("md: could not open device unknown-block(%u,%u).\n",
3836 MAJOR(newdev), MINOR(newdev));
3837 err = PTR_ERR(rdev->bdev_file);
3838 goto out_clear_rdev;
3839 }
3840 rdev->bdev = file_bdev(rdev->bdev_file);
3841
3842 kobject_init(&rdev->kobj, &rdev_ktype);
3843
3844 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3845 if (!size) {
3846 pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3847 rdev->bdev);
3848 err = -EINVAL;
3849 goto out_blkdev_put;
3850 }
3851
3852 if (super_format >= 0) {
3853 err = super_types[super_format].
3854 load_super(rdev, NULL, super_minor);
3855 if (err == -EINVAL) {
3856 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3857 rdev->bdev,
3858 super_format, super_minor);
3859 goto out_blkdev_put;
3860 }
3861 if (err < 0) {
3862 pr_warn("md: could not read %pg's sb, not importing!\n",
3863 rdev->bdev);
3864 goto out_blkdev_put;
3865 }
3866 }
3867
3868 return rdev;
3869
3870 out_blkdev_put:
3871 fput(rdev->bdev_file);
3872 out_clear_rdev:
3873 md_rdev_clear(rdev);
3874 out_free_rdev:
3875 kfree(rdev);
3876 return ERR_PTR(err);
3877 }
3878
3879 /*
3880 * Check a full RAID array for plausibility
3881 */
3882
analyze_sbs(struct mddev * mddev)3883 static int analyze_sbs(struct mddev *mddev)
3884 {
3885 struct md_rdev *rdev, *freshest, *tmp;
3886
3887 freshest = NULL;
3888 rdev_for_each_safe(rdev, tmp, mddev)
3889 switch (super_types[mddev->major_version].
3890 load_super(rdev, freshest, mddev->minor_version)) {
3891 case 1:
3892 freshest = rdev;
3893 break;
3894 case 0:
3895 break;
3896 default:
3897 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3898 rdev->bdev);
3899 md_kick_rdev_from_array(rdev);
3900 }
3901
3902 /* Cannot find a valid fresh disk */
3903 if (!freshest) {
3904 pr_warn("md: cannot find a valid disk\n");
3905 return -EINVAL;
3906 }
3907
3908 super_types[mddev->major_version].
3909 validate_super(mddev, NULL/*freshest*/, freshest);
3910
3911 rdev_for_each_safe(rdev, tmp, mddev) {
3912 if (mddev->max_disks &&
3913 rdev->desc_nr >= mddev->max_disks) {
3914 pr_warn("md: %s: %pg: only %d devices permitted\n",
3915 mdname(mddev), rdev->bdev,
3916 mddev->max_disks);
3917 md_kick_rdev_from_array(rdev);
3918 continue;
3919 }
3920 if (rdev != freshest) {
3921 if (super_types[mddev->major_version].
3922 validate_super(mddev, freshest, rdev)) {
3923 pr_warn("md: kicking non-fresh %pg from array!\n",
3924 rdev->bdev);
3925 md_kick_rdev_from_array(rdev);
3926 continue;
3927 }
3928 }
3929 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3930 !test_bit(Journal, &rdev->flags)) {
3931 rdev->raid_disk = -1;
3932 clear_bit(In_sync, &rdev->flags);
3933 }
3934 }
3935
3936 return 0;
3937 }
3938
3939 /* Read a fixed-point number.
3940 * Numbers in sysfs attributes should be in "standard" units where
3941 * possible, so time should be in seconds.
3942 * However we internally use a a much smaller unit such as
3943 * milliseconds or jiffies.
3944 * This function takes a decimal number with a possible fractional
3945 * component, and produces an integer which is the result of
3946 * multiplying that number by 10^'scale'.
3947 * all without any floating-point arithmetic.
3948 */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3949 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3950 {
3951 unsigned long result = 0;
3952 long decimals = -1;
3953 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3954 if (*cp == '.')
3955 decimals = 0;
3956 else if (decimals < scale) {
3957 unsigned int value;
3958 value = *cp - '0';
3959 result = result * 10 + value;
3960 if (decimals >= 0)
3961 decimals++;
3962 }
3963 cp++;
3964 }
3965 if (*cp == '\n')
3966 cp++;
3967 if (*cp)
3968 return -EINVAL;
3969 if (decimals < 0)
3970 decimals = 0;
3971 *res = result * int_pow(10, scale - decimals);
3972 return 0;
3973 }
3974
3975 static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3976 safe_delay_show(struct mddev *mddev, char *page)
3977 {
3978 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3979
3980 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3981 }
3982 static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3983 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3984 {
3985 unsigned long msec;
3986
3987 if (mddev_is_clustered(mddev)) {
3988 pr_warn("md: Safemode is disabled for clustered mode\n");
3989 return -EINVAL;
3990 }
3991
3992 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3993 return -EINVAL;
3994 if (msec == 0)
3995 mddev->safemode_delay = 0;
3996 else {
3997 unsigned long old_delay = mddev->safemode_delay;
3998 unsigned long new_delay = (msec*HZ)/1000;
3999
4000 if (new_delay == 0)
4001 new_delay = 1;
4002 mddev->safemode_delay = new_delay;
4003 if (new_delay < old_delay || old_delay == 0)
4004 mod_timer(&mddev->safemode_timer, jiffies+1);
4005 }
4006 return len;
4007 }
4008 static struct md_sysfs_entry md_safe_delay =
4009 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
4010
4011 static ssize_t
level_show(struct mddev * mddev,char * page)4012 level_show(struct mddev *mddev, char *page)
4013 {
4014 struct md_personality *p;
4015 int ret;
4016 spin_lock(&mddev->lock);
4017 p = mddev->pers;
4018 if (p)
4019 ret = sprintf(page, "%s\n", p->head.name);
4020 else if (mddev->clevel[0])
4021 ret = sprintf(page, "%s\n", mddev->clevel);
4022 else if (mddev->level != LEVEL_NONE)
4023 ret = sprintf(page, "%d\n", mddev->level);
4024 else
4025 ret = 0;
4026 spin_unlock(&mddev->lock);
4027 return ret;
4028 }
4029
4030 static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)4031 level_store(struct mddev *mddev, const char *buf, size_t len)
4032 {
4033 char clevel[16];
4034 ssize_t rv;
4035 size_t slen = len;
4036 struct md_personality *pers, *oldpers;
4037 long level;
4038 void *priv, *oldpriv;
4039 struct md_rdev *rdev;
4040
4041 if (slen == 0 || slen >= sizeof(clevel))
4042 return -EINVAL;
4043
4044 rv = mddev_suspend_and_lock(mddev);
4045 if (rv)
4046 return rv;
4047
4048 if (mddev->pers == NULL) {
4049 memcpy(mddev->clevel, buf, slen);
4050 if (mddev->clevel[slen-1] == '\n')
4051 slen--;
4052 mddev->clevel[slen] = 0;
4053 mddev->level = LEVEL_NONE;
4054 rv = len;
4055 goto out_unlock;
4056 }
4057 rv = -EROFS;
4058 if (!md_is_rdwr(mddev))
4059 goto out_unlock;
4060
4061 /* request to change the personality. Need to ensure:
4062 * - array is not engaged in resync/recovery/reshape
4063 * - old personality can be suspended
4064 * - new personality will access other array.
4065 */
4066
4067 rv = -EBUSY;
4068 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4069 mddev->reshape_position != MaxSector ||
4070 mddev->sysfs_active)
4071 goto out_unlock;
4072
4073 rv = -EINVAL;
4074 if (!mddev->pers->quiesce) {
4075 pr_warn("md: %s: %s does not support online personality change\n",
4076 mdname(mddev), mddev->pers->head.name);
4077 goto out_unlock;
4078 }
4079
4080 /* Now find the new personality */
4081 memcpy(clevel, buf, slen);
4082 if (clevel[slen-1] == '\n')
4083 slen--;
4084 clevel[slen] = 0;
4085 if (kstrtol(clevel, 10, &level))
4086 level = LEVEL_NONE;
4087
4088 if (request_module("md-%s", clevel) != 0)
4089 request_module("md-level-%s", clevel);
4090 pers = get_pers(level, clevel);
4091 if (!pers) {
4092 rv = -EINVAL;
4093 goto out_unlock;
4094 }
4095
4096 if (pers == mddev->pers) {
4097 /* Nothing to do! */
4098 put_pers(pers);
4099 rv = len;
4100 goto out_unlock;
4101 }
4102 if (!pers->takeover) {
4103 put_pers(pers);
4104 pr_warn("md: %s: %s does not support personality takeover\n",
4105 mdname(mddev), clevel);
4106 rv = -EINVAL;
4107 goto out_unlock;
4108 }
4109
4110 rdev_for_each(rdev, mddev)
4111 rdev->new_raid_disk = rdev->raid_disk;
4112
4113 /* ->takeover must set new_* and/or delta_disks
4114 * if it succeeds, and may set them when it fails.
4115 */
4116 priv = pers->takeover(mddev);
4117 if (IS_ERR(priv)) {
4118 mddev->new_level = mddev->level;
4119 mddev->new_layout = mddev->layout;
4120 mddev->new_chunk_sectors = mddev->chunk_sectors;
4121 mddev->raid_disks -= mddev->delta_disks;
4122 mddev->delta_disks = 0;
4123 mddev->reshape_backwards = 0;
4124 put_pers(pers);
4125 pr_warn("md: %s: %s would not accept array\n",
4126 mdname(mddev), clevel);
4127 rv = PTR_ERR(priv);
4128 goto out_unlock;
4129 }
4130
4131 /* Looks like we have a winner */
4132 mddev_detach(mddev);
4133
4134 spin_lock(&mddev->lock);
4135 oldpers = mddev->pers;
4136 oldpriv = mddev->private;
4137 mddev->pers = pers;
4138 mddev->private = priv;
4139 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
4140 mddev->level = mddev->new_level;
4141 mddev->layout = mddev->new_layout;
4142 mddev->chunk_sectors = mddev->new_chunk_sectors;
4143 mddev->delta_disks = 0;
4144 mddev->reshape_backwards = 0;
4145 mddev->degraded = 0;
4146 spin_unlock(&mddev->lock);
4147
4148 if (oldpers->sync_request == NULL &&
4149 mddev->external) {
4150 /* We are converting from a no-redundancy array
4151 * to a redundancy array and metadata is managed
4152 * externally so we need to be sure that writes
4153 * won't block due to a need to transition
4154 * clean->dirty
4155 * until external management is started.
4156 */
4157 mddev->in_sync = 0;
4158 mddev->safemode_delay = 0;
4159 mddev->safemode = 0;
4160 }
4161
4162 oldpers->free(mddev, oldpriv);
4163
4164 if (oldpers->sync_request == NULL &&
4165 pers->sync_request != NULL) {
4166 /* need to add the md_redundancy_group */
4167 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4168 pr_warn("md: cannot register extra attributes for %s\n",
4169 mdname(mddev));
4170 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4171 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4172 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4173 }
4174 if (oldpers->sync_request != NULL &&
4175 pers->sync_request == NULL) {
4176 /* need to remove the md_redundancy_group */
4177 if (mddev->to_remove == NULL)
4178 mddev->to_remove = &md_redundancy_group;
4179 }
4180
4181 put_pers(oldpers);
4182
4183 rdev_for_each(rdev, mddev) {
4184 if (rdev->raid_disk < 0)
4185 continue;
4186 if (rdev->new_raid_disk >= mddev->raid_disks)
4187 rdev->new_raid_disk = -1;
4188 if (rdev->new_raid_disk == rdev->raid_disk)
4189 continue;
4190 sysfs_unlink_rdev(mddev, rdev);
4191 }
4192 rdev_for_each(rdev, mddev) {
4193 if (rdev->raid_disk < 0)
4194 continue;
4195 if (rdev->new_raid_disk == rdev->raid_disk)
4196 continue;
4197 rdev->raid_disk = rdev->new_raid_disk;
4198 if (rdev->raid_disk < 0)
4199 clear_bit(In_sync, &rdev->flags);
4200 else {
4201 if (sysfs_link_rdev(mddev, rdev))
4202 pr_warn("md: cannot register rd%d for %s after level change\n",
4203 rdev->raid_disk, mdname(mddev));
4204 }
4205 }
4206
4207 if (pers->sync_request == NULL) {
4208 /* this is now an array without redundancy, so
4209 * it must always be in_sync
4210 */
4211 mddev->in_sync = 1;
4212 timer_delete_sync(&mddev->safemode_timer);
4213 }
4214 pers->run(mddev);
4215 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4216 if (!mddev->thread)
4217 md_update_sb(mddev, 1);
4218 sysfs_notify_dirent_safe(mddev->sysfs_level);
4219 md_new_event();
4220 rv = len;
4221 out_unlock:
4222 mddev_unlock_and_resume(mddev);
4223 return rv;
4224 }
4225
4226 static struct md_sysfs_entry md_level =
4227 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4228
4229 static ssize_t
new_level_show(struct mddev * mddev,char * page)4230 new_level_show(struct mddev *mddev, char *page)
4231 {
4232 return sprintf(page, "%d\n", mddev->new_level);
4233 }
4234
4235 static ssize_t
new_level_store(struct mddev * mddev,const char * buf,size_t len)4236 new_level_store(struct mddev *mddev, const char *buf, size_t len)
4237 {
4238 unsigned int n;
4239 int err;
4240
4241 err = kstrtouint(buf, 10, &n);
4242 if (err < 0)
4243 return err;
4244 err = mddev_lock(mddev);
4245 if (err)
4246 return err;
4247
4248 mddev->new_level = n;
4249 md_update_sb(mddev, 1);
4250
4251 mddev_unlock(mddev);
4252 return len;
4253 }
4254 static struct md_sysfs_entry md_new_level =
4255 __ATTR(new_level, 0664, new_level_show, new_level_store);
4256
4257 static ssize_t
bitmap_type_show(struct mddev * mddev,char * page)4258 bitmap_type_show(struct mddev *mddev, char *page)
4259 {
4260 struct md_submodule_head *head;
4261 unsigned long i;
4262 ssize_t len = 0;
4263
4264 if (mddev->bitmap_id == ID_BITMAP_NONE)
4265 len += sprintf(page + len, "[none] ");
4266 else
4267 len += sprintf(page + len, "none ");
4268
4269 xa_lock(&md_submodule);
4270 xa_for_each(&md_submodule, i, head) {
4271 if (head->type != MD_BITMAP)
4272 continue;
4273
4274 if (mddev->bitmap_id == head->id)
4275 len += sprintf(page + len, "[%s] ", head->name);
4276 else
4277 len += sprintf(page + len, "%s ", head->name);
4278 }
4279 xa_unlock(&md_submodule);
4280
4281 len += sprintf(page + len, "\n");
4282 return len;
4283 }
4284
4285 static ssize_t
bitmap_type_store(struct mddev * mddev,const char * buf,size_t len)4286 bitmap_type_store(struct mddev *mddev, const char *buf, size_t len)
4287 {
4288 struct md_submodule_head *head;
4289 enum md_submodule_id id;
4290 unsigned long i;
4291 int err = 0;
4292
4293 xa_lock(&md_submodule);
4294
4295 if (mddev->bitmap_ops) {
4296 err = -EBUSY;
4297 goto out;
4298 }
4299
4300 if (cmd_match(buf, "none")) {
4301 mddev->bitmap_id = ID_BITMAP_NONE;
4302 goto out;
4303 }
4304
4305 xa_for_each(&md_submodule, i, head) {
4306 if (head->type == MD_BITMAP && cmd_match(buf, head->name)) {
4307 mddev->bitmap_id = head->id;
4308 goto out;
4309 }
4310 }
4311
4312 err = kstrtoint(buf, 10, &id);
4313 if (err)
4314 goto out;
4315
4316 if (id == ID_BITMAP_NONE) {
4317 mddev->bitmap_id = id;
4318 goto out;
4319 }
4320
4321 head = xa_load(&md_submodule, id);
4322 if (head && head->type == MD_BITMAP) {
4323 mddev->bitmap_id = id;
4324 goto out;
4325 }
4326
4327 err = -ENOENT;
4328
4329 out:
4330 xa_unlock(&md_submodule);
4331 return err ? err : len;
4332 }
4333
4334 static struct md_sysfs_entry md_bitmap_type =
4335 __ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store);
4336
4337 static ssize_t
layout_show(struct mddev * mddev,char * page)4338 layout_show(struct mddev *mddev, char *page)
4339 {
4340 /* just a number, not meaningful for all levels */
4341 if (mddev->reshape_position != MaxSector &&
4342 mddev->layout != mddev->new_layout)
4343 return sprintf(page, "%d (%d)\n",
4344 mddev->new_layout, mddev->layout);
4345 return sprintf(page, "%d\n", mddev->layout);
4346 }
4347
4348 static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4349 layout_store(struct mddev *mddev, const char *buf, size_t len)
4350 {
4351 unsigned int n;
4352 int err;
4353
4354 err = kstrtouint(buf, 10, &n);
4355 if (err < 0)
4356 return err;
4357 err = mddev_lock(mddev);
4358 if (err)
4359 return err;
4360
4361 if (mddev->pers) {
4362 if (mddev->pers->check_reshape == NULL)
4363 err = -EBUSY;
4364 else if (!md_is_rdwr(mddev))
4365 err = -EROFS;
4366 else {
4367 mddev->new_layout = n;
4368 err = mddev->pers->check_reshape(mddev);
4369 if (err)
4370 mddev->new_layout = mddev->layout;
4371 }
4372 } else {
4373 mddev->new_layout = n;
4374 if (mddev->reshape_position == MaxSector)
4375 mddev->layout = n;
4376 }
4377 mddev_unlock(mddev);
4378 return err ?: len;
4379 }
4380 static struct md_sysfs_entry md_layout =
4381 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4382
4383 static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4384 raid_disks_show(struct mddev *mddev, char *page)
4385 {
4386 if (mddev->raid_disks == 0)
4387 return 0;
4388 if (mddev->reshape_position != MaxSector &&
4389 mddev->delta_disks != 0)
4390 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4391 mddev->raid_disks - mddev->delta_disks);
4392 return sprintf(page, "%d\n", mddev->raid_disks);
4393 }
4394
4395 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4396
4397 static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4398 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4399 {
4400 unsigned int n;
4401 int err;
4402
4403 err = kstrtouint(buf, 10, &n);
4404 if (err < 0)
4405 return err;
4406
4407 err = mddev_suspend_and_lock(mddev);
4408 if (err)
4409 return err;
4410 if (mddev->pers)
4411 err = update_raid_disks(mddev, n);
4412 else if (mddev->reshape_position != MaxSector) {
4413 struct md_rdev *rdev;
4414 int olddisks = mddev->raid_disks - mddev->delta_disks;
4415
4416 err = -EINVAL;
4417 rdev_for_each(rdev, mddev) {
4418 if (olddisks < n &&
4419 rdev->data_offset < rdev->new_data_offset)
4420 goto out_unlock;
4421 if (olddisks > n &&
4422 rdev->data_offset > rdev->new_data_offset)
4423 goto out_unlock;
4424 }
4425 err = 0;
4426 mddev->delta_disks = n - olddisks;
4427 mddev->raid_disks = n;
4428 mddev->reshape_backwards = (mddev->delta_disks < 0);
4429 } else
4430 mddev->raid_disks = n;
4431 out_unlock:
4432 mddev_unlock_and_resume(mddev);
4433 return err ? err : len;
4434 }
4435 static struct md_sysfs_entry md_raid_disks =
4436 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4437
4438 static ssize_t
uuid_show(struct mddev * mddev,char * page)4439 uuid_show(struct mddev *mddev, char *page)
4440 {
4441 return sprintf(page, "%pU\n", mddev->uuid);
4442 }
4443 static struct md_sysfs_entry md_uuid =
4444 __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4445
4446 static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4447 chunk_size_show(struct mddev *mddev, char *page)
4448 {
4449 if (mddev->reshape_position != MaxSector &&
4450 mddev->chunk_sectors != mddev->new_chunk_sectors)
4451 return sprintf(page, "%d (%d)\n",
4452 mddev->new_chunk_sectors << 9,
4453 mddev->chunk_sectors << 9);
4454 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4455 }
4456
4457 static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4458 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4459 {
4460 unsigned long n;
4461 int err;
4462
4463 err = kstrtoul(buf, 10, &n);
4464 if (err < 0)
4465 return err;
4466
4467 err = mddev_lock(mddev);
4468 if (err)
4469 return err;
4470 if (mddev->pers) {
4471 if (mddev->pers->check_reshape == NULL)
4472 err = -EBUSY;
4473 else if (!md_is_rdwr(mddev))
4474 err = -EROFS;
4475 else {
4476 mddev->new_chunk_sectors = n >> 9;
4477 err = mddev->pers->check_reshape(mddev);
4478 if (err)
4479 mddev->new_chunk_sectors = mddev->chunk_sectors;
4480 }
4481 } else {
4482 mddev->new_chunk_sectors = n >> 9;
4483 if (mddev->reshape_position == MaxSector)
4484 mddev->chunk_sectors = n >> 9;
4485 }
4486 mddev_unlock(mddev);
4487 return err ?: len;
4488 }
4489 static struct md_sysfs_entry md_chunk_size =
4490 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4491
4492 static ssize_t
resync_start_show(struct mddev * mddev,char * page)4493 resync_start_show(struct mddev *mddev, char *page)
4494 {
4495 if (mddev->resync_offset == MaxSector)
4496 return sprintf(page, "none\n");
4497 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
4498 }
4499
4500 static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4501 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4502 {
4503 unsigned long long n;
4504 int err;
4505
4506 if (cmd_match(buf, "none"))
4507 n = MaxSector;
4508 else {
4509 err = kstrtoull(buf, 10, &n);
4510 if (err < 0)
4511 return err;
4512 if (n != (sector_t)n)
4513 return -EINVAL;
4514 }
4515
4516 err = mddev_lock(mddev);
4517 if (err)
4518 return err;
4519 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4520 err = -EBUSY;
4521
4522 if (!err) {
4523 mddev->resync_offset = n;
4524 if (mddev->pers)
4525 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4526 }
4527 mddev_unlock(mddev);
4528 return err ?: len;
4529 }
4530 static struct md_sysfs_entry md_resync_start =
4531 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4532 resync_start_show, resync_start_store);
4533
4534 /*
4535 * The array state can be:
4536 *
4537 * clear
4538 * No devices, no size, no level
4539 * Equivalent to STOP_ARRAY ioctl
4540 * inactive
4541 * May have some settings, but array is not active
4542 * all IO results in error
4543 * When written, doesn't tear down array, but just stops it
4544 * suspended (not supported yet)
4545 * All IO requests will block. The array can be reconfigured.
4546 * Writing this, if accepted, will block until array is quiescent
4547 * readonly
4548 * no resync can happen. no superblocks get written.
4549 * write requests fail
4550 * read-auto
4551 * like readonly, but behaves like 'clean' on a write request.
4552 *
4553 * clean - no pending writes, but otherwise active.
4554 * When written to inactive array, starts without resync
4555 * If a write request arrives then
4556 * if metadata is known, mark 'dirty' and switch to 'active'.
4557 * if not known, block and switch to write-pending
4558 * If written to an active array that has pending writes, then fails.
4559 * active
4560 * fully active: IO and resync can be happening.
4561 * When written to inactive array, starts with resync
4562 *
4563 * write-pending
4564 * clean, but writes are blocked waiting for 'active' to be written.
4565 *
4566 * active-idle
4567 * like active, but no writes have been seen for a while (100msec).
4568 *
4569 * broken
4570 * Array is failed. It's useful because mounted-arrays aren't stopped
4571 * when array is failed, so this state will at least alert the user that
4572 * something is wrong.
4573 */
4574 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4575 write_pending, active_idle, broken, bad_word};
4576 static char *array_states[] = {
4577 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4578 "write-pending", "active-idle", "broken", NULL };
4579
match_word(const char * word,char ** list)4580 static int match_word(const char *word, char **list)
4581 {
4582 int n;
4583 for (n=0; list[n]; n++)
4584 if (cmd_match(word, list[n]))
4585 break;
4586 return n;
4587 }
4588
4589 static ssize_t
array_state_show(struct mddev * mddev,char * page)4590 array_state_show(struct mddev *mddev, char *page)
4591 {
4592 enum array_state st = inactive;
4593
4594 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4595 switch(mddev->ro) {
4596 case MD_RDONLY:
4597 st = readonly;
4598 break;
4599 case MD_AUTO_READ:
4600 st = read_auto;
4601 break;
4602 case MD_RDWR:
4603 spin_lock(&mddev->lock);
4604 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4605 st = write_pending;
4606 else if (mddev->in_sync)
4607 st = clean;
4608 else if (mddev->safemode)
4609 st = active_idle;
4610 else
4611 st = active;
4612 spin_unlock(&mddev->lock);
4613 }
4614
4615 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4616 st = broken;
4617 } else {
4618 if (list_empty(&mddev->disks) &&
4619 mddev->raid_disks == 0 &&
4620 mddev->dev_sectors == 0)
4621 st = clear;
4622 else
4623 st = inactive;
4624 }
4625 return sprintf(page, "%s\n", array_states[st]);
4626 }
4627
4628 static int do_md_stop(struct mddev *mddev, int ro);
4629 static int md_set_readonly(struct mddev *mddev);
4630 static int restart_array(struct mddev *mddev);
4631
4632 static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4633 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4634 {
4635 int err = 0;
4636 enum array_state st = match_word(buf, array_states);
4637
4638 /* No lock dependent actions */
4639 switch (st) {
4640 case suspended: /* not supported yet */
4641 case write_pending: /* cannot be set */
4642 case active_idle: /* cannot be set */
4643 case broken: /* cannot be set */
4644 case bad_word:
4645 return -EINVAL;
4646 case clear:
4647 case readonly:
4648 case inactive:
4649 case read_auto:
4650 if (!mddev->pers || !md_is_rdwr(mddev))
4651 break;
4652 /* write sysfs will not open mddev and opener should be 0 */
4653 err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4654 if (err)
4655 return err;
4656 break;
4657 default:
4658 break;
4659 }
4660
4661 if (mddev->pers && (st == active || st == clean) &&
4662 mddev->ro != MD_RDONLY) {
4663 /* don't take reconfig_mutex when toggling between
4664 * clean and active
4665 */
4666 spin_lock(&mddev->lock);
4667 if (st == active) {
4668 restart_array(mddev);
4669 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4670 md_wakeup_thread(mddev->thread);
4671 wake_up(&mddev->sb_wait);
4672 } else /* st == clean */ {
4673 restart_array(mddev);
4674 if (!set_in_sync(mddev))
4675 err = -EBUSY;
4676 }
4677 if (!err)
4678 sysfs_notify_dirent_safe(mddev->sysfs_state);
4679 spin_unlock(&mddev->lock);
4680 return err ?: len;
4681 }
4682 err = mddev_lock(mddev);
4683 if (err)
4684 return err;
4685
4686 switch (st) {
4687 case inactive:
4688 /* stop an active array, return 0 otherwise */
4689 if (mddev->pers)
4690 err = do_md_stop(mddev, 2);
4691 break;
4692 case clear:
4693 err = do_md_stop(mddev, 0);
4694 break;
4695 case readonly:
4696 if (mddev->pers)
4697 err = md_set_readonly(mddev);
4698 else {
4699 mddev->ro = MD_RDONLY;
4700 set_disk_ro(mddev->gendisk, 1);
4701 err = do_md_run(mddev);
4702 }
4703 break;
4704 case read_auto:
4705 if (mddev->pers) {
4706 if (md_is_rdwr(mddev))
4707 err = md_set_readonly(mddev);
4708 else if (mddev->ro == MD_RDONLY)
4709 err = restart_array(mddev);
4710 if (err == 0) {
4711 mddev->ro = MD_AUTO_READ;
4712 set_disk_ro(mddev->gendisk, 0);
4713 }
4714 } else {
4715 mddev->ro = MD_AUTO_READ;
4716 err = do_md_run(mddev);
4717 }
4718 break;
4719 case clean:
4720 if (mddev->pers) {
4721 err = restart_array(mddev);
4722 if (err)
4723 break;
4724 spin_lock(&mddev->lock);
4725 if (!set_in_sync(mddev))
4726 err = -EBUSY;
4727 spin_unlock(&mddev->lock);
4728 } else
4729 err = -EINVAL;
4730 break;
4731 case active:
4732 if (mddev->pers) {
4733 err = restart_array(mddev);
4734 if (err)
4735 break;
4736 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4737 wake_up(&mddev->sb_wait);
4738 err = 0;
4739 } else {
4740 mddev->ro = MD_RDWR;
4741 set_disk_ro(mddev->gendisk, 0);
4742 err = do_md_run(mddev);
4743 }
4744 break;
4745 default:
4746 err = -EINVAL;
4747 break;
4748 }
4749
4750 if (!err) {
4751 if (mddev->hold_active == UNTIL_IOCTL)
4752 mddev->hold_active = 0;
4753 sysfs_notify_dirent_safe(mddev->sysfs_state);
4754 }
4755 mddev_unlock(mddev);
4756
4757 if (st == readonly || st == read_auto || st == inactive ||
4758 (err && st == clear))
4759 clear_bit(MD_CLOSING, &mddev->flags);
4760
4761 return err ?: len;
4762 }
4763 static struct md_sysfs_entry md_array_state =
4764 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4765
4766 static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4767 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4768 return sprintf(page, "%d\n",
4769 atomic_read(&mddev->max_corr_read_errors));
4770 }
4771
4772 static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4773 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4774 {
4775 unsigned int n;
4776 int rv;
4777
4778 rv = kstrtouint(buf, 10, &n);
4779 if (rv < 0)
4780 return rv;
4781 if (n > INT_MAX)
4782 return -EINVAL;
4783 atomic_set(&mddev->max_corr_read_errors, n);
4784 return len;
4785 }
4786
4787 static struct md_sysfs_entry max_corr_read_errors =
4788 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4789 max_corrected_read_errors_store);
4790
4791 static ssize_t
null_show(struct mddev * mddev,char * page)4792 null_show(struct mddev *mddev, char *page)
4793 {
4794 return -EINVAL;
4795 }
4796
4797 static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4798 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4799 {
4800 /* buf must be %d:%d\n? giving major and minor numbers */
4801 /* The new device is added to the array.
4802 * If the array has a persistent superblock, we read the
4803 * superblock to initialise info and check validity.
4804 * Otherwise, only checking done is that in bind_rdev_to_array,
4805 * which mainly checks size.
4806 */
4807 char *e;
4808 int major = simple_strtoul(buf, &e, 10);
4809 int minor;
4810 dev_t dev;
4811 struct md_rdev *rdev;
4812 int err;
4813
4814 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4815 return -EINVAL;
4816 minor = simple_strtoul(e+1, &e, 10);
4817 if (*e && *e != '\n')
4818 return -EINVAL;
4819 dev = MKDEV(major, minor);
4820 if (major != MAJOR(dev) ||
4821 minor != MINOR(dev))
4822 return -EOVERFLOW;
4823
4824 err = mddev_suspend_and_lock(mddev);
4825 if (err)
4826 return err;
4827 if (mddev->persistent) {
4828 rdev = md_import_device(dev, mddev->major_version,
4829 mddev->minor_version);
4830 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4831 struct md_rdev *rdev0
4832 = list_entry(mddev->disks.next,
4833 struct md_rdev, same_set);
4834 err = super_types[mddev->major_version]
4835 .load_super(rdev, rdev0, mddev->minor_version);
4836 if (err < 0)
4837 goto out;
4838 }
4839 } else if (mddev->external)
4840 rdev = md_import_device(dev, -2, -1);
4841 else
4842 rdev = md_import_device(dev, -1, -1);
4843
4844 if (IS_ERR(rdev)) {
4845 mddev_unlock_and_resume(mddev);
4846 return PTR_ERR(rdev);
4847 }
4848 err = bind_rdev_to_array(rdev, mddev);
4849 out:
4850 if (err)
4851 export_rdev(rdev, mddev);
4852 mddev_unlock_and_resume(mddev);
4853 if (!err)
4854 md_new_event();
4855 return err ? err : len;
4856 }
4857
4858 static struct md_sysfs_entry md_new_device =
4859 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4860
4861 static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4862 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4863 {
4864 char *end;
4865 unsigned long chunk, end_chunk;
4866 int err;
4867
4868 if (!md_bitmap_enabled(mddev, false))
4869 return len;
4870
4871 err = mddev_lock(mddev);
4872 if (err)
4873 return err;
4874 if (!mddev->bitmap)
4875 goto out;
4876 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4877 while (*buf) {
4878 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4879 if (buf == end)
4880 break;
4881
4882 if (*end == '-') { /* range */
4883 buf = end + 1;
4884 end_chunk = simple_strtoul(buf, &end, 0);
4885 if (buf == end)
4886 break;
4887 }
4888
4889 if (*end && !isspace(*end))
4890 break;
4891
4892 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
4893 buf = skip_spaces(end);
4894 }
4895 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
4896 out:
4897 mddev_unlock(mddev);
4898 return len;
4899 }
4900
4901 static struct md_sysfs_entry md_bitmap =
4902 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4903
4904 static ssize_t
size_show(struct mddev * mddev,char * page)4905 size_show(struct mddev *mddev, char *page)
4906 {
4907 return sprintf(page, "%llu\n",
4908 (unsigned long long)mddev->dev_sectors / 2);
4909 }
4910
4911 static int update_size(struct mddev *mddev, sector_t num_sectors);
4912
4913 static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4914 size_store(struct mddev *mddev, const char *buf, size_t len)
4915 {
4916 /* If array is inactive, we can reduce the component size, but
4917 * not increase it (except from 0).
4918 * If array is active, we can try an on-line resize
4919 */
4920 sector_t sectors;
4921 int err = strict_blocks_to_sectors(buf, §ors);
4922
4923 if (err < 0)
4924 return err;
4925 err = mddev_lock(mddev);
4926 if (err)
4927 return err;
4928 if (mddev->pers) {
4929 err = update_size(mddev, sectors);
4930 if (err == 0)
4931 md_update_sb(mddev, 1);
4932 } else {
4933 if (mddev->dev_sectors == 0 ||
4934 mddev->dev_sectors > sectors)
4935 mddev->dev_sectors = sectors;
4936 else
4937 err = -ENOSPC;
4938 }
4939 mddev_unlock(mddev);
4940 return err ? err : len;
4941 }
4942
4943 static struct md_sysfs_entry md_size =
4944 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4945
4946 /* Metadata version.
4947 * This is one of
4948 * 'none' for arrays with no metadata (good luck...)
4949 * 'external' for arrays with externally managed metadata,
4950 * or N.M for internally known formats
4951 */
4952 static ssize_t
metadata_show(struct mddev * mddev,char * page)4953 metadata_show(struct mddev *mddev, char *page)
4954 {
4955 if (mddev->persistent)
4956 return sprintf(page, "%d.%d\n",
4957 mddev->major_version, mddev->minor_version);
4958 else if (mddev->external)
4959 return sprintf(page, "external:%s\n", mddev->metadata_type);
4960 else
4961 return sprintf(page, "none\n");
4962 }
4963
4964 static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4965 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4966 {
4967 int major, minor;
4968 char *e;
4969 int err;
4970 /* Changing the details of 'external' metadata is
4971 * always permitted. Otherwise there must be
4972 * no devices attached to the array.
4973 */
4974
4975 err = mddev_lock(mddev);
4976 if (err)
4977 return err;
4978 err = -EBUSY;
4979 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4980 ;
4981 else if (!list_empty(&mddev->disks))
4982 goto out_unlock;
4983
4984 err = 0;
4985 if (cmd_match(buf, "none")) {
4986 mddev->persistent = 0;
4987 mddev->external = 0;
4988 mddev->major_version = 0;
4989 mddev->minor_version = 90;
4990 goto out_unlock;
4991 }
4992 if (strncmp(buf, "external:", 9) == 0) {
4993 size_t namelen = len-9;
4994 if (namelen >= sizeof(mddev->metadata_type))
4995 namelen = sizeof(mddev->metadata_type)-1;
4996 memcpy(mddev->metadata_type, buf+9, namelen);
4997 mddev->metadata_type[namelen] = 0;
4998 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4999 mddev->metadata_type[--namelen] = 0;
5000 mddev->persistent = 0;
5001 mddev->external = 1;
5002 mddev->major_version = 0;
5003 mddev->minor_version = 90;
5004 goto out_unlock;
5005 }
5006 major = simple_strtoul(buf, &e, 10);
5007 err = -EINVAL;
5008 if (e==buf || *e != '.')
5009 goto out_unlock;
5010 buf = e+1;
5011 minor = simple_strtoul(buf, &e, 10);
5012 if (e==buf || (*e && *e != '\n') )
5013 goto out_unlock;
5014 err = -ENOENT;
5015 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
5016 goto out_unlock;
5017 mddev->major_version = major;
5018 mddev->minor_version = minor;
5019 mddev->persistent = 1;
5020 mddev->external = 0;
5021 err = 0;
5022 out_unlock:
5023 mddev_unlock(mddev);
5024 return err ?: len;
5025 }
5026
5027 static struct md_sysfs_entry md_metadata =
5028 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
5029
rdev_needs_recovery(struct md_rdev * rdev,sector_t sectors)5030 static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
5031 {
5032 return rdev->raid_disk >= 0 &&
5033 !test_bit(Journal, &rdev->flags) &&
5034 !test_bit(Faulty, &rdev->flags) &&
5035 !test_bit(In_sync, &rdev->flags) &&
5036 rdev->recovery_offset < sectors;
5037 }
5038
md_get_active_sync_action(struct mddev * mddev)5039 static enum sync_action md_get_active_sync_action(struct mddev *mddev)
5040 {
5041 struct md_rdev *rdev;
5042 bool is_recover = false;
5043
5044 if (mddev->resync_offset < MaxSector)
5045 return ACTION_RESYNC;
5046
5047 if (mddev->reshape_position != MaxSector)
5048 return ACTION_RESHAPE;
5049
5050 rcu_read_lock();
5051 rdev_for_each_rcu(rdev, mddev) {
5052 if (rdev_needs_recovery(rdev, MaxSector)) {
5053 is_recover = true;
5054 break;
5055 }
5056 }
5057 rcu_read_unlock();
5058
5059 return is_recover ? ACTION_RECOVER : ACTION_IDLE;
5060 }
5061
md_sync_action(struct mddev * mddev)5062 enum sync_action md_sync_action(struct mddev *mddev)
5063 {
5064 unsigned long recovery = mddev->recovery;
5065 enum sync_action active_action;
5066
5067 /*
5068 * frozen has the highest priority, means running sync_thread will be
5069 * stopped immediately, and no new sync_thread can start.
5070 */
5071 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
5072 return ACTION_FROZEN;
5073
5074 /*
5075 * read-only array can't register sync_thread, and it can only
5076 * add/remove spares.
5077 */
5078 if (!md_is_rdwr(mddev))
5079 return ACTION_IDLE;
5080
5081 /*
5082 * idle means no sync_thread is running, and no new sync_thread is
5083 * requested.
5084 */
5085 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
5086 !test_bit(MD_RECOVERY_NEEDED, &recovery))
5087 return ACTION_IDLE;
5088
5089 /*
5090 * Check if any sync operation (resync/recover/reshape) is
5091 * currently active. This ensures that only one sync operation
5092 * can run at a time. Returns the type of active operation, or
5093 * ACTION_IDLE if none are active.
5094 */
5095 active_action = md_get_active_sync_action(mddev);
5096 if (active_action != ACTION_IDLE)
5097 return active_action;
5098
5099 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
5100 return ACTION_RESHAPE;
5101
5102 if (test_bit(MD_RECOVERY_RECOVER, &recovery))
5103 return ACTION_RECOVER;
5104
5105 if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
5106 /*
5107 * MD_RECOVERY_CHECK must be paired with
5108 * MD_RECOVERY_REQUESTED.
5109 */
5110 if (test_bit(MD_RECOVERY_CHECK, &recovery))
5111 return ACTION_CHECK;
5112 if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
5113 return ACTION_REPAIR;
5114 return ACTION_RESYNC;
5115 }
5116
5117 /*
5118 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
5119 * sync_action is specified.
5120 */
5121 return ACTION_IDLE;
5122 }
5123
md_sync_action_by_name(const char * page)5124 enum sync_action md_sync_action_by_name(const char *page)
5125 {
5126 enum sync_action action;
5127
5128 for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
5129 if (cmd_match(page, action_name[action]))
5130 return action;
5131 }
5132
5133 return NR_SYNC_ACTIONS;
5134 }
5135
md_sync_action_name(enum sync_action action)5136 const char *md_sync_action_name(enum sync_action action)
5137 {
5138 return action_name[action];
5139 }
5140
5141 static ssize_t
action_show(struct mddev * mddev,char * page)5142 action_show(struct mddev *mddev, char *page)
5143 {
5144 enum sync_action action = md_sync_action(mddev);
5145
5146 return sprintf(page, "%s\n", md_sync_action_name(action));
5147 }
5148
5149 /**
5150 * stop_sync_thread() - wait for sync_thread to stop if it's running.
5151 * @mddev: the array.
5152 * @locked: if set, reconfig_mutex will still be held after this function
5153 * return; if not set, reconfig_mutex will be released after this
5154 * function return.
5155 */
stop_sync_thread(struct mddev * mddev,bool locked)5156 static void stop_sync_thread(struct mddev *mddev, bool locked)
5157 {
5158 int sync_seq = atomic_read(&mddev->sync_seq);
5159
5160 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5161 if (!locked)
5162 mddev_unlock(mddev);
5163 return;
5164 }
5165
5166 mddev_unlock(mddev);
5167
5168 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5169 /*
5170 * Thread might be blocked waiting for metadata update which will now
5171 * never happen
5172 */
5173 md_wakeup_thread_directly(&mddev->sync_thread);
5174 if (work_pending(&mddev->sync_work))
5175 flush_work(&mddev->sync_work);
5176
5177 wait_event(resync_wait,
5178 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
5179 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
5180 sync_seq != atomic_read(&mddev->sync_seq)));
5181
5182 if (locked)
5183 mddev_lock_nointr(mddev);
5184 }
5185
md_idle_sync_thread(struct mddev * mddev)5186 void md_idle_sync_thread(struct mddev *mddev)
5187 {
5188 lockdep_assert_held(&mddev->reconfig_mutex);
5189
5190 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5191 stop_sync_thread(mddev, true);
5192 }
5193 EXPORT_SYMBOL_GPL(md_idle_sync_thread);
5194
md_frozen_sync_thread(struct mddev * mddev)5195 void md_frozen_sync_thread(struct mddev *mddev)
5196 {
5197 lockdep_assert_held(&mddev->reconfig_mutex);
5198
5199 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5200 stop_sync_thread(mddev, true);
5201 }
5202 EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
5203
md_unfrozen_sync_thread(struct mddev * mddev)5204 void md_unfrozen_sync_thread(struct mddev *mddev)
5205 {
5206 lockdep_assert_held(&mddev->reconfig_mutex);
5207
5208 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5209 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5210 md_wakeup_thread(mddev->thread);
5211 sysfs_notify_dirent_safe(mddev->sysfs_action);
5212 }
5213 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
5214
mddev_start_reshape(struct mddev * mddev)5215 static int mddev_start_reshape(struct mddev *mddev)
5216 {
5217 int ret;
5218
5219 if (mddev->pers->start_reshape == NULL)
5220 return -EINVAL;
5221
5222 if (mddev->reshape_position == MaxSector ||
5223 mddev->pers->check_reshape == NULL ||
5224 mddev->pers->check_reshape(mddev)) {
5225 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5226 ret = mddev->pers->start_reshape(mddev);
5227 if (ret)
5228 return ret;
5229 } else {
5230 /*
5231 * If reshape is still in progress, and md_check_recovery() can
5232 * continue to reshape, don't restart reshape because data can
5233 * be corrupted for raid456.
5234 */
5235 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5236 }
5237
5238 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
5239 return 0;
5240 }
5241
5242 static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)5243 action_store(struct mddev *mddev, const char *page, size_t len)
5244 {
5245 int ret;
5246 enum sync_action action;
5247
5248 if (!mddev->pers || !mddev->pers->sync_request)
5249 return -EINVAL;
5250
5251 retry:
5252 if (work_busy(&mddev->sync_work))
5253 flush_work(&mddev->sync_work);
5254
5255 ret = mddev_lock(mddev);
5256 if (ret)
5257 return ret;
5258
5259 if (work_busy(&mddev->sync_work)) {
5260 mddev_unlock(mddev);
5261 goto retry;
5262 }
5263
5264 action = md_sync_action_by_name(page);
5265
5266 /* TODO: mdadm rely on "idle" to start sync_thread. */
5267 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5268 switch (action) {
5269 case ACTION_FROZEN:
5270 md_frozen_sync_thread(mddev);
5271 ret = len;
5272 goto out;
5273 case ACTION_IDLE:
5274 md_idle_sync_thread(mddev);
5275 break;
5276 case ACTION_RESHAPE:
5277 case ACTION_RECOVER:
5278 case ACTION_CHECK:
5279 case ACTION_REPAIR:
5280 case ACTION_RESYNC:
5281 ret = -EBUSY;
5282 goto out;
5283 default:
5284 ret = -EINVAL;
5285 goto out;
5286 }
5287 } else {
5288 switch (action) {
5289 case ACTION_FROZEN:
5290 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5291 ret = len;
5292 goto out;
5293 case ACTION_RESHAPE:
5294 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5295 ret = mddev_start_reshape(mddev);
5296 if (ret)
5297 goto out;
5298 break;
5299 case ACTION_RECOVER:
5300 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5301 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5302 break;
5303 case ACTION_CHECK:
5304 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5305 fallthrough;
5306 case ACTION_REPAIR:
5307 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5308 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5309 fallthrough;
5310 case ACTION_RESYNC:
5311 case ACTION_IDLE:
5312 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5313 break;
5314 default:
5315 ret = -EINVAL;
5316 goto out;
5317 }
5318 }
5319
5320 if (mddev->ro == MD_AUTO_READ) {
5321 /* A write to sync_action is enough to justify
5322 * canceling read-auto mode
5323 */
5324 mddev->ro = MD_RDWR;
5325 md_wakeup_thread(mddev->sync_thread);
5326 }
5327
5328 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5329 md_wakeup_thread(mddev->thread);
5330 sysfs_notify_dirent_safe(mddev->sysfs_action);
5331 ret = len;
5332
5333 out:
5334 mddev_unlock(mddev);
5335 return ret;
5336 }
5337
5338 static struct md_sysfs_entry md_scan_mode =
5339 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5340
5341 static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)5342 last_sync_action_show(struct mddev *mddev, char *page)
5343 {
5344 return sprintf(page, "%s\n",
5345 md_sync_action_name(mddev->last_sync_action));
5346 }
5347
5348 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5349
5350 static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)5351 mismatch_cnt_show(struct mddev *mddev, char *page)
5352 {
5353 return sprintf(page, "%llu\n",
5354 (unsigned long long)
5355 atomic64_read(&mddev->resync_mismatches));
5356 }
5357
5358 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5359
5360 static ssize_t
sync_min_show(struct mddev * mddev,char * page)5361 sync_min_show(struct mddev *mddev, char *page)
5362 {
5363 return sprintf(page, "%d (%s)\n", speed_min(mddev),
5364 mddev->sync_speed_min ? "local" : "system");
5365 }
5366
5367 static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)5368 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5369 {
5370 unsigned int min;
5371 int rv;
5372
5373 if (strncmp(buf, "system", 6) == 0) {
5374 min = 0;
5375 } else {
5376 rv = kstrtouint(buf, 10, &min);
5377 if (rv < 0)
5378 return rv;
5379 if (min == 0)
5380 return -EINVAL;
5381 }
5382 mddev->sync_speed_min = min;
5383 return len;
5384 }
5385
5386 static struct md_sysfs_entry md_sync_min =
5387 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5388
5389 static ssize_t
sync_max_show(struct mddev * mddev,char * page)5390 sync_max_show(struct mddev *mddev, char *page)
5391 {
5392 return sprintf(page, "%d (%s)\n", speed_max(mddev),
5393 mddev->sync_speed_max ? "local" : "system");
5394 }
5395
5396 static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5397 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5398 {
5399 unsigned int max;
5400 int rv;
5401
5402 if (strncmp(buf, "system", 6) == 0) {
5403 max = 0;
5404 } else {
5405 rv = kstrtouint(buf, 10, &max);
5406 if (rv < 0)
5407 return rv;
5408 if (max == 0)
5409 return -EINVAL;
5410 }
5411 mddev->sync_speed_max = max;
5412 return len;
5413 }
5414
5415 static struct md_sysfs_entry md_sync_max =
5416 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5417
5418 static ssize_t
sync_io_depth_show(struct mddev * mddev,char * page)5419 sync_io_depth_show(struct mddev *mddev, char *page)
5420 {
5421 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
5422 mddev->sync_io_depth ? "local" : "system");
5423 }
5424
5425 static ssize_t
sync_io_depth_store(struct mddev * mddev,const char * buf,size_t len)5426 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
5427 {
5428 unsigned int max;
5429 int rv;
5430
5431 if (strncmp(buf, "system", 6) == 0) {
5432 max = 0;
5433 } else {
5434 rv = kstrtouint(buf, 10, &max);
5435 if (rv < 0)
5436 return rv;
5437 if (max == 0)
5438 return -EINVAL;
5439 }
5440 mddev->sync_io_depth = max;
5441 return len;
5442 }
5443
5444 static struct md_sysfs_entry md_sync_io_depth =
5445 __ATTR_RW(sync_io_depth);
5446
5447 static ssize_t
degraded_show(struct mddev * mddev,char * page)5448 degraded_show(struct mddev *mddev, char *page)
5449 {
5450 return sprintf(page, "%d\n", mddev->degraded);
5451 }
5452 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5453
5454 static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5455 sync_force_parallel_show(struct mddev *mddev, char *page)
5456 {
5457 return sprintf(page, "%d\n", mddev->parallel_resync);
5458 }
5459
5460 static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5461 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5462 {
5463 long n;
5464
5465 if (kstrtol(buf, 10, &n))
5466 return -EINVAL;
5467
5468 if (n != 0 && n != 1)
5469 return -EINVAL;
5470
5471 mddev->parallel_resync = n;
5472
5473 if (mddev->sync_thread)
5474 wake_up(&resync_wait);
5475
5476 return len;
5477 }
5478
5479 /* force parallel resync, even with shared block devices */
5480 static struct md_sysfs_entry md_sync_force_parallel =
5481 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5482 sync_force_parallel_show, sync_force_parallel_store);
5483
5484 static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5485 sync_speed_show(struct mddev *mddev, char *page)
5486 {
5487 unsigned long resync, dt, db;
5488 if (mddev->curr_resync == MD_RESYNC_NONE)
5489 return sprintf(page, "none\n");
5490 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5491 dt = (jiffies - mddev->resync_mark) / HZ;
5492 if (!dt) dt++;
5493 db = resync - mddev->resync_mark_cnt;
5494 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5495 }
5496
5497 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5498
5499 static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5500 sync_completed_show(struct mddev *mddev, char *page)
5501 {
5502 unsigned long long max_sectors, resync;
5503
5504 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5505 return sprintf(page, "none\n");
5506
5507 if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5508 mddev->curr_resync == MD_RESYNC_DELAYED)
5509 return sprintf(page, "delayed\n");
5510
5511 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5512 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5513 max_sectors = mddev->resync_max_sectors;
5514 else
5515 max_sectors = mddev->dev_sectors;
5516
5517 resync = mddev->curr_resync_completed;
5518 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5519 }
5520
5521 static struct md_sysfs_entry md_sync_completed =
5522 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5523
5524 static ssize_t
min_sync_show(struct mddev * mddev,char * page)5525 min_sync_show(struct mddev *mddev, char *page)
5526 {
5527 return sprintf(page, "%llu\n",
5528 (unsigned long long)mddev->resync_min);
5529 }
5530 static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5531 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5532 {
5533 unsigned long long min;
5534 int err;
5535
5536 if (kstrtoull(buf, 10, &min))
5537 return -EINVAL;
5538
5539 spin_lock(&mddev->lock);
5540 err = -EINVAL;
5541 if (min > mddev->resync_max)
5542 goto out_unlock;
5543
5544 err = -EBUSY;
5545 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5546 goto out_unlock;
5547
5548 /* Round down to multiple of 4K for safety */
5549 mddev->resync_min = round_down(min, 8);
5550 err = 0;
5551
5552 out_unlock:
5553 spin_unlock(&mddev->lock);
5554 return err ?: len;
5555 }
5556
5557 static struct md_sysfs_entry md_min_sync =
5558 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5559
5560 static ssize_t
max_sync_show(struct mddev * mddev,char * page)5561 max_sync_show(struct mddev *mddev, char *page)
5562 {
5563 if (mddev->resync_max == MaxSector)
5564 return sprintf(page, "max\n");
5565 else
5566 return sprintf(page, "%llu\n",
5567 (unsigned long long)mddev->resync_max);
5568 }
5569 static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5570 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5571 {
5572 int err;
5573 spin_lock(&mddev->lock);
5574 if (strncmp(buf, "max", 3) == 0)
5575 mddev->resync_max = MaxSector;
5576 else {
5577 unsigned long long max;
5578 int chunk;
5579
5580 err = -EINVAL;
5581 if (kstrtoull(buf, 10, &max))
5582 goto out_unlock;
5583 if (max < mddev->resync_min)
5584 goto out_unlock;
5585
5586 err = -EBUSY;
5587 if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5588 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5589 goto out_unlock;
5590
5591 /* Must be a multiple of chunk_size */
5592 chunk = mddev->chunk_sectors;
5593 if (chunk) {
5594 sector_t temp = max;
5595
5596 err = -EINVAL;
5597 if (sector_div(temp, chunk))
5598 goto out_unlock;
5599 }
5600 mddev->resync_max = max;
5601 }
5602 wake_up(&mddev->recovery_wait);
5603 err = 0;
5604 out_unlock:
5605 spin_unlock(&mddev->lock);
5606 return err ?: len;
5607 }
5608
5609 static struct md_sysfs_entry md_max_sync =
5610 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5611
5612 static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5613 suspend_lo_show(struct mddev *mddev, char *page)
5614 {
5615 return sprintf(page, "%llu\n",
5616 (unsigned long long)READ_ONCE(mddev->suspend_lo));
5617 }
5618
5619 static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5620 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5621 {
5622 unsigned long long new;
5623 int err;
5624
5625 err = kstrtoull(buf, 10, &new);
5626 if (err < 0)
5627 return err;
5628 if (new != (sector_t)new)
5629 return -EINVAL;
5630
5631 err = mddev_suspend(mddev, true);
5632 if (err)
5633 return err;
5634
5635 WRITE_ONCE(mddev->suspend_lo, new);
5636 mddev_resume(mddev);
5637
5638 return len;
5639 }
5640 static struct md_sysfs_entry md_suspend_lo =
5641 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5642
5643 static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5644 suspend_hi_show(struct mddev *mddev, char *page)
5645 {
5646 return sprintf(page, "%llu\n",
5647 (unsigned long long)READ_ONCE(mddev->suspend_hi));
5648 }
5649
5650 static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5651 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5652 {
5653 unsigned long long new;
5654 int err;
5655
5656 err = kstrtoull(buf, 10, &new);
5657 if (err < 0)
5658 return err;
5659 if (new != (sector_t)new)
5660 return -EINVAL;
5661
5662 err = mddev_suspend(mddev, true);
5663 if (err)
5664 return err;
5665
5666 WRITE_ONCE(mddev->suspend_hi, new);
5667 mddev_resume(mddev);
5668
5669 return len;
5670 }
5671 static struct md_sysfs_entry md_suspend_hi =
5672 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5673
5674 static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5675 reshape_position_show(struct mddev *mddev, char *page)
5676 {
5677 if (mddev->reshape_position != MaxSector)
5678 return sprintf(page, "%llu\n",
5679 (unsigned long long)mddev->reshape_position);
5680 strcpy(page, "none\n");
5681 return 5;
5682 }
5683
5684 static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5685 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5686 {
5687 struct md_rdev *rdev;
5688 unsigned long long new;
5689 int err;
5690
5691 err = kstrtoull(buf, 10, &new);
5692 if (err < 0)
5693 return err;
5694 if (new != (sector_t)new)
5695 return -EINVAL;
5696 err = mddev_lock(mddev);
5697 if (err)
5698 return err;
5699 err = -EBUSY;
5700 if (mddev->pers)
5701 goto unlock;
5702 mddev->reshape_position = new;
5703 mddev->delta_disks = 0;
5704 mddev->reshape_backwards = 0;
5705 mddev->new_level = mddev->level;
5706 mddev->new_layout = mddev->layout;
5707 mddev->new_chunk_sectors = mddev->chunk_sectors;
5708 rdev_for_each(rdev, mddev)
5709 rdev->new_data_offset = rdev->data_offset;
5710 err = 0;
5711 unlock:
5712 mddev_unlock(mddev);
5713 return err ?: len;
5714 }
5715
5716 static struct md_sysfs_entry md_reshape_position =
5717 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5718 reshape_position_store);
5719
5720 static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5721 reshape_direction_show(struct mddev *mddev, char *page)
5722 {
5723 return sprintf(page, "%s\n",
5724 mddev->reshape_backwards ? "backwards" : "forwards");
5725 }
5726
5727 static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5728 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5729 {
5730 int backwards = 0;
5731 int err;
5732
5733 if (cmd_match(buf, "forwards"))
5734 backwards = 0;
5735 else if (cmd_match(buf, "backwards"))
5736 backwards = 1;
5737 else
5738 return -EINVAL;
5739 if (mddev->reshape_backwards == backwards)
5740 return len;
5741
5742 err = mddev_lock(mddev);
5743 if (err)
5744 return err;
5745 /* check if we are allowed to change */
5746 if (mddev->delta_disks)
5747 err = -EBUSY;
5748 else if (mddev->persistent &&
5749 mddev->major_version == 0)
5750 err = -EINVAL;
5751 else
5752 mddev->reshape_backwards = backwards;
5753 mddev_unlock(mddev);
5754 return err ?: len;
5755 }
5756
5757 static struct md_sysfs_entry md_reshape_direction =
5758 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5759 reshape_direction_store);
5760
5761 static ssize_t
array_size_show(struct mddev * mddev,char * page)5762 array_size_show(struct mddev *mddev, char *page)
5763 {
5764 if (mddev->external_size)
5765 return sprintf(page, "%llu\n",
5766 (unsigned long long)mddev->array_sectors/2);
5767 else
5768 return sprintf(page, "default\n");
5769 }
5770
5771 static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5772 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5773 {
5774 sector_t sectors;
5775 int err;
5776
5777 err = mddev_lock(mddev);
5778 if (err)
5779 return err;
5780
5781 /* cluster raid doesn't support change array_sectors */
5782 if (mddev_is_clustered(mddev)) {
5783 mddev_unlock(mddev);
5784 return -EINVAL;
5785 }
5786
5787 if (strncmp(buf, "default", 7) == 0) {
5788 if (mddev->pers)
5789 sectors = mddev->pers->size(mddev, 0, 0);
5790 else
5791 sectors = mddev->array_sectors;
5792
5793 mddev->external_size = 0;
5794 } else {
5795 if (strict_blocks_to_sectors(buf, §ors) < 0)
5796 err = -EINVAL;
5797 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5798 err = -E2BIG;
5799 else
5800 mddev->external_size = 1;
5801 }
5802
5803 if (!err) {
5804 mddev->array_sectors = sectors;
5805 if (mddev->pers)
5806 set_capacity_and_notify(mddev->gendisk,
5807 mddev->array_sectors);
5808 }
5809 mddev_unlock(mddev);
5810 return err ?: len;
5811 }
5812
5813 static struct md_sysfs_entry md_array_size =
5814 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5815 array_size_store);
5816
5817 static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5818 consistency_policy_show(struct mddev *mddev, char *page)
5819 {
5820 int ret;
5821
5822 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5823 ret = sprintf(page, "journal\n");
5824 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5825 ret = sprintf(page, "ppl\n");
5826 } else if (mddev->bitmap) {
5827 ret = sprintf(page, "bitmap\n");
5828 } else if (mddev->pers) {
5829 if (mddev->pers->sync_request)
5830 ret = sprintf(page, "resync\n");
5831 else
5832 ret = sprintf(page, "none\n");
5833 } else {
5834 ret = sprintf(page, "unknown\n");
5835 }
5836
5837 return ret;
5838 }
5839
5840 static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5841 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5842 {
5843 int err = 0;
5844
5845 if (mddev->pers) {
5846 if (mddev->pers->change_consistency_policy)
5847 err = mddev->pers->change_consistency_policy(mddev, buf);
5848 else
5849 err = -EBUSY;
5850 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5851 set_bit(MD_HAS_PPL, &mddev->flags);
5852 } else {
5853 err = -EINVAL;
5854 }
5855
5856 return err ? err : len;
5857 }
5858
5859 static struct md_sysfs_entry md_consistency_policy =
5860 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5861 consistency_policy_store);
5862
fail_last_dev_show(struct mddev * mddev,char * page)5863 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5864 {
5865 return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags));
5866 }
5867
5868 /*
5869 * Setting MD_FAILLAST_DEV to allow last device to be forcibly removed
5870 * from RAID1/RAID10.
5871 */
5872 static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5873 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5874 {
5875 int ret;
5876 bool value;
5877
5878 ret = kstrtobool(buf, &value);
5879 if (ret)
5880 return ret;
5881
5882 if (value)
5883 set_bit(MD_FAILLAST_DEV, &mddev->flags);
5884 else
5885 clear_bit(MD_FAILLAST_DEV, &mddev->flags);
5886
5887 return len;
5888 }
5889 static struct md_sysfs_entry md_fail_last_dev =
5890 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5891 fail_last_dev_store);
5892
serialize_policy_show(struct mddev * mddev,char * page)5893 static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5894 {
5895 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
5896 return sprintf(page, "n/a\n");
5897 else
5898 return sprintf(page, "%d\n",
5899 test_bit(MD_SERIALIZE_POLICY, &mddev->flags));
5900 }
5901
5902 /*
5903 * Setting MD_SERIALIZE_POLICY enforce write IO is not reordered
5904 * for raid1.
5905 */
5906 static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5907 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5908 {
5909 int err;
5910 bool value;
5911
5912 err = kstrtobool(buf, &value);
5913 if (err)
5914 return err;
5915
5916 if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
5917 return len;
5918
5919 err = mddev_suspend_and_lock(mddev);
5920 if (err)
5921 return err;
5922 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
5923 pr_err("md: serialize_policy is only effective for raid1\n");
5924 err = -EINVAL;
5925 goto unlock;
5926 }
5927
5928 if (value) {
5929 mddev_create_serial_pool(mddev, NULL);
5930 set_bit(MD_SERIALIZE_POLICY, &mddev->flags);
5931 } else {
5932 mddev_destroy_serial_pool(mddev, NULL);
5933 clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
5934 }
5935 unlock:
5936 mddev_unlock_and_resume(mddev);
5937 return err ?: len;
5938 }
5939
5940 static struct md_sysfs_entry md_serialize_policy =
5941 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5942 serialize_policy_store);
5943
mddev_set_logical_block_size(struct mddev * mddev,unsigned int lbs)5944 static int mddev_set_logical_block_size(struct mddev *mddev,
5945 unsigned int lbs)
5946 {
5947 int err = 0;
5948 struct queue_limits lim;
5949
5950 if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
5951 pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
5952 mdname(mddev), lbs);
5953 return -EINVAL;
5954 }
5955
5956 lim = queue_limits_start_update(mddev->gendisk->queue);
5957 lim.logical_block_size = lbs;
5958 pr_info("%s: logical_block_size is changed, data may be lost\n",
5959 mdname(mddev));
5960 err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
5961 if (err)
5962 return err;
5963
5964 mddev->logical_block_size = lbs;
5965 /* New lbs will be written to superblock after array is running */
5966 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5967 return 0;
5968 }
5969
5970 static ssize_t
lbs_show(struct mddev * mddev,char * page)5971 lbs_show(struct mddev *mddev, char *page)
5972 {
5973 return sprintf(page, "%u\n", mddev->logical_block_size);
5974 }
5975
5976 static ssize_t
lbs_store(struct mddev * mddev,const char * buf,size_t len)5977 lbs_store(struct mddev *mddev, const char *buf, size_t len)
5978 {
5979 unsigned int lbs;
5980 int err = -EBUSY;
5981
5982 /* Only 1.x meta supports configurable LBS */
5983 if (mddev->major_version == 0)
5984 return -EINVAL;
5985
5986 err = kstrtouint(buf, 10, &lbs);
5987 if (err < 0)
5988 return -EINVAL;
5989
5990 if (mddev->pers) {
5991 unsigned int curr_lbs;
5992
5993 if (mddev->logical_block_size)
5994 return -EBUSY;
5995 /*
5996 * To fix forward compatibility issues, LBS is not
5997 * configured for arrays from old kernels (<=6.18) by default.
5998 * If the user confirms no rollback to old kernels,
5999 * enable LBS by writing current LBS — to prevent data
6000 * loss from LBS changes.
6001 */
6002 curr_lbs = queue_logical_block_size(mddev->gendisk->queue);
6003 if (lbs != curr_lbs)
6004 return -EINVAL;
6005
6006 mddev->logical_block_size = curr_lbs;
6007 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6008 pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n",
6009 mdname(mddev));
6010 return len;
6011 }
6012
6013 err = mddev_lock(mddev);
6014 if (err)
6015 goto unlock;
6016
6017 err = mddev_set_logical_block_size(mddev, lbs);
6018
6019 unlock:
6020 mddev_unlock(mddev);
6021 return err ?: len;
6022 }
6023
6024 static struct md_sysfs_entry md_logical_block_size =
6025 __ATTR(logical_block_size, 0644, lbs_show, lbs_store);
6026
6027 static struct attribute *md_default_attrs[] = {
6028 &md_level.attr,
6029 &md_new_level.attr,
6030 &md_bitmap_type.attr,
6031 &md_layout.attr,
6032 &md_raid_disks.attr,
6033 &md_uuid.attr,
6034 &md_chunk_size.attr,
6035 &md_size.attr,
6036 &md_resync_start.attr,
6037 &md_metadata.attr,
6038 &md_new_device.attr,
6039 &md_safe_delay.attr,
6040 &md_array_state.attr,
6041 &md_reshape_position.attr,
6042 &md_reshape_direction.attr,
6043 &md_array_size.attr,
6044 &max_corr_read_errors.attr,
6045 &md_consistency_policy.attr,
6046 &md_fail_last_dev.attr,
6047 &md_serialize_policy.attr,
6048 &md_logical_block_size.attr,
6049 NULL,
6050 };
6051
6052 static const struct attribute_group md_default_group = {
6053 .attrs = md_default_attrs,
6054 };
6055
6056 static struct attribute *md_redundancy_attrs[] = {
6057 &md_scan_mode.attr,
6058 &md_last_scan_mode.attr,
6059 &md_mismatches.attr,
6060 &md_sync_min.attr,
6061 &md_sync_max.attr,
6062 &md_sync_io_depth.attr,
6063 &md_sync_speed.attr,
6064 &md_sync_force_parallel.attr,
6065 &md_sync_completed.attr,
6066 &md_min_sync.attr,
6067 &md_max_sync.attr,
6068 &md_suspend_lo.attr,
6069 &md_suspend_hi.attr,
6070 &md_bitmap.attr,
6071 &md_degraded.attr,
6072 NULL,
6073 };
6074 static const struct attribute_group md_redundancy_group = {
6075 .name = NULL,
6076 .attrs = md_redundancy_attrs,
6077 };
6078
6079 static const struct attribute_group *md_attr_groups[] = {
6080 &md_default_group,
6081 NULL,
6082 };
6083
6084 static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)6085 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
6086 {
6087 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
6088 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
6089 ssize_t rv;
6090
6091 if (!entry->show)
6092 return -EIO;
6093 spin_lock(&all_mddevs_lock);
6094 if (!mddev_get(mddev)) {
6095 spin_unlock(&all_mddevs_lock);
6096 return -EBUSY;
6097 }
6098 spin_unlock(&all_mddevs_lock);
6099
6100 rv = entry->show(mddev, page);
6101 mddev_put(mddev);
6102 return rv;
6103 }
6104
6105 static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)6106 md_attr_store(struct kobject *kobj, struct attribute *attr,
6107 const char *page, size_t length)
6108 {
6109 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
6110 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
6111 ssize_t rv;
6112 struct kernfs_node *kn = NULL;
6113
6114 if (!entry->store)
6115 return -EIO;
6116 if (!capable(CAP_SYS_ADMIN))
6117 return -EACCES;
6118
6119 if (entry->store == array_state_store && cmd_match(page, "clear"))
6120 kn = sysfs_break_active_protection(kobj, attr);
6121
6122 spin_lock(&all_mddevs_lock);
6123 if (!mddev_get(mddev)) {
6124 spin_unlock(&all_mddevs_lock);
6125 if (kn)
6126 sysfs_unbreak_active_protection(kn);
6127 return -EBUSY;
6128 }
6129 spin_unlock(&all_mddevs_lock);
6130 rv = entry->store(mddev, page, length);
6131 mddev_put(mddev);
6132
6133 if (kn)
6134 sysfs_unbreak_active_protection(kn);
6135
6136 return rv;
6137 }
6138
md_kobj_release(struct kobject * ko)6139 static void md_kobj_release(struct kobject *ko)
6140 {
6141 struct mddev *mddev = container_of(ko, struct mddev, kobj);
6142
6143 if (legacy_async_del_gendisk) {
6144 if (mddev->sysfs_state)
6145 sysfs_put(mddev->sysfs_state);
6146 if (mddev->sysfs_level)
6147 sysfs_put(mddev->sysfs_level);
6148 del_gendisk(mddev->gendisk);
6149 }
6150 put_disk(mddev->gendisk);
6151 }
6152
6153 static const struct sysfs_ops md_sysfs_ops = {
6154 .show = md_attr_show,
6155 .store = md_attr_store,
6156 };
6157 static const struct kobj_type md_ktype = {
6158 .release = md_kobj_release,
6159 .sysfs_ops = &md_sysfs_ops,
6160 .default_groups = md_attr_groups,
6161 };
6162
6163 int mdp_major = 0;
6164
6165 /* stack the limit for all rdevs into lim */
mddev_stack_rdev_limits(struct mddev * mddev,struct queue_limits * lim,unsigned int flags)6166 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
6167 unsigned int flags)
6168 {
6169 struct md_rdev *rdev;
6170
6171 rdev_for_each(rdev, mddev) {
6172 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
6173 mddev->gendisk->disk_name);
6174 if ((flags & MDDEV_STACK_INTEGRITY) &&
6175 !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
6176 return -EINVAL;
6177 }
6178
6179 /*
6180 * Before RAID adding folio support, the logical_block_size
6181 * should be smaller than the page size.
6182 */
6183 if (lim->logical_block_size > PAGE_SIZE) {
6184 pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
6185 mdname(mddev));
6186 return -EINVAL;
6187 }
6188
6189 /* Only 1.x meta needs to set logical block size */
6190 if (mddev->major_version == 0)
6191 return 0;
6192
6193 /*
6194 * Fix forward compatibility issue. Only set LBS by default for
6195 * new arrays, mddev->events == 0 indicates the array was just
6196 * created. When assembling an array, read LBS from the superblock
6197 * instead — LBS is 0 in superblocks created by old kernels.
6198 */
6199 if (!mddev->events) {
6200 pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n",
6201 mdname(mddev));
6202 mddev->logical_block_size = lim->logical_block_size;
6203 }
6204
6205 if (!mddev->logical_block_size)
6206 pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n"
6207 "\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n",
6208 mdname(mddev));
6209
6210 return 0;
6211 }
6212 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
6213
6214 /* apply the extra stacking limits from a new rdev into mddev */
mddev_stack_new_rdev(struct mddev * mddev,struct md_rdev * rdev)6215 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
6216 {
6217 struct queue_limits lim;
6218
6219 if (mddev_is_dm(mddev))
6220 return 0;
6221
6222 if (queue_logical_block_size(rdev->bdev->bd_disk->queue) >
6223 queue_logical_block_size(mddev->gendisk->queue)) {
6224 pr_err("%s: incompatible logical_block_size, can not add\n",
6225 mdname(mddev));
6226 return -EINVAL;
6227 }
6228
6229 lim = queue_limits_start_update(mddev->gendisk->queue);
6230 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
6231 mddev->gendisk->disk_name);
6232
6233 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
6234 pr_err("%s: incompatible integrity profile for %pg\n",
6235 mdname(mddev), rdev->bdev);
6236 queue_limits_cancel_update(mddev->gendisk->queue);
6237 return -ENXIO;
6238 }
6239
6240 return queue_limits_commit_update(mddev->gendisk->queue, &lim);
6241 }
6242 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
6243
6244 /* update the optimal I/O size after a reshape */
mddev_update_io_opt(struct mddev * mddev,unsigned int nr_stripes)6245 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
6246 {
6247 struct queue_limits lim;
6248
6249 if (mddev_is_dm(mddev))
6250 return;
6251
6252 /* don't bother updating io_opt if we can't suspend the array */
6253 if (mddev_suspend(mddev, false) < 0)
6254 return;
6255 lim = queue_limits_start_update(mddev->gendisk->queue);
6256 lim.io_opt = lim.io_min * nr_stripes;
6257 queue_limits_commit_update(mddev->gendisk->queue, &lim);
6258 mddev_resume(mddev);
6259 }
6260 EXPORT_SYMBOL_GPL(mddev_update_io_opt);
6261
mddev_delayed_delete(struct work_struct * ws)6262 static void mddev_delayed_delete(struct work_struct *ws)
6263 {
6264 struct mddev *mddev = container_of(ws, struct mddev, del_work);
6265
6266 kobject_put(&mddev->kobj);
6267 }
6268
md_init_stacking_limits(struct queue_limits * lim)6269 void md_init_stacking_limits(struct queue_limits *lim)
6270 {
6271 blk_set_stacking_limits(lim);
6272 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
6273 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
6274 }
6275 EXPORT_SYMBOL_GPL(md_init_stacking_limits);
6276
md_alloc(dev_t dev,char * name)6277 struct mddev *md_alloc(dev_t dev, char *name)
6278 {
6279 /*
6280 * If dev is zero, name is the name of a device to allocate with
6281 * an arbitrary minor number. It will be "md_???"
6282 * If dev is non-zero it must be a device number with a MAJOR of
6283 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
6284 * the device is being created by opening a node in /dev.
6285 * If "name" is not NULL, the device is being created by
6286 * writing to /sys/module/md_mod/parameters/new_array.
6287 */
6288 static DEFINE_MUTEX(disks_mutex);
6289 struct mddev *mddev;
6290 struct gendisk *disk;
6291 int partitioned;
6292 int shift;
6293 int unit;
6294 int error;
6295
6296 /*
6297 * Wait for any previous instance of this device to be completely
6298 * removed (mddev_delayed_delete).
6299 */
6300 flush_workqueue(md_misc_wq);
6301
6302 mutex_lock(&disks_mutex);
6303 mddev = mddev_alloc(dev);
6304 if (IS_ERR(mddev)) {
6305 error = PTR_ERR(mddev);
6306 goto out_unlock;
6307 }
6308
6309 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
6310 shift = partitioned ? MdpMinorShift : 0;
6311 unit = MINOR(mddev->unit) >> shift;
6312
6313 if (name && !dev) {
6314 /* Need to ensure that 'name' is not a duplicate.
6315 */
6316 struct mddev *mddev2;
6317 spin_lock(&all_mddevs_lock);
6318
6319 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
6320 if (mddev2->gendisk &&
6321 strcmp(mddev2->gendisk->disk_name, name) == 0) {
6322 spin_unlock(&all_mddevs_lock);
6323 error = -EEXIST;
6324 goto out_free_mddev;
6325 }
6326 spin_unlock(&all_mddevs_lock);
6327 }
6328 if (name && dev)
6329 /*
6330 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
6331 */
6332 mddev->hold_active = UNTIL_STOP;
6333
6334 disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
6335 if (IS_ERR(disk)) {
6336 error = PTR_ERR(disk);
6337 goto out_free_mddev;
6338 }
6339
6340 disk->major = MAJOR(mddev->unit);
6341 disk->first_minor = unit << shift;
6342 disk->minors = 1 << shift;
6343 if (name)
6344 strcpy(disk->disk_name, name);
6345 else if (partitioned)
6346 sprintf(disk->disk_name, "md_d%d", unit);
6347 else
6348 sprintf(disk->disk_name, "md%d", unit);
6349 disk->fops = &md_fops;
6350 disk->private_data = mddev;
6351
6352 disk->events |= DISK_EVENT_MEDIA_CHANGE;
6353 mddev->gendisk = disk;
6354 error = add_disk(disk);
6355 if (error)
6356 goto out_put_disk;
6357
6358 kobject_init(&mddev->kobj, &md_ktype);
6359 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
6360 if (error) {
6361 /*
6362 * The disk is already live at this point. Clear the hold flag
6363 * and let mddev_put take care of the deletion, as it isn't any
6364 * different from a normal close on last release now.
6365 */
6366 mddev->hold_active = 0;
6367 mutex_unlock(&disks_mutex);
6368 mddev_put(mddev);
6369 return ERR_PTR(error);
6370 }
6371
6372 kobject_uevent(&mddev->kobj, KOBJ_ADD);
6373 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
6374 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
6375 mutex_unlock(&disks_mutex);
6376 return mddev;
6377
6378 out_put_disk:
6379 put_disk(disk);
6380 out_free_mddev:
6381 mddev_free(mddev);
6382 out_unlock:
6383 mutex_unlock(&disks_mutex);
6384 return ERR_PTR(error);
6385 }
6386
md_alloc_and_put(dev_t dev,char * name)6387 static int md_alloc_and_put(dev_t dev, char *name)
6388 {
6389 struct mddev *mddev = md_alloc(dev, name);
6390
6391 if (legacy_async_del_gendisk)
6392 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
6393
6394 if (IS_ERR(mddev))
6395 return PTR_ERR(mddev);
6396 mddev_put(mddev);
6397 return 0;
6398 }
6399
md_probe(dev_t dev)6400 static void md_probe(dev_t dev)
6401 {
6402 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
6403 return;
6404 if (create_on_open)
6405 md_alloc_and_put(dev, NULL);
6406 }
6407
add_named_array(const char * val,const struct kernel_param * kp)6408 static int add_named_array(const char *val, const struct kernel_param *kp)
6409 {
6410 /*
6411 * val must be "md_*" or "mdNNN".
6412 * For "md_*" we allocate an array with a large free minor number, and
6413 * set the name to val. val must not already be an active name.
6414 * For "mdNNN" we allocate an array with the minor number NNN
6415 * which must not already be in use.
6416 */
6417 int len = strlen(val);
6418 char buf[DISK_NAME_LEN];
6419 unsigned long devnum;
6420
6421 while (len && val[len-1] == '\n')
6422 len--;
6423 if (len >= DISK_NAME_LEN)
6424 return -E2BIG;
6425 strscpy(buf, val, len+1);
6426 if (strncmp(buf, "md_", 3) == 0)
6427 return md_alloc_and_put(0, buf);
6428 if (strncmp(buf, "md", 2) == 0 &&
6429 isdigit(buf[2]) &&
6430 kstrtoul(buf+2, 10, &devnum) == 0 &&
6431 devnum <= MINORMASK)
6432 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
6433
6434 return -EINVAL;
6435 }
6436
md_safemode_timeout(struct timer_list * t)6437 static void md_safemode_timeout(struct timer_list *t)
6438 {
6439 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
6440
6441 mddev->safemode = 1;
6442 if (mddev->external)
6443 sysfs_notify_dirent_safe(mddev->sysfs_state);
6444
6445 md_wakeup_thread(mddev->thread);
6446 }
6447
6448 static int start_dirty_degraded;
6449
md_bitmap_create(struct mddev * mddev)6450 static int md_bitmap_create(struct mddev *mddev)
6451 {
6452 if (mddev->bitmap_id == ID_BITMAP_NONE)
6453 return -EINVAL;
6454
6455 if (!mddev_set_bitmap_ops(mddev))
6456 return -ENOENT;
6457
6458 return mddev->bitmap_ops->create(mddev);
6459 }
6460
md_bitmap_destroy(struct mddev * mddev)6461 static void md_bitmap_destroy(struct mddev *mddev)
6462 {
6463 if (!md_bitmap_registered(mddev))
6464 return;
6465
6466 mddev->bitmap_ops->destroy(mddev);
6467 mddev_clear_bitmap_ops(mddev);
6468 }
6469
md_run(struct mddev * mddev)6470 int md_run(struct mddev *mddev)
6471 {
6472 int err;
6473 struct md_rdev *rdev;
6474 struct md_personality *pers;
6475 bool nowait = true;
6476
6477 if (list_empty(&mddev->disks))
6478 /* cannot run an array with no devices.. */
6479 return -EINVAL;
6480
6481 if (mddev->pers)
6482 return -EBUSY;
6483 /* Cannot run until previous stop completes properly */
6484 if (mddev->sysfs_active)
6485 return -EBUSY;
6486
6487 /*
6488 * Analyze all RAID superblock(s)
6489 */
6490 if (!mddev->raid_disks) {
6491 if (!mddev->persistent)
6492 return -EINVAL;
6493 err = analyze_sbs(mddev);
6494 if (err)
6495 return -EINVAL;
6496 }
6497
6498 if (mddev->level != LEVEL_NONE)
6499 request_module("md-level-%d", mddev->level);
6500 else if (mddev->clevel[0])
6501 request_module("md-%s", mddev->clevel);
6502
6503 /*
6504 * Drop all container device buffers, from now on
6505 * the only valid external interface is through the md
6506 * device.
6507 */
6508 clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags);
6509 rdev_for_each(rdev, mddev) {
6510 if (test_bit(Faulty, &rdev->flags))
6511 continue;
6512 sync_blockdev(rdev->bdev);
6513 invalidate_bdev(rdev->bdev);
6514 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6515 mddev->ro = MD_RDONLY;
6516 if (!mddev_is_dm(mddev))
6517 set_disk_ro(mddev->gendisk, 1);
6518 }
6519
6520 if (rdev->sb_page)
6521 set_bit(MD_HAS_SUPERBLOCK, &mddev->flags);
6522
6523 /* perform some consistency tests on the device.
6524 * We don't want the data to overlap the metadata,
6525 * Internal Bitmap issues have been handled elsewhere.
6526 */
6527 if (rdev->meta_bdev) {
6528 /* Nothing to check */;
6529 } else if (rdev->data_offset < rdev->sb_start) {
6530 if (mddev->dev_sectors &&
6531 rdev->data_offset + mddev->dev_sectors
6532 > rdev->sb_start) {
6533 pr_warn("md: %s: data overlaps metadata\n",
6534 mdname(mddev));
6535 return -EINVAL;
6536 }
6537 } else {
6538 if (rdev->sb_start + rdev->sb_size/512
6539 > rdev->data_offset) {
6540 pr_warn("md: %s: metadata overlaps data\n",
6541 mdname(mddev));
6542 return -EINVAL;
6543 }
6544 }
6545 sysfs_notify_dirent_safe(rdev->sysfs_state);
6546 nowait = nowait && bdev_nowait(rdev->bdev);
6547 }
6548
6549 pers = get_pers(mddev->level, mddev->clevel);
6550 if (!pers)
6551 return -EINVAL;
6552 if (mddev->level != pers->head.id) {
6553 mddev->level = pers->head.id;
6554 mddev->new_level = pers->head.id;
6555 }
6556 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
6557
6558 if (mddev->reshape_position != MaxSector &&
6559 pers->start_reshape == NULL) {
6560 /* This personality cannot handle reshaping... */
6561 put_pers(pers);
6562 return -EINVAL;
6563 }
6564
6565 if (pers->sync_request) {
6566 /* Warn if this is a potentially silly
6567 * configuration.
6568 */
6569 struct md_rdev *rdev2;
6570 int warned = 0;
6571
6572 rdev_for_each(rdev, mddev)
6573 rdev_for_each(rdev2, mddev) {
6574 if (rdev < rdev2 &&
6575 rdev->bdev->bd_disk ==
6576 rdev2->bdev->bd_disk) {
6577 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6578 mdname(mddev),
6579 rdev->bdev,
6580 rdev2->bdev);
6581 warned = 1;
6582 }
6583 }
6584
6585 if (warned)
6586 pr_warn("True protection against single-disk failure might be compromised.\n");
6587 }
6588
6589 /* dm-raid expect sync_thread to be frozen until resume */
6590 if (mddev->gendisk)
6591 mddev->recovery = 0;
6592
6593 /* may be over-ridden by personality */
6594 mddev->resync_max_sectors = mddev->dev_sectors;
6595
6596 mddev->ok_start_degraded = start_dirty_degraded;
6597
6598 if (start_readonly && md_is_rdwr(mddev))
6599 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6600
6601 err = pers->run(mddev);
6602 if (err)
6603 pr_warn("md: pers->run() failed ...\n");
6604 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6605 WARN_ONCE(!mddev->external_size,
6606 "%s: default size too small, but 'external_size' not in effect?\n",
6607 __func__);
6608 pr_warn("md: invalid array_size %llu > default size %llu\n",
6609 (unsigned long long)mddev->array_sectors / 2,
6610 (unsigned long long)pers->size(mddev, 0, 0) / 2);
6611 err = -EINVAL;
6612 }
6613 if (err == 0 && pers->sync_request &&
6614 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6615 err = md_bitmap_create(mddev);
6616 if (err)
6617 pr_warn("%s: failed to create bitmap (%d)\n",
6618 mdname(mddev), err);
6619 }
6620 if (err)
6621 goto bitmap_abort;
6622
6623 if (mddev->bitmap_info.max_write_behind > 0) {
6624 bool create_pool = false;
6625
6626 rdev_for_each(rdev, mddev) {
6627 if (test_bit(WriteMostly, &rdev->flags) &&
6628 rdev_init_serial(rdev))
6629 create_pool = true;
6630 }
6631 if (create_pool && mddev->serial_info_pool == NULL) {
6632 mddev->serial_info_pool =
6633 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6634 sizeof(struct serial_info));
6635 if (!mddev->serial_info_pool) {
6636 err = -ENOMEM;
6637 goto bitmap_abort;
6638 }
6639 }
6640 }
6641
6642 if (pers->sync_request) {
6643 if (mddev->kobj.sd &&
6644 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6645 pr_warn("md: cannot register extra attributes for %s\n",
6646 mdname(mddev));
6647 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6648 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6649 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6650 } else if (mddev->ro == MD_AUTO_READ)
6651 mddev->ro = MD_RDWR;
6652
6653 atomic_set(&mddev->max_corr_read_errors,
6654 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6655 mddev->safemode = 0;
6656 if (mddev_is_clustered(mddev))
6657 mddev->safemode_delay = 0;
6658 else
6659 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6660 mddev->in_sync = 1;
6661 smp_wmb();
6662 spin_lock(&mddev->lock);
6663 mddev->pers = pers;
6664 spin_unlock(&mddev->lock);
6665 rdev_for_each(rdev, mddev)
6666 if (rdev->raid_disk >= 0)
6667 sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6668
6669 if (mddev->degraded && md_is_rdwr(mddev))
6670 /* This ensures that recovering status is reported immediately
6671 * via sysfs - until a lack of spares is confirmed.
6672 */
6673 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6674 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6675
6676 if (mddev->sb_flags)
6677 md_update_sb(mddev, 0);
6678
6679 md_new_event();
6680 return 0;
6681
6682 bitmap_abort:
6683 mddev_detach(mddev);
6684 if (mddev->private)
6685 pers->free(mddev, mddev->private);
6686 mddev->private = NULL;
6687 put_pers(pers);
6688 md_bitmap_destroy(mddev);
6689 return err;
6690 }
6691 EXPORT_SYMBOL_GPL(md_run);
6692
do_md_run(struct mddev * mddev)6693 int do_md_run(struct mddev *mddev)
6694 {
6695 int err;
6696
6697 set_bit(MD_NOT_READY, &mddev->flags);
6698 err = md_run(mddev);
6699 if (err)
6700 goto out;
6701
6702 if (md_bitmap_registered(mddev)) {
6703 err = mddev->bitmap_ops->load(mddev);
6704 if (err) {
6705 md_bitmap_destroy(mddev);
6706 goto out;
6707 }
6708 }
6709
6710 if (mddev_is_clustered(mddev))
6711 md_allow_write(mddev);
6712
6713 /* run start up tasks that require md_thread */
6714 md_start(mddev);
6715
6716 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6717
6718 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6719 clear_bit(MD_NOT_READY, &mddev->flags);
6720 mddev->changed = 1;
6721 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6722 sysfs_notify_dirent_safe(mddev->sysfs_state);
6723 sysfs_notify_dirent_safe(mddev->sysfs_action);
6724 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6725 out:
6726 clear_bit(MD_NOT_READY, &mddev->flags);
6727 return err;
6728 }
6729
md_start(struct mddev * mddev)6730 int md_start(struct mddev *mddev)
6731 {
6732 int ret = 0;
6733
6734 if (mddev->pers->start) {
6735 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6736 ret = mddev->pers->start(mddev);
6737 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6738 md_wakeup_thread(mddev->sync_thread);
6739 }
6740 return ret;
6741 }
6742 EXPORT_SYMBOL_GPL(md_start);
6743
restart_array(struct mddev * mddev)6744 static int restart_array(struct mddev *mddev)
6745 {
6746 struct gendisk *disk = mddev->gendisk;
6747 struct md_rdev *rdev;
6748 bool has_journal = false;
6749 bool has_readonly = false;
6750
6751 /* Complain if it has no devices */
6752 if (list_empty(&mddev->disks))
6753 return -ENXIO;
6754 if (!mddev->pers)
6755 return -EINVAL;
6756 if (md_is_rdwr(mddev))
6757 return -EBUSY;
6758
6759 rcu_read_lock();
6760 rdev_for_each_rcu(rdev, mddev) {
6761 if (test_bit(Journal, &rdev->flags) &&
6762 !test_bit(Faulty, &rdev->flags))
6763 has_journal = true;
6764 if (rdev_read_only(rdev))
6765 has_readonly = true;
6766 }
6767 rcu_read_unlock();
6768 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6769 /* Don't restart rw with journal missing/faulty */
6770 return -EINVAL;
6771 if (has_readonly)
6772 return -EROFS;
6773
6774 mddev->safemode = 0;
6775 mddev->ro = MD_RDWR;
6776 set_disk_ro(disk, 0);
6777 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6778 /* Kick recovery or resync if necessary */
6779 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6780 md_wakeup_thread(mddev->sync_thread);
6781 sysfs_notify_dirent_safe(mddev->sysfs_state);
6782 return 0;
6783 }
6784
md_clean(struct mddev * mddev)6785 static void md_clean(struct mddev *mddev)
6786 {
6787 mddev->array_sectors = 0;
6788 mddev->external_size = 0;
6789 mddev->dev_sectors = 0;
6790 mddev->raid_disks = 0;
6791 mddev->resync_offset = 0;
6792 mddev->resync_min = 0;
6793 mddev->resync_max = MaxSector;
6794 mddev->reshape_position = MaxSector;
6795 /* we still need mddev->external in export_rdev, do not clear it yet */
6796 mddev->persistent = 0;
6797 mddev->level = LEVEL_NONE;
6798 mddev->clevel[0] = 0;
6799
6800 /*
6801 * For legacy_async_del_gendisk mode, it can stop the array in the
6802 * middle of assembling it, then it still can access the array. So
6803 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk,
6804 * it can't open the array again after stopping it. So it doesn't
6805 * clear MD_CLOSING.
6806 */
6807 if (legacy_async_del_gendisk && mddev->hold_active) {
6808 clear_bit(MD_CLOSING, &mddev->flags);
6809 } else {
6810 /* if UNTIL_STOP is set, it's cleared here */
6811 mddev->hold_active = 0;
6812 /* Don't clear MD_CLOSING, or mddev can be opened again. */
6813 mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6814 }
6815 mddev->sb_flags = 0;
6816 mddev->ro = MD_RDWR;
6817 mddev->metadata_type[0] = 0;
6818 mddev->chunk_sectors = 0;
6819 mddev->ctime = mddev->utime = 0;
6820 mddev->layout = 0;
6821 mddev->logical_block_size = 0;
6822 mddev->max_disks = 0;
6823 mddev->events = 0;
6824 mddev->can_decrease_events = 0;
6825 mddev->delta_disks = 0;
6826 mddev->reshape_backwards = 0;
6827 mddev->new_level = LEVEL_NONE;
6828 mddev->new_layout = 0;
6829 mddev->new_chunk_sectors = 0;
6830 mddev->curr_resync = MD_RESYNC_NONE;
6831 atomic64_set(&mddev->resync_mismatches, 0);
6832 mddev->suspend_lo = mddev->suspend_hi = 0;
6833 mddev->sync_speed_min = mddev->sync_speed_max = 0;
6834 mddev->recovery = 0;
6835 mddev->in_sync = 0;
6836 mddev->changed = 0;
6837 mddev->degraded = 0;
6838 mddev->safemode = 0;
6839 mddev->private = NULL;
6840 mddev->cluster_info = NULL;
6841 mddev->bitmap_info.offset = 0;
6842 mddev->bitmap_info.default_offset = 0;
6843 mddev->bitmap_info.default_space = 0;
6844 mddev->bitmap_info.chunksize = 0;
6845 mddev->bitmap_info.daemon_sleep = 0;
6846 mddev->bitmap_info.max_write_behind = 0;
6847 mddev->bitmap_info.nodes = 0;
6848 }
6849
__md_stop_writes(struct mddev * mddev)6850 static void __md_stop_writes(struct mddev *mddev)
6851 {
6852 timer_delete_sync(&mddev->safemode_timer);
6853
6854 if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) {
6855 if (mddev->pers && mddev->pers->quiesce) {
6856 mddev->pers->quiesce(mddev, 1);
6857 mddev->pers->quiesce(mddev, 0);
6858 }
6859
6860 if (md_bitmap_enabled(mddev, true))
6861 mddev->bitmap_ops->flush(mddev);
6862 }
6863
6864 if (md_is_rdwr(mddev) &&
6865 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6866 mddev->sb_flags)) {
6867 /* mark array as shutdown cleanly */
6868 if (!mddev_is_clustered(mddev))
6869 mddev->in_sync = 1;
6870 md_update_sb(mddev, 1);
6871 }
6872 /* disable policy to guarantee rdevs free resources for serialization */
6873 clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
6874 mddev_destroy_serial_pool(mddev, NULL);
6875 }
6876
md_stop_writes(struct mddev * mddev)6877 void md_stop_writes(struct mddev *mddev)
6878 {
6879 mddev_lock_nointr(mddev);
6880 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6881 stop_sync_thread(mddev, true);
6882 __md_stop_writes(mddev);
6883 mddev_unlock(mddev);
6884 }
6885 EXPORT_SYMBOL_GPL(md_stop_writes);
6886
mddev_detach(struct mddev * mddev)6887 static void mddev_detach(struct mddev *mddev)
6888 {
6889 if (md_bitmap_enabled(mddev, false))
6890 mddev->bitmap_ops->wait_behind_writes(mddev);
6891 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6892 mddev->pers->quiesce(mddev, 1);
6893 mddev->pers->quiesce(mddev, 0);
6894 }
6895 md_unregister_thread(mddev, &mddev->thread);
6896
6897 /* the unplug fn references 'conf' */
6898 if (!mddev_is_dm(mddev))
6899 blk_sync_queue(mddev->gendisk->queue);
6900 }
6901
__md_stop(struct mddev * mddev)6902 static void __md_stop(struct mddev *mddev)
6903 {
6904 struct md_personality *pers = mddev->pers;
6905
6906 md_bitmap_destroy(mddev);
6907 mddev_detach(mddev);
6908 spin_lock(&mddev->lock);
6909 mddev->pers = NULL;
6910 spin_unlock(&mddev->lock);
6911 if (mddev->private)
6912 pers->free(mddev, mddev->private);
6913 mddev->private = NULL;
6914 put_pers(pers);
6915 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6916 }
6917
md_stop(struct mddev * mddev)6918 void md_stop(struct mddev *mddev)
6919 {
6920 lockdep_assert_held(&mddev->reconfig_mutex);
6921
6922 /* stop the array and free an attached data structures.
6923 * This is called from dm-raid
6924 */
6925 __md_stop_writes(mddev);
6926 __md_stop(mddev);
6927 }
6928
6929 EXPORT_SYMBOL_GPL(md_stop);
6930
6931 /* ensure 'mddev->pers' exist before calling md_set_readonly() */
md_set_readonly(struct mddev * mddev)6932 static int md_set_readonly(struct mddev *mddev)
6933 {
6934 int err = 0;
6935 int did_freeze = 0;
6936
6937 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6938 return -EBUSY;
6939
6940 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6941 did_freeze = 1;
6942 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6943 }
6944
6945 stop_sync_thread(mddev, false);
6946 wait_event(mddev->sb_wait,
6947 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6948 mddev_lock_nointr(mddev);
6949
6950 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6951 pr_warn("md: %s still in use.\n",mdname(mddev));
6952 err = -EBUSY;
6953 goto out;
6954 }
6955
6956 __md_stop_writes(mddev);
6957
6958 if (mddev->ro == MD_RDONLY) {
6959 err = -ENXIO;
6960 goto out;
6961 }
6962
6963 mddev->ro = MD_RDONLY;
6964 set_disk_ro(mddev->gendisk, 1);
6965
6966 out:
6967 if (!err || did_freeze) {
6968 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6969 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6970 sysfs_notify_dirent_safe(mddev->sysfs_state);
6971 }
6972
6973 return err;
6974 }
6975
6976 /* mode:
6977 * 0 - completely stop and dis-assemble array
6978 * 2 - stop but do not disassemble array
6979 */
do_md_stop(struct mddev * mddev,int mode)6980 static int do_md_stop(struct mddev *mddev, int mode)
6981 {
6982 struct gendisk *disk = mddev->gendisk;
6983 struct md_rdev *rdev;
6984 int did_freeze = 0;
6985
6986 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6987 did_freeze = 1;
6988 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6989 }
6990
6991 stop_sync_thread(mddev, true);
6992
6993 if (mddev->sysfs_active ||
6994 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6995 pr_warn("md: %s still in use.\n",mdname(mddev));
6996 if (did_freeze) {
6997 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6998 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6999 }
7000 return -EBUSY;
7001 }
7002 if (mddev->pers) {
7003 if (!md_is_rdwr(mddev))
7004 set_disk_ro(disk, 0);
7005
7006 if (mode == 2 && mddev->pers->sync_request &&
7007 mddev->to_remove == NULL)
7008 mddev->to_remove = &md_redundancy_group;
7009
7010 __md_stop_writes(mddev);
7011 __md_stop(mddev);
7012
7013 /* tell userspace to handle 'inactive' */
7014 sysfs_notify_dirent_safe(mddev->sysfs_state);
7015
7016 rdev_for_each(rdev, mddev)
7017 if (rdev->raid_disk >= 0)
7018 sysfs_unlink_rdev(mddev, rdev);
7019
7020 set_capacity_and_notify(disk, 0);
7021 mddev->changed = 1;
7022
7023 if (!md_is_rdwr(mddev))
7024 mddev->ro = MD_RDWR;
7025 }
7026 /*
7027 * Free resources if final stop
7028 */
7029 if (mode == 0) {
7030 pr_info("md: %s stopped.\n", mdname(mddev));
7031
7032 if (mddev->bitmap_info.file) {
7033 struct file *f = mddev->bitmap_info.file;
7034 spin_lock(&mddev->lock);
7035 mddev->bitmap_info.file = NULL;
7036 spin_unlock(&mddev->lock);
7037 fput(f);
7038 }
7039 mddev->bitmap_info.offset = 0;
7040
7041 export_array(mddev);
7042 md_clean(mddev);
7043 if (!legacy_async_del_gendisk)
7044 set_bit(MD_DELETED, &mddev->flags);
7045 }
7046 md_new_event();
7047 sysfs_notify_dirent_safe(mddev->sysfs_state);
7048 return 0;
7049 }
7050
7051 #ifndef MODULE
autorun_array(struct mddev * mddev)7052 static void autorun_array(struct mddev *mddev)
7053 {
7054 struct md_rdev *rdev;
7055 int err;
7056
7057 if (list_empty(&mddev->disks))
7058 return;
7059
7060 pr_info("md: running: ");
7061
7062 rdev_for_each(rdev, mddev) {
7063 pr_cont("<%pg>", rdev->bdev);
7064 }
7065 pr_cont("\n");
7066
7067 err = do_md_run(mddev);
7068 if (err) {
7069 pr_warn("md: do_md_run() returned %d\n", err);
7070 do_md_stop(mddev, 0);
7071 }
7072 }
7073
7074 /*
7075 * lets try to run arrays based on all disks that have arrived
7076 * until now. (those are in pending_raid_disks)
7077 *
7078 * the method: pick the first pending disk, collect all disks with
7079 * the same UUID, remove all from the pending list and put them into
7080 * the 'same_array' list. Then order this list based on superblock
7081 * update time (freshest comes first), kick out 'old' disks and
7082 * compare superblocks. If everything's fine then run it.
7083 *
7084 * If "unit" is allocated, then bump its reference count
7085 */
autorun_devices(int part)7086 static void autorun_devices(int part)
7087 {
7088 struct md_rdev *rdev0, *rdev, *tmp;
7089 struct mddev *mddev;
7090
7091 pr_info("md: autorun ...\n");
7092 while (!list_empty(&pending_raid_disks)) {
7093 int unit;
7094 dev_t dev;
7095 LIST_HEAD(candidates);
7096 rdev0 = list_entry(pending_raid_disks.next,
7097 struct md_rdev, same_set);
7098
7099 pr_debug("md: considering %pg ...\n", rdev0->bdev);
7100 INIT_LIST_HEAD(&candidates);
7101 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
7102 if (super_90_load(rdev, rdev0, 0) >= 0) {
7103 pr_debug("md: adding %pg ...\n",
7104 rdev->bdev);
7105 list_move(&rdev->same_set, &candidates);
7106 }
7107 /*
7108 * now we have a set of devices, with all of them having
7109 * mostly sane superblocks. It's time to allocate the
7110 * mddev.
7111 */
7112 if (part) {
7113 dev = MKDEV(mdp_major,
7114 rdev0->preferred_minor << MdpMinorShift);
7115 unit = MINOR(dev) >> MdpMinorShift;
7116 } else {
7117 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
7118 unit = MINOR(dev);
7119 }
7120 if (rdev0->preferred_minor != unit) {
7121 pr_warn("md: unit number in %pg is bad: %d\n",
7122 rdev0->bdev, rdev0->preferred_minor);
7123 break;
7124 }
7125
7126 mddev = md_alloc(dev, NULL);
7127 if (IS_ERR(mddev))
7128 break;
7129
7130 if (mddev_suspend_and_lock(mddev))
7131 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
7132 else if (mddev->raid_disks || mddev->major_version
7133 || !list_empty(&mddev->disks)) {
7134 pr_warn("md: %s already running, cannot run %pg\n",
7135 mdname(mddev), rdev0->bdev);
7136 mddev_unlock_and_resume(mddev);
7137 } else {
7138 pr_debug("md: created %s\n", mdname(mddev));
7139 mddev->persistent = 1;
7140 rdev_for_each_list(rdev, tmp, &candidates) {
7141 list_del_init(&rdev->same_set);
7142 if (bind_rdev_to_array(rdev, mddev))
7143 export_rdev(rdev, mddev);
7144 }
7145 autorun_array(mddev);
7146 mddev_unlock_and_resume(mddev);
7147 }
7148 /* on success, candidates will be empty, on error
7149 * it won't...
7150 */
7151 rdev_for_each_list(rdev, tmp, &candidates) {
7152 list_del_init(&rdev->same_set);
7153 export_rdev(rdev, mddev);
7154 }
7155 mddev_put(mddev);
7156 }
7157 pr_info("md: ... autorun DONE.\n");
7158 }
7159 #endif /* !MODULE */
7160
get_version(void __user * arg)7161 static int get_version(void __user *arg)
7162 {
7163 mdu_version_t ver;
7164
7165 ver.major = MD_MAJOR_VERSION;
7166 ver.minor = MD_MINOR_VERSION;
7167 ver.patchlevel = MD_PATCHLEVEL_VERSION;
7168
7169 if (copy_to_user(arg, &ver, sizeof(ver)))
7170 return -EFAULT;
7171
7172 return 0;
7173 }
7174
get_array_info(struct mddev * mddev,void __user * arg)7175 static int get_array_info(struct mddev *mddev, void __user *arg)
7176 {
7177 mdu_array_info_t info;
7178 int nr,working,insync,failed,spare;
7179 struct md_rdev *rdev;
7180
7181 nr = working = insync = failed = spare = 0;
7182 rcu_read_lock();
7183 rdev_for_each_rcu(rdev, mddev) {
7184 nr++;
7185 if (test_bit(Faulty, &rdev->flags))
7186 failed++;
7187 else {
7188 working++;
7189 if (test_bit(In_sync, &rdev->flags))
7190 insync++;
7191 else if (test_bit(Journal, &rdev->flags))
7192 /* TODO: add journal count to md_u.h */
7193 ;
7194 else
7195 spare++;
7196 }
7197 }
7198 rcu_read_unlock();
7199
7200 info.major_version = mddev->major_version;
7201 info.minor_version = mddev->minor_version;
7202 info.patch_version = MD_PATCHLEVEL_VERSION;
7203 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
7204 info.level = mddev->level;
7205 info.size = mddev->dev_sectors / 2;
7206 if (info.size != mddev->dev_sectors / 2) /* overflow */
7207 info.size = -1;
7208 info.nr_disks = nr;
7209 info.raid_disks = mddev->raid_disks;
7210 info.md_minor = mddev->md_minor;
7211 info.not_persistent= !mddev->persistent;
7212
7213 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
7214 info.state = 0;
7215 if (mddev->in_sync)
7216 info.state = (1<<MD_SB_CLEAN);
7217 if (mddev->bitmap && mddev->bitmap_info.offset)
7218 info.state |= (1<<MD_SB_BITMAP_PRESENT);
7219 if (mddev_is_clustered(mddev))
7220 info.state |= (1<<MD_SB_CLUSTERED);
7221 info.active_disks = insync;
7222 info.working_disks = working;
7223 info.failed_disks = failed;
7224 info.spare_disks = spare;
7225
7226 info.layout = mddev->layout;
7227 info.chunk_size = mddev->chunk_sectors << 9;
7228
7229 if (copy_to_user(arg, &info, sizeof(info)))
7230 return -EFAULT;
7231
7232 return 0;
7233 }
7234
get_bitmap_file(struct mddev * mddev,void __user * arg)7235 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
7236 {
7237 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
7238 char *ptr;
7239 int err;
7240
7241 file = kzalloc_obj(*file, GFP_NOIO);
7242 if (!file)
7243 return -ENOMEM;
7244
7245 err = 0;
7246 spin_lock(&mddev->lock);
7247 /* bitmap enabled */
7248 if (mddev->bitmap_info.file) {
7249 ptr = file_path(mddev->bitmap_info.file, file->pathname,
7250 sizeof(file->pathname));
7251 if (IS_ERR(ptr))
7252 err = PTR_ERR(ptr);
7253 else
7254 memmove(file->pathname, ptr,
7255 sizeof(file->pathname)-(ptr-file->pathname));
7256 }
7257 spin_unlock(&mddev->lock);
7258
7259 if (err == 0 &&
7260 copy_to_user(arg, file, sizeof(*file)))
7261 err = -EFAULT;
7262
7263 kfree(file);
7264 return err;
7265 }
7266
get_disk_info(struct mddev * mddev,void __user * arg)7267 static int get_disk_info(struct mddev *mddev, void __user * arg)
7268 {
7269 mdu_disk_info_t info;
7270 struct md_rdev *rdev;
7271
7272 if (copy_from_user(&info, arg, sizeof(info)))
7273 return -EFAULT;
7274
7275 rcu_read_lock();
7276 rdev = md_find_rdev_nr_rcu(mddev, info.number);
7277 if (rdev) {
7278 info.major = MAJOR(rdev->bdev->bd_dev);
7279 info.minor = MINOR(rdev->bdev->bd_dev);
7280 info.raid_disk = rdev->raid_disk;
7281 info.state = 0;
7282 if (test_bit(Faulty, &rdev->flags))
7283 info.state |= (1<<MD_DISK_FAULTY);
7284 else if (test_bit(In_sync, &rdev->flags)) {
7285 info.state |= (1<<MD_DISK_ACTIVE);
7286 info.state |= (1<<MD_DISK_SYNC);
7287 }
7288 if (test_bit(Journal, &rdev->flags))
7289 info.state |= (1<<MD_DISK_JOURNAL);
7290 if (test_bit(WriteMostly, &rdev->flags))
7291 info.state |= (1<<MD_DISK_WRITEMOSTLY);
7292 if (test_bit(FailFast, &rdev->flags))
7293 info.state |= (1<<MD_DISK_FAILFAST);
7294 } else {
7295 info.major = info.minor = 0;
7296 info.raid_disk = -1;
7297 info.state = (1<<MD_DISK_REMOVED);
7298 }
7299 rcu_read_unlock();
7300
7301 if (copy_to_user(arg, &info, sizeof(info)))
7302 return -EFAULT;
7303
7304 return 0;
7305 }
7306
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)7307 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
7308 {
7309 struct md_rdev *rdev;
7310 dev_t dev = MKDEV(info->major,info->minor);
7311
7312 if (mddev_is_clustered(mddev) &&
7313 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
7314 pr_warn("%s: Cannot add to clustered mddev.\n",
7315 mdname(mddev));
7316 return -EINVAL;
7317 }
7318
7319 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
7320 return -EOVERFLOW;
7321
7322 if (!mddev->raid_disks) {
7323 int err;
7324 /* expecting a device which has a superblock */
7325 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
7326 if (IS_ERR(rdev)) {
7327 pr_warn("md: md_import_device returned %ld\n",
7328 PTR_ERR(rdev));
7329 return PTR_ERR(rdev);
7330 }
7331 if (!list_empty(&mddev->disks)) {
7332 struct md_rdev *rdev0
7333 = list_entry(mddev->disks.next,
7334 struct md_rdev, same_set);
7335 err = super_types[mddev->major_version]
7336 .load_super(rdev, rdev0, mddev->minor_version);
7337 if (err < 0) {
7338 pr_warn("md: %pg has different UUID to %pg\n",
7339 rdev->bdev,
7340 rdev0->bdev);
7341 export_rdev(rdev, mddev);
7342 return -EINVAL;
7343 }
7344 }
7345 err = bind_rdev_to_array(rdev, mddev);
7346 if (err)
7347 export_rdev(rdev, mddev);
7348 return err;
7349 }
7350
7351 /*
7352 * md_add_new_disk can be used once the array is assembled
7353 * to add "hot spares". They must already have a superblock
7354 * written
7355 */
7356 if (mddev->pers) {
7357 int err;
7358 if (!mddev->pers->hot_add_disk) {
7359 pr_warn("%s: personality does not support diskops!\n",
7360 mdname(mddev));
7361 return -EINVAL;
7362 }
7363 if (mddev->persistent)
7364 rdev = md_import_device(dev, mddev->major_version,
7365 mddev->minor_version);
7366 else
7367 rdev = md_import_device(dev, -1, -1);
7368 if (IS_ERR(rdev)) {
7369 pr_warn("md: md_import_device returned %ld\n",
7370 PTR_ERR(rdev));
7371 return PTR_ERR(rdev);
7372 }
7373 /* set saved_raid_disk if appropriate */
7374 if (!mddev->persistent) {
7375 if (info->state & (1<<MD_DISK_SYNC) &&
7376 info->raid_disk < mddev->raid_disks) {
7377 rdev->raid_disk = info->raid_disk;
7378 clear_bit(Bitmap_sync, &rdev->flags);
7379 } else
7380 rdev->raid_disk = -1;
7381 rdev->saved_raid_disk = rdev->raid_disk;
7382 } else
7383 super_types[mddev->major_version].
7384 validate_super(mddev, NULL/*freshest*/, rdev);
7385 if ((info->state & (1<<MD_DISK_SYNC)) &&
7386 rdev->raid_disk != info->raid_disk) {
7387 /* This was a hot-add request, but events doesn't
7388 * match, so reject it.
7389 */
7390 export_rdev(rdev, mddev);
7391 return -EINVAL;
7392 }
7393
7394 clear_bit(In_sync, &rdev->flags); /* just to be sure */
7395 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7396 set_bit(WriteMostly, &rdev->flags);
7397 else
7398 clear_bit(WriteMostly, &rdev->flags);
7399 if (info->state & (1<<MD_DISK_FAILFAST))
7400 set_bit(FailFast, &rdev->flags);
7401 else
7402 clear_bit(FailFast, &rdev->flags);
7403
7404 if (info->state & (1<<MD_DISK_JOURNAL)) {
7405 struct md_rdev *rdev2;
7406 bool has_journal = false;
7407
7408 /* make sure no existing journal disk */
7409 rdev_for_each(rdev2, mddev) {
7410 if (test_bit(Journal, &rdev2->flags)) {
7411 has_journal = true;
7412 break;
7413 }
7414 }
7415 if (has_journal || mddev->bitmap) {
7416 export_rdev(rdev, mddev);
7417 return -EBUSY;
7418 }
7419 set_bit(Journal, &rdev->flags);
7420 }
7421 /*
7422 * check whether the device shows up in other nodes
7423 */
7424 if (mddev_is_clustered(mddev)) {
7425 if (info->state & (1 << MD_DISK_CANDIDATE))
7426 set_bit(Candidate, &rdev->flags);
7427 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
7428 /* --add initiated by this node */
7429 err = mddev->cluster_ops->add_new_disk(mddev, rdev);
7430 if (err) {
7431 export_rdev(rdev, mddev);
7432 return err;
7433 }
7434 }
7435 }
7436
7437 rdev->raid_disk = -1;
7438 err = bind_rdev_to_array(rdev, mddev);
7439
7440 if (err)
7441 export_rdev(rdev, mddev);
7442
7443 if (mddev_is_clustered(mddev)) {
7444 if (info->state & (1 << MD_DISK_CANDIDATE)) {
7445 if (!err) {
7446 err = mddev->cluster_ops->new_disk_ack(
7447 mddev, err == 0);
7448 if (err)
7449 md_kick_rdev_from_array(rdev);
7450 }
7451 } else {
7452 if (err)
7453 mddev->cluster_ops->add_new_disk_cancel(mddev);
7454 else
7455 err = add_bound_rdev(rdev);
7456 }
7457
7458 } else if (!err)
7459 err = add_bound_rdev(rdev);
7460
7461 return err;
7462 }
7463
7464 /* otherwise, md_add_new_disk is only allowed
7465 * for major_version==0 superblocks
7466 */
7467 if (mddev->major_version != 0) {
7468 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7469 return -EINVAL;
7470 }
7471
7472 if (!(info->state & (1<<MD_DISK_FAULTY))) {
7473 int err;
7474 rdev = md_import_device(dev, -1, 0);
7475 if (IS_ERR(rdev)) {
7476 pr_warn("md: error, md_import_device() returned %ld\n",
7477 PTR_ERR(rdev));
7478 return PTR_ERR(rdev);
7479 }
7480 rdev->desc_nr = info->number;
7481 if (info->raid_disk < mddev->raid_disks)
7482 rdev->raid_disk = info->raid_disk;
7483 else
7484 rdev->raid_disk = -1;
7485
7486 if (rdev->raid_disk < mddev->raid_disks)
7487 if (info->state & (1<<MD_DISK_SYNC))
7488 set_bit(In_sync, &rdev->flags);
7489
7490 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7491 set_bit(WriteMostly, &rdev->flags);
7492 if (info->state & (1<<MD_DISK_FAILFAST))
7493 set_bit(FailFast, &rdev->flags);
7494
7495 if (!mddev->persistent) {
7496 pr_debug("md: nonpersistent superblock ...\n");
7497 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7498 } else
7499 rdev->sb_start = calc_dev_sboffset(rdev);
7500 rdev->sectors = rdev->sb_start;
7501
7502 err = bind_rdev_to_array(rdev, mddev);
7503 if (err) {
7504 export_rdev(rdev, mddev);
7505 return err;
7506 }
7507 }
7508
7509 return 0;
7510 }
7511
hot_remove_disk(struct mddev * mddev,dev_t dev)7512 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7513 {
7514 struct md_rdev *rdev;
7515
7516 if (!mddev->pers)
7517 return -ENODEV;
7518
7519 rdev = find_rdev(mddev, dev);
7520 if (!rdev)
7521 return -ENXIO;
7522
7523 if (rdev->raid_disk < 0)
7524 goto kick_rdev;
7525
7526 clear_bit(Blocked, &rdev->flags);
7527 remove_and_add_spares(mddev, rdev);
7528
7529 if (rdev->raid_disk >= 0)
7530 goto busy;
7531
7532 kick_rdev:
7533 if (mddev_is_clustered(mddev) &&
7534 mddev->cluster_ops->remove_disk(mddev, rdev))
7535 goto busy;
7536
7537 md_kick_rdev_from_array(rdev);
7538 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7539 if (!mddev->thread)
7540 md_update_sb(mddev, 1);
7541 md_new_event();
7542
7543 return 0;
7544 busy:
7545 pr_debug("md: cannot remove active disk %pg from %s ...\n",
7546 rdev->bdev, mdname(mddev));
7547 return -EBUSY;
7548 }
7549
hot_add_disk(struct mddev * mddev,dev_t dev)7550 static int hot_add_disk(struct mddev *mddev, dev_t dev)
7551 {
7552 int err;
7553 struct md_rdev *rdev;
7554
7555 if (!mddev->pers)
7556 return -ENODEV;
7557
7558 if (mddev->major_version != 0) {
7559 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7560 mdname(mddev));
7561 return -EINVAL;
7562 }
7563 if (!mddev->pers->hot_add_disk) {
7564 pr_warn("%s: personality does not support diskops!\n",
7565 mdname(mddev));
7566 return -EINVAL;
7567 }
7568
7569 rdev = md_import_device(dev, -1, 0);
7570 if (IS_ERR(rdev)) {
7571 pr_warn("md: error, md_import_device() returned %ld\n",
7572 PTR_ERR(rdev));
7573 return -EINVAL;
7574 }
7575
7576 if (mddev->persistent)
7577 rdev->sb_start = calc_dev_sboffset(rdev);
7578 else
7579 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7580
7581 rdev->sectors = rdev->sb_start;
7582
7583 if (test_bit(Faulty, &rdev->flags)) {
7584 pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7585 rdev->bdev, mdname(mddev));
7586 err = -EINVAL;
7587 goto abort_export;
7588 }
7589
7590 clear_bit(In_sync, &rdev->flags);
7591 rdev->desc_nr = -1;
7592 rdev->saved_raid_disk = -1;
7593 err = bind_rdev_to_array(rdev, mddev);
7594 if (err)
7595 goto abort_export;
7596
7597 /*
7598 * The rest should better be atomic, we can have disk failures
7599 * noticed in interrupt contexts ...
7600 */
7601
7602 rdev->raid_disk = -1;
7603
7604 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7605 if (!mddev->thread)
7606 md_update_sb(mddev, 1);
7607 /*
7608 * Kick recovery, maybe this spare has to be added to the
7609 * array immediately.
7610 */
7611 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7612 md_new_event();
7613 return 0;
7614
7615 abort_export:
7616 export_rdev(rdev, mddev);
7617 return err;
7618 }
7619
set_bitmap_file(struct mddev * mddev,int fd)7620 static int set_bitmap_file(struct mddev *mddev, int fd)
7621 {
7622 int err = 0;
7623
7624 if (!md_bitmap_registered(mddev))
7625 return -EINVAL;
7626
7627 if (mddev->pers) {
7628 if (!mddev->pers->quiesce || !mddev->thread)
7629 return -EBUSY;
7630 if (mddev->recovery || mddev->sync_thread)
7631 return -EBUSY;
7632 /* we should be able to change the bitmap.. */
7633 }
7634
7635 if (fd >= 0) {
7636 struct inode *inode;
7637 struct file *f;
7638
7639 if (mddev->bitmap || mddev->bitmap_info.file)
7640 return -EEXIST; /* cannot add when bitmap is present */
7641
7642 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7643 pr_warn("%s: bitmap files not supported by this kernel\n",
7644 mdname(mddev));
7645 return -EINVAL;
7646 }
7647 pr_warn("%s: using deprecated bitmap file support\n",
7648 mdname(mddev));
7649
7650 f = fget(fd);
7651
7652 if (f == NULL) {
7653 pr_warn("%s: error: failed to get bitmap file\n",
7654 mdname(mddev));
7655 return -EBADF;
7656 }
7657
7658 inode = f->f_mapping->host;
7659 if (!S_ISREG(inode->i_mode)) {
7660 pr_warn("%s: error: bitmap file must be a regular file\n",
7661 mdname(mddev));
7662 err = -EBADF;
7663 } else if (!(f->f_mode & FMODE_WRITE)) {
7664 pr_warn("%s: error: bitmap file must open for write\n",
7665 mdname(mddev));
7666 err = -EBADF;
7667 } else if (atomic_read(&inode->i_writecount) != 1) {
7668 pr_warn("%s: error: bitmap file is already in use\n",
7669 mdname(mddev));
7670 err = -EBUSY;
7671 }
7672 if (err) {
7673 fput(f);
7674 return err;
7675 }
7676 mddev->bitmap_info.file = f;
7677 mddev->bitmap_info.offset = 0; /* file overrides offset */
7678 } else if (mddev->bitmap == NULL)
7679 return -ENOENT; /* cannot remove what isn't there */
7680 err = 0;
7681 if (mddev->pers) {
7682 if (fd >= 0) {
7683 err = md_bitmap_create(mddev);
7684 if (!err)
7685 err = mddev->bitmap_ops->load(mddev);
7686
7687 if (err) {
7688 md_bitmap_destroy(mddev);
7689 fd = -1;
7690 }
7691 } else if (fd < 0) {
7692 md_bitmap_destroy(mddev);
7693 }
7694 }
7695
7696 if (fd < 0) {
7697 struct file *f = mddev->bitmap_info.file;
7698 if (f) {
7699 spin_lock(&mddev->lock);
7700 mddev->bitmap_info.file = NULL;
7701 spin_unlock(&mddev->lock);
7702 fput(f);
7703 }
7704 }
7705
7706 return err;
7707 }
7708
7709 /*
7710 * md_set_array_info is used two different ways
7711 * The original usage is when creating a new array.
7712 * In this usage, raid_disks is > 0 and it together with
7713 * level, size, not_persistent,layout,chunksize determine the
7714 * shape of the array.
7715 * This will always create an array with a type-0.90.0 superblock.
7716 * The newer usage is when assembling an array.
7717 * In this case raid_disks will be 0, and the major_version field is
7718 * use to determine which style super-blocks are to be found on the devices.
7719 * The minor and patch _version numbers are also kept incase the
7720 * super_block handler wishes to interpret them.
7721 */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7722 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7723 {
7724 if (info->raid_disks == 0) {
7725 /* just setting version number for superblock loading */
7726 if (info->major_version < 0 ||
7727 info->major_version >= ARRAY_SIZE(super_types) ||
7728 super_types[info->major_version].name == NULL) {
7729 /* maybe try to auto-load a module? */
7730 pr_warn("md: superblock version %d not known\n",
7731 info->major_version);
7732 return -EINVAL;
7733 }
7734 mddev->major_version = info->major_version;
7735 mddev->minor_version = info->minor_version;
7736 mddev->patch_version = info->patch_version;
7737 mddev->persistent = !info->not_persistent;
7738 /* ensure mddev_put doesn't delete this now that there
7739 * is some minimal configuration.
7740 */
7741 mddev->ctime = ktime_get_real_seconds();
7742 return 0;
7743 }
7744 mddev->major_version = MD_MAJOR_VERSION;
7745 mddev->minor_version = MD_MINOR_VERSION;
7746 mddev->patch_version = MD_PATCHLEVEL_VERSION;
7747 mddev->ctime = ktime_get_real_seconds();
7748
7749 mddev->level = info->level;
7750 mddev->clevel[0] = 0;
7751 mddev->dev_sectors = 2 * (sector_t)info->size;
7752 mddev->raid_disks = info->raid_disks;
7753 /* don't set md_minor, it is determined by which /dev/md* was
7754 * openned
7755 */
7756 if (info->state & (1<<MD_SB_CLEAN))
7757 mddev->resync_offset = MaxSector;
7758 else
7759 mddev->resync_offset = 0;
7760 mddev->persistent = ! info->not_persistent;
7761 mddev->external = 0;
7762
7763 mddev->layout = info->layout;
7764 if (mddev->level == 0)
7765 /* Cannot trust RAID0 layout info here */
7766 mddev->layout = -1;
7767 mddev->chunk_sectors = info->chunk_size >> 9;
7768
7769 if (mddev->persistent) {
7770 mddev->max_disks = MD_SB_DISKS;
7771 mddev->flags = 0;
7772 mddev->sb_flags = 0;
7773 }
7774 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7775
7776 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7777 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7778 mddev->bitmap_info.offset = 0;
7779
7780 mddev->reshape_position = MaxSector;
7781
7782 /*
7783 * Generate a 128 bit UUID
7784 */
7785 get_random_bytes(mddev->uuid, 16);
7786
7787 mddev->new_level = mddev->level;
7788 mddev->new_chunk_sectors = mddev->chunk_sectors;
7789 mddev->new_layout = mddev->layout;
7790 mddev->delta_disks = 0;
7791 mddev->reshape_backwards = 0;
7792
7793 return 0;
7794 }
7795
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7796 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7797 {
7798 lockdep_assert_held(&mddev->reconfig_mutex);
7799
7800 if (mddev->external_size)
7801 return;
7802
7803 mddev->array_sectors = array_sectors;
7804 }
7805 EXPORT_SYMBOL(md_set_array_sectors);
7806
update_size(struct mddev * mddev,sector_t num_sectors)7807 static int update_size(struct mddev *mddev, sector_t num_sectors)
7808 {
7809 struct md_rdev *rdev;
7810 int rv;
7811 int fit = (num_sectors == 0);
7812 sector_t old_dev_sectors = mddev->dev_sectors;
7813
7814 if (mddev->pers->resize == NULL)
7815 return -EINVAL;
7816 /* The "num_sectors" is the number of sectors of each device that
7817 * is used. This can only make sense for arrays with redundancy.
7818 * linear and raid0 always use whatever space is available. We can only
7819 * consider changing this number if no resync or reconstruction is
7820 * happening, and if the new size is acceptable. It must fit before the
7821 * sb_start or, if that is <data_offset, it must fit before the size
7822 * of each device. If num_sectors is zero, we find the largest size
7823 * that fits.
7824 */
7825 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7826 return -EBUSY;
7827 if (!md_is_rdwr(mddev))
7828 return -EROFS;
7829
7830 rdev_for_each(rdev, mddev) {
7831 sector_t avail = rdev->sectors;
7832
7833 if (fit && (num_sectors == 0 || num_sectors > avail))
7834 num_sectors = avail;
7835 if (avail < num_sectors)
7836 return -ENOSPC;
7837 }
7838 rv = mddev->pers->resize(mddev, num_sectors);
7839 if (!rv) {
7840 if (mddev_is_clustered(mddev))
7841 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
7842 else if (!mddev_is_dm(mddev))
7843 set_capacity_and_notify(mddev->gendisk,
7844 mddev->array_sectors);
7845 }
7846 return rv;
7847 }
7848
update_raid_disks(struct mddev * mddev,int raid_disks)7849 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7850 {
7851 int rv;
7852 struct md_rdev *rdev;
7853 /* change the number of raid disks */
7854 if (mddev->pers->check_reshape == NULL)
7855 return -EINVAL;
7856 if (!md_is_rdwr(mddev))
7857 return -EROFS;
7858 if (raid_disks <= 0 ||
7859 (mddev->max_disks && raid_disks >= mddev->max_disks))
7860 return -EINVAL;
7861 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7862 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7863 mddev->reshape_position != MaxSector)
7864 return -EBUSY;
7865
7866 rdev_for_each(rdev, mddev) {
7867 if (mddev->raid_disks < raid_disks &&
7868 rdev->data_offset < rdev->new_data_offset)
7869 return -EINVAL;
7870 if (mddev->raid_disks > raid_disks &&
7871 rdev->data_offset > rdev->new_data_offset)
7872 return -EINVAL;
7873 }
7874
7875 mddev->delta_disks = raid_disks - mddev->raid_disks;
7876 if (mddev->delta_disks < 0)
7877 mddev->reshape_backwards = 1;
7878 else if (mddev->delta_disks > 0)
7879 mddev->reshape_backwards = 0;
7880
7881 rv = mddev->pers->check_reshape(mddev);
7882 if (rv < 0) {
7883 mddev->delta_disks = 0;
7884 mddev->reshape_backwards = 0;
7885 }
7886 return rv;
7887 }
7888
get_cluster_ops(struct mddev * mddev)7889 static int get_cluster_ops(struct mddev *mddev)
7890 {
7891 xa_lock(&md_submodule);
7892 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
7893 if (mddev->cluster_ops &&
7894 !try_module_get(mddev->cluster_ops->head.owner))
7895 mddev->cluster_ops = NULL;
7896 xa_unlock(&md_submodule);
7897
7898 return mddev->cluster_ops == NULL ? -ENOENT : 0;
7899 }
7900
put_cluster_ops(struct mddev * mddev)7901 static void put_cluster_ops(struct mddev *mddev)
7902 {
7903 if (!mddev->cluster_ops)
7904 return;
7905
7906 mddev->cluster_ops->leave(mddev);
7907 module_put(mddev->cluster_ops->head.owner);
7908 mddev->cluster_ops = NULL;
7909 }
7910
7911 /*
7912 * update_array_info is used to change the configuration of an
7913 * on-line array.
7914 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7915 * fields in the info are checked against the array.
7916 * Any differences that cannot be handled will cause an error.
7917 * Normally, only one change can be managed at a time.
7918 */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7919 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7920 {
7921 int rv = 0;
7922 int cnt = 0;
7923 int state = 0;
7924
7925 /* calculate expected state,ignoring low bits */
7926 if (mddev->bitmap && mddev->bitmap_info.offset)
7927 state |= (1 << MD_SB_BITMAP_PRESENT);
7928
7929 if (mddev->major_version != info->major_version ||
7930 mddev->minor_version != info->minor_version ||
7931 /* mddev->patch_version != info->patch_version || */
7932 mddev->ctime != info->ctime ||
7933 mddev->level != info->level ||
7934 /* mddev->layout != info->layout || */
7935 mddev->persistent != !info->not_persistent ||
7936 mddev->chunk_sectors != info->chunk_size >> 9 ||
7937 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7938 ((state^info->state) & 0xfffffe00)
7939 )
7940 return -EINVAL;
7941 /* Check there is only one change */
7942 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7943 cnt++;
7944 if (mddev->raid_disks != info->raid_disks)
7945 cnt++;
7946 if (mddev->layout != info->layout)
7947 cnt++;
7948 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7949 cnt++;
7950 if (cnt == 0)
7951 return 0;
7952 if (cnt > 1)
7953 return -EINVAL;
7954
7955 if (mddev->layout != info->layout) {
7956 /* Change layout
7957 * we don't need to do anything at the md level, the
7958 * personality will take care of it all.
7959 */
7960 if (mddev->pers->check_reshape == NULL)
7961 return -EINVAL;
7962 else {
7963 mddev->new_layout = info->layout;
7964 rv = mddev->pers->check_reshape(mddev);
7965 if (rv)
7966 mddev->new_layout = mddev->layout;
7967 return rv;
7968 }
7969 }
7970 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7971 rv = update_size(mddev, (sector_t)info->size * 2);
7972
7973 if (mddev->raid_disks != info->raid_disks)
7974 rv = update_raid_disks(mddev, info->raid_disks);
7975
7976 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7977 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7978 rv = -EINVAL;
7979 goto err;
7980 }
7981 if (mddev->recovery || mddev->sync_thread) {
7982 rv = -EBUSY;
7983 goto err;
7984 }
7985 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7986 /* add the bitmap */
7987 if (mddev->bitmap) {
7988 rv = -EEXIST;
7989 goto err;
7990 }
7991 if (mddev->bitmap_info.default_offset == 0) {
7992 rv = -EINVAL;
7993 goto err;
7994 }
7995 mddev->bitmap_info.offset =
7996 mddev->bitmap_info.default_offset;
7997 mddev->bitmap_info.space =
7998 mddev->bitmap_info.default_space;
7999 rv = md_bitmap_create(mddev);
8000 if (!rv)
8001 rv = mddev->bitmap_ops->load(mddev);
8002
8003 if (rv)
8004 md_bitmap_destroy(mddev);
8005 } else {
8006 struct md_bitmap_stats stats;
8007
8008 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8009 if (rv)
8010 goto err;
8011
8012 if (stats.file) {
8013 rv = -EINVAL;
8014 goto err;
8015 }
8016
8017 if (mddev->bitmap_info.nodes) {
8018 /* hold PW on all the bitmap lock */
8019 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
8020 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
8021 rv = -EPERM;
8022 mddev->cluster_ops->unlock_all_bitmaps(mddev);
8023 goto err;
8024 }
8025
8026 mddev->bitmap_info.nodes = 0;
8027 put_cluster_ops(mddev);
8028 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
8029 }
8030 md_bitmap_destroy(mddev);
8031 mddev->bitmap_info.offset = 0;
8032 }
8033 }
8034 md_update_sb(mddev, 1);
8035 return rv;
8036 err:
8037 return rv;
8038 }
8039
set_disk_faulty(struct mddev * mddev,dev_t dev)8040 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
8041 {
8042 struct md_rdev *rdev;
8043 int err = 0;
8044
8045 if (mddev->pers == NULL)
8046 return -ENODEV;
8047
8048 rcu_read_lock();
8049 rdev = md_find_rdev_rcu(mddev, dev);
8050 if (!rdev)
8051 err = -ENODEV;
8052 else {
8053 md_error(mddev, rdev);
8054 if (test_bit(MD_BROKEN, &mddev->flags))
8055 err = -EBUSY;
8056 }
8057 rcu_read_unlock();
8058 return err;
8059 }
8060
8061 /*
8062 * We have a problem here : there is no easy way to give a CHS
8063 * virtual geometry. We currently pretend that we have a 2 heads
8064 * 4 sectors (with a BIG number of cylinders...). This drives
8065 * dosfs just mad... ;-)
8066 */
md_getgeo(struct gendisk * disk,struct hd_geometry * geo)8067 static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo)
8068 {
8069 struct mddev *mddev = disk->private_data;
8070
8071 geo->heads = 2;
8072 geo->sectors = 4;
8073 geo->cylinders = mddev->array_sectors / 8;
8074 return 0;
8075 }
8076
md_ioctl_valid(unsigned int cmd)8077 static inline int md_ioctl_valid(unsigned int cmd)
8078 {
8079 switch (cmd) {
8080 case GET_ARRAY_INFO:
8081 case GET_DISK_INFO:
8082 case RAID_VERSION:
8083 return 0;
8084 case ADD_NEW_DISK:
8085 case GET_BITMAP_FILE:
8086 case HOT_ADD_DISK:
8087 case HOT_REMOVE_DISK:
8088 case RESTART_ARRAY_RW:
8089 case RUN_ARRAY:
8090 case SET_ARRAY_INFO:
8091 case SET_BITMAP_FILE:
8092 case SET_DISK_FAULTY:
8093 case STOP_ARRAY:
8094 case STOP_ARRAY_RO:
8095 case CLUSTERED_DISK_NACK:
8096 if (!capable(CAP_SYS_ADMIN))
8097 return -EACCES;
8098 return 0;
8099 default:
8100 return -ENOTTY;
8101 }
8102 }
8103
md_ioctl_need_suspend(unsigned int cmd)8104 static bool md_ioctl_need_suspend(unsigned int cmd)
8105 {
8106 switch (cmd) {
8107 case ADD_NEW_DISK:
8108 case HOT_ADD_DISK:
8109 case HOT_REMOVE_DISK:
8110 case SET_BITMAP_FILE:
8111 case SET_ARRAY_INFO:
8112 return true;
8113 default:
8114 return false;
8115 }
8116 }
8117
__md_set_array_info(struct mddev * mddev,void __user * argp)8118 static int __md_set_array_info(struct mddev *mddev, void __user *argp)
8119 {
8120 mdu_array_info_t info;
8121 int err;
8122
8123 if (!argp)
8124 memset(&info, 0, sizeof(info));
8125 else if (copy_from_user(&info, argp, sizeof(info)))
8126 return -EFAULT;
8127
8128 if (mddev->pers) {
8129 err = update_array_info(mddev, &info);
8130 if (err)
8131 pr_warn("md: couldn't update array info. %d\n", err);
8132 return err;
8133 }
8134
8135 if (!list_empty(&mddev->disks)) {
8136 pr_warn("md: array %s already has disks!\n", mdname(mddev));
8137 return -EBUSY;
8138 }
8139
8140 if (mddev->raid_disks) {
8141 pr_warn("md: array %s already initialised!\n", mdname(mddev));
8142 return -EBUSY;
8143 }
8144
8145 err = md_set_array_info(mddev, &info);
8146 if (err)
8147 pr_warn("md: couldn't set array info. %d\n", err);
8148
8149 return err;
8150 }
8151
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)8152 static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
8153 unsigned int cmd, unsigned long arg)
8154 {
8155 int err = 0;
8156 void __user *argp = (void __user *)arg;
8157 struct mddev *mddev = NULL;
8158
8159 err = md_ioctl_valid(cmd);
8160 if (err)
8161 return err;
8162
8163 /*
8164 * Commands dealing with the RAID driver but not any
8165 * particular array:
8166 */
8167 if (cmd == RAID_VERSION)
8168 return get_version(argp);
8169
8170 /*
8171 * Commands creating/starting a new array:
8172 */
8173
8174 mddev = bdev->bd_disk->private_data;
8175
8176 /* Some actions do not requires the mutex */
8177 switch (cmd) {
8178 case GET_ARRAY_INFO:
8179 if (!mddev->raid_disks && !mddev->external)
8180 return -ENODEV;
8181 return get_array_info(mddev, argp);
8182
8183 case GET_DISK_INFO:
8184 if (!mddev->raid_disks && !mddev->external)
8185 return -ENODEV;
8186 return get_disk_info(mddev, argp);
8187
8188 case SET_DISK_FAULTY:
8189 return set_disk_faulty(mddev, new_decode_dev(arg));
8190
8191 case GET_BITMAP_FILE:
8192 return get_bitmap_file(mddev, argp);
8193 }
8194
8195 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
8196 /* Need to flush page cache, and ensure no-one else opens
8197 * and writes
8198 */
8199 err = mddev_set_closing_and_sync_blockdev(mddev, 1);
8200 if (err)
8201 return err;
8202 }
8203
8204 if (!md_is_rdwr(mddev))
8205 flush_work(&mddev->sync_work);
8206
8207 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
8208 mddev_lock(mddev);
8209 if (err) {
8210 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
8211 err, cmd);
8212 goto out;
8213 }
8214
8215 if (cmd == SET_ARRAY_INFO) {
8216 err = __md_set_array_info(mddev, argp);
8217 goto unlock;
8218 }
8219
8220 /*
8221 * Commands querying/configuring an existing array:
8222 */
8223 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
8224 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
8225 if ((!mddev->raid_disks && !mddev->external)
8226 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
8227 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
8228 && cmd != GET_BITMAP_FILE) {
8229 err = -ENODEV;
8230 goto unlock;
8231 }
8232
8233 /*
8234 * Commands even a read-only array can execute:
8235 */
8236 switch (cmd) {
8237 case RESTART_ARRAY_RW:
8238 err = restart_array(mddev);
8239 goto unlock;
8240
8241 case STOP_ARRAY:
8242 err = do_md_stop(mddev, 0);
8243 goto unlock;
8244
8245 case STOP_ARRAY_RO:
8246 if (mddev->pers)
8247 err = md_set_readonly(mddev);
8248 goto unlock;
8249
8250 case HOT_REMOVE_DISK:
8251 err = hot_remove_disk(mddev, new_decode_dev(arg));
8252 goto unlock;
8253
8254 case ADD_NEW_DISK:
8255 /* We can support ADD_NEW_DISK on read-only arrays
8256 * only if we are re-adding a preexisting device.
8257 * So require mddev->pers and MD_DISK_SYNC.
8258 */
8259 if (mddev->pers) {
8260 mdu_disk_info_t info;
8261 if (copy_from_user(&info, argp, sizeof(info)))
8262 err = -EFAULT;
8263 else if (!(info.state & (1<<MD_DISK_SYNC)))
8264 /* Need to clear read-only for this */
8265 break;
8266 else
8267 err = md_add_new_disk(mddev, &info);
8268 goto unlock;
8269 }
8270 break;
8271 }
8272
8273 /*
8274 * The remaining ioctls are changing the state of the
8275 * superblock, so we do not allow them on read-only arrays.
8276 */
8277 if (!md_is_rdwr(mddev) && mddev->pers) {
8278 if (mddev->ro != MD_AUTO_READ) {
8279 err = -EROFS;
8280 goto unlock;
8281 }
8282 mddev->ro = MD_RDWR;
8283 sysfs_notify_dirent_safe(mddev->sysfs_state);
8284 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8285 /* mddev_unlock will wake thread */
8286 /* If a device failed while we were read-only, we
8287 * need to make sure the metadata is updated now.
8288 */
8289 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
8290 mddev_unlock(mddev);
8291 wait_event(mddev->sb_wait,
8292 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
8293 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8294 mddev_lock_nointr(mddev);
8295 }
8296 }
8297
8298 switch (cmd) {
8299 case ADD_NEW_DISK:
8300 {
8301 mdu_disk_info_t info;
8302 if (copy_from_user(&info, argp, sizeof(info)))
8303 err = -EFAULT;
8304 else
8305 err = md_add_new_disk(mddev, &info);
8306 goto unlock;
8307 }
8308
8309 case CLUSTERED_DISK_NACK:
8310 if (mddev_is_clustered(mddev))
8311 mddev->cluster_ops->new_disk_ack(mddev, false);
8312 else
8313 err = -EINVAL;
8314 goto unlock;
8315
8316 case HOT_ADD_DISK:
8317 err = hot_add_disk(mddev, new_decode_dev(arg));
8318 goto unlock;
8319
8320 case RUN_ARRAY:
8321 err = do_md_run(mddev);
8322 goto unlock;
8323
8324 case SET_BITMAP_FILE:
8325 err = set_bitmap_file(mddev, (int)arg);
8326 goto unlock;
8327
8328 default:
8329 err = -EINVAL;
8330 goto unlock;
8331 }
8332
8333 unlock:
8334 if (mddev->hold_active == UNTIL_IOCTL &&
8335 err != -EINVAL)
8336 mddev->hold_active = 0;
8337
8338 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
8339 mddev_unlock(mddev);
8340
8341 out:
8342 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
8343 clear_bit(MD_CLOSING, &mddev->flags);
8344 return err;
8345 }
8346 #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)8347 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
8348 unsigned int cmd, unsigned long arg)
8349 {
8350 switch (cmd) {
8351 case HOT_REMOVE_DISK:
8352 case HOT_ADD_DISK:
8353 case SET_DISK_FAULTY:
8354 case SET_BITMAP_FILE:
8355 /* These take in integer arg, do not convert */
8356 break;
8357 default:
8358 arg = (unsigned long)compat_ptr(arg);
8359 break;
8360 }
8361
8362 return md_ioctl(bdev, mode, cmd, arg);
8363 }
8364 #endif /* CONFIG_COMPAT */
8365
md_set_read_only(struct block_device * bdev,bool ro)8366 static int md_set_read_only(struct block_device *bdev, bool ro)
8367 {
8368 struct mddev *mddev = bdev->bd_disk->private_data;
8369 int err;
8370
8371 err = mddev_lock(mddev);
8372 if (err)
8373 return err;
8374
8375 if (!mddev->raid_disks && !mddev->external) {
8376 err = -ENODEV;
8377 goto out_unlock;
8378 }
8379
8380 /*
8381 * Transitioning to read-auto need only happen for arrays that call
8382 * md_write_start and which are not ready for writes yet.
8383 */
8384 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
8385 err = restart_array(mddev);
8386 if (err)
8387 goto out_unlock;
8388 mddev->ro = MD_AUTO_READ;
8389 }
8390
8391 out_unlock:
8392 mddev_unlock(mddev);
8393 return err;
8394 }
8395
md_open(struct gendisk * disk,blk_mode_t mode)8396 static int md_open(struct gendisk *disk, blk_mode_t mode)
8397 {
8398 struct mddev *mddev;
8399 int err;
8400
8401 spin_lock(&all_mddevs_lock);
8402 mddev = mddev_get(disk->private_data);
8403 spin_unlock(&all_mddevs_lock);
8404 if (!mddev)
8405 return -ENODEV;
8406
8407 err = mutex_lock_interruptible(&mddev->open_mutex);
8408 if (err)
8409 goto out;
8410
8411 err = -ENODEV;
8412 if (test_bit(MD_CLOSING, &mddev->flags))
8413 goto out_unlock;
8414
8415 atomic_inc(&mddev->openers);
8416 mutex_unlock(&mddev->open_mutex);
8417
8418 disk_check_media_change(disk);
8419 return 0;
8420
8421 out_unlock:
8422 mutex_unlock(&mddev->open_mutex);
8423 out:
8424 mddev_put(mddev);
8425 return err;
8426 }
8427
md_release(struct gendisk * disk)8428 static void md_release(struct gendisk *disk)
8429 {
8430 struct mddev *mddev = disk->private_data;
8431
8432 BUG_ON(!mddev);
8433 atomic_dec(&mddev->openers);
8434 mddev_put(mddev);
8435 }
8436
md_check_events(struct gendisk * disk,unsigned int clearing)8437 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
8438 {
8439 struct mddev *mddev = disk->private_data;
8440 unsigned int ret = 0;
8441
8442 if (mddev->changed)
8443 ret = DISK_EVENT_MEDIA_CHANGE;
8444 mddev->changed = 0;
8445 return ret;
8446 }
8447
md_free_disk(struct gendisk * disk)8448 static void md_free_disk(struct gendisk *disk)
8449 {
8450 struct mddev *mddev = disk->private_data;
8451
8452 mddev_free(mddev);
8453 }
8454
8455 const struct block_device_operations md_fops =
8456 {
8457 .owner = THIS_MODULE,
8458 .submit_bio = md_submit_bio,
8459 .open = md_open,
8460 .release = md_release,
8461 .ioctl = md_ioctl,
8462 #ifdef CONFIG_COMPAT
8463 .compat_ioctl = md_compat_ioctl,
8464 #endif
8465 .getgeo = md_getgeo,
8466 .check_events = md_check_events,
8467 .set_read_only = md_set_read_only,
8468 .free_disk = md_free_disk,
8469 };
8470
md_thread(void * arg)8471 static int md_thread(void *arg)
8472 {
8473 struct md_thread *thread = arg;
8474
8475 /*
8476 * md_thread is a 'system-thread', it's priority should be very
8477 * high. We avoid resource deadlocks individually in each
8478 * raid personality. (RAID5 does preallocation) We also use RR and
8479 * the very same RT priority as kswapd, thus we will never get
8480 * into a priority inversion deadlock.
8481 *
8482 * we definitely have to have equal or higher priority than
8483 * bdflush, otherwise bdflush will deadlock if there are too
8484 * many dirty RAID5 blocks.
8485 */
8486
8487 allow_signal(SIGKILL);
8488 while (!kthread_should_stop()) {
8489
8490 /* We need to wait INTERRUPTIBLE so that
8491 * we don't add to the load-average.
8492 * That means we need to be sure no signals are
8493 * pending
8494 */
8495 if (signal_pending(current))
8496 flush_signals(current);
8497
8498 wait_event_interruptible_timeout
8499 (thread->wqueue,
8500 test_bit(THREAD_WAKEUP, &thread->flags)
8501 || kthread_should_stop() || kthread_should_park(),
8502 thread->timeout);
8503
8504 clear_bit(THREAD_WAKEUP, &thread->flags);
8505 if (kthread_should_park())
8506 kthread_parkme();
8507 if (!kthread_should_stop())
8508 thread->run(thread);
8509 }
8510
8511 return 0;
8512 }
8513
md_wakeup_thread_directly(struct md_thread __rcu ** thread)8514 static void md_wakeup_thread_directly(struct md_thread __rcu **thread)
8515 {
8516 struct md_thread *t;
8517
8518 rcu_read_lock();
8519 t = rcu_dereference(*thread);
8520 if (t)
8521 wake_up_process(t->tsk);
8522 rcu_read_unlock();
8523 }
8524
__md_wakeup_thread(struct md_thread __rcu * thread)8525 void __md_wakeup_thread(struct md_thread __rcu *thread)
8526 {
8527 struct md_thread *t;
8528
8529 t = rcu_dereference(thread);
8530 if (t) {
8531 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8532 set_bit(THREAD_WAKEUP, &t->flags);
8533 if (wq_has_sleeper(&t->wqueue))
8534 wake_up(&t->wqueue);
8535 }
8536 }
8537 EXPORT_SYMBOL(__md_wakeup_thread);
8538
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8539 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8540 struct mddev *mddev, const char *name)
8541 {
8542 struct md_thread *thread;
8543
8544 thread = kzalloc_obj(struct md_thread);
8545 if (!thread)
8546 return NULL;
8547
8548 init_waitqueue_head(&thread->wqueue);
8549
8550 thread->run = run;
8551 thread->mddev = mddev;
8552 thread->timeout = MAX_SCHEDULE_TIMEOUT;
8553 thread->tsk = kthread_run(md_thread, thread,
8554 "%s_%s",
8555 mdname(thread->mddev),
8556 name);
8557 if (IS_ERR(thread->tsk)) {
8558 kfree(thread);
8559 return NULL;
8560 }
8561 return thread;
8562 }
8563 EXPORT_SYMBOL(md_register_thread);
8564
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8565 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8566 {
8567 struct md_thread *thread = rcu_dereference_protected(*threadp,
8568 lockdep_is_held(&mddev->reconfig_mutex));
8569
8570 if (!thread)
8571 return;
8572
8573 rcu_assign_pointer(*threadp, NULL);
8574 synchronize_rcu();
8575
8576 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8577 kthread_stop(thread->tsk);
8578 kfree(thread);
8579 }
8580 EXPORT_SYMBOL(md_unregister_thread);
8581
md_error(struct mddev * mddev,struct md_rdev * rdev)8582 void md_error(struct mddev *mddev, struct md_rdev *rdev)
8583 {
8584 if (!rdev || test_bit(Faulty, &rdev->flags))
8585 return;
8586
8587 if (!mddev->pers || !mddev->pers->error_handler)
8588 return;
8589 mddev->pers->error_handler(mddev, rdev);
8590
8591 if (mddev->pers->head.id == ID_RAID0 ||
8592 mddev->pers->head.id == ID_LINEAR)
8593 return;
8594
8595 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8596 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8597 sysfs_notify_dirent_safe(rdev->sysfs_state);
8598 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8599 if (!test_bit(MD_BROKEN, &mddev->flags)) {
8600 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8601 md_wakeup_thread(mddev->thread);
8602 }
8603 if (mddev->event_work.func)
8604 queue_work(md_misc_wq, &mddev->event_work);
8605 md_new_event();
8606 }
8607 EXPORT_SYMBOL(md_error);
8608
8609 /* seq_file implementation /proc/mdstat */
8610
status_unused(struct seq_file * seq)8611 static void status_unused(struct seq_file *seq)
8612 {
8613 int i = 0;
8614 struct md_rdev *rdev;
8615
8616 seq_printf(seq, "unused devices: ");
8617
8618 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8619 i++;
8620 seq_printf(seq, "%pg ", rdev->bdev);
8621 }
8622 if (!i)
8623 seq_printf(seq, "<none>");
8624
8625 seq_printf(seq, "\n");
8626 }
8627
status_personalities(struct seq_file * seq)8628 static void status_personalities(struct seq_file *seq)
8629 {
8630 struct md_submodule_head *head;
8631 unsigned long i;
8632
8633 seq_puts(seq, "Personalities : ");
8634
8635 xa_lock(&md_submodule);
8636 xa_for_each(&md_submodule, i, head)
8637 if (head->type == MD_PERSONALITY)
8638 seq_printf(seq, "[%s] ", head->name);
8639 xa_unlock(&md_submodule);
8640
8641 seq_puts(seq, "\n");
8642 }
8643
status_resync(struct seq_file * seq,struct mddev * mddev)8644 static int status_resync(struct seq_file *seq, struct mddev *mddev)
8645 {
8646 sector_t max_sectors, resync, res;
8647 unsigned long dt, db = 0;
8648 sector_t rt, curr_mark_cnt, resync_mark_cnt;
8649 int scale, recovery_active;
8650 unsigned int per_milli;
8651
8652 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8653 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8654 max_sectors = mddev->resync_max_sectors;
8655 else
8656 max_sectors = mddev->dev_sectors;
8657
8658 resync = mddev->curr_resync;
8659 if (resync < MD_RESYNC_ACTIVE) {
8660 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8661 /* Still cleaning up */
8662 resync = max_sectors;
8663 } else if (resync > max_sectors) {
8664 resync = max_sectors;
8665 } else {
8666 res = atomic_read(&mddev->recovery_active);
8667 /*
8668 * Resync has started, but the subtraction has overflowed or
8669 * yielded one of the special values. Force it to active to
8670 * ensure the status reports an active resync.
8671 */
8672 if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8673 resync = MD_RESYNC_ACTIVE;
8674 else
8675 resync -= res;
8676 }
8677
8678 if (resync == MD_RESYNC_NONE) {
8679 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8680 struct md_rdev *rdev;
8681
8682 rdev_for_each(rdev, mddev)
8683 if (rdev->raid_disk >= 0 &&
8684 !test_bit(Faulty, &rdev->flags) &&
8685 rdev->recovery_offset != MaxSector &&
8686 rdev->recovery_offset) {
8687 seq_printf(seq, "\trecover=REMOTE");
8688 return 1;
8689 }
8690 if (mddev->reshape_position != MaxSector)
8691 seq_printf(seq, "\treshape=REMOTE");
8692 else
8693 seq_printf(seq, "\tresync=REMOTE");
8694 return 1;
8695 }
8696 if (mddev->resync_offset < MaxSector) {
8697 seq_printf(seq, "\tresync=PENDING");
8698 return 1;
8699 }
8700 return 0;
8701 }
8702 if (resync < MD_RESYNC_ACTIVE) {
8703 seq_printf(seq, "\tresync=DELAYED");
8704 return 1;
8705 }
8706
8707 WARN_ON(max_sectors == 0);
8708 /* Pick 'scale' such that (resync>>scale)*1000 will fit
8709 * in a sector_t, and (max_sectors>>scale) will fit in a
8710 * u32, as those are the requirements for sector_div.
8711 * Thus 'scale' must be at least 10
8712 */
8713 scale = 10;
8714 if (sizeof(sector_t) > sizeof(unsigned long)) {
8715 while ( max_sectors/2 > (1ULL<<(scale+32)))
8716 scale++;
8717 }
8718 res = (resync>>scale)*1000;
8719 sector_div(res, (u32)((max_sectors>>scale)+1));
8720
8721 per_milli = res;
8722 {
8723 int i, x = per_milli/50, y = 20-x;
8724 seq_printf(seq, "[");
8725 for (i = 0; i < x; i++)
8726 seq_printf(seq, "=");
8727 seq_printf(seq, ">");
8728 for (i = 0; i < y; i++)
8729 seq_printf(seq, ".");
8730 seq_printf(seq, "] ");
8731 }
8732 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8733 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8734 "reshape" :
8735 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8736 "check" :
8737 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8738 "resync" : "recovery"))),
8739 per_milli/10, per_milli % 10,
8740 (unsigned long long) resync/2,
8741 (unsigned long long) max_sectors/2);
8742
8743 /*
8744 * dt: time from mark until now
8745 * db: blocks written from mark until now
8746 * rt: remaining time
8747 *
8748 * rt is a sector_t, which is always 64bit now. We are keeping
8749 * the original algorithm, but it is not really necessary.
8750 *
8751 * Original algorithm:
8752 * So we divide before multiply in case it is 32bit and close
8753 * to the limit.
8754 * We scale the divisor (db) by 32 to avoid losing precision
8755 * near the end of resync when the number of remaining sectors
8756 * is close to 'db'.
8757 * We then divide rt by 32 after multiplying by db to compensate.
8758 * The '+1' avoids division by zero if db is very small.
8759 */
8760 dt = ((jiffies - mddev->resync_mark) / HZ);
8761 if (!dt) dt++;
8762
8763 curr_mark_cnt = mddev->curr_mark_cnt;
8764 recovery_active = atomic_read(&mddev->recovery_active);
8765 resync_mark_cnt = mddev->resync_mark_cnt;
8766
8767 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8768 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8769
8770 rt = max_sectors - resync; /* number of remaining sectors */
8771 rt = div64_u64(rt, db/32+1);
8772 rt *= dt;
8773 rt >>= 5;
8774
8775 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8776 ((unsigned long)rt % 60)/6);
8777
8778 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8779 return 1;
8780 }
8781
md_seq_start(struct seq_file * seq,loff_t * pos)8782 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8783 __acquires(&all_mddevs_lock)
8784 {
8785 seq->poll_event = atomic_read(&md_event_count);
8786 spin_lock(&all_mddevs_lock);
8787
8788 return seq_list_start_head(&all_mddevs, *pos);
8789 }
8790
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8791 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8792 {
8793 return seq_list_next(v, &all_mddevs, pos);
8794 }
8795
md_seq_stop(struct seq_file * seq,void * v)8796 static void md_seq_stop(struct seq_file *seq, void *v)
8797 __releases(&all_mddevs_lock)
8798 {
8799 spin_unlock(&all_mddevs_lock);
8800 }
8801
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8802 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8803 {
8804 struct md_bitmap_stats stats;
8805 unsigned long used_pages;
8806 unsigned long chunk_kb;
8807 int err;
8808
8809 if (!md_bitmap_enabled(mddev, false))
8810 return;
8811
8812 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8813 if (err)
8814 return;
8815
8816 chunk_kb = mddev->bitmap_info.chunksize >> 10;
8817 used_pages = stats.pages - stats.missing_pages;
8818
8819 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8820 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8821 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8822 chunk_kb ? "KB" : "B");
8823
8824 if (stats.file) {
8825 seq_puts(seq, ", file: ");
8826 seq_file_path(seq, stats.file, " \t\n");
8827 }
8828
8829 seq_putc(seq, '\n');
8830 }
8831
md_seq_show(struct seq_file * seq,void * v)8832 static int md_seq_show(struct seq_file *seq, void *v)
8833 {
8834 struct mddev *mddev;
8835 sector_t sectors;
8836 struct md_rdev *rdev;
8837
8838 if (v == &all_mddevs) {
8839 status_personalities(seq);
8840 if (list_empty(&all_mddevs))
8841 status_unused(seq);
8842 return 0;
8843 }
8844
8845 mddev = list_entry(v, struct mddev, all_mddevs);
8846 if (!mddev_get(mddev))
8847 return 0;
8848
8849 spin_unlock(&all_mddevs_lock);
8850
8851 /* prevent bitmap to be freed after checking */
8852 mutex_lock(&mddev->bitmap_info.mutex);
8853
8854 spin_lock(&mddev->lock);
8855 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8856 seq_printf(seq, "%s : ", mdname(mddev));
8857 if (mddev->pers) {
8858 if (test_bit(MD_BROKEN, &mddev->flags))
8859 seq_printf(seq, "broken");
8860 else
8861 seq_printf(seq, "active");
8862 if (mddev->ro == MD_RDONLY)
8863 seq_printf(seq, " (read-only)");
8864 if (mddev->ro == MD_AUTO_READ)
8865 seq_printf(seq, " (auto-read-only)");
8866 seq_printf(seq, " %s", mddev->pers->head.name);
8867 } else {
8868 seq_printf(seq, "inactive");
8869 }
8870
8871 sectors = 0;
8872 rcu_read_lock();
8873 rdev_for_each_rcu(rdev, mddev) {
8874 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8875
8876 if (test_bit(WriteMostly, &rdev->flags))
8877 seq_printf(seq, "(W)");
8878 if (test_bit(Journal, &rdev->flags))
8879 seq_printf(seq, "(J)");
8880 if (test_bit(Faulty, &rdev->flags)) {
8881 seq_printf(seq, "(F)");
8882 continue;
8883 }
8884 if (rdev->raid_disk < 0)
8885 seq_printf(seq, "(S)"); /* spare */
8886 if (test_bit(Replacement, &rdev->flags))
8887 seq_printf(seq, "(R)");
8888 sectors += rdev->sectors;
8889 }
8890 rcu_read_unlock();
8891
8892 if (!list_empty(&mddev->disks)) {
8893 if (mddev->pers)
8894 seq_printf(seq, "\n %llu blocks",
8895 (unsigned long long)
8896 mddev->array_sectors / 2);
8897 else
8898 seq_printf(seq, "\n %llu blocks",
8899 (unsigned long long)sectors / 2);
8900 }
8901 if (mddev->persistent) {
8902 if (mddev->major_version != 0 ||
8903 mddev->minor_version != 90) {
8904 seq_printf(seq," super %d.%d",
8905 mddev->major_version,
8906 mddev->minor_version);
8907 }
8908 } else if (mddev->external)
8909 seq_printf(seq, " super external:%s",
8910 mddev->metadata_type);
8911 else
8912 seq_printf(seq, " super non-persistent");
8913
8914 if (mddev->pers) {
8915 mddev->pers->status(seq, mddev);
8916 seq_printf(seq, "\n ");
8917 if (mddev->pers->sync_request) {
8918 if (status_resync(seq, mddev))
8919 seq_printf(seq, "\n ");
8920 }
8921 } else
8922 seq_printf(seq, "\n ");
8923
8924 md_bitmap_status(seq, mddev);
8925
8926 seq_printf(seq, "\n");
8927 }
8928 spin_unlock(&mddev->lock);
8929 mutex_unlock(&mddev->bitmap_info.mutex);
8930 spin_lock(&all_mddevs_lock);
8931
8932 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8933 status_unused(seq);
8934
8935 mddev_put_locked(mddev);
8936 return 0;
8937 }
8938
8939 static const struct seq_operations md_seq_ops = {
8940 .start = md_seq_start,
8941 .next = md_seq_next,
8942 .stop = md_seq_stop,
8943 .show = md_seq_show,
8944 };
8945
md_seq_open(struct inode * inode,struct file * file)8946 static int md_seq_open(struct inode *inode, struct file *file)
8947 {
8948 struct seq_file *seq;
8949 int error;
8950
8951 error = seq_open(file, &md_seq_ops);
8952 if (error)
8953 return error;
8954
8955 seq = file->private_data;
8956 seq->poll_event = atomic_read(&md_event_count);
8957 return error;
8958 }
8959
8960 static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8961 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8962 {
8963 struct seq_file *seq = filp->private_data;
8964 __poll_t mask;
8965
8966 if (md_unloading)
8967 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8968 poll_wait(filp, &md_event_waiters, wait);
8969
8970 /* always allow read */
8971 mask = EPOLLIN | EPOLLRDNORM;
8972
8973 if (seq->poll_event != atomic_read(&md_event_count))
8974 mask |= EPOLLERR | EPOLLPRI;
8975 return mask;
8976 }
8977
8978 static const struct proc_ops mdstat_proc_ops = {
8979 .proc_open = md_seq_open,
8980 .proc_read = seq_read,
8981 .proc_lseek = seq_lseek,
8982 .proc_release = seq_release,
8983 .proc_poll = mdstat_poll,
8984 };
8985
register_md_submodule(struct md_submodule_head * msh)8986 int register_md_submodule(struct md_submodule_head *msh)
8987 {
8988 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
8989 }
8990 EXPORT_SYMBOL_GPL(register_md_submodule);
8991
unregister_md_submodule(struct md_submodule_head * msh)8992 void unregister_md_submodule(struct md_submodule_head *msh)
8993 {
8994 xa_erase(&md_submodule, msh->id);
8995 }
8996 EXPORT_SYMBOL_GPL(unregister_md_submodule);
8997
md_setup_cluster(struct mddev * mddev,int nodes)8998 int md_setup_cluster(struct mddev *mddev, int nodes)
8999 {
9000 int ret = get_cluster_ops(mddev);
9001
9002 if (ret) {
9003 request_module("md-cluster");
9004 ret = get_cluster_ops(mddev);
9005 }
9006
9007 /* ensure module won't be unloaded */
9008 if (ret) {
9009 pr_warn("can't find md-cluster module or get its reference.\n");
9010 return ret;
9011 }
9012
9013 ret = mddev->cluster_ops->join(mddev, nodes);
9014 if (!ret)
9015 mddev->safemode_delay = 0;
9016 return ret;
9017 }
9018
md_cluster_stop(struct mddev * mddev)9019 void md_cluster_stop(struct mddev *mddev)
9020 {
9021 put_cluster_ops(mddev);
9022 }
9023
is_rdev_holder_idle(struct md_rdev * rdev,bool init)9024 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
9025 {
9026 unsigned long last_events = rdev->last_events;
9027
9028 if (!bdev_is_partition(rdev->bdev))
9029 return true;
9030
9031 /*
9032 * If rdev is partition, and user doesn't issue IO to the array, the
9033 * array is still not idle if user issues IO to other partitions.
9034 */
9035 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
9036 sectors) -
9037 part_stat_read_accum(rdev->bdev, sectors);
9038
9039 return init || rdev->last_events <= last_events;
9040 }
9041
9042 /*
9043 * mddev is idle if following conditions are matched since last check:
9044 * 1) mddev doesn't have normal IO completed;
9045 * 2) mddev doesn't have inflight normal IO;
9046 * 3) if any member disk is partition, and other partitions don't have IO
9047 * completed;
9048 *
9049 * Noted this checking rely on IO accounting is enabled.
9050 */
is_mddev_idle(struct mddev * mddev,int init)9051 static bool is_mddev_idle(struct mddev *mddev, int init)
9052 {
9053 unsigned long last_events = mddev->normal_io_events;
9054 struct gendisk *disk;
9055 struct md_rdev *rdev;
9056 bool idle = true;
9057
9058 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
9059 if (!disk)
9060 return true;
9061
9062 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
9063 if (!init && (mddev->normal_io_events > last_events ||
9064 bdev_count_inflight(disk->part0)))
9065 idle = false;
9066
9067 rcu_read_lock();
9068 rdev_for_each_rcu(rdev, mddev)
9069 if (!is_rdev_holder_idle(rdev, init))
9070 idle = false;
9071 rcu_read_unlock();
9072
9073 return idle;
9074 }
9075
md_done_sync(struct mddev * mddev,int blocks)9076 void md_done_sync(struct mddev *mddev, int blocks)
9077 {
9078 /* another "blocks" (512byte) blocks have been synced */
9079 atomic_sub(blocks, &mddev->recovery_active);
9080 wake_up(&mddev->recovery_wait);
9081 }
9082 EXPORT_SYMBOL(md_done_sync);
9083
md_sync_error(struct mddev * mddev)9084 void md_sync_error(struct mddev *mddev)
9085 {
9086 // stop recovery, signal do_sync ....
9087 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9088 md_wakeup_thread(mddev->thread);
9089 }
9090 EXPORT_SYMBOL(md_sync_error);
9091
9092 /* md_write_start(mddev, bi)
9093 * If we need to update some array metadata (e.g. 'active' flag
9094 * in superblock) before writing, schedule a superblock update
9095 * and wait for it to complete.
9096 * A return value of 'false' means that the write wasn't recorded
9097 * and cannot proceed as the array is being suspend.
9098 */
md_write_start(struct mddev * mddev,struct bio * bi)9099 void md_write_start(struct mddev *mddev, struct bio *bi)
9100 {
9101 int did_change = 0;
9102
9103 if (bio_data_dir(bi) != WRITE)
9104 return;
9105
9106 BUG_ON(mddev->ro == MD_RDONLY);
9107 if (mddev->ro == MD_AUTO_READ) {
9108 /* need to switch to read/write */
9109 mddev->ro = MD_RDWR;
9110 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9111 md_wakeup_thread(mddev->thread);
9112 md_wakeup_thread(mddev->sync_thread);
9113 did_change = 1;
9114 }
9115 rcu_read_lock();
9116 percpu_ref_get(&mddev->writes_pending);
9117 smp_mb(); /* Match smp_mb in set_in_sync() */
9118 if (mddev->safemode == 1)
9119 mddev->safemode = 0;
9120 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
9121 if (mddev->in_sync || mddev->sync_checkers) {
9122 spin_lock(&mddev->lock);
9123 if (mddev->in_sync) {
9124 mddev->in_sync = 0;
9125 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9126 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9127 md_wakeup_thread(mddev->thread);
9128 did_change = 1;
9129 }
9130 spin_unlock(&mddev->lock);
9131 }
9132 rcu_read_unlock();
9133 if (did_change)
9134 sysfs_notify_dirent_safe(mddev->sysfs_state);
9135 if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags))
9136 return;
9137 wait_event(mddev->sb_wait,
9138 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
9139 }
9140 EXPORT_SYMBOL(md_write_start);
9141
9142 /* md_write_inc can only be called when md_write_start() has
9143 * already been called at least once of the current request.
9144 * It increments the counter and is useful when a single request
9145 * is split into several parts. Each part causes an increment and
9146 * so needs a matching md_write_end().
9147 * Unlike md_write_start(), it is safe to call md_write_inc() inside
9148 * a spinlocked region.
9149 */
md_write_inc(struct mddev * mddev,struct bio * bi)9150 void md_write_inc(struct mddev *mddev, struct bio *bi)
9151 {
9152 if (bio_data_dir(bi) != WRITE)
9153 return;
9154 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
9155 percpu_ref_get(&mddev->writes_pending);
9156 }
9157 EXPORT_SYMBOL(md_write_inc);
9158
md_write_end(struct mddev * mddev)9159 void md_write_end(struct mddev *mddev)
9160 {
9161 percpu_ref_put(&mddev->writes_pending);
9162
9163 if (mddev->safemode == 2)
9164 md_wakeup_thread(mddev->thread);
9165 else if (mddev->safemode_delay)
9166 /* The roundup() ensures this only performs locking once
9167 * every ->safemode_delay jiffies
9168 */
9169 mod_timer(&mddev->safemode_timer,
9170 roundup(jiffies, mddev->safemode_delay) +
9171 mddev->safemode_delay);
9172 }
9173
9174 EXPORT_SYMBOL(md_write_end);
9175
9176 /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)9177 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
9178 struct bio *bio, sector_t start, sector_t size)
9179 {
9180 struct bio *discard_bio = NULL;
9181
9182 __blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio);
9183 if (!discard_bio)
9184 return;
9185
9186 bio_chain(discard_bio, bio);
9187 bio_clone_blkg_association(discard_bio, bio);
9188 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
9189 submit_bio_noacct(discard_bio);
9190 }
9191 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
9192
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)9193 static void md_bitmap_start(struct mddev *mddev,
9194 struct md_io_clone *md_io_clone)
9195 {
9196 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
9197 mddev->bitmap_ops->start_discard :
9198 mddev->bitmap_ops->start_write;
9199
9200 if (mddev->pers->bitmap_sector)
9201 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
9202 &md_io_clone->sectors);
9203
9204 fn(mddev, md_io_clone->offset, md_io_clone->sectors);
9205 }
9206
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)9207 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
9208 {
9209 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
9210 mddev->bitmap_ops->end_discard :
9211 mddev->bitmap_ops->end_write;
9212
9213 fn(mddev, md_io_clone->offset, md_io_clone->sectors);
9214 }
9215
md_end_clone_io(struct bio * bio)9216 static void md_end_clone_io(struct bio *bio)
9217 {
9218 struct md_io_clone *md_io_clone = bio->bi_private;
9219 struct bio *orig_bio = md_io_clone->orig_bio;
9220 struct mddev *mddev = md_io_clone->mddev;
9221
9222 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
9223 md_bitmap_end(mddev, md_io_clone);
9224
9225 if (bio->bi_status && !orig_bio->bi_status)
9226 orig_bio->bi_status = bio->bi_status;
9227
9228 if (md_io_clone->start_time)
9229 bio_end_io_acct(orig_bio, md_io_clone->start_time);
9230
9231 bio_put(bio);
9232 bio_endio(orig_bio);
9233 percpu_ref_put(&mddev->active_io);
9234 }
9235
md_clone_bio(struct mddev * mddev,struct bio ** bio)9236 static void md_clone_bio(struct mddev *mddev, struct bio **bio)
9237 {
9238 struct block_device *bdev = (*bio)->bi_bdev;
9239 struct md_io_clone *md_io_clone;
9240 struct bio *clone =
9241 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
9242
9243 md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
9244 md_io_clone->orig_bio = *bio;
9245 md_io_clone->mddev = mddev;
9246 if (blk_queue_io_stat(bdev->bd_disk->queue))
9247 md_io_clone->start_time = bio_start_io_acct(*bio);
9248
9249 if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) {
9250 md_io_clone->offset = (*bio)->bi_iter.bi_sector;
9251 md_io_clone->sectors = bio_sectors(*bio);
9252 md_io_clone->rw = op_stat_group(bio_op(*bio));
9253 md_bitmap_start(mddev, md_io_clone);
9254 }
9255
9256 clone->bi_end_io = md_end_clone_io;
9257 clone->bi_private = md_io_clone;
9258 *bio = clone;
9259 }
9260
md_account_bio(struct mddev * mddev,struct bio ** bio)9261 void md_account_bio(struct mddev *mddev, struct bio **bio)
9262 {
9263 percpu_ref_get(&mddev->active_io);
9264 md_clone_bio(mddev, bio);
9265 }
9266 EXPORT_SYMBOL_GPL(md_account_bio);
9267
md_free_cloned_bio(struct bio * bio)9268 void md_free_cloned_bio(struct bio *bio)
9269 {
9270 struct md_io_clone *md_io_clone = bio->bi_private;
9271 struct bio *orig_bio = md_io_clone->orig_bio;
9272 struct mddev *mddev = md_io_clone->mddev;
9273
9274 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
9275 md_bitmap_end(mddev, md_io_clone);
9276
9277 if (bio->bi_status && !orig_bio->bi_status)
9278 orig_bio->bi_status = bio->bi_status;
9279
9280 if (md_io_clone->start_time)
9281 bio_end_io_acct(orig_bio, md_io_clone->start_time);
9282
9283 bio_put(bio);
9284 percpu_ref_put(&mddev->active_io);
9285 }
9286 EXPORT_SYMBOL_GPL(md_free_cloned_bio);
9287
9288 /* md_allow_write(mddev)
9289 * Calling this ensures that the array is marked 'active' so that writes
9290 * may proceed without blocking. It is important to call this before
9291 * attempting a GFP_KERNEL allocation while holding the mddev lock.
9292 * Must be called with mddev_lock held.
9293 */
md_allow_write(struct mddev * mddev)9294 void md_allow_write(struct mddev *mddev)
9295 {
9296 if (!mddev->pers)
9297 return;
9298 if (!md_is_rdwr(mddev))
9299 return;
9300 if (!mddev->pers->sync_request)
9301 return;
9302
9303 spin_lock(&mddev->lock);
9304 if (mddev->in_sync) {
9305 mddev->in_sync = 0;
9306 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9307 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9308 if (mddev->safemode_delay &&
9309 mddev->safemode == 0)
9310 mddev->safemode = 1;
9311 spin_unlock(&mddev->lock);
9312 md_update_sb(mddev, 0);
9313 sysfs_notify_dirent_safe(mddev->sysfs_state);
9314 /* wait for the dirty state to be recorded in the metadata */
9315 wait_event(mddev->sb_wait,
9316 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
9317 } else
9318 spin_unlock(&mddev->lock);
9319 }
9320 EXPORT_SYMBOL_GPL(md_allow_write);
9321
md_sync_max_sectors(struct mddev * mddev,enum sync_action action)9322 static sector_t md_sync_max_sectors(struct mddev *mddev,
9323 enum sync_action action)
9324 {
9325 switch (action) {
9326 case ACTION_RESYNC:
9327 case ACTION_CHECK:
9328 case ACTION_REPAIR:
9329 atomic64_set(&mddev->resync_mismatches, 0);
9330 fallthrough;
9331 case ACTION_RESHAPE:
9332 return mddev->resync_max_sectors;
9333 case ACTION_RECOVER:
9334 return mddev->dev_sectors;
9335 default:
9336 return 0;
9337 }
9338 }
9339
9340 /*
9341 * If lazy recovery is requested and all rdevs are in sync, select the rdev with
9342 * the higest index to perfore recovery to build initial xor data, this is the
9343 * same as old bitmap.
9344 */
mddev_select_lazy_recover_rdev(struct mddev * mddev)9345 static bool mddev_select_lazy_recover_rdev(struct mddev *mddev)
9346 {
9347 struct md_rdev *recover_rdev = NULL;
9348 struct md_rdev *rdev;
9349 bool ret = false;
9350
9351 rcu_read_lock();
9352 rdev_for_each_rcu(rdev, mddev) {
9353 if (rdev->raid_disk < 0)
9354 continue;
9355
9356 if (test_bit(Faulty, &rdev->flags) ||
9357 !test_bit(In_sync, &rdev->flags))
9358 break;
9359
9360 if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk)
9361 recover_rdev = rdev;
9362 }
9363
9364 if (recover_rdev) {
9365 clear_bit(In_sync, &recover_rdev->flags);
9366 ret = true;
9367 }
9368
9369 rcu_read_unlock();
9370 return ret;
9371 }
9372
md_sync_position(struct mddev * mddev,enum sync_action action)9373 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
9374 {
9375 sector_t start = 0;
9376 struct md_rdev *rdev;
9377
9378 switch (action) {
9379 case ACTION_CHECK:
9380 case ACTION_REPAIR:
9381 return mddev->resync_min;
9382 case ACTION_RESYNC:
9383 if (!mddev->bitmap)
9384 return mddev->resync_offset;
9385 return 0;
9386 case ACTION_RESHAPE:
9387 /*
9388 * If the original node aborts reshaping then we continue the
9389 * reshaping, so set again to avoid restart reshape from the
9390 * first beginning
9391 */
9392 if (mddev_is_clustered(mddev) &&
9393 mddev->reshape_position != MaxSector)
9394 return mddev->reshape_position;
9395 return 0;
9396 case ACTION_RECOVER:
9397 start = MaxSector;
9398 rcu_read_lock();
9399 rdev_for_each_rcu(rdev, mddev)
9400 if (rdev_needs_recovery(rdev, start))
9401 start = rdev->recovery_offset;
9402 rcu_read_unlock();
9403
9404 /*
9405 * If there are no spares, and raid456 lazy initial recover is
9406 * requested.
9407 */
9408 if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) &&
9409 start == MaxSector && mddev_select_lazy_recover_rdev(mddev))
9410 start = 0;
9411
9412 /* If there is a bitmap, we need to make sure all
9413 * writes that started before we added a spare
9414 * complete before we start doing a recovery.
9415 * Otherwise the write might complete and (via
9416 * bitmap_endwrite) set a bit in the bitmap after the
9417 * recovery has checked that bit and skipped that
9418 * region.
9419 */
9420 if (mddev->bitmap) {
9421 mddev->pers->quiesce(mddev, 1);
9422 mddev->pers->quiesce(mddev, 0);
9423 }
9424 return start;
9425 default:
9426 return MaxSector;
9427 }
9428 }
9429
sync_io_within_limit(struct mddev * mddev)9430 static bool sync_io_within_limit(struct mddev *mddev)
9431 {
9432 /*
9433 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
9434 * RESYNC_PAGES(64k) per IO.
9435 */
9436 return atomic_read(&mddev->recovery_active) <
9437 (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
9438 }
9439
9440 /*
9441 * Update sync offset and mddev status when sync completes
9442 */
md_finish_sync(struct mddev * mddev,enum sync_action action)9443 static void md_finish_sync(struct mddev *mddev, enum sync_action action)
9444 {
9445 struct md_rdev *rdev;
9446
9447 switch (action) {
9448 case ACTION_RESYNC:
9449 case ACTION_REPAIR:
9450 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9451 mddev->curr_resync = MaxSector;
9452 mddev->resync_offset = mddev->curr_resync;
9453 break;
9454 case ACTION_RECOVER:
9455 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9456 mddev->curr_resync = MaxSector;
9457 rcu_read_lock();
9458 rdev_for_each_rcu(rdev, mddev)
9459 if (mddev->delta_disks >= 0 &&
9460 rdev_needs_recovery(rdev, mddev->curr_resync))
9461 rdev->recovery_offset = mddev->curr_resync;
9462 rcu_read_unlock();
9463 break;
9464 case ACTION_RESHAPE:
9465 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9466 mddev->delta_disks > 0 &&
9467 mddev->pers->finish_reshape &&
9468 mddev->pers->size &&
9469 !mddev_is_dm(mddev)) {
9470 mddev_lock_nointr(mddev);
9471 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9472 mddev_unlock(mddev);
9473 if (!mddev_is_clustered(mddev))
9474 set_capacity_and_notify(mddev->gendisk,
9475 mddev->array_sectors);
9476 }
9477 if (mddev->pers->finish_reshape)
9478 mddev->pers->finish_reshape(mddev);
9479 break;
9480 /* */
9481 case ACTION_CHECK:
9482 default:
9483 break;
9484 }
9485 }
9486
9487 #define SYNC_MARKS 10
9488 #define SYNC_MARK_STEP (3*HZ)
9489 #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)9490 void md_do_sync(struct md_thread *thread)
9491 {
9492 struct mddev *mddev = thread->mddev;
9493 struct mddev *mddev2;
9494 unsigned int currspeed = 0, window;
9495 sector_t max_sectors,j, io_sectors, recovery_done;
9496 unsigned long mark[SYNC_MARKS];
9497 unsigned long update_time;
9498 sector_t mark_cnt[SYNC_MARKS];
9499 int last_mark,m;
9500 sector_t last_check;
9501 int skipped = 0;
9502 enum sync_action action;
9503 const char *desc;
9504 struct blk_plug plug;
9505 int ret;
9506
9507 /* just incase thread restarts... */
9508 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9509 return;
9510
9511 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9512 goto skip;
9513
9514 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
9515 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
9516 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9517 goto skip;
9518 }
9519
9520 if (mddev_is_clustered(mddev)) {
9521 ret = mddev->cluster_ops->resync_start(mddev);
9522 if (ret)
9523 goto skip;
9524
9525 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
9526 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
9527 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
9528 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
9529 && ((unsigned long long)mddev->curr_resync_completed
9530 < (unsigned long long)mddev->resync_max_sectors))
9531 goto skip;
9532 }
9533
9534 action = md_sync_action(mddev);
9535 if (action == ACTION_FROZEN || action == ACTION_IDLE) {
9536 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9537 goto skip;
9538 }
9539
9540 desc = md_sync_action_name(action);
9541 mddev->last_sync_action = action;
9542
9543 /*
9544 * Before starting a resync we must have set curr_resync to
9545 * 2, and then checked that every "conflicting" array has curr_resync
9546 * less than ours. When we find one that is the same or higher
9547 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
9548 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
9549 * This will mean we have to start checking from the beginning again.
9550 *
9551 */
9552 if (mddev_is_clustered(mddev))
9553 mddev->cluster_ops->resync_start_notify(mddev);
9554 do {
9555 int mddev2_minor = -1;
9556 mddev->curr_resync = MD_RESYNC_DELAYED;
9557
9558 try_again:
9559 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9560 goto skip;
9561 spin_lock(&all_mddevs_lock);
9562 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
9563 if (test_bit(MD_DELETED, &mddev2->flags))
9564 continue;
9565 if (mddev2 == mddev)
9566 continue;
9567 if (!mddev->parallel_resync
9568 && mddev2->curr_resync
9569 && match_mddev_units(mddev, mddev2)) {
9570 DEFINE_WAIT(wq);
9571 if (mddev < mddev2 &&
9572 mddev->curr_resync == MD_RESYNC_DELAYED) {
9573 /* arbitrarily yield */
9574 mddev->curr_resync = MD_RESYNC_YIELDED;
9575 wake_up(&resync_wait);
9576 }
9577 if (mddev > mddev2 &&
9578 mddev->curr_resync == MD_RESYNC_YIELDED)
9579 /* no need to wait here, we can wait the next
9580 * time 'round when curr_resync == 2
9581 */
9582 continue;
9583 /* We need to wait 'interruptible' so as not to
9584 * contribute to the load average, and not to
9585 * be caught by 'softlockup'
9586 */
9587 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
9588 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9589 mddev2->curr_resync >= mddev->curr_resync) {
9590 if (mddev2_minor != mddev2->md_minor) {
9591 mddev2_minor = mddev2->md_minor;
9592 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
9593 desc, mdname(mddev),
9594 mdname(mddev2));
9595 }
9596 spin_unlock(&all_mddevs_lock);
9597
9598 if (signal_pending(current))
9599 flush_signals(current);
9600 schedule();
9601 finish_wait(&resync_wait, &wq);
9602 goto try_again;
9603 }
9604 finish_wait(&resync_wait, &wq);
9605 }
9606 }
9607 spin_unlock(&all_mddevs_lock);
9608 } while (mddev->curr_resync < MD_RESYNC_DELAYED);
9609
9610 max_sectors = md_sync_max_sectors(mddev, action);
9611 j = md_sync_position(mddev, action);
9612
9613 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9614 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
9615 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9616 speed_max(mddev), desc);
9617
9618 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9619
9620 io_sectors = 0;
9621 for (m = 0; m < SYNC_MARKS; m++) {
9622 mark[m] = jiffies;
9623 mark_cnt[m] = io_sectors;
9624 }
9625 last_mark = 0;
9626 mddev->resync_mark = mark[last_mark];
9627 mddev->resync_mark_cnt = mark_cnt[last_mark];
9628
9629 /*
9630 * Tune reconstruction:
9631 */
9632 window = 32 * (PAGE_SIZE / 512);
9633 pr_debug("md: using %dk window, over a total of %lluk.\n",
9634 window/2, (unsigned long long)max_sectors/2);
9635
9636 atomic_set(&mddev->recovery_active, 0);
9637 last_check = 0;
9638
9639 if (j >= MD_RESYNC_ACTIVE) {
9640 pr_debug("md: resuming %s of %s from checkpoint.\n",
9641 desc, mdname(mddev));
9642 mddev->curr_resync = j;
9643 } else
9644 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9645 mddev->curr_resync_completed = j;
9646 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9647 md_new_event();
9648 update_time = jiffies;
9649
9650 blk_start_plug(&plug);
9651 while (j < max_sectors) {
9652 sector_t sectors;
9653
9654 skipped = 0;
9655
9656 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9657 ((mddev->curr_resync > mddev->curr_resync_completed &&
9658 (mddev->curr_resync - mddev->curr_resync_completed)
9659 > (max_sectors >> 4)) ||
9660 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9661 (j - mddev->curr_resync_completed)*2
9662 >= mddev->resync_max - mddev->curr_resync_completed ||
9663 mddev->curr_resync_completed > mddev->resync_max
9664 )) {
9665 /* time to update curr_resync_completed */
9666 wait_event(mddev->recovery_wait,
9667 atomic_read(&mddev->recovery_active) == 0);
9668 mddev->curr_resync_completed = j;
9669 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9670 j > mddev->resync_offset)
9671 mddev->resync_offset = j;
9672 update_time = jiffies;
9673 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9674 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9675 }
9676
9677 while (j >= mddev->resync_max &&
9678 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9679 /* As this condition is controlled by user-space,
9680 * we can block indefinitely, so use '_interruptible'
9681 * to avoid triggering warnings.
9682 */
9683 flush_signals(current); /* just in case */
9684 wait_event_interruptible(mddev->recovery_wait,
9685 mddev->resync_max > j
9686 || test_bit(MD_RECOVERY_INTR,
9687 &mddev->recovery));
9688 }
9689
9690 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9691 break;
9692
9693 if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) {
9694 sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j);
9695 if (sectors)
9696 goto update;
9697 }
9698
9699 sectors = mddev->pers->sync_request(mddev, j, max_sectors,
9700 &skipped);
9701 if (sectors == 0) {
9702 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9703 break;
9704 }
9705
9706 if (!skipped) { /* actual IO requested */
9707 io_sectors += sectors;
9708 atomic_add(sectors, &mddev->recovery_active);
9709 }
9710
9711 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9712 break;
9713
9714 update:
9715 j += sectors;
9716 if (j > max_sectors)
9717 /* when skipping, extra large numbers can be returned. */
9718 j = max_sectors;
9719 if (j >= MD_RESYNC_ACTIVE)
9720 mddev->curr_resync = j;
9721 mddev->curr_mark_cnt = io_sectors;
9722 if (last_check == 0)
9723 /* this is the earliest that rebuild will be
9724 * visible in /proc/mdstat
9725 */
9726 md_new_event();
9727
9728 if (last_check + window > io_sectors || j == max_sectors)
9729 continue;
9730
9731 last_check = io_sectors;
9732 repeat:
9733 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9734 /* step marks */
9735 int next = (last_mark+1) % SYNC_MARKS;
9736
9737 mddev->resync_mark = mark[next];
9738 mddev->resync_mark_cnt = mark_cnt[next];
9739 mark[next] = jiffies;
9740 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9741 last_mark = next;
9742 }
9743
9744 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9745 break;
9746
9747 /*
9748 * this loop exits only if either when we are slower than
9749 * the 'hard' speed limit, or the system was IO-idle for
9750 * a jiffy.
9751 * the system might be non-idle CPU-wise, but we only care
9752 * about not overloading the IO subsystem. (things like an
9753 * e2fsck being done on the RAID array should execute fast)
9754 */
9755 cond_resched();
9756
9757 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9758 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9759 /((jiffies-mddev->resync_mark)/HZ +1) +1;
9760
9761 if (currspeed > speed_min(mddev)) {
9762 if (currspeed > speed_max(mddev)) {
9763 msleep(500);
9764 goto repeat;
9765 }
9766 if (!sync_io_within_limit(mddev) &&
9767 !is_mddev_idle(mddev, 0)) {
9768 /*
9769 * Give other IO more of a chance.
9770 * The faster the devices, the less we wait.
9771 */
9772 wait_event(mddev->recovery_wait,
9773 !atomic_read(&mddev->recovery_active));
9774 }
9775 }
9776 }
9777 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9778 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9779 ? "interrupted" : "done");
9780 /*
9781 * this also signals 'finished resyncing' to md_stop
9782 */
9783 blk_finish_plug(&plug);
9784 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9785
9786 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9787 mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9788 /* All sync IO completes after recovery_active becomes 0 */
9789 mddev->curr_resync_completed = mddev->curr_resync;
9790 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9791 }
9792 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
9793
9794 if (mddev->curr_resync > MD_RESYNC_ACTIVE)
9795 md_finish_sync(mddev, action);
9796 skip:
9797 /* set CHANGE_PENDING here since maybe another update is needed,
9798 * so other nodes are informed. It should be harmless for normal
9799 * raid */
9800 set_mask_bits(&mddev->sb_flags, 0,
9801 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9802 spin_lock(&mddev->lock);
9803 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9804 /* We completed so min/max setting can be forgotten if used. */
9805 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9806 mddev->resync_min = 0;
9807 mddev->resync_max = MaxSector;
9808 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9809 mddev->resync_min = mddev->curr_resync_completed;
9810 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9811 mddev->curr_resync = MD_RESYNC_NONE;
9812 spin_unlock(&mddev->lock);
9813
9814 wake_up(&resync_wait);
9815 md_wakeup_thread(mddev->thread);
9816 return;
9817 }
9818 EXPORT_SYMBOL_GPL(md_do_sync);
9819
rdev_removeable(struct md_rdev * rdev)9820 static bool rdev_removeable(struct md_rdev *rdev)
9821 {
9822 /* rdev is not used. */
9823 if (rdev->raid_disk < 0)
9824 return false;
9825
9826 /* There are still inflight io, don't remove this rdev. */
9827 if (atomic_read(&rdev->nr_pending))
9828 return false;
9829
9830 /*
9831 * An error occurred but has not yet been acknowledged by the metadata
9832 * handler, don't remove this rdev.
9833 */
9834 if (test_bit(Blocked, &rdev->flags))
9835 return false;
9836
9837 /* Fautly rdev is not used, it's safe to remove it. */
9838 if (test_bit(Faulty, &rdev->flags))
9839 return true;
9840
9841 /* Journal disk can only be removed if it's faulty. */
9842 if (test_bit(Journal, &rdev->flags))
9843 return false;
9844
9845 /*
9846 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9847 * replacement has just become active from pers->spare_active(), and
9848 * then pers->hot_remove_disk() will replace this rdev with replacement.
9849 */
9850 if (!test_bit(In_sync, &rdev->flags))
9851 return true;
9852
9853 return false;
9854 }
9855
rdev_is_spare(struct md_rdev * rdev)9856 static bool rdev_is_spare(struct md_rdev *rdev)
9857 {
9858 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9859 !test_bit(In_sync, &rdev->flags) &&
9860 !test_bit(Journal, &rdev->flags) &&
9861 !test_bit(Faulty, &rdev->flags);
9862 }
9863
rdev_addable(struct md_rdev * rdev)9864 static bool rdev_addable(struct md_rdev *rdev)
9865 {
9866 struct mddev *mddev;
9867
9868 mddev = READ_ONCE(rdev->mddev);
9869 if (!mddev)
9870 return false;
9871
9872 /* rdev is already used, don't add it again. */
9873 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9874 test_bit(Faulty, &rdev->flags))
9875 return false;
9876
9877 /* Allow to add journal disk. */
9878 if (test_bit(Journal, &rdev->flags))
9879 return true;
9880
9881 /* Allow to add if array is read-write. */
9882 if (md_is_rdwr(mddev))
9883 return true;
9884
9885 /*
9886 * For read-only array, only allow to readd a rdev. And if bitmap is
9887 * used, don't allow to readd a rdev that is too old.
9888 */
9889 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9890 return true;
9891
9892 return false;
9893 }
9894
md_spares_need_change(struct mddev * mddev)9895 static bool md_spares_need_change(struct mddev *mddev)
9896 {
9897 struct md_rdev *rdev;
9898
9899 rcu_read_lock();
9900 rdev_for_each_rcu(rdev, mddev) {
9901 if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9902 rcu_read_unlock();
9903 return true;
9904 }
9905 }
9906 rcu_read_unlock();
9907 return false;
9908 }
9909
remove_spares(struct mddev * mddev,struct md_rdev * this)9910 static int remove_spares(struct mddev *mddev, struct md_rdev *this)
9911 {
9912 struct md_rdev *rdev;
9913 int removed = 0;
9914
9915 rdev_for_each(rdev, mddev) {
9916 if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9917 !mddev->pers->hot_remove_disk(mddev, rdev)) {
9918 sysfs_unlink_rdev(mddev, rdev);
9919 rdev->saved_raid_disk = rdev->raid_disk;
9920 rdev->raid_disk = -1;
9921 removed++;
9922 }
9923 }
9924
9925 if (removed && mddev->kobj.sd)
9926 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9927
9928 return removed;
9929 }
9930
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9931 static int remove_and_add_spares(struct mddev *mddev,
9932 struct md_rdev *this)
9933 {
9934 struct md_rdev *rdev;
9935 int spares = 0;
9936 int removed = 0;
9937
9938 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9939 /* Mustn't remove devices when resync thread is running */
9940 return 0;
9941
9942 removed = remove_spares(mddev, this);
9943 if (this && removed)
9944 goto no_add;
9945
9946 rdev_for_each(rdev, mddev) {
9947 if (this && this != rdev)
9948 continue;
9949 if (rdev_is_spare(rdev))
9950 spares++;
9951 if (!rdev_addable(rdev))
9952 continue;
9953 if (!test_bit(Journal, &rdev->flags))
9954 rdev->recovery_offset = 0;
9955 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9956 /* failure here is OK */
9957 sysfs_link_rdev(mddev, rdev);
9958 if (!test_bit(Journal, &rdev->flags))
9959 spares++;
9960 md_new_event();
9961 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9962 }
9963 }
9964 no_add:
9965 if (removed)
9966 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9967 return spares;
9968 }
9969
md_choose_sync_action(struct mddev * mddev,int * spares)9970 static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9971 {
9972 /* Check if reshape is in progress first. */
9973 if (mddev->reshape_position != MaxSector) {
9974 if (mddev->pers->check_reshape == NULL ||
9975 mddev->pers->check_reshape(mddev) != 0)
9976 return false;
9977
9978 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9979 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9980 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
9981 return true;
9982 }
9983
9984 /* Check if resync is in progress. */
9985 if (mddev->resync_offset < MaxSector) {
9986 remove_spares(mddev, NULL);
9987 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9988 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9989 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
9990 return true;
9991 }
9992
9993 /*
9994 * Remove any failed drives, then add spares if possible. Spares are
9995 * also removed and re-added, to allow the personality to fail the
9996 * re-add.
9997 */
9998 *spares = remove_and_add_spares(mddev, NULL);
9999 if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) {
10000 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
10001 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
10002 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
10003
10004 /* Start new recovery. */
10005 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
10006 return true;
10007 }
10008
10009 /* Delay to choose resync/check/repair in md_do_sync(). */
10010 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
10011 return true;
10012
10013 /* Nothing to be done */
10014 return false;
10015 }
10016
md_start_sync(struct work_struct * ws)10017 static void md_start_sync(struct work_struct *ws)
10018 {
10019 struct mddev *mddev = container_of(ws, struct mddev, sync_work);
10020 int spares = 0;
10021 bool suspend = false;
10022 char *name;
10023
10024 /*
10025 * If reshape is still in progress, spares won't be added or removed
10026 * from conf until reshape is done.
10027 */
10028 if (mddev->reshape_position == MaxSector &&
10029 md_spares_need_change(mddev)) {
10030 suspend = true;
10031 mddev_suspend(mddev, false);
10032 }
10033
10034 mddev_lock_nointr(mddev);
10035 if (!md_is_rdwr(mddev)) {
10036 /*
10037 * On a read-only array we can:
10038 * - remove failed devices
10039 * - add already-in_sync devices if the array itself is in-sync.
10040 * As we only add devices that are already in-sync, we can
10041 * activate the spares immediately.
10042 */
10043 remove_and_add_spares(mddev, NULL);
10044 goto not_running;
10045 }
10046
10047 if (!md_choose_sync_action(mddev, &spares))
10048 goto not_running;
10049
10050 if (!mddev->pers->sync_request)
10051 goto not_running;
10052
10053 /*
10054 * We are adding a device or devices to an array which has the bitmap
10055 * stored on all devices. So make sure all bitmap pages get written.
10056 */
10057 if (spares && md_bitmap_enabled(mddev, true))
10058 mddev->bitmap_ops->write_all(mddev);
10059
10060 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
10061 "reshape" : "resync";
10062 rcu_assign_pointer(mddev->sync_thread,
10063 md_register_thread(md_do_sync, mddev, name));
10064 if (!mddev->sync_thread) {
10065 pr_warn("%s: could not start resync thread...\n",
10066 mdname(mddev));
10067 /* leave the spares where they are, it shouldn't hurt */
10068 goto not_running;
10069 }
10070
10071 mddev_unlock(mddev);
10072 /*
10073 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
10074 * not set it again. Otherwise, we may cause issue like this one:
10075 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
10076 * Therefore, use __mddev_resume(mddev, false).
10077 */
10078 if (suspend)
10079 __mddev_resume(mddev, false);
10080 md_wakeup_thread(mddev->sync_thread);
10081 sysfs_notify_dirent_safe(mddev->sysfs_action);
10082 md_new_event();
10083 return;
10084
10085 not_running:
10086 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
10087 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
10088 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
10089 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
10090 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10091 mddev_unlock(mddev);
10092 /*
10093 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
10094 * not set it again. Otherwise, we may cause issue like this one:
10095 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
10096 * Therefore, use __mddev_resume(mddev, false).
10097 */
10098 if (suspend)
10099 __mddev_resume(mddev, false);
10100
10101 wake_up(&resync_wait);
10102 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
10103 mddev->sysfs_action)
10104 sysfs_notify_dirent_safe(mddev->sysfs_action);
10105 }
10106
unregister_sync_thread(struct mddev * mddev)10107 static void unregister_sync_thread(struct mddev *mddev)
10108 {
10109 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
10110 /* resync/recovery still happening */
10111 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10112 return;
10113 }
10114
10115 if (WARN_ON_ONCE(!mddev->sync_thread))
10116 return;
10117
10118 md_reap_sync_thread(mddev);
10119 }
10120
md_should_do_recovery(struct mddev * mddev)10121 static bool md_should_do_recovery(struct mddev *mddev)
10122 {
10123 /*
10124 * As long as one of the following flags is set,
10125 * recovery needs to do or cleanup.
10126 */
10127 if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
10128 test_bit(MD_RECOVERY_DONE, &mddev->recovery))
10129 return true;
10130
10131 /*
10132 * If no flags are set and it is in read-only status,
10133 * there is nothing to do.
10134 */
10135 if (!md_is_rdwr(mddev))
10136 return false;
10137
10138 /*
10139 * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to
10140 * active, and no action is needed for now.
10141 * All other MD_SB_* flags require to update the superblock.
10142 */
10143 if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING))
10144 return true;
10145
10146 /*
10147 * If the array is not using external metadata and there has been no data
10148 * written for some time, then the array's status needs to be set to
10149 * in_sync.
10150 */
10151 if (mddev->external == 0 && mddev->safemode == 1)
10152 return true;
10153
10154 /*
10155 * When the system is about to restart or the process receives an signal,
10156 * the array needs to be synchronized as soon as possible.
10157 * Once the data synchronization is completed, need to change the array
10158 * status to in_sync.
10159 */
10160 if (mddev->safemode == 2 && !mddev->in_sync &&
10161 mddev->resync_offset == MaxSector)
10162 return true;
10163
10164 return false;
10165 }
10166
10167 /*
10168 * This routine is regularly called by all per-raid-array threads to
10169 * deal with generic issues like resync and super-block update.
10170 * Raid personalities that don't have a thread (linear/raid0) do not
10171 * need this as they never do any recovery or update the superblock.
10172 *
10173 * It does not do any resync itself, but rather "forks" off other threads
10174 * to do that as needed.
10175 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
10176 * "->recovery" and create a thread at ->sync_thread.
10177 * When the thread finishes it sets MD_RECOVERY_DONE
10178 * and wakeups up this thread which will reap the thread and finish up.
10179 * This thread also removes any faulty devices (with nr_pending == 0).
10180 *
10181 * The overall approach is:
10182 * 1/ if the superblock needs updating, update it.
10183 * 2/ If a recovery thread is running, don't do anything else.
10184 * 3/ If recovery has finished, clean up, possibly marking spares active.
10185 * 4/ If there are any faulty devices, remove them.
10186 * 5/ If array is degraded, try to add spares devices
10187 * 6/ If array has spares or is not in-sync, start a resync thread.
10188 */
md_check_recovery(struct mddev * mddev)10189 void md_check_recovery(struct mddev *mddev)
10190 {
10191 if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work)
10192 mddev->bitmap_ops->daemon_work(mddev);
10193
10194 if (signal_pending(current)) {
10195 if (mddev->pers->sync_request && !mddev->external) {
10196 pr_debug("md: %s in immediate safe mode\n",
10197 mdname(mddev));
10198 mddev->safemode = 2;
10199 }
10200 flush_signals(current);
10201 }
10202
10203 if (!md_should_do_recovery(mddev))
10204 return;
10205
10206 if (mddev_trylock(mddev)) {
10207 bool try_set_sync = mddev->safemode != 0;
10208
10209 if (!mddev->external && mddev->safemode == 1)
10210 mddev->safemode = 0;
10211
10212 if (!md_is_rdwr(mddev)) {
10213 struct md_rdev *rdev;
10214
10215 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
10216 unregister_sync_thread(mddev);
10217 goto unlock;
10218 }
10219
10220 if (!mddev->external && mddev->in_sync)
10221 /*
10222 * 'Blocked' flag not needed as failed devices
10223 * will be recorded if array switched to read/write.
10224 * Leaving it set will prevent the device
10225 * from being removed.
10226 */
10227 rdev_for_each(rdev, mddev)
10228 clear_bit(Blocked, &rdev->flags);
10229
10230 /*
10231 * There is no thread, but we need to call
10232 * ->spare_active and clear saved_raid_disk
10233 */
10234 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
10235 md_reap_sync_thread(mddev);
10236
10237 /*
10238 * Let md_start_sync() to remove and add rdevs to the
10239 * array.
10240 */
10241 if (md_spares_need_change(mddev)) {
10242 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10243 queue_work(md_misc_wq, &mddev->sync_work);
10244 }
10245
10246 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
10247 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
10248 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10249 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
10250
10251 goto unlock;
10252 }
10253
10254 if (mddev_is_clustered(mddev)) {
10255 struct md_rdev *rdev, *tmp;
10256 /* kick the device if another node issued a
10257 * remove disk.
10258 */
10259 rdev_for_each_safe(rdev, tmp, mddev) {
10260 if (rdev->raid_disk < 0 &&
10261 test_and_clear_bit(ClusterRemove, &rdev->flags))
10262 md_kick_rdev_from_array(rdev);
10263 }
10264 }
10265
10266 if (try_set_sync && !mddev->external && !mddev->in_sync) {
10267 spin_lock(&mddev->lock);
10268 set_in_sync(mddev);
10269 spin_unlock(&mddev->lock);
10270 }
10271
10272 if (mddev->sb_flags)
10273 md_update_sb(mddev, 0);
10274
10275 /*
10276 * Never start a new sync thread if MD_RECOVERY_RUNNING is
10277 * still set.
10278 */
10279 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
10280 unregister_sync_thread(mddev);
10281 goto unlock;
10282 }
10283
10284 /* Set RUNNING before clearing NEEDED to avoid
10285 * any transients in the value of "sync_action".
10286 */
10287 mddev->curr_resync_completed = 0;
10288 spin_lock(&mddev->lock);
10289 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10290 spin_unlock(&mddev->lock);
10291 /* Clear some bits that don't mean anything, but
10292 * might be left set
10293 */
10294 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
10295 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
10296
10297 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
10298 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
10299 queue_work(md_misc_wq, &mddev->sync_work);
10300 } else {
10301 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10302 wake_up(&resync_wait);
10303 }
10304
10305 unlock:
10306 wake_up(&mddev->sb_wait);
10307 mddev_unlock(mddev);
10308 }
10309 }
10310 EXPORT_SYMBOL(md_check_recovery);
10311
md_reap_sync_thread(struct mddev * mddev)10312 void md_reap_sync_thread(struct mddev *mddev)
10313 {
10314 struct md_rdev *rdev;
10315 sector_t old_dev_sectors = mddev->dev_sectors;
10316 bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
10317
10318 /* resync has finished, collect result */
10319 md_unregister_thread(mddev, &mddev->sync_thread);
10320 atomic_inc(&mddev->sync_seq);
10321
10322 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
10323 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
10324 mddev->degraded != mddev->raid_disks) {
10325 /* success...*/
10326 /* activate any spares */
10327 if (mddev->pers->spare_active(mddev)) {
10328 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10329 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
10330 }
10331 }
10332
10333 /* If array is no-longer degraded, then any saved_raid_disk
10334 * information must be scrapped.
10335 */
10336 if (!mddev->degraded)
10337 rdev_for_each(rdev, mddev)
10338 rdev->saved_raid_disk = -1;
10339
10340 md_update_sb(mddev, 1);
10341 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
10342 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
10343 * clustered raid */
10344 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
10345 mddev->cluster_ops->resync_finish(mddev);
10346 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10347 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
10348 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
10349 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
10350 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
10351 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
10352 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
10353 /*
10354 * We call mddev->cluster_ops->update_size here because sync_size could
10355 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
10356 * so it is time to update size across cluster.
10357 */
10358 if (mddev_is_clustered(mddev) && is_reshaped &&
10359 mddev->pers->finish_reshape &&
10360 !test_bit(MD_CLOSING, &mddev->flags))
10361 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
10362 /* flag recovery needed just to double check */
10363 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10364 sysfs_notify_dirent_safe(mddev->sysfs_completed);
10365 sysfs_notify_dirent_safe(mddev->sysfs_action);
10366 md_new_event();
10367 if (mddev->event_work.func)
10368 queue_work(md_misc_wq, &mddev->event_work);
10369 wake_up(&resync_wait);
10370 }
10371 EXPORT_SYMBOL(md_reap_sync_thread);
10372
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)10373 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
10374 {
10375 sysfs_notify_dirent_safe(rdev->sysfs_state);
10376 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
10377 msecs_to_jiffies(5000));
10378 rdev_dec_pending(rdev, mddev);
10379 }
10380 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
10381
md_finish_reshape(struct mddev * mddev)10382 void md_finish_reshape(struct mddev *mddev)
10383 {
10384 /* called be personality module when reshape completes. */
10385 struct md_rdev *rdev;
10386
10387 rdev_for_each(rdev, mddev) {
10388 if (rdev->data_offset > rdev->new_data_offset)
10389 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
10390 else
10391 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
10392 rdev->data_offset = rdev->new_data_offset;
10393 }
10394 }
10395 EXPORT_SYMBOL(md_finish_reshape);
10396
10397 /* Bad block management */
10398
10399 /* Returns true on success, false on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)10400 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
10401 int is_new)
10402 {
10403 struct mddev *mddev = rdev->mddev;
10404
10405 /*
10406 * Recording new badblocks for faulty rdev will force unnecessary
10407 * super block updating. This is fragile for external management because
10408 * userspace daemon may trying to remove this device and deadlock may
10409 * occur. This will be probably solved in the mdadm, but it is safer to
10410 * avoid it.
10411 */
10412 if (test_bit(Faulty, &rdev->flags))
10413 return true;
10414
10415 if (is_new)
10416 s += rdev->new_data_offset;
10417 else
10418 s += rdev->data_offset;
10419
10420 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) {
10421 /*
10422 * Mark the disk as Faulty when setting badblocks fails,
10423 * otherwise, bad sectors may be read.
10424 */
10425 md_error(mddev, rdev);
10426 return false;
10427 }
10428
10429 /* Make sure they get written out promptly */
10430 if (test_bit(ExternalBbl, &rdev->flags))
10431 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
10432 sysfs_notify_dirent_safe(rdev->sysfs_state);
10433 set_mask_bits(&mddev->sb_flags, 0,
10434 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
10435 md_wakeup_thread(rdev->mddev->thread);
10436 return true;
10437 }
10438 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
10439
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)10440 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
10441 int is_new)
10442 {
10443 if (is_new)
10444 s += rdev->new_data_offset;
10445 else
10446 s += rdev->data_offset;
10447
10448 if (!badblocks_clear(&rdev->badblocks, s, sectors))
10449 return;
10450
10451 if (test_bit(ExternalBbl, &rdev->flags))
10452 sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
10453 }
10454 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
10455
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)10456 static int md_notify_reboot(struct notifier_block *this,
10457 unsigned long code, void *x)
10458 {
10459 struct mddev *mddev;
10460
10461 spin_lock(&all_mddevs_lock);
10462 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10463 if (!mddev_get(mddev))
10464 continue;
10465 spin_unlock(&all_mddevs_lock);
10466 if (mddev_trylock(mddev)) {
10467 if (mddev->pers)
10468 __md_stop_writes(mddev);
10469 if (mddev->persistent)
10470 mddev->safemode = 2;
10471 mddev_unlock(mddev);
10472 }
10473 spin_lock(&all_mddevs_lock);
10474 mddev_put_locked(mddev);
10475 }
10476 spin_unlock(&all_mddevs_lock);
10477
10478 return NOTIFY_DONE;
10479 }
10480
10481 static struct notifier_block md_notifier = {
10482 .notifier_call = md_notify_reboot,
10483 .next = NULL,
10484 .priority = INT_MAX, /* before any real devices */
10485 };
10486
md_geninit(void)10487 static void md_geninit(void)
10488 {
10489 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
10490
10491 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
10492 }
10493
md_init(void)10494 static int __init md_init(void)
10495 {
10496 int ret = md_bitmap_init();
10497
10498 if (ret)
10499 return ret;
10500
10501 ret = md_llbitmap_init();
10502 if (ret)
10503 goto err_bitmap;
10504
10505 ret = -ENOMEM;
10506 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
10507 if (!md_wq)
10508 goto err_wq;
10509
10510 md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
10511 if (!md_misc_wq)
10512 goto err_misc_wq;
10513
10514 ret = __register_blkdev(MD_MAJOR, "md", md_probe);
10515 if (ret < 0)
10516 goto err_md;
10517
10518 ret = __register_blkdev(0, "mdp", md_probe);
10519 if (ret < 0)
10520 goto err_mdp;
10521 mdp_major = ret;
10522
10523 register_reboot_notifier(&md_notifier);
10524 raid_table_header = register_sysctl("dev/raid", raid_table);
10525
10526 md_geninit();
10527 return 0;
10528
10529 err_mdp:
10530 unregister_blkdev(MD_MAJOR, "md");
10531 err_md:
10532 destroy_workqueue(md_misc_wq);
10533 err_misc_wq:
10534 destroy_workqueue(md_wq);
10535 err_wq:
10536 md_llbitmap_exit();
10537 err_bitmap:
10538 md_bitmap_exit();
10539 return ret;
10540 }
10541
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)10542 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
10543 {
10544 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
10545 struct md_rdev *rdev2, *tmp;
10546 int role, ret;
10547
10548 /*
10549 * If size is changed in another node then we need to
10550 * do resize as well.
10551 */
10552 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
10553 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
10554 if (ret)
10555 pr_info("md-cluster: resize failed\n");
10556 else if (md_bitmap_enabled(mddev, false))
10557 mddev->bitmap_ops->update_sb(mddev->bitmap);
10558 }
10559
10560 /* Check for change of roles in the active devices */
10561 rdev_for_each_safe(rdev2, tmp, mddev) {
10562 if (test_bit(Faulty, &rdev2->flags)) {
10563 if (test_bit(ClusterRemove, &rdev2->flags))
10564 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10565 continue;
10566 }
10567
10568 /* Check if the roles changed */
10569 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
10570
10571 if (test_bit(Candidate, &rdev2->flags)) {
10572 if (role == MD_DISK_ROLE_FAULTY) {
10573 pr_info("md: Removing Candidate device %pg because add failed\n",
10574 rdev2->bdev);
10575 md_kick_rdev_from_array(rdev2);
10576 continue;
10577 }
10578 else
10579 clear_bit(Candidate, &rdev2->flags);
10580 }
10581
10582 if (role != rdev2->raid_disk) {
10583 /*
10584 * got activated except reshape is happening.
10585 */
10586 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
10587 !(le32_to_cpu(sb->feature_map) &
10588 MD_FEATURE_RESHAPE_ACTIVE) &&
10589 !mddev->cluster_ops->resync_status_get(mddev)) {
10590 /*
10591 * -1 to make raid1_add_disk() set conf->fullsync
10592 * to 1. This could avoid skipping sync when the
10593 * remote node is down during resyncing.
10594 */
10595 if ((le32_to_cpu(sb->feature_map)
10596 & MD_FEATURE_RECOVERY_OFFSET))
10597 rdev2->saved_raid_disk = -1;
10598 else
10599 rdev2->saved_raid_disk = role;
10600 ret = remove_and_add_spares(mddev, rdev2);
10601 pr_info("Activated spare: %pg\n",
10602 rdev2->bdev);
10603 /* wakeup mddev->thread here, so array could
10604 * perform resync with the new activated disk */
10605 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10606 md_wakeup_thread(mddev->thread);
10607 }
10608 /* device faulty
10609 * We just want to do the minimum to mark the disk
10610 * as faulty. The recovery is performed by the
10611 * one who initiated the error.
10612 */
10613 if (role == MD_DISK_ROLE_FAULTY ||
10614 role == MD_DISK_ROLE_JOURNAL) {
10615 md_error(mddev, rdev2);
10616 clear_bit(Blocked, &rdev2->flags);
10617 }
10618 }
10619 }
10620
10621 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
10622 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
10623 if (ret)
10624 pr_warn("md: updating array disks failed. %d\n", ret);
10625 }
10626
10627 /*
10628 * Since mddev->delta_disks has already updated in update_raid_disks,
10629 * so it is time to check reshape.
10630 */
10631 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10632 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10633 /*
10634 * reshape is happening in the remote node, we need to
10635 * update reshape_position and call start_reshape.
10636 */
10637 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10638 if (mddev->pers->update_reshape_pos)
10639 mddev->pers->update_reshape_pos(mddev);
10640 if (mddev->pers->start_reshape)
10641 mddev->pers->start_reshape(mddev);
10642 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10643 mddev->reshape_position != MaxSector &&
10644 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10645 /* reshape is just done in another node. */
10646 mddev->reshape_position = MaxSector;
10647 if (mddev->pers->update_reshape_pos)
10648 mddev->pers->update_reshape_pos(mddev);
10649 }
10650
10651 /* Finally set the event to be up to date */
10652 mddev->events = le64_to_cpu(sb->events);
10653 }
10654
read_rdev(struct mddev * mddev,struct md_rdev * rdev)10655 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10656 {
10657 int err;
10658 struct page *swapout = rdev->sb_page;
10659 struct mdp_superblock_1 *sb;
10660
10661 /* Store the sb page of the rdev in the swapout temporary
10662 * variable in case we err in the future
10663 */
10664 rdev->sb_page = NULL;
10665 err = alloc_disk_sb(rdev);
10666 if (err == 0) {
10667 ClearPageUptodate(rdev->sb_page);
10668 rdev->sb_loaded = 0;
10669 err = super_types[mddev->major_version].
10670 load_super(rdev, NULL, mddev->minor_version);
10671 }
10672 if (err < 0) {
10673 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10674 __func__, __LINE__, rdev->desc_nr, err);
10675 if (rdev->sb_page)
10676 put_page(rdev->sb_page);
10677 rdev->sb_page = swapout;
10678 rdev->sb_loaded = 1;
10679 return err;
10680 }
10681
10682 sb = page_address(rdev->sb_page);
10683 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10684 * is not set
10685 */
10686
10687 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10688 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10689
10690 /* The other node finished recovery, call spare_active to set
10691 * device In_sync and mddev->degraded
10692 */
10693 if (rdev->recovery_offset == MaxSector &&
10694 !test_bit(In_sync, &rdev->flags) &&
10695 mddev->pers->spare_active(mddev))
10696 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10697
10698 put_page(swapout);
10699 return 0;
10700 }
10701
md_reload_sb(struct mddev * mddev,int nr)10702 void md_reload_sb(struct mddev *mddev, int nr)
10703 {
10704 struct md_rdev *rdev = NULL, *iter;
10705 int err;
10706
10707 /* Find the rdev */
10708 rdev_for_each_rcu(iter, mddev) {
10709 if (iter->desc_nr == nr) {
10710 rdev = iter;
10711 break;
10712 }
10713 }
10714
10715 if (!rdev) {
10716 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10717 return;
10718 }
10719
10720 err = read_rdev(mddev, rdev);
10721 if (err < 0)
10722 return;
10723
10724 check_sb_changes(mddev, rdev);
10725
10726 /* Read all rdev's to update recovery_offset */
10727 rdev_for_each_rcu(rdev, mddev) {
10728 if (!test_bit(Faulty, &rdev->flags))
10729 read_rdev(mddev, rdev);
10730 }
10731 }
10732 EXPORT_SYMBOL(md_reload_sb);
10733
10734 #ifndef MODULE
10735
10736 /*
10737 * Searches all registered partitions for autorun RAID arrays
10738 * at boot time.
10739 */
10740
10741 static DEFINE_MUTEX(detected_devices_mutex);
10742 static LIST_HEAD(all_detected_devices);
10743 struct detected_devices_node {
10744 struct list_head list;
10745 dev_t dev;
10746 };
10747
md_autodetect_dev(dev_t dev)10748 void md_autodetect_dev(dev_t dev)
10749 {
10750 struct detected_devices_node *node_detected_dev;
10751
10752 node_detected_dev = kzalloc_obj(*node_detected_dev);
10753 if (node_detected_dev) {
10754 node_detected_dev->dev = dev;
10755 mutex_lock(&detected_devices_mutex);
10756 list_add_tail(&node_detected_dev->list, &all_detected_devices);
10757 mutex_unlock(&detected_devices_mutex);
10758 }
10759 }
10760
md_autostart_arrays(int part)10761 void md_autostart_arrays(int part)
10762 {
10763 struct md_rdev *rdev;
10764 struct detected_devices_node *node_detected_dev;
10765 dev_t dev;
10766 int i_scanned, i_passed;
10767
10768 i_scanned = 0;
10769 i_passed = 0;
10770
10771 pr_info("md: Autodetecting RAID arrays.\n");
10772
10773 mutex_lock(&detected_devices_mutex);
10774 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10775 i_scanned++;
10776 node_detected_dev = list_entry(all_detected_devices.next,
10777 struct detected_devices_node, list);
10778 list_del(&node_detected_dev->list);
10779 dev = node_detected_dev->dev;
10780 kfree(node_detected_dev);
10781 mutex_unlock(&detected_devices_mutex);
10782 rdev = md_import_device(dev,0, 90);
10783 mutex_lock(&detected_devices_mutex);
10784 if (IS_ERR(rdev))
10785 continue;
10786
10787 if (test_bit(Faulty, &rdev->flags))
10788 continue;
10789
10790 set_bit(AutoDetected, &rdev->flags);
10791 list_add(&rdev->same_set, &pending_raid_disks);
10792 i_passed++;
10793 }
10794 mutex_unlock(&detected_devices_mutex);
10795
10796 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10797
10798 autorun_devices(part);
10799 }
10800
10801 #endif /* !MODULE */
10802
md_exit(void)10803 static __exit void md_exit(void)
10804 {
10805 struct mddev *mddev;
10806 int delay = 1;
10807
10808 unregister_blkdev(MD_MAJOR,"md");
10809 unregister_blkdev(mdp_major, "mdp");
10810 unregister_reboot_notifier(&md_notifier);
10811 unregister_sysctl_table(raid_table_header);
10812
10813 /* We cannot unload the modules while some process is
10814 * waiting for us in select() or poll() - wake them up
10815 */
10816 md_unloading = 1;
10817 while (waitqueue_active(&md_event_waiters)) {
10818 /* not safe to leave yet */
10819 wake_up(&md_event_waiters);
10820 msleep(delay);
10821 delay += delay;
10822 }
10823 remove_proc_entry("mdstat", NULL);
10824
10825 spin_lock(&all_mddevs_lock);
10826 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10827 if (!mddev_get(mddev))
10828 continue;
10829 spin_unlock(&all_mddevs_lock);
10830 export_array(mddev);
10831 mddev->ctime = 0;
10832 mddev->hold_active = 0;
10833 /*
10834 * As the mddev is now fully clear, mddev_put will schedule
10835 * the mddev for destruction by a workqueue, and the
10836 * destroy_workqueue() below will wait for that to complete.
10837 */
10838 spin_lock(&all_mddevs_lock);
10839 mddev_put_locked(mddev);
10840 }
10841 spin_unlock(&all_mddevs_lock);
10842
10843 destroy_workqueue(md_misc_wq);
10844 destroy_workqueue(md_wq);
10845 md_bitmap_exit();
10846 }
10847
10848 subsys_initcall(md_init);
module_exit(md_exit)10849 module_exit(md_exit)
10850
10851 static int get_ro(char *buffer, const struct kernel_param *kp)
10852 {
10853 return sprintf(buffer, "%d\n", start_readonly);
10854 }
set_ro(const char * val,const struct kernel_param * kp)10855 static int set_ro(const char *val, const struct kernel_param *kp)
10856 {
10857 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10858 }
10859
10860 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10861 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10862 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10863 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10864 module_param(legacy_async_del_gendisk, bool, 0600);
10865 module_param(check_new_feature, bool, 0600);
10866
10867 MODULE_LICENSE("GPL");
10868 MODULE_DESCRIPTION("MD RAID framework");
10869 MODULE_ALIAS("md");
10870 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10871