1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5
6 completely rewritten, based on the MD driver code from Marc Zyngier
7
8 Changes:
9
10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14 - kmod support by: Cyrus Durgin
15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17
18 - lots of fixes and improvements to the RAID1/RAID5 and generic
19 RAID code (such as request based resynchronization):
20
21 Neil Brown <neilb@cse.unsw.edu.au>.
22
23 - persistent bitmap code
24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25
26
27 Errors, Warnings, etc.
28 Please use:
29 pr_crit() for error conditions that risk data loss
30 pr_err() for error conditions that are unexpected, like an IO error
31 or internal inconsistency
32 pr_warn() for error conditions that could have been predicated, like
33 adding a device to an array when it has incompatible metadata
34 pr_info() for every interesting, very rare events, like an array starting
35 or stopping, or resync starting or stopping
36 pr_debug() for everything else.
37
38 */
39
40 #include <linux/sched/mm.h>
41 #include <linux/sched/signal.h>
42 #include <linux/kthread.h>
43 #include <linux/blkdev.h>
44 #include <linux/blk-integrity.h>
45 #include <linux/badblocks.h>
46 #include <linux/sysctl.h>
47 #include <linux/seq_file.h>
48 #include <linux/fs.h>
49 #include <linux/poll.h>
50 #include <linux/ctype.h>
51 #include <linux/string.h>
52 #include <linux/hdreg.h>
53 #include <linux/proc_fs.h>
54 #include <linux/random.h>
55 #include <linux/major.h>
56 #include <linux/module.h>
57 #include <linux/reboot.h>
58 #include <linux/file.h>
59 #include <linux/compat.h>
60 #include <linux/delay.h>
61 #include <linux/raid/md_p.h>
62 #include <linux/raid/md_u.h>
63 #include <linux/raid/detect.h>
64 #include <linux/slab.h>
65 #include <linux/percpu-refcount.h>
66 #include <linux/part_stat.h>
67
68 #include "md.h"
69 #include "md-bitmap.h"
70 #include "md-cluster.h"
71
72 static const char *action_name[NR_SYNC_ACTIONS] = {
73 [ACTION_RESYNC] = "resync",
74 [ACTION_RECOVER] = "recover",
75 [ACTION_CHECK] = "check",
76 [ACTION_REPAIR] = "repair",
77 [ACTION_RESHAPE] = "reshape",
78 [ACTION_FROZEN] = "frozen",
79 [ACTION_IDLE] = "idle",
80 };
81
82 static DEFINE_XARRAY(md_submodule);
83
84 static const struct kobj_type md_ktype;
85
86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87 static struct workqueue_struct *md_wq;
88
89 /*
90 * This workqueue is used for sync_work to register new sync_thread, and for
91 * del_work to remove rdev, and for event_work that is only set by dm-raid.
92 *
93 * Noted that sync_work will grab reconfig_mutex, hence never flush this
94 * workqueue whith reconfig_mutex grabbed.
95 */
96 static struct workqueue_struct *md_misc_wq;
97
98 static int remove_and_add_spares(struct mddev *mddev,
99 struct md_rdev *this);
100 static void mddev_detach(struct mddev *mddev);
101 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
102 static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
103
104 /*
105 * Default number of read corrections we'll attempt on an rdev
106 * before ejecting it from the array. We divide the read error
107 * count by 2 for every hour elapsed between read errors.
108 */
109 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
110 /* Default safemode delay: 200 msec */
111 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
112 /*
113 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
114 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
115 * does not show up that much. Increase it if you want to have more guaranteed
116 * speed. Note that the RAID driver will use the maximum bandwidth
117 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
118 *
119 * Background sync IO speed control:
120 *
121 * - below speed min:
122 * no limit;
123 * - above speed min and below speed max:
124 * a) if mddev is idle, then no limit;
125 * b) if mddev is busy handling normal IO, then limit inflight sync IO
126 * to sync_io_depth;
127 * - above speed max:
128 * sync IO can't be issued;
129 *
130 * Following configurations can be changed via /proc/sys/dev/raid/ for system
131 * or /sys/block/mdX/md/ for one array.
132 */
133 static int sysctl_speed_limit_min = 1000;
134 static int sysctl_speed_limit_max = 200000;
135 static int sysctl_sync_io_depth = 32;
136
speed_min(struct mddev * mddev)137 static int speed_min(struct mddev *mddev)
138 {
139 return mddev->sync_speed_min ?
140 mddev->sync_speed_min : sysctl_speed_limit_min;
141 }
142
speed_max(struct mddev * mddev)143 static int speed_max(struct mddev *mddev)
144 {
145 return mddev->sync_speed_max ?
146 mddev->sync_speed_max : sysctl_speed_limit_max;
147 }
148
sync_io_depth(struct mddev * mddev)149 static int sync_io_depth(struct mddev *mddev)
150 {
151 return mddev->sync_io_depth ?
152 mddev->sync_io_depth : sysctl_sync_io_depth;
153 }
154
rdev_uninit_serial(struct md_rdev * rdev)155 static void rdev_uninit_serial(struct md_rdev *rdev)
156 {
157 if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
158 return;
159
160 kvfree(rdev->serial);
161 rdev->serial = NULL;
162 }
163
rdevs_uninit_serial(struct mddev * mddev)164 static void rdevs_uninit_serial(struct mddev *mddev)
165 {
166 struct md_rdev *rdev;
167
168 rdev_for_each(rdev, mddev)
169 rdev_uninit_serial(rdev);
170 }
171
rdev_init_serial(struct md_rdev * rdev)172 static int rdev_init_serial(struct md_rdev *rdev)
173 {
174 /* serial_nums equals with BARRIER_BUCKETS_NR */
175 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
176 struct serial_in_rdev *serial = NULL;
177
178 if (test_bit(CollisionCheck, &rdev->flags))
179 return 0;
180
181 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
182 GFP_KERNEL);
183 if (!serial)
184 return -ENOMEM;
185
186 for (i = 0; i < serial_nums; i++) {
187 struct serial_in_rdev *serial_tmp = &serial[i];
188
189 spin_lock_init(&serial_tmp->serial_lock);
190 serial_tmp->serial_rb = RB_ROOT_CACHED;
191 init_waitqueue_head(&serial_tmp->serial_io_wait);
192 }
193
194 rdev->serial = serial;
195 set_bit(CollisionCheck, &rdev->flags);
196
197 return 0;
198 }
199
rdevs_init_serial(struct mddev * mddev)200 static int rdevs_init_serial(struct mddev *mddev)
201 {
202 struct md_rdev *rdev;
203 int ret = 0;
204
205 rdev_for_each(rdev, mddev) {
206 ret = rdev_init_serial(rdev);
207 if (ret)
208 break;
209 }
210
211 /* Free all resources if pool is not existed */
212 if (ret && !mddev->serial_info_pool)
213 rdevs_uninit_serial(mddev);
214
215 return ret;
216 }
217
218 /*
219 * rdev needs to enable serial stuffs if it meets the conditions:
220 * 1. it is multi-queue device flaged with writemostly.
221 * 2. the write-behind mode is enabled.
222 */
rdev_need_serial(struct md_rdev * rdev)223 static int rdev_need_serial(struct md_rdev *rdev)
224 {
225 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
226 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
227 test_bit(WriteMostly, &rdev->flags));
228 }
229
230 /*
231 * Init resource for rdev(s), then create serial_info_pool if:
232 * 1. rdev is the first device which return true from rdev_enable_serial.
233 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
234 */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev)235 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
236 {
237 int ret = 0;
238
239 if (rdev && !rdev_need_serial(rdev) &&
240 !test_bit(CollisionCheck, &rdev->flags))
241 return;
242
243 if (!rdev)
244 ret = rdevs_init_serial(mddev);
245 else
246 ret = rdev_init_serial(rdev);
247 if (ret)
248 return;
249
250 if (mddev->serial_info_pool == NULL) {
251 /*
252 * already in memalloc noio context by
253 * mddev_suspend()
254 */
255 mddev->serial_info_pool =
256 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
257 sizeof(struct serial_info));
258 if (!mddev->serial_info_pool) {
259 rdevs_uninit_serial(mddev);
260 pr_err("can't alloc memory pool for serialization\n");
261 }
262 }
263 }
264
265 /*
266 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
267 * 1. rdev is the last device flaged with CollisionCheck.
268 * 2. when bitmap is destroyed while policy is not enabled.
269 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
270 */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev)271 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
272 {
273 if (rdev && !test_bit(CollisionCheck, &rdev->flags))
274 return;
275
276 if (mddev->serial_info_pool) {
277 struct md_rdev *temp;
278 int num = 0; /* used to track if other rdevs need the pool */
279
280 rdev_for_each(temp, mddev) {
281 if (!rdev) {
282 if (!mddev->serialize_policy ||
283 !rdev_need_serial(temp))
284 rdev_uninit_serial(temp);
285 else
286 num++;
287 } else if (temp != rdev &&
288 test_bit(CollisionCheck, &temp->flags))
289 num++;
290 }
291
292 if (rdev)
293 rdev_uninit_serial(rdev);
294
295 if (num)
296 pr_info("The mempool could be used by other devices\n");
297 else {
298 mempool_destroy(mddev->serial_info_pool);
299 mddev->serial_info_pool = NULL;
300 }
301 }
302 }
303
304 static struct ctl_table_header *raid_table_header;
305
306 static const struct ctl_table raid_table[] = {
307 {
308 .procname = "speed_limit_min",
309 .data = &sysctl_speed_limit_min,
310 .maxlen = sizeof(int),
311 .mode = 0644,
312 .proc_handler = proc_dointvec,
313 },
314 {
315 .procname = "speed_limit_max",
316 .data = &sysctl_speed_limit_max,
317 .maxlen = sizeof(int),
318 .mode = 0644,
319 .proc_handler = proc_dointvec,
320 },
321 {
322 .procname = "sync_io_depth",
323 .data = &sysctl_sync_io_depth,
324 .maxlen = sizeof(int),
325 .mode = 0644,
326 .proc_handler = proc_dointvec,
327 },
328 };
329
330 static int start_readonly;
331
332 /*
333 * The original mechanism for creating an md device is to create
334 * a device node in /dev and to open it. This causes races with device-close.
335 * The preferred method is to write to the "new_array" module parameter.
336 * This can avoid races.
337 * Setting create_on_open to false disables the original mechanism
338 * so all the races disappear.
339 */
340 static bool create_on_open = true;
341 static bool legacy_async_del_gendisk = true;
342 static bool check_new_feature = true;
343
344 /*
345 * We have a system wide 'event count' that is incremented
346 * on any 'interesting' event, and readers of /proc/mdstat
347 * can use 'poll' or 'select' to find out when the event
348 * count increases.
349 *
350 * Events are:
351 * start array, stop array, error, add device, remove device,
352 * start build, activate spare
353 */
354 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
355 static atomic_t md_event_count;
md_new_event(void)356 void md_new_event(void)
357 {
358 atomic_inc(&md_event_count);
359 wake_up(&md_event_waiters);
360 }
361 EXPORT_SYMBOL_GPL(md_new_event);
362
363 /*
364 * Enables to iterate over all existing md arrays
365 * all_mddevs_lock protects this list.
366 */
367 static LIST_HEAD(all_mddevs);
368 static DEFINE_SPINLOCK(all_mddevs_lock);
369
is_md_suspended(struct mddev * mddev)370 static bool is_md_suspended(struct mddev *mddev)
371 {
372 return percpu_ref_is_dying(&mddev->active_io);
373 }
374 /* Rather than calling directly into the personality make_request function,
375 * IO requests come here first so that we can check if the device is
376 * being suspended pending a reconfiguration.
377 * We hold a refcount over the call to ->make_request. By the time that
378 * call has finished, the bio has been linked into some internal structure
379 * and so is visible to ->quiesce(), so we don't need the refcount any more.
380 */
is_suspended(struct mddev * mddev,struct bio * bio)381 static bool is_suspended(struct mddev *mddev, struct bio *bio)
382 {
383 if (is_md_suspended(mddev))
384 return true;
385 if (bio_data_dir(bio) != WRITE)
386 return false;
387 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
388 return false;
389 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
390 return false;
391 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
392 return false;
393 return true;
394 }
395
md_handle_request(struct mddev * mddev,struct bio * bio)396 bool md_handle_request(struct mddev *mddev, struct bio *bio)
397 {
398 check_suspended:
399 if (is_suspended(mddev, bio)) {
400 DEFINE_WAIT(__wait);
401 /* Bail out if REQ_NOWAIT is set for the bio */
402 if (bio->bi_opf & REQ_NOWAIT) {
403 bio_wouldblock_error(bio);
404 return true;
405 }
406 for (;;) {
407 prepare_to_wait(&mddev->sb_wait, &__wait,
408 TASK_UNINTERRUPTIBLE);
409 if (!is_suspended(mddev, bio))
410 break;
411 schedule();
412 }
413 finish_wait(&mddev->sb_wait, &__wait);
414 }
415 if (!percpu_ref_tryget_live(&mddev->active_io))
416 goto check_suspended;
417
418 if (!mddev->pers->make_request(mddev, bio)) {
419 percpu_ref_put(&mddev->active_io);
420 if (!mddev->gendisk && mddev->pers->prepare_suspend)
421 return false;
422 goto check_suspended;
423 }
424
425 percpu_ref_put(&mddev->active_io);
426 return true;
427 }
428 EXPORT_SYMBOL(md_handle_request);
429
md_submit_bio(struct bio * bio)430 static void md_submit_bio(struct bio *bio)
431 {
432 const int rw = bio_data_dir(bio);
433 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
434
435 if (mddev == NULL || mddev->pers == NULL) {
436 bio_io_error(bio);
437 return;
438 }
439
440 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
441 bio_io_error(bio);
442 return;
443 }
444
445 bio = bio_split_to_limits(bio);
446 if (!bio)
447 return;
448
449 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
450 if (bio_sectors(bio) != 0)
451 bio->bi_status = BLK_STS_IOERR;
452 bio_endio(bio);
453 return;
454 }
455
456 /* bio could be mergeable after passing to underlayer */
457 bio->bi_opf &= ~REQ_NOMERGE;
458
459 md_handle_request(mddev, bio);
460 }
461
462 /*
463 * Make sure no new requests are submitted to the device, and any requests that
464 * have been submitted are completely handled.
465 */
mddev_suspend(struct mddev * mddev,bool interruptible)466 int mddev_suspend(struct mddev *mddev, bool interruptible)
467 {
468 int err = 0;
469
470 /*
471 * hold reconfig_mutex to wait for normal io will deadlock, because
472 * other context can't update super_block, and normal io can rely on
473 * updating super_block.
474 */
475 lockdep_assert_not_held(&mddev->reconfig_mutex);
476
477 if (interruptible)
478 err = mutex_lock_interruptible(&mddev->suspend_mutex);
479 else
480 mutex_lock(&mddev->suspend_mutex);
481 if (err)
482 return err;
483
484 if (mddev->suspended) {
485 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
486 mutex_unlock(&mddev->suspend_mutex);
487 return 0;
488 }
489
490 percpu_ref_kill(&mddev->active_io);
491 if (interruptible)
492 err = wait_event_interruptible(mddev->sb_wait,
493 percpu_ref_is_zero(&mddev->active_io));
494 else
495 wait_event(mddev->sb_wait,
496 percpu_ref_is_zero(&mddev->active_io));
497 if (err) {
498 percpu_ref_resurrect(&mddev->active_io);
499 mutex_unlock(&mddev->suspend_mutex);
500 return err;
501 }
502
503 /*
504 * For raid456, io might be waiting for reshape to make progress,
505 * allow new reshape to start while waiting for io to be done to
506 * prevent deadlock.
507 */
508 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
509
510 /* restrict memory reclaim I/O during raid array is suspend */
511 mddev->noio_flag = memalloc_noio_save();
512
513 mutex_unlock(&mddev->suspend_mutex);
514 return 0;
515 }
516 EXPORT_SYMBOL_GPL(mddev_suspend);
517
__mddev_resume(struct mddev * mddev,bool recovery_needed)518 static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
519 {
520 lockdep_assert_not_held(&mddev->reconfig_mutex);
521
522 mutex_lock(&mddev->suspend_mutex);
523 WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
524 if (mddev->suspended) {
525 mutex_unlock(&mddev->suspend_mutex);
526 return;
527 }
528
529 /* entred the memalloc scope from mddev_suspend() */
530 memalloc_noio_restore(mddev->noio_flag);
531
532 percpu_ref_resurrect(&mddev->active_io);
533 wake_up(&mddev->sb_wait);
534
535 if (recovery_needed)
536 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
537 md_wakeup_thread(mddev->thread);
538 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
539
540 mutex_unlock(&mddev->suspend_mutex);
541 }
542
mddev_resume(struct mddev * mddev)543 void mddev_resume(struct mddev *mddev)
544 {
545 return __mddev_resume(mddev, true);
546 }
547 EXPORT_SYMBOL_GPL(mddev_resume);
548
549 /* sync bdev before setting device to readonly or stopping raid*/
mddev_set_closing_and_sync_blockdev(struct mddev * mddev,int opener_num)550 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
551 {
552 mutex_lock(&mddev->open_mutex);
553 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
554 mutex_unlock(&mddev->open_mutex);
555 return -EBUSY;
556 }
557 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
558 mutex_unlock(&mddev->open_mutex);
559 return -EBUSY;
560 }
561 mutex_unlock(&mddev->open_mutex);
562
563 sync_blockdev(mddev->gendisk->part0);
564 return 0;
565 }
566
567 /*
568 * The only difference from bio_chain_endio() is that the current
569 * bi_status of bio does not affect the bi_status of parent.
570 */
md_end_flush(struct bio * bio)571 static void md_end_flush(struct bio *bio)
572 {
573 struct bio *parent = bio->bi_private;
574
575 /*
576 * If any flush io error before the power failure,
577 * disk data may be lost.
578 */
579 if (bio->bi_status)
580 pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
581 blk_status_to_errno(bio->bi_status));
582
583 bio_put(bio);
584 bio_endio(parent);
585 }
586
md_flush_request(struct mddev * mddev,struct bio * bio)587 bool md_flush_request(struct mddev *mddev, struct bio *bio)
588 {
589 struct md_rdev *rdev;
590 struct bio *new;
591
592 /*
593 * md_flush_reqeust() should be called under md_handle_request() and
594 * 'active_io' is already grabbed. Hence it's safe to get rdev directly
595 * without rcu protection.
596 */
597 WARN_ON(percpu_ref_is_zero(&mddev->active_io));
598
599 rdev_for_each(rdev, mddev) {
600 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
601 continue;
602
603 new = bio_alloc_bioset(rdev->bdev, 0,
604 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
605 &mddev->bio_set);
606 new->bi_private = bio;
607 new->bi_end_io = md_end_flush;
608 bio_inc_remaining(bio);
609 submit_bio(new);
610 }
611
612 if (bio_sectors(bio) == 0) {
613 bio_endio(bio);
614 return true;
615 }
616
617 bio->bi_opf &= ~REQ_PREFLUSH;
618 return false;
619 }
620 EXPORT_SYMBOL(md_flush_request);
621
mddev_get(struct mddev * mddev)622 static inline struct mddev *mddev_get(struct mddev *mddev)
623 {
624 lockdep_assert_held(&all_mddevs_lock);
625
626 if (test_bit(MD_DELETED, &mddev->flags))
627 return NULL;
628 atomic_inc(&mddev->active);
629 return mddev;
630 }
631
632 static void mddev_delayed_delete(struct work_struct *ws);
633
__mddev_put(struct mddev * mddev)634 static void __mddev_put(struct mddev *mddev)
635 {
636 if (mddev->raid_disks || !list_empty(&mddev->disks) ||
637 mddev->ctime || mddev->hold_active)
638 return;
639
640 /*
641 * If array is freed by stopping array, MD_DELETED is set by
642 * do_md_stop(), MD_DELETED is still set here in case mddev is freed
643 * directly by closing a mddev that is created by create_on_open.
644 */
645 set_bit(MD_DELETED, &mddev->flags);
646 /*
647 * Call queue_work inside the spinlock so that flush_workqueue() after
648 * mddev_find will succeed in waiting for the work to be done.
649 */
650 queue_work(md_misc_wq, &mddev->del_work);
651 }
652
mddev_put_locked(struct mddev * mddev)653 static void mddev_put_locked(struct mddev *mddev)
654 {
655 if (atomic_dec_and_test(&mddev->active))
656 __mddev_put(mddev);
657 }
658
mddev_put(struct mddev * mddev)659 void mddev_put(struct mddev *mddev)
660 {
661 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
662 return;
663
664 __mddev_put(mddev);
665 spin_unlock(&all_mddevs_lock);
666 }
667
668 static void md_safemode_timeout(struct timer_list *t);
669 static void md_start_sync(struct work_struct *ws);
670
active_io_release(struct percpu_ref * ref)671 static void active_io_release(struct percpu_ref *ref)
672 {
673 struct mddev *mddev = container_of(ref, struct mddev, active_io);
674
675 wake_up(&mddev->sb_wait);
676 }
677
no_op(struct percpu_ref * r)678 static void no_op(struct percpu_ref *r) {}
679
mddev_set_bitmap_ops(struct mddev * mddev)680 static bool mddev_set_bitmap_ops(struct mddev *mddev)
681 {
682 struct bitmap_operations *old = mddev->bitmap_ops;
683 struct md_submodule_head *head;
684
685 if (mddev->bitmap_id == ID_BITMAP_NONE ||
686 (old && old->head.id == mddev->bitmap_id))
687 return true;
688
689 xa_lock(&md_submodule);
690 head = xa_load(&md_submodule, mddev->bitmap_id);
691
692 if (!head) {
693 pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id);
694 goto err;
695 }
696
697 if (head->type != MD_BITMAP) {
698 pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id);
699 goto err;
700 }
701
702 mddev->bitmap_ops = (void *)head;
703 xa_unlock(&md_submodule);
704
705 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) {
706 if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group))
707 pr_warn("md: cannot register extra bitmap attributes for %s\n",
708 mdname(mddev));
709 else
710 /*
711 * Inform user with KOBJ_CHANGE about new bitmap
712 * attributes.
713 */
714 kobject_uevent(&mddev->kobj, KOBJ_CHANGE);
715 }
716 return true;
717
718 err:
719 xa_unlock(&md_submodule);
720 return false;
721 }
722
mddev_clear_bitmap_ops(struct mddev * mddev)723 static void mddev_clear_bitmap_ops(struct mddev *mddev)
724 {
725 if (!mddev_is_dm(mddev) && mddev->bitmap_ops &&
726 mddev->bitmap_ops->group)
727 sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group);
728
729 mddev->bitmap_ops = NULL;
730 }
731
mddev_init(struct mddev * mddev)732 int mddev_init(struct mddev *mddev)
733 {
734 int err = 0;
735
736 if (!IS_ENABLED(CONFIG_MD_BITMAP))
737 mddev->bitmap_id = ID_BITMAP_NONE;
738 else
739 mddev->bitmap_id = ID_BITMAP;
740
741 if (percpu_ref_init(&mddev->active_io, active_io_release,
742 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
743 return -ENOMEM;
744
745 if (percpu_ref_init(&mddev->writes_pending, no_op,
746 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
747 err = -ENOMEM;
748 goto exit_acitve_io;
749 }
750
751 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
752 if (err)
753 goto exit_writes_pending;
754
755 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
756 if (err)
757 goto exit_bio_set;
758
759 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
760 offsetof(struct md_io_clone, bio_clone), 0);
761 if (err)
762 goto exit_sync_set;
763
764 /* We want to start with the refcount at zero */
765 percpu_ref_put(&mddev->writes_pending);
766
767 mutex_init(&mddev->open_mutex);
768 mutex_init(&mddev->reconfig_mutex);
769 mutex_init(&mddev->suspend_mutex);
770 mutex_init(&mddev->bitmap_info.mutex);
771 INIT_LIST_HEAD(&mddev->disks);
772 INIT_LIST_HEAD(&mddev->all_mddevs);
773 INIT_LIST_HEAD(&mddev->deleting);
774 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
775 atomic_set(&mddev->active, 1);
776 atomic_set(&mddev->openers, 0);
777 atomic_set(&mddev->sync_seq, 0);
778 spin_lock_init(&mddev->lock);
779 init_waitqueue_head(&mddev->sb_wait);
780 init_waitqueue_head(&mddev->recovery_wait);
781 mddev->reshape_position = MaxSector;
782 mddev->reshape_backwards = 0;
783 mddev->last_sync_action = ACTION_IDLE;
784 mddev->resync_min = 0;
785 mddev->resync_max = MaxSector;
786 mddev->level = LEVEL_NONE;
787
788 INIT_WORK(&mddev->sync_work, md_start_sync);
789 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
790
791 return 0;
792
793 exit_sync_set:
794 bioset_exit(&mddev->sync_set);
795 exit_bio_set:
796 bioset_exit(&mddev->bio_set);
797 exit_writes_pending:
798 percpu_ref_exit(&mddev->writes_pending);
799 exit_acitve_io:
800 percpu_ref_exit(&mddev->active_io);
801 return err;
802 }
803 EXPORT_SYMBOL_GPL(mddev_init);
804
mddev_destroy(struct mddev * mddev)805 void mddev_destroy(struct mddev *mddev)
806 {
807 bioset_exit(&mddev->bio_set);
808 bioset_exit(&mddev->sync_set);
809 bioset_exit(&mddev->io_clone_set);
810 percpu_ref_exit(&mddev->active_io);
811 percpu_ref_exit(&mddev->writes_pending);
812 }
813 EXPORT_SYMBOL_GPL(mddev_destroy);
814
mddev_find_locked(dev_t unit)815 static struct mddev *mddev_find_locked(dev_t unit)
816 {
817 struct mddev *mddev;
818
819 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
820 if (mddev->unit == unit)
821 return mddev;
822
823 return NULL;
824 }
825
826 /* find an unused unit number */
mddev_alloc_unit(void)827 static dev_t mddev_alloc_unit(void)
828 {
829 static int next_minor = 512;
830 int start = next_minor;
831 bool is_free = 0;
832 dev_t dev = 0;
833
834 while (!is_free) {
835 dev = MKDEV(MD_MAJOR, next_minor);
836 next_minor++;
837 if (next_minor > MINORMASK)
838 next_minor = 0;
839 if (next_minor == start)
840 return 0; /* Oh dear, all in use. */
841 is_free = !mddev_find_locked(dev);
842 }
843
844 return dev;
845 }
846
mddev_alloc(dev_t unit)847 static struct mddev *mddev_alloc(dev_t unit)
848 {
849 struct mddev *new;
850 int error;
851
852 if (unit && MAJOR(unit) != MD_MAJOR)
853 unit &= ~((1 << MdpMinorShift) - 1);
854
855 new = kzalloc(sizeof(*new), GFP_KERNEL);
856 if (!new)
857 return ERR_PTR(-ENOMEM);
858
859 error = mddev_init(new);
860 if (error)
861 goto out_free_new;
862
863 spin_lock(&all_mddevs_lock);
864 if (unit) {
865 error = -EEXIST;
866 if (mddev_find_locked(unit))
867 goto out_destroy_new;
868 new->unit = unit;
869 if (MAJOR(unit) == MD_MAJOR)
870 new->md_minor = MINOR(unit);
871 else
872 new->md_minor = MINOR(unit) >> MdpMinorShift;
873 new->hold_active = UNTIL_IOCTL;
874 } else {
875 error = -ENODEV;
876 new->unit = mddev_alloc_unit();
877 if (!new->unit)
878 goto out_destroy_new;
879 new->md_minor = MINOR(new->unit);
880 new->hold_active = UNTIL_STOP;
881 }
882
883 list_add(&new->all_mddevs, &all_mddevs);
884 spin_unlock(&all_mddevs_lock);
885 return new;
886
887 out_destroy_new:
888 spin_unlock(&all_mddevs_lock);
889 mddev_destroy(new);
890 out_free_new:
891 kfree(new);
892 return ERR_PTR(error);
893 }
894
mddev_free(struct mddev * mddev)895 static void mddev_free(struct mddev *mddev)
896 {
897 spin_lock(&all_mddevs_lock);
898 list_del(&mddev->all_mddevs);
899 spin_unlock(&all_mddevs_lock);
900
901 mddev_destroy(mddev);
902 kfree(mddev);
903 }
904
905 static const struct attribute_group md_redundancy_group;
906
mddev_unlock(struct mddev * mddev)907 void mddev_unlock(struct mddev *mddev)
908 {
909 struct md_rdev *rdev;
910 struct md_rdev *tmp;
911 LIST_HEAD(delete);
912
913 if (!list_empty(&mddev->deleting))
914 list_splice_init(&mddev->deleting, &delete);
915
916 if (mddev->to_remove) {
917 /* These cannot be removed under reconfig_mutex as
918 * an access to the files will try to take reconfig_mutex
919 * while holding the file unremovable, which leads to
920 * a deadlock.
921 * So hold set sysfs_active while the remove in happeing,
922 * and anything else which might set ->to_remove or my
923 * otherwise change the sysfs namespace will fail with
924 * -EBUSY if sysfs_active is still set.
925 * We set sysfs_active under reconfig_mutex and elsewhere
926 * test it under the same mutex to ensure its correct value
927 * is seen.
928 */
929 const struct attribute_group *to_remove = mddev->to_remove;
930 mddev->to_remove = NULL;
931 mddev->sysfs_active = 1;
932 mutex_unlock(&mddev->reconfig_mutex);
933
934 if (mddev->kobj.sd) {
935 if (to_remove != &md_redundancy_group)
936 sysfs_remove_group(&mddev->kobj, to_remove);
937 if (mddev->pers == NULL ||
938 mddev->pers->sync_request == NULL) {
939 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
940 if (mddev->sysfs_action)
941 sysfs_put(mddev->sysfs_action);
942 if (mddev->sysfs_completed)
943 sysfs_put(mddev->sysfs_completed);
944 if (mddev->sysfs_degraded)
945 sysfs_put(mddev->sysfs_degraded);
946 mddev->sysfs_action = NULL;
947 mddev->sysfs_completed = NULL;
948 mddev->sysfs_degraded = NULL;
949 }
950 }
951 mddev->sysfs_active = 0;
952 } else
953 mutex_unlock(&mddev->reconfig_mutex);
954
955 md_wakeup_thread(mddev->thread);
956 wake_up(&mddev->sb_wait);
957
958 list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
959 list_del_init(&rdev->same_set);
960 kobject_del(&rdev->kobj);
961 export_rdev(rdev, mddev);
962 }
963
964 if (!legacy_async_del_gendisk) {
965 /*
966 * Call del_gendisk after release reconfig_mutex to avoid
967 * deadlock (e.g. call del_gendisk under the lock and an
968 * access to sysfs files waits the lock)
969 * And MD_DELETED is only used for md raid which is set in
970 * do_md_stop. dm raid only uses md_stop to stop. So dm raid
971 * doesn't need to check MD_DELETED when getting reconfig lock
972 */
973 if (test_bit(MD_DELETED, &mddev->flags) &&
974 !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) {
975 kobject_del(&mddev->kobj);
976 del_gendisk(mddev->gendisk);
977 }
978 }
979 }
980 EXPORT_SYMBOL_GPL(mddev_unlock);
981
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)982 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
983 {
984 struct md_rdev *rdev;
985
986 rdev_for_each_rcu(rdev, mddev)
987 if (rdev->desc_nr == nr)
988 return rdev;
989
990 return NULL;
991 }
992 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
993
find_rdev(struct mddev * mddev,dev_t dev)994 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
995 {
996 struct md_rdev *rdev;
997
998 rdev_for_each(rdev, mddev)
999 if (rdev->bdev->bd_dev == dev)
1000 return rdev;
1001
1002 return NULL;
1003 }
1004
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)1005 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
1006 {
1007 struct md_rdev *rdev;
1008
1009 rdev_for_each_rcu(rdev, mddev)
1010 if (rdev->bdev->bd_dev == dev)
1011 return rdev;
1012
1013 return NULL;
1014 }
1015 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
1016
get_pers(int level,char * clevel)1017 static struct md_personality *get_pers(int level, char *clevel)
1018 {
1019 struct md_personality *ret = NULL;
1020 struct md_submodule_head *head;
1021 unsigned long i;
1022
1023 xa_lock(&md_submodule);
1024 xa_for_each(&md_submodule, i, head) {
1025 if (head->type != MD_PERSONALITY)
1026 continue;
1027 if ((level != LEVEL_NONE && head->id == level) ||
1028 !strcmp(head->name, clevel)) {
1029 if (try_module_get(head->owner))
1030 ret = (void *)head;
1031 break;
1032 }
1033 }
1034 xa_unlock(&md_submodule);
1035
1036 if (!ret) {
1037 if (level != LEVEL_NONE)
1038 pr_warn("md: personality for level %d is not loaded!\n",
1039 level);
1040 else
1041 pr_warn("md: personality for level %s is not loaded!\n",
1042 clevel);
1043 }
1044
1045 return ret;
1046 }
1047
put_pers(struct md_personality * pers)1048 static void put_pers(struct md_personality *pers)
1049 {
1050 module_put(pers->head.owner);
1051 }
1052
1053 /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)1054 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
1055 {
1056 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
1057 }
1058
alloc_disk_sb(struct md_rdev * rdev)1059 static int alloc_disk_sb(struct md_rdev *rdev)
1060 {
1061 rdev->sb_page = alloc_page(GFP_KERNEL);
1062 if (!rdev->sb_page)
1063 return -ENOMEM;
1064 return 0;
1065 }
1066
md_rdev_clear(struct md_rdev * rdev)1067 void md_rdev_clear(struct md_rdev *rdev)
1068 {
1069 if (rdev->sb_page) {
1070 put_page(rdev->sb_page);
1071 rdev->sb_loaded = 0;
1072 rdev->sb_page = NULL;
1073 rdev->sb_start = 0;
1074 rdev->sectors = 0;
1075 }
1076 if (rdev->bb_page) {
1077 put_page(rdev->bb_page);
1078 rdev->bb_page = NULL;
1079 }
1080 badblocks_exit(&rdev->badblocks);
1081 }
1082 EXPORT_SYMBOL_GPL(md_rdev_clear);
1083
super_written(struct bio * bio)1084 static void super_written(struct bio *bio)
1085 {
1086 struct md_rdev *rdev = bio->bi_private;
1087 struct mddev *mddev = rdev->mddev;
1088
1089 if (bio->bi_status) {
1090 pr_err("md: %s gets error=%d\n", __func__,
1091 blk_status_to_errno(bio->bi_status));
1092 md_error(mddev, rdev);
1093 if (!test_bit(Faulty, &rdev->flags)
1094 && (bio->bi_opf & MD_FAILFAST)) {
1095 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
1096 set_bit(LastDev, &rdev->flags);
1097 }
1098 } else
1099 clear_bit(LastDev, &rdev->flags);
1100
1101 bio_put(bio);
1102
1103 rdev_dec_pending(rdev, mddev);
1104
1105 if (atomic_dec_and_test(&mddev->pending_writes))
1106 wake_up(&mddev->sb_wait);
1107 }
1108
1109 /**
1110 * md_write_metadata - write metadata to underlying disk, including
1111 * array superblock, badblocks, bitmap superblock and bitmap bits.
1112 * @mddev: the array to write
1113 * @rdev: the underlying disk to write
1114 * @sector: the offset to @rdev
1115 * @size: the length of the metadata
1116 * @page: the metadata
1117 * @offset: the offset to @page
1118 *
1119 * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment
1120 * mddev->pending_writes before returning, and decrement it on completion,
1121 * waking up sb_wait. Caller must call md_super_wait() after issuing io to all
1122 * rdev. If an error occurred, md_error() will be called, and the @rdev will be
1123 * kicked out from @mddev.
1124 */
md_write_metadata(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page,unsigned int offset)1125 void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
1126 sector_t sector, int size, struct page *page,
1127 unsigned int offset)
1128 {
1129 struct bio *bio;
1130
1131 if (!page)
1132 return;
1133
1134 if (test_bit(Faulty, &rdev->flags))
1135 return;
1136
1137 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1138 1,
1139 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1140 | REQ_PREFLUSH | REQ_FUA,
1141 GFP_NOIO, &mddev->sync_set);
1142
1143 atomic_inc(&rdev->nr_pending);
1144
1145 bio->bi_iter.bi_sector = sector;
1146 __bio_add_page(bio, page, size, offset);
1147 bio->bi_private = rdev;
1148 bio->bi_end_io = super_written;
1149
1150 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1151 test_bit(FailFast, &rdev->flags) &&
1152 !test_bit(LastDev, &rdev->flags))
1153 bio->bi_opf |= MD_FAILFAST;
1154
1155 atomic_inc(&mddev->pending_writes);
1156 submit_bio(bio);
1157 }
1158
md_super_wait(struct mddev * mddev)1159 int md_super_wait(struct mddev *mddev)
1160 {
1161 /* wait for all superblock writes that were scheduled to complete */
1162 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1163 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1164 return -EAGAIN;
1165 return 0;
1166 }
1167
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)1168 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1169 struct page *page, blk_opf_t opf, bool metadata_op)
1170 {
1171 struct bio bio;
1172 struct bio_vec bvec;
1173
1174 if (metadata_op && rdev->meta_bdev)
1175 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1176 else
1177 bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1178
1179 if (metadata_op)
1180 bio.bi_iter.bi_sector = sector + rdev->sb_start;
1181 else if (rdev->mddev->reshape_position != MaxSector &&
1182 (rdev->mddev->reshape_backwards ==
1183 (sector >= rdev->mddev->reshape_position)))
1184 bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1185 else
1186 bio.bi_iter.bi_sector = sector + rdev->data_offset;
1187 __bio_add_page(&bio, page, size, 0);
1188
1189 submit_bio_wait(&bio);
1190
1191 return !bio.bi_status;
1192 }
1193 EXPORT_SYMBOL_GPL(sync_page_io);
1194
read_disk_sb(struct md_rdev * rdev,int size)1195 static int read_disk_sb(struct md_rdev *rdev, int size)
1196 {
1197 if (rdev->sb_loaded)
1198 return 0;
1199
1200 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1201 goto fail;
1202 rdev->sb_loaded = 1;
1203 return 0;
1204
1205 fail:
1206 pr_err("md: disabled device %pg, could not read superblock.\n",
1207 rdev->bdev);
1208 return -EINVAL;
1209 }
1210
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1211 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1212 {
1213 return sb1->set_uuid0 == sb2->set_uuid0 &&
1214 sb1->set_uuid1 == sb2->set_uuid1 &&
1215 sb1->set_uuid2 == sb2->set_uuid2 &&
1216 sb1->set_uuid3 == sb2->set_uuid3;
1217 }
1218
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1219 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1220 {
1221 int ret;
1222 mdp_super_t *tmp1, *tmp2;
1223
1224 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1225 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1226
1227 if (!tmp1 || !tmp2) {
1228 ret = 0;
1229 goto abort;
1230 }
1231
1232 *tmp1 = *sb1;
1233 *tmp2 = *sb2;
1234
1235 /*
1236 * nr_disks is not constant
1237 */
1238 tmp1->nr_disks = 0;
1239 tmp2->nr_disks = 0;
1240
1241 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1242 abort:
1243 kfree(tmp1);
1244 kfree(tmp2);
1245 return ret;
1246 }
1247
md_csum_fold(u32 csum)1248 static u32 md_csum_fold(u32 csum)
1249 {
1250 csum = (csum & 0xffff) + (csum >> 16);
1251 return (csum & 0xffff) + (csum >> 16);
1252 }
1253
calc_sb_csum(mdp_super_t * sb)1254 static unsigned int calc_sb_csum(mdp_super_t *sb)
1255 {
1256 u64 newcsum = 0;
1257 u32 *sb32 = (u32*)sb;
1258 int i;
1259 unsigned int disk_csum, csum;
1260
1261 disk_csum = sb->sb_csum;
1262 sb->sb_csum = 0;
1263
1264 for (i = 0; i < MD_SB_BYTES/4 ; i++)
1265 newcsum += sb32[i];
1266 csum = (newcsum & 0xffffffff) + (newcsum>>32);
1267
1268 #ifdef CONFIG_ALPHA
1269 /* This used to use csum_partial, which was wrong for several
1270 * reasons including that different results are returned on
1271 * different architectures. It isn't critical that we get exactly
1272 * the same return value as before (we always csum_fold before
1273 * testing, and that removes any differences). However as we
1274 * know that csum_partial always returned a 16bit value on
1275 * alphas, do a fold to maximise conformity to previous behaviour.
1276 */
1277 sb->sb_csum = md_csum_fold(disk_csum);
1278 #else
1279 sb->sb_csum = disk_csum;
1280 #endif
1281 return csum;
1282 }
1283
1284 /*
1285 * Handle superblock details.
1286 * We want to be able to handle multiple superblock formats
1287 * so we have a common interface to them all, and an array of
1288 * different handlers.
1289 * We rely on user-space to write the initial superblock, and support
1290 * reading and updating of superblocks.
1291 * Interface methods are:
1292 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1293 * loads and validates a superblock on dev.
1294 * if refdev != NULL, compare superblocks on both devices
1295 * Return:
1296 * 0 - dev has a superblock that is compatible with refdev
1297 * 1 - dev has a superblock that is compatible and newer than refdev
1298 * so dev should be used as the refdev in future
1299 * -EINVAL superblock incompatible or invalid
1300 * -othererror e.g. -EIO
1301 *
1302 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1303 * Verify that dev is acceptable into mddev.
1304 * The first time, mddev->raid_disks will be 0, and data from
1305 * dev should be merged in. Subsequent calls check that dev
1306 * is new enough. Return 0 or -EINVAL
1307 *
1308 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1309 * Update the superblock for rdev with data in mddev
1310 * This does not write to disc.
1311 *
1312 */
1313
1314 struct super_type {
1315 char *name;
1316 struct module *owner;
1317 int (*load_super)(struct md_rdev *rdev,
1318 struct md_rdev *refdev,
1319 int minor_version);
1320 int (*validate_super)(struct mddev *mddev,
1321 struct md_rdev *freshest,
1322 struct md_rdev *rdev);
1323 void (*sync_super)(struct mddev *mddev,
1324 struct md_rdev *rdev);
1325 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1326 sector_t num_sectors);
1327 int (*allow_new_offset)(struct md_rdev *rdev,
1328 unsigned long long new_offset);
1329 };
1330
1331 /*
1332 * Check that the given mddev has no bitmap.
1333 *
1334 * This function is called from the run method of all personalities that do not
1335 * support bitmaps. It prints an error message and returns non-zero if mddev
1336 * has a bitmap. Otherwise, it returns 0.
1337 *
1338 */
md_check_no_bitmap(struct mddev * mddev)1339 int md_check_no_bitmap(struct mddev *mddev)
1340 {
1341 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1342 return 0;
1343 pr_warn("%s: bitmaps are not supported for %s\n",
1344 mdname(mddev), mddev->pers->head.name);
1345 return 1;
1346 }
1347 EXPORT_SYMBOL(md_check_no_bitmap);
1348
1349 /*
1350 * load_super for 0.90.0
1351 */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1352 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1353 {
1354 mdp_super_t *sb;
1355 int ret;
1356 bool spare_disk = true;
1357
1358 /*
1359 * Calculate the position of the superblock (512byte sectors),
1360 * it's at the end of the disk.
1361 *
1362 * It also happens to be a multiple of 4Kb.
1363 */
1364 rdev->sb_start = calc_dev_sboffset(rdev);
1365
1366 ret = read_disk_sb(rdev, MD_SB_BYTES);
1367 if (ret)
1368 return ret;
1369
1370 ret = -EINVAL;
1371
1372 sb = page_address(rdev->sb_page);
1373
1374 if (sb->md_magic != MD_SB_MAGIC) {
1375 pr_warn("md: invalid raid superblock magic on %pg\n",
1376 rdev->bdev);
1377 goto abort;
1378 }
1379
1380 if (sb->major_version != 0 ||
1381 sb->minor_version < 90 ||
1382 sb->minor_version > 91) {
1383 pr_warn("Bad version number %d.%d on %pg\n",
1384 sb->major_version, sb->minor_version, rdev->bdev);
1385 goto abort;
1386 }
1387
1388 if (sb->raid_disks <= 0)
1389 goto abort;
1390
1391 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1392 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1393 goto abort;
1394 }
1395
1396 rdev->preferred_minor = sb->md_minor;
1397 rdev->data_offset = 0;
1398 rdev->new_data_offset = 0;
1399 rdev->sb_size = MD_SB_BYTES;
1400 rdev->badblocks.shift = -1;
1401
1402 rdev->desc_nr = sb->this_disk.number;
1403
1404 /* not spare disk */
1405 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1406 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1407 spare_disk = false;
1408
1409 if (!refdev) {
1410 if (!spare_disk)
1411 ret = 1;
1412 else
1413 ret = 0;
1414 } else {
1415 __u64 ev1, ev2;
1416 mdp_super_t *refsb = page_address(refdev->sb_page);
1417 if (!md_uuid_equal(refsb, sb)) {
1418 pr_warn("md: %pg has different UUID to %pg\n",
1419 rdev->bdev, refdev->bdev);
1420 goto abort;
1421 }
1422 if (!md_sb_equal(refsb, sb)) {
1423 pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1424 rdev->bdev, refdev->bdev);
1425 goto abort;
1426 }
1427 ev1 = md_event(sb);
1428 ev2 = md_event(refsb);
1429
1430 if (!spare_disk && ev1 > ev2)
1431 ret = 1;
1432 else
1433 ret = 0;
1434 }
1435 rdev->sectors = rdev->sb_start;
1436 /* Limit to 4TB as metadata cannot record more than that.
1437 * (not needed for Linear and RAID0 as metadata doesn't
1438 * record this size)
1439 */
1440 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1441 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1442
1443 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1444 /* "this cannot possibly happen" ... */
1445 ret = -EINVAL;
1446
1447 abort:
1448 return ret;
1449 }
1450
md_bitmap_events_cleared(struct mddev * mddev)1451 static u64 md_bitmap_events_cleared(struct mddev *mddev)
1452 {
1453 struct md_bitmap_stats stats;
1454 int err;
1455
1456 if (!md_bitmap_enabled(mddev, false))
1457 return 0;
1458
1459 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
1460 if (err)
1461 return 0;
1462
1463 return stats.events_cleared;
1464 }
1465
1466 /*
1467 * validate_super for 0.90.0
1468 * note: we are not using "freshest" for 0.9 superblock
1469 */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1470 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1471 {
1472 mdp_disk_t *desc;
1473 mdp_super_t *sb = page_address(rdev->sb_page);
1474 __u64 ev1 = md_event(sb);
1475
1476 rdev->raid_disk = -1;
1477 clear_bit(Faulty, &rdev->flags);
1478 clear_bit(In_sync, &rdev->flags);
1479 clear_bit(Bitmap_sync, &rdev->flags);
1480 clear_bit(WriteMostly, &rdev->flags);
1481
1482 if (mddev->raid_disks == 0) {
1483 mddev->major_version = 0;
1484 mddev->minor_version = sb->minor_version;
1485 mddev->patch_version = sb->patch_version;
1486 mddev->external = 0;
1487 mddev->chunk_sectors = sb->chunk_size >> 9;
1488 mddev->ctime = sb->ctime;
1489 mddev->utime = sb->utime;
1490 mddev->level = sb->level;
1491 mddev->clevel[0] = 0;
1492 mddev->layout = sb->layout;
1493 mddev->raid_disks = sb->raid_disks;
1494 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1495 mddev->events = ev1;
1496 mddev->bitmap_info.offset = 0;
1497 mddev->bitmap_info.space = 0;
1498 /* bitmap can use 60 K after the 4K superblocks */
1499 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1500 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1501 mddev->reshape_backwards = 0;
1502
1503 if (mddev->minor_version >= 91) {
1504 mddev->reshape_position = sb->reshape_position;
1505 mddev->delta_disks = sb->delta_disks;
1506 mddev->new_level = sb->new_level;
1507 mddev->new_layout = sb->new_layout;
1508 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1509 if (mddev->delta_disks < 0)
1510 mddev->reshape_backwards = 1;
1511 } else {
1512 mddev->reshape_position = MaxSector;
1513 mddev->delta_disks = 0;
1514 mddev->new_level = mddev->level;
1515 mddev->new_layout = mddev->layout;
1516 mddev->new_chunk_sectors = mddev->chunk_sectors;
1517 }
1518 if (mddev->level == 0)
1519 mddev->layout = -1;
1520
1521 if (sb->state & (1<<MD_SB_CLEAN))
1522 mddev->resync_offset = MaxSector;
1523 else {
1524 if (sb->events_hi == sb->cp_events_hi &&
1525 sb->events_lo == sb->cp_events_lo) {
1526 mddev->resync_offset = sb->recovery_cp;
1527 } else
1528 mddev->resync_offset = 0;
1529 }
1530
1531 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1532 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1533 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1534 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1535
1536 mddev->max_disks = MD_SB_DISKS;
1537
1538 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1539 mddev->bitmap_info.file == NULL) {
1540 mddev->bitmap_info.offset =
1541 mddev->bitmap_info.default_offset;
1542 mddev->bitmap_info.space =
1543 mddev->bitmap_info.default_space;
1544 }
1545
1546 } else if (mddev->pers == NULL) {
1547 /* Insist on good event counter while assembling, except
1548 * for spares (which don't need an event count) */
1549 ++ev1;
1550 if (sb->disks[rdev->desc_nr].state & (
1551 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1552 if (ev1 < mddev->events)
1553 return -EINVAL;
1554 } else if (mddev->bitmap) {
1555 /* if adding to array with a bitmap, then we can accept an
1556 * older device ... but not too old.
1557 */
1558 if (ev1 < md_bitmap_events_cleared(mddev))
1559 return 0;
1560 if (ev1 < mddev->events)
1561 set_bit(Bitmap_sync, &rdev->flags);
1562 } else {
1563 if (ev1 < mddev->events)
1564 /* just a hot-add of a new device, leave raid_disk at -1 */
1565 return 0;
1566 }
1567
1568 desc = sb->disks + rdev->desc_nr;
1569
1570 if (desc->state & (1<<MD_DISK_FAULTY))
1571 set_bit(Faulty, &rdev->flags);
1572 else if (desc->state & (1<<MD_DISK_SYNC)) {
1573 set_bit(In_sync, &rdev->flags);
1574 rdev->raid_disk = desc->raid_disk;
1575 rdev->saved_raid_disk = desc->raid_disk;
1576 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1577 /* active but not in sync implies recovery up to
1578 * reshape position. We don't know exactly where
1579 * that is, so set to zero for now
1580 */
1581 if (mddev->minor_version >= 91) {
1582 rdev->recovery_offset = 0;
1583 rdev->raid_disk = desc->raid_disk;
1584 }
1585 }
1586 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1587 set_bit(WriteMostly, &rdev->flags);
1588 if (desc->state & (1<<MD_DISK_FAILFAST))
1589 set_bit(FailFast, &rdev->flags);
1590 return 0;
1591 }
1592
1593 /*
1594 * sync_super for 0.90.0
1595 */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1596 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1597 {
1598 mdp_super_t *sb;
1599 struct md_rdev *rdev2;
1600 int next_spare = mddev->raid_disks;
1601
1602 /* make rdev->sb match mddev data..
1603 *
1604 * 1/ zero out disks
1605 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1606 * 3/ any empty disks < next_spare become removed
1607 *
1608 * disks[0] gets initialised to REMOVED because
1609 * we cannot be sure from other fields if it has
1610 * been initialised or not.
1611 */
1612 int i;
1613 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1614
1615 rdev->sb_size = MD_SB_BYTES;
1616
1617 sb = page_address(rdev->sb_page);
1618
1619 memset(sb, 0, sizeof(*sb));
1620
1621 sb->md_magic = MD_SB_MAGIC;
1622 sb->major_version = mddev->major_version;
1623 sb->patch_version = mddev->patch_version;
1624 sb->gvalid_words = 0; /* ignored */
1625 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1626 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1627 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1628 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1629
1630 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1631 sb->level = mddev->level;
1632 sb->size = mddev->dev_sectors / 2;
1633 sb->raid_disks = mddev->raid_disks;
1634 sb->md_minor = mddev->md_minor;
1635 sb->not_persistent = 0;
1636 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1637 sb->state = 0;
1638 sb->events_hi = (mddev->events>>32);
1639 sb->events_lo = (u32)mddev->events;
1640
1641 if (mddev->reshape_position == MaxSector)
1642 sb->minor_version = 90;
1643 else {
1644 sb->minor_version = 91;
1645 sb->reshape_position = mddev->reshape_position;
1646 sb->new_level = mddev->new_level;
1647 sb->delta_disks = mddev->delta_disks;
1648 sb->new_layout = mddev->new_layout;
1649 sb->new_chunk = mddev->new_chunk_sectors << 9;
1650 }
1651 mddev->minor_version = sb->minor_version;
1652 if (mddev->in_sync)
1653 {
1654 sb->recovery_cp = mddev->resync_offset;
1655 sb->cp_events_hi = (mddev->events>>32);
1656 sb->cp_events_lo = (u32)mddev->events;
1657 if (mddev->resync_offset == MaxSector)
1658 sb->state = (1<< MD_SB_CLEAN);
1659 } else
1660 sb->recovery_cp = 0;
1661
1662 sb->layout = mddev->layout;
1663 sb->chunk_size = mddev->chunk_sectors << 9;
1664
1665 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1666 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1667
1668 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1669 rdev_for_each(rdev2, mddev) {
1670 mdp_disk_t *d;
1671 int desc_nr;
1672 int is_active = test_bit(In_sync, &rdev2->flags);
1673
1674 if (rdev2->raid_disk >= 0 &&
1675 sb->minor_version >= 91)
1676 /* we have nowhere to store the recovery_offset,
1677 * but if it is not below the reshape_position,
1678 * we can piggy-back on that.
1679 */
1680 is_active = 1;
1681 if (rdev2->raid_disk < 0 ||
1682 test_bit(Faulty, &rdev2->flags))
1683 is_active = 0;
1684 if (is_active)
1685 desc_nr = rdev2->raid_disk;
1686 else
1687 desc_nr = next_spare++;
1688 rdev2->desc_nr = desc_nr;
1689 d = &sb->disks[rdev2->desc_nr];
1690 nr_disks++;
1691 d->number = rdev2->desc_nr;
1692 d->major = MAJOR(rdev2->bdev->bd_dev);
1693 d->minor = MINOR(rdev2->bdev->bd_dev);
1694 if (is_active)
1695 d->raid_disk = rdev2->raid_disk;
1696 else
1697 d->raid_disk = rdev2->desc_nr; /* compatibility */
1698 if (test_bit(Faulty, &rdev2->flags))
1699 d->state = (1<<MD_DISK_FAULTY);
1700 else if (is_active) {
1701 d->state = (1<<MD_DISK_ACTIVE);
1702 if (test_bit(In_sync, &rdev2->flags))
1703 d->state |= (1<<MD_DISK_SYNC);
1704 active++;
1705 working++;
1706 } else {
1707 d->state = 0;
1708 spare++;
1709 working++;
1710 }
1711 if (test_bit(WriteMostly, &rdev2->flags))
1712 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1713 if (test_bit(FailFast, &rdev2->flags))
1714 d->state |= (1<<MD_DISK_FAILFAST);
1715 }
1716 /* now set the "removed" and "faulty" bits on any missing devices */
1717 for (i=0 ; i < mddev->raid_disks ; i++) {
1718 mdp_disk_t *d = &sb->disks[i];
1719 if (d->state == 0 && d->number == 0) {
1720 d->number = i;
1721 d->raid_disk = i;
1722 d->state = (1<<MD_DISK_REMOVED);
1723 d->state |= (1<<MD_DISK_FAULTY);
1724 failed++;
1725 }
1726 }
1727 sb->nr_disks = nr_disks;
1728 sb->active_disks = active;
1729 sb->working_disks = working;
1730 sb->failed_disks = failed;
1731 sb->spare_disks = spare;
1732
1733 sb->this_disk = sb->disks[rdev->desc_nr];
1734 sb->sb_csum = calc_sb_csum(sb);
1735 }
1736
1737 /*
1738 * rdev_size_change for 0.90.0
1739 */
1740 static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1741 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1742 {
1743 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1744 return 0; /* component must fit device */
1745 if (rdev->mddev->bitmap_info.offset)
1746 return 0; /* can't move bitmap */
1747 rdev->sb_start = calc_dev_sboffset(rdev);
1748 if (!num_sectors || num_sectors > rdev->sb_start)
1749 num_sectors = rdev->sb_start;
1750 /* Limit to 4TB as metadata cannot record more than that.
1751 * 4TB == 2^32 KB, or 2*2^32 sectors.
1752 */
1753 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1754 num_sectors = (sector_t)(2ULL << 32) - 2;
1755 do {
1756 md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
1757 rdev->sb_size, rdev->sb_page, 0);
1758 } while (md_super_wait(rdev->mddev) < 0);
1759 return num_sectors;
1760 }
1761
1762 static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1763 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1764 {
1765 /* non-zero offset changes not possible with v0.90 */
1766 return new_offset == 0;
1767 }
1768
1769 /*
1770 * version 1 superblock
1771 */
1772
calc_sb_1_csum(struct mdp_superblock_1 * sb)1773 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1774 {
1775 __le32 disk_csum;
1776 u32 csum;
1777 unsigned long long newcsum;
1778 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1779 __le32 *isuper = (__le32*)sb;
1780
1781 disk_csum = sb->sb_csum;
1782 sb->sb_csum = 0;
1783 newcsum = 0;
1784 for (; size >= 4; size -= 4)
1785 newcsum += le32_to_cpu(*isuper++);
1786
1787 if (size == 2)
1788 newcsum += le16_to_cpu(*(__le16*) isuper);
1789
1790 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1791 sb->sb_csum = disk_csum;
1792 return cpu_to_le32(csum);
1793 }
1794
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1795 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1796 {
1797 struct mdp_superblock_1 *sb;
1798 int ret;
1799 sector_t sb_start;
1800 sector_t sectors;
1801 int bmask;
1802 bool spare_disk = true;
1803
1804 /*
1805 * Calculate the position of the superblock in 512byte sectors.
1806 * It is always aligned to a 4K boundary and
1807 * depeding on minor_version, it can be:
1808 * 0: At least 8K, but less than 12K, from end of device
1809 * 1: At start of device
1810 * 2: 4K from start of device.
1811 */
1812 switch(minor_version) {
1813 case 0:
1814 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1815 sb_start &= ~(sector_t)(4*2-1);
1816 break;
1817 case 1:
1818 sb_start = 0;
1819 break;
1820 case 2:
1821 sb_start = 8;
1822 break;
1823 default:
1824 return -EINVAL;
1825 }
1826 rdev->sb_start = sb_start;
1827
1828 /* superblock is rarely larger than 1K, but it can be larger,
1829 * and it is safe to read 4k, so we do that
1830 */
1831 ret = read_disk_sb(rdev, 4096);
1832 if (ret) return ret;
1833
1834 sb = page_address(rdev->sb_page);
1835
1836 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1837 sb->major_version != cpu_to_le32(1) ||
1838 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1839 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1840 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1841 return -EINVAL;
1842
1843 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1844 pr_warn("md: invalid superblock checksum on %pg\n",
1845 rdev->bdev);
1846 return -EINVAL;
1847 }
1848 if (le64_to_cpu(sb->data_size) < 10) {
1849 pr_warn("md: data_size too small on %pg\n",
1850 rdev->bdev);
1851 return -EINVAL;
1852 }
1853 if (sb->pad0 ||
1854 sb->pad3[0] ||
1855 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) {
1856 pr_warn("Some padding is non-zero on %pg, might be a new feature\n",
1857 rdev->bdev);
1858 if (check_new_feature)
1859 return -EINVAL;
1860 pr_warn("check_new_feature is disabled, data corruption possible\n");
1861 }
1862
1863 rdev->preferred_minor = 0xffff;
1864 rdev->data_offset = le64_to_cpu(sb->data_offset);
1865 rdev->new_data_offset = rdev->data_offset;
1866 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1867 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1868 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1869 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1870
1871 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1872 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1873 if (rdev->sb_size & bmask)
1874 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1875
1876 if (minor_version
1877 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1878 return -EINVAL;
1879 if (minor_version
1880 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1881 return -EINVAL;
1882
1883 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1884
1885 if (!rdev->bb_page) {
1886 rdev->bb_page = alloc_page(GFP_KERNEL);
1887 if (!rdev->bb_page)
1888 return -ENOMEM;
1889 }
1890 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1891 rdev->badblocks.count == 0) {
1892 /* need to load the bad block list.
1893 * Currently we limit it to one page.
1894 */
1895 s32 offset;
1896 sector_t bb_sector;
1897 __le64 *bbp;
1898 int i;
1899 int sectors = le16_to_cpu(sb->bblog_size);
1900 if (sectors > (PAGE_SIZE / 512))
1901 return -EINVAL;
1902 offset = le32_to_cpu(sb->bblog_offset);
1903 if (offset == 0)
1904 return -EINVAL;
1905 bb_sector = (long long)offset;
1906 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1907 rdev->bb_page, REQ_OP_READ, true))
1908 return -EIO;
1909 bbp = (__le64 *)page_address(rdev->bb_page);
1910 rdev->badblocks.shift = sb->bblog_shift;
1911 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1912 u64 bb = le64_to_cpu(*bbp);
1913 int count = bb & (0x3ff);
1914 u64 sector = bb >> 10;
1915 sector <<= sb->bblog_shift;
1916 count <<= sb->bblog_shift;
1917 if (bb + 1 == 0)
1918 break;
1919 if (!badblocks_set(&rdev->badblocks, sector, count, 1))
1920 return -EINVAL;
1921 }
1922 } else if (sb->bblog_offset != 0)
1923 rdev->badblocks.shift = 0;
1924
1925 if ((le32_to_cpu(sb->feature_map) &
1926 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1927 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1928 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1929 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1930 }
1931
1932 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1933 sb->level != 0)
1934 return -EINVAL;
1935
1936 /* not spare disk */
1937 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1938 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1939 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1940 spare_disk = false;
1941
1942 if (!refdev) {
1943 if (!spare_disk)
1944 ret = 1;
1945 else
1946 ret = 0;
1947 } else {
1948 __u64 ev1, ev2;
1949 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1950
1951 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1952 sb->level != refsb->level ||
1953 sb->layout != refsb->layout ||
1954 sb->chunksize != refsb->chunksize) {
1955 pr_warn("md: %pg has strangely different superblock to %pg\n",
1956 rdev->bdev,
1957 refdev->bdev);
1958 return -EINVAL;
1959 }
1960 ev1 = le64_to_cpu(sb->events);
1961 ev2 = le64_to_cpu(refsb->events);
1962
1963 if (!spare_disk && ev1 > ev2)
1964 ret = 1;
1965 else
1966 ret = 0;
1967 }
1968 if (minor_version)
1969 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1970 else
1971 sectors = rdev->sb_start;
1972 if (sectors < le64_to_cpu(sb->data_size))
1973 return -EINVAL;
1974 rdev->sectors = le64_to_cpu(sb->data_size);
1975 return ret;
1976 }
1977
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1978 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1979 {
1980 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1981 __u64 ev1 = le64_to_cpu(sb->events);
1982 int role;
1983
1984 rdev->raid_disk = -1;
1985 clear_bit(Faulty, &rdev->flags);
1986 clear_bit(In_sync, &rdev->flags);
1987 clear_bit(Bitmap_sync, &rdev->flags);
1988 clear_bit(WriteMostly, &rdev->flags);
1989
1990 if (mddev->raid_disks == 0) {
1991 mddev->major_version = 1;
1992 mddev->patch_version = 0;
1993 mddev->external = 0;
1994 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1995 mddev->ctime = le64_to_cpu(sb->ctime);
1996 mddev->utime = le64_to_cpu(sb->utime);
1997 mddev->level = le32_to_cpu(sb->level);
1998 mddev->clevel[0] = 0;
1999 mddev->layout = le32_to_cpu(sb->layout);
2000 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
2001 mddev->dev_sectors = le64_to_cpu(sb->size);
2002 mddev->events = ev1;
2003 mddev->bitmap_info.offset = 0;
2004 mddev->bitmap_info.space = 0;
2005 /* Default location for bitmap is 1K after superblock
2006 * using 3K - total of 4K
2007 */
2008 mddev->bitmap_info.default_offset = 1024 >> 9;
2009 mddev->bitmap_info.default_space = (4096-1024) >> 9;
2010 mddev->reshape_backwards = 0;
2011
2012 mddev->resync_offset = le64_to_cpu(sb->resync_offset);
2013 memcpy(mddev->uuid, sb->set_uuid, 16);
2014
2015 mddev->max_disks = (4096-256)/2;
2016
2017 if (!mddev->logical_block_size)
2018 mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
2019
2020 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
2021 mddev->bitmap_info.file == NULL) {
2022 mddev->bitmap_info.offset =
2023 (__s32)le32_to_cpu(sb->bitmap_offset);
2024 /* Metadata doesn't record how much space is available.
2025 * For 1.0, we assume we can use up to the superblock
2026 * if before, else to 4K beyond superblock.
2027 * For others, assume no change is possible.
2028 */
2029 if (mddev->minor_version > 0)
2030 mddev->bitmap_info.space = 0;
2031 else if (mddev->bitmap_info.offset > 0)
2032 mddev->bitmap_info.space =
2033 8 - mddev->bitmap_info.offset;
2034 else
2035 mddev->bitmap_info.space =
2036 -mddev->bitmap_info.offset;
2037 }
2038
2039 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
2040 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
2041 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
2042 mddev->new_level = le32_to_cpu(sb->new_level);
2043 mddev->new_layout = le32_to_cpu(sb->new_layout);
2044 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
2045 if (mddev->delta_disks < 0 ||
2046 (mddev->delta_disks == 0 &&
2047 (le32_to_cpu(sb->feature_map)
2048 & MD_FEATURE_RESHAPE_BACKWARDS)))
2049 mddev->reshape_backwards = 1;
2050 } else {
2051 mddev->reshape_position = MaxSector;
2052 mddev->delta_disks = 0;
2053 mddev->new_level = mddev->level;
2054 mddev->new_layout = mddev->layout;
2055 mddev->new_chunk_sectors = mddev->chunk_sectors;
2056 }
2057
2058 if (mddev->level == 0 &&
2059 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
2060 mddev->layout = -1;
2061
2062 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
2063 set_bit(MD_HAS_JOURNAL, &mddev->flags);
2064
2065 if (le32_to_cpu(sb->feature_map) &
2066 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
2067 if (le32_to_cpu(sb->feature_map) &
2068 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
2069 return -EINVAL;
2070 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
2071 (le32_to_cpu(sb->feature_map) &
2072 MD_FEATURE_MULTIPLE_PPLS))
2073 return -EINVAL;
2074 set_bit(MD_HAS_PPL, &mddev->flags);
2075 }
2076 } else if (mddev->pers == NULL) {
2077 /* Insist of good event counter while assembling, except for
2078 * spares (which don't need an event count).
2079 * Similar to mdadm, we allow event counter difference of 1
2080 * from the freshest device.
2081 */
2082 if (rdev->desc_nr >= 0 &&
2083 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
2084 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
2085 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
2086 if (ev1 + 1 < mddev->events)
2087 return -EINVAL;
2088 } else if (mddev->bitmap) {
2089 /* If adding to array with a bitmap, then we can accept an
2090 * older device, but not too old.
2091 */
2092 if (ev1 < md_bitmap_events_cleared(mddev))
2093 return 0;
2094 if (ev1 < mddev->events)
2095 set_bit(Bitmap_sync, &rdev->flags);
2096 } else {
2097 if (ev1 < mddev->events)
2098 /* just a hot-add of a new device, leave raid_disk at -1 */
2099 return 0;
2100 }
2101
2102 if (rdev->desc_nr < 0 ||
2103 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
2104 role = MD_DISK_ROLE_SPARE;
2105 rdev->desc_nr = -1;
2106 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
2107 /*
2108 * If we are assembling, and our event counter is smaller than the
2109 * highest event counter, we cannot trust our superblock about the role.
2110 * It could happen that our rdev was marked as Faulty, and all other
2111 * superblocks were updated with +1 event counter.
2112 * Then, before the next superblock update, which typically happens when
2113 * remove_and_add_spares() removes the device from the array, there was
2114 * a crash or reboot.
2115 * If we allow current rdev without consulting the freshest superblock,
2116 * we could cause data corruption.
2117 * Note that in this case our event counter is smaller by 1 than the
2118 * highest, otherwise, this rdev would not be allowed into array;
2119 * both kernel and mdadm allow event counter difference of 1.
2120 */
2121 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
2122 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
2123
2124 if (rdev->desc_nr >= freshest_max_dev) {
2125 /* this is unexpected, better not proceed */
2126 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
2127 mdname(mddev), rdev->bdev, rdev->desc_nr,
2128 freshest->bdev, freshest_max_dev);
2129 return -EUCLEAN;
2130 }
2131
2132 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
2133 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
2134 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
2135 } else {
2136 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2137 }
2138 switch (role) {
2139 case MD_DISK_ROLE_SPARE: /* spare */
2140 break;
2141 case MD_DISK_ROLE_FAULTY: /* faulty */
2142 set_bit(Faulty, &rdev->flags);
2143 break;
2144 case MD_DISK_ROLE_JOURNAL: /* journal device */
2145 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
2146 /* journal device without journal feature */
2147 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2148 return -EINVAL;
2149 }
2150 set_bit(Journal, &rdev->flags);
2151 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2152 rdev->raid_disk = 0;
2153 break;
2154 default:
2155 rdev->saved_raid_disk = role;
2156 if ((le32_to_cpu(sb->feature_map) &
2157 MD_FEATURE_RECOVERY_OFFSET)) {
2158 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2159 if (!(le32_to_cpu(sb->feature_map) &
2160 MD_FEATURE_RECOVERY_BITMAP))
2161 rdev->saved_raid_disk = -1;
2162 } else {
2163 /*
2164 * If the array is FROZEN, then the device can't
2165 * be in_sync with rest of array.
2166 */
2167 if (!test_bit(MD_RECOVERY_FROZEN,
2168 &mddev->recovery))
2169 set_bit(In_sync, &rdev->flags);
2170 }
2171 rdev->raid_disk = role;
2172 break;
2173 }
2174 if (sb->devflags & WriteMostly1)
2175 set_bit(WriteMostly, &rdev->flags);
2176 if (sb->devflags & FailFast1)
2177 set_bit(FailFast, &rdev->flags);
2178 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2179 set_bit(Replacement, &rdev->flags);
2180
2181 return 0;
2182 }
2183
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)2184 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2185 {
2186 struct mdp_superblock_1 *sb;
2187 struct md_rdev *rdev2;
2188 int max_dev, i;
2189 /* make rdev->sb match mddev and rdev data. */
2190
2191 sb = page_address(rdev->sb_page);
2192
2193 sb->feature_map = 0;
2194 sb->pad0 = 0;
2195 sb->recovery_offset = cpu_to_le64(0);
2196 memset(sb->pad3, 0, sizeof(sb->pad3));
2197
2198 sb->utime = cpu_to_le64((__u64)mddev->utime);
2199 sb->events = cpu_to_le64(mddev->events);
2200 if (mddev->in_sync)
2201 sb->resync_offset = cpu_to_le64(mddev->resync_offset);
2202 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2203 sb->resync_offset = cpu_to_le64(MaxSector);
2204 else
2205 sb->resync_offset = cpu_to_le64(0);
2206
2207 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2208
2209 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2210 sb->size = cpu_to_le64(mddev->dev_sectors);
2211 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2212 sb->level = cpu_to_le32(mddev->level);
2213 sb->layout = cpu_to_le32(mddev->layout);
2214 sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
2215 if (test_bit(FailFast, &rdev->flags))
2216 sb->devflags |= FailFast1;
2217 else
2218 sb->devflags &= ~FailFast1;
2219
2220 if (test_bit(WriteMostly, &rdev->flags))
2221 sb->devflags |= WriteMostly1;
2222 else
2223 sb->devflags &= ~WriteMostly1;
2224 sb->data_offset = cpu_to_le64(rdev->data_offset);
2225 sb->data_size = cpu_to_le64(rdev->sectors);
2226
2227 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2228 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2229 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2230 }
2231
2232 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2233 !test_bit(In_sync, &rdev->flags)) {
2234 sb->feature_map |=
2235 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2236 sb->recovery_offset =
2237 cpu_to_le64(rdev->recovery_offset);
2238 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2239 sb->feature_map |=
2240 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2241 }
2242 /* Note: recovery_offset and journal_tail share space */
2243 if (test_bit(Journal, &rdev->flags))
2244 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2245 if (test_bit(Replacement, &rdev->flags))
2246 sb->feature_map |=
2247 cpu_to_le32(MD_FEATURE_REPLACEMENT);
2248
2249 if (mddev->reshape_position != MaxSector) {
2250 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2251 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2252 sb->new_layout = cpu_to_le32(mddev->new_layout);
2253 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2254 sb->new_level = cpu_to_le32(mddev->new_level);
2255 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2256 if (mddev->delta_disks == 0 &&
2257 mddev->reshape_backwards)
2258 sb->feature_map
2259 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2260 if (rdev->new_data_offset != rdev->data_offset) {
2261 sb->feature_map
2262 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2263 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2264 - rdev->data_offset));
2265 }
2266 }
2267
2268 if (mddev_is_clustered(mddev))
2269 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2270
2271 if (rdev->badblocks.count == 0)
2272 /* Nothing to do for bad blocks*/ ;
2273 else if (sb->bblog_offset == 0)
2274 /* Cannot record bad blocks on this device */
2275 md_error(mddev, rdev);
2276 else {
2277 struct badblocks *bb = &rdev->badblocks;
2278 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2279 u64 *p = bb->page;
2280 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2281 if (bb->changed) {
2282 unsigned seq;
2283
2284 retry:
2285 seq = read_seqbegin(&bb->lock);
2286
2287 memset(bbp, 0xff, PAGE_SIZE);
2288
2289 for (i = 0 ; i < bb->count ; i++) {
2290 u64 internal_bb = p[i];
2291 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2292 | BB_LEN(internal_bb));
2293 bbp[i] = cpu_to_le64(store_bb);
2294 }
2295 bb->changed = 0;
2296 if (read_seqretry(&bb->lock, seq))
2297 goto retry;
2298
2299 bb->sector = (rdev->sb_start +
2300 (int)le32_to_cpu(sb->bblog_offset));
2301 bb->size = le16_to_cpu(sb->bblog_size);
2302 }
2303 }
2304
2305 max_dev = 0;
2306 rdev_for_each(rdev2, mddev)
2307 if (rdev2->desc_nr+1 > max_dev)
2308 max_dev = rdev2->desc_nr+1;
2309
2310 if (max_dev > le32_to_cpu(sb->max_dev)) {
2311 int bmask;
2312 sb->max_dev = cpu_to_le32(max_dev);
2313 rdev->sb_size = max_dev * 2 + 256;
2314 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2315 if (rdev->sb_size & bmask)
2316 rdev->sb_size = (rdev->sb_size | bmask) + 1;
2317 } else
2318 max_dev = le32_to_cpu(sb->max_dev);
2319
2320 for (i=0; i<max_dev;i++)
2321 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2322
2323 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2324 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2325
2326 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2327 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2328 sb->feature_map |=
2329 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2330 else
2331 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2332 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2333 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2334 }
2335
2336 rdev_for_each(rdev2, mddev) {
2337 i = rdev2->desc_nr;
2338 if (test_bit(Faulty, &rdev2->flags))
2339 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2340 else if (test_bit(In_sync, &rdev2->flags))
2341 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2342 else if (test_bit(Journal, &rdev2->flags))
2343 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2344 else if (rdev2->raid_disk >= 0)
2345 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2346 else
2347 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2348 }
2349
2350 sb->sb_csum = calc_sb_1_csum(sb);
2351 }
2352
super_1_choose_bm_space(sector_t dev_size)2353 static sector_t super_1_choose_bm_space(sector_t dev_size)
2354 {
2355 sector_t bm_space;
2356
2357 /* if the device is bigger than 8Gig, save 64k for bitmap
2358 * usage, if bigger than 200Gig, save 128k
2359 */
2360 if (dev_size < 64*2)
2361 bm_space = 0;
2362 else if (dev_size - 64*2 >= 200*1024*1024*2)
2363 bm_space = 128*2;
2364 else if (dev_size - 4*2 > 8*1024*1024*2)
2365 bm_space = 64*2;
2366 else
2367 bm_space = 4*2;
2368 return bm_space;
2369 }
2370
2371 static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2372 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2373 {
2374 struct mdp_superblock_1 *sb;
2375 sector_t max_sectors;
2376 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2377 return 0; /* component must fit device */
2378 if (rdev->data_offset != rdev->new_data_offset)
2379 return 0; /* too confusing */
2380 if (rdev->sb_start < rdev->data_offset) {
2381 /* minor versions 1 and 2; superblock before data */
2382 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2383 if (!num_sectors || num_sectors > max_sectors)
2384 num_sectors = max_sectors;
2385 } else if (rdev->mddev->bitmap_info.offset) {
2386 /* minor version 0 with bitmap we can't move */
2387 return 0;
2388 } else {
2389 /* minor version 0; superblock after data */
2390 sector_t sb_start, bm_space;
2391 sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2392
2393 /* 8K is for superblock */
2394 sb_start = dev_size - 8*2;
2395 sb_start &= ~(sector_t)(4*2 - 1);
2396
2397 bm_space = super_1_choose_bm_space(dev_size);
2398
2399 /* Space that can be used to store date needs to decrease
2400 * superblock bitmap space and bad block space(4K)
2401 */
2402 max_sectors = sb_start - bm_space - 4*2;
2403
2404 if (!num_sectors || num_sectors > max_sectors)
2405 num_sectors = max_sectors;
2406 rdev->sb_start = sb_start;
2407 }
2408 sb = page_address(rdev->sb_page);
2409 sb->data_size = cpu_to_le64(num_sectors);
2410 sb->super_offset = cpu_to_le64(rdev->sb_start);
2411 sb->sb_csum = calc_sb_1_csum(sb);
2412 do {
2413 md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
2414 rdev->sb_size, rdev->sb_page, 0);
2415 } while (md_super_wait(rdev->mddev) < 0);
2416 return num_sectors;
2417
2418 }
2419
2420 static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2421 super_1_allow_new_offset(struct md_rdev *rdev,
2422 unsigned long long new_offset)
2423 {
2424 struct mddev *mddev = rdev->mddev;
2425
2426 /* All necessary checks on new >= old have been done */
2427 if (new_offset >= rdev->data_offset)
2428 return 1;
2429
2430 /* with 1.0 metadata, there is no metadata to tread on
2431 * so we can always move back */
2432 if (mddev->minor_version == 0)
2433 return 1;
2434
2435 /* otherwise we must be sure not to step on
2436 * any metadata, so stay:
2437 * 36K beyond start of superblock
2438 * beyond end of badblocks
2439 * beyond write-intent bitmap
2440 */
2441 if (rdev->sb_start + (32+4)*2 > new_offset)
2442 return 0;
2443
2444 if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) {
2445 struct md_bitmap_stats stats;
2446 int err;
2447
2448 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
2449 if (!err && rdev->sb_start + mddev->bitmap_info.offset +
2450 stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
2451 return 0;
2452 }
2453
2454 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2455 return 0;
2456
2457 return 1;
2458 }
2459
2460 static struct super_type super_types[] = {
2461 [0] = {
2462 .name = "0.90.0",
2463 .owner = THIS_MODULE,
2464 .load_super = super_90_load,
2465 .validate_super = super_90_validate,
2466 .sync_super = super_90_sync,
2467 .rdev_size_change = super_90_rdev_size_change,
2468 .allow_new_offset = super_90_allow_new_offset,
2469 },
2470 [1] = {
2471 .name = "md-1",
2472 .owner = THIS_MODULE,
2473 .load_super = super_1_load,
2474 .validate_super = super_1_validate,
2475 .sync_super = super_1_sync,
2476 .rdev_size_change = super_1_rdev_size_change,
2477 .allow_new_offset = super_1_allow_new_offset,
2478 },
2479 };
2480
sync_super(struct mddev * mddev,struct md_rdev * rdev)2481 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2482 {
2483 if (mddev->sync_super) {
2484 mddev->sync_super(mddev, rdev);
2485 return;
2486 }
2487
2488 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2489
2490 super_types[mddev->major_version].sync_super(mddev, rdev);
2491 }
2492
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2493 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2494 {
2495 struct md_rdev *rdev, *rdev2;
2496
2497 rcu_read_lock();
2498 rdev_for_each_rcu(rdev, mddev1) {
2499 if (test_bit(Faulty, &rdev->flags) ||
2500 test_bit(Journal, &rdev->flags) ||
2501 rdev->raid_disk == -1)
2502 continue;
2503 rdev_for_each_rcu(rdev2, mddev2) {
2504 if (test_bit(Faulty, &rdev2->flags) ||
2505 test_bit(Journal, &rdev2->flags) ||
2506 rdev2->raid_disk == -1)
2507 continue;
2508 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2509 rcu_read_unlock();
2510 return 1;
2511 }
2512 }
2513 }
2514 rcu_read_unlock();
2515 return 0;
2516 }
2517
2518 static LIST_HEAD(pending_raid_disks);
2519
2520 /*
2521 * Try to register data integrity profile for an mddev
2522 *
2523 * This is called when an array is started and after a disk has been kicked
2524 * from the array. It only succeeds if all working and active component devices
2525 * are integrity capable with matching profiles.
2526 */
md_integrity_register(struct mddev * mddev)2527 int md_integrity_register(struct mddev *mddev)
2528 {
2529 if (list_empty(&mddev->disks))
2530 return 0; /* nothing to do */
2531 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
2532 return 0; /* shouldn't register */
2533
2534 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2535 return 0;
2536 }
2537 EXPORT_SYMBOL(md_integrity_register);
2538
rdev_read_only(struct md_rdev * rdev)2539 static bool rdev_read_only(struct md_rdev *rdev)
2540 {
2541 return bdev_read_only(rdev->bdev) ||
2542 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2543 }
2544
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2545 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2546 {
2547 char b[BDEVNAME_SIZE];
2548 int err;
2549
2550 /* prevent duplicates */
2551 if (find_rdev(mddev, rdev->bdev->bd_dev))
2552 return -EEXIST;
2553
2554 if (rdev_read_only(rdev) && mddev->pers)
2555 return -EROFS;
2556
2557 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2558 if (!test_bit(Journal, &rdev->flags) &&
2559 rdev->sectors &&
2560 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2561 if (mddev->pers) {
2562 /* Cannot change size, so fail
2563 * If mddev->level <= 0, then we don't care
2564 * about aligning sizes (e.g. linear)
2565 */
2566 if (mddev->level > 0)
2567 return -ENOSPC;
2568 } else
2569 mddev->dev_sectors = rdev->sectors;
2570 }
2571
2572 /* Verify rdev->desc_nr is unique.
2573 * If it is -1, assign a free number, else
2574 * check number is not in use
2575 */
2576 rcu_read_lock();
2577 if (rdev->desc_nr < 0) {
2578 int choice = 0;
2579 if (mddev->pers)
2580 choice = mddev->raid_disks;
2581 while (md_find_rdev_nr_rcu(mddev, choice))
2582 choice++;
2583 rdev->desc_nr = choice;
2584 } else {
2585 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2586 rcu_read_unlock();
2587 return -EBUSY;
2588 }
2589 }
2590 rcu_read_unlock();
2591 if (!test_bit(Journal, &rdev->flags) &&
2592 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2593 pr_warn("md: %s: array is limited to %d devices\n",
2594 mdname(mddev), mddev->max_disks);
2595 return -EBUSY;
2596 }
2597 snprintf(b, sizeof(b), "%pg", rdev->bdev);
2598 strreplace(b, '/', '!');
2599
2600 rdev->mddev = mddev;
2601 pr_debug("md: bind<%s>\n", b);
2602
2603 if (mddev->raid_disks)
2604 mddev_create_serial_pool(mddev, rdev);
2605
2606 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2607 goto fail;
2608
2609 /* failure here is OK */
2610 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2611 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2612 rdev->sysfs_unack_badblocks =
2613 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2614 rdev->sysfs_badblocks =
2615 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2616
2617 list_add_rcu(&rdev->same_set, &mddev->disks);
2618 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2619
2620 /* May as well allow recovery to be retried once */
2621 mddev->recovery_disabled++;
2622
2623 return 0;
2624
2625 fail:
2626 pr_warn("md: failed to register dev-%s for %s\n",
2627 b, mdname(mddev));
2628 mddev_destroy_serial_pool(mddev, rdev);
2629 return err;
2630 }
2631
2632 void md_autodetect_dev(dev_t dev);
2633
2634 /* just for claiming the bdev */
2635 static struct md_rdev claim_rdev;
2636
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2637 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2638 {
2639 pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2640 md_rdev_clear(rdev);
2641 #ifndef MODULE
2642 if (test_bit(AutoDetected, &rdev->flags))
2643 md_autodetect_dev(rdev->bdev->bd_dev);
2644 #endif
2645 fput(rdev->bdev_file);
2646 rdev->bdev = NULL;
2647 kobject_put(&rdev->kobj);
2648 }
2649
md_kick_rdev_from_array(struct md_rdev * rdev)2650 static void md_kick_rdev_from_array(struct md_rdev *rdev)
2651 {
2652 struct mddev *mddev = rdev->mddev;
2653
2654 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2655 list_del_rcu(&rdev->same_set);
2656 pr_debug("md: unbind<%pg>\n", rdev->bdev);
2657 mddev_destroy_serial_pool(rdev->mddev, rdev);
2658 WRITE_ONCE(rdev->mddev, NULL);
2659 sysfs_remove_link(&rdev->kobj, "block");
2660 sysfs_put(rdev->sysfs_state);
2661 sysfs_put(rdev->sysfs_unack_badblocks);
2662 sysfs_put(rdev->sysfs_badblocks);
2663 rdev->sysfs_state = NULL;
2664 rdev->sysfs_unack_badblocks = NULL;
2665 rdev->sysfs_badblocks = NULL;
2666 rdev->badblocks.count = 0;
2667
2668 synchronize_rcu();
2669
2670 /*
2671 * kobject_del() will wait for all in progress writers to be done, where
2672 * reconfig_mutex is held, hence it can't be called under
2673 * reconfig_mutex and it's delayed to mddev_unlock().
2674 */
2675 list_add(&rdev->same_set, &mddev->deleting);
2676 }
2677
export_array(struct mddev * mddev)2678 static void export_array(struct mddev *mddev)
2679 {
2680 struct md_rdev *rdev;
2681
2682 while (!list_empty(&mddev->disks)) {
2683 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2684 same_set);
2685 md_kick_rdev_from_array(rdev);
2686 }
2687 mddev->raid_disks = 0;
2688 mddev->major_version = 0;
2689 }
2690
set_in_sync(struct mddev * mddev)2691 static bool set_in_sync(struct mddev *mddev)
2692 {
2693 lockdep_assert_held(&mddev->lock);
2694 if (!mddev->in_sync) {
2695 mddev->sync_checkers++;
2696 spin_unlock(&mddev->lock);
2697 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2698 spin_lock(&mddev->lock);
2699 if (!mddev->in_sync &&
2700 percpu_ref_is_zero(&mddev->writes_pending)) {
2701 mddev->in_sync = 1;
2702 /*
2703 * Ensure ->in_sync is visible before we clear
2704 * ->sync_checkers.
2705 */
2706 smp_mb();
2707 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2708 sysfs_notify_dirent_safe(mddev->sysfs_state);
2709 }
2710 if (--mddev->sync_checkers == 0)
2711 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2712 }
2713 if (mddev->safemode == 1)
2714 mddev->safemode = 0;
2715 return mddev->in_sync;
2716 }
2717
sync_sbs(struct mddev * mddev,int nospares)2718 static void sync_sbs(struct mddev *mddev, int nospares)
2719 {
2720 /* Update each superblock (in-memory image), but
2721 * if we are allowed to, skip spares which already
2722 * have the right event counter, or have one earlier
2723 * (which would mean they aren't being marked as dirty
2724 * with the rest of the array)
2725 */
2726 struct md_rdev *rdev;
2727 rdev_for_each(rdev, mddev) {
2728 if (rdev->sb_events == mddev->events ||
2729 (nospares &&
2730 rdev->raid_disk < 0 &&
2731 rdev->sb_events+1 == mddev->events)) {
2732 /* Don't update this superblock */
2733 rdev->sb_loaded = 2;
2734 } else {
2735 sync_super(mddev, rdev);
2736 rdev->sb_loaded = 1;
2737 }
2738 }
2739 }
2740
does_sb_need_changing(struct mddev * mddev)2741 static bool does_sb_need_changing(struct mddev *mddev)
2742 {
2743 struct md_rdev *rdev = NULL, *iter;
2744 struct mdp_superblock_1 *sb;
2745 int role;
2746
2747 /* Find a good rdev */
2748 rdev_for_each(iter, mddev)
2749 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2750 rdev = iter;
2751 break;
2752 }
2753
2754 /* No good device found. */
2755 if (!rdev)
2756 return false;
2757
2758 sb = page_address(rdev->sb_page);
2759 /* Check if a device has become faulty or a spare become active */
2760 rdev_for_each(rdev, mddev) {
2761 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2762 /* Device activated? */
2763 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2764 !test_bit(Faulty, &rdev->flags))
2765 return true;
2766 /* Device turned faulty? */
2767 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2768 return true;
2769 }
2770
2771 /* Check if any mddev parameters have changed */
2772 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2773 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2774 (mddev->layout != le32_to_cpu(sb->layout)) ||
2775 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2776 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2777 return true;
2778
2779 return false;
2780 }
2781
md_update_sb(struct mddev * mddev,int force_change)2782 void md_update_sb(struct mddev *mddev, int force_change)
2783 {
2784 struct md_rdev *rdev;
2785 int sync_req;
2786 int nospares = 0;
2787 int any_badblocks_changed = 0;
2788 int ret = -1;
2789
2790 if (!md_is_rdwr(mddev)) {
2791 if (force_change)
2792 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2793 pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
2794 return;
2795 }
2796
2797 repeat:
2798 if (mddev_is_clustered(mddev)) {
2799 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2800 force_change = 1;
2801 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2802 nospares = 1;
2803 ret = mddev->cluster_ops->metadata_update_start(mddev);
2804 /* Has someone else has updated the sb */
2805 if (!does_sb_need_changing(mddev)) {
2806 if (ret == 0)
2807 mddev->cluster_ops->metadata_update_cancel(mddev);
2808 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2809 BIT(MD_SB_CHANGE_DEVS) |
2810 BIT(MD_SB_CHANGE_CLEAN));
2811 return;
2812 }
2813 }
2814
2815 /*
2816 * First make sure individual recovery_offsets are correct
2817 * curr_resync_completed can only be used during recovery.
2818 * During reshape/resync it might use array-addresses rather
2819 * that device addresses.
2820 */
2821 rdev_for_each(rdev, mddev) {
2822 if (rdev->raid_disk >= 0 &&
2823 mddev->delta_disks >= 0 &&
2824 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2825 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2826 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2827 !test_bit(Journal, &rdev->flags) &&
2828 !test_bit(In_sync, &rdev->flags) &&
2829 mddev->curr_resync_completed > rdev->recovery_offset)
2830 rdev->recovery_offset = mddev->curr_resync_completed;
2831
2832 }
2833 if (!mddev->persistent) {
2834 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2835 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2836 if (!mddev->external) {
2837 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2838 rdev_for_each(rdev, mddev) {
2839 if (rdev->badblocks.changed) {
2840 rdev->badblocks.changed = 0;
2841 ack_all_badblocks(&rdev->badblocks);
2842 md_error(mddev, rdev);
2843 }
2844 clear_bit(Blocked, &rdev->flags);
2845 clear_bit(BlockedBadBlocks, &rdev->flags);
2846 wake_up(&rdev->blocked_wait);
2847 }
2848 }
2849 wake_up(&mddev->sb_wait);
2850 return;
2851 }
2852
2853 spin_lock(&mddev->lock);
2854
2855 mddev->utime = ktime_get_real_seconds();
2856
2857 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2858 force_change = 1;
2859 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2860 /* just a clean<-> dirty transition, possibly leave spares alone,
2861 * though if events isn't the right even/odd, we will have to do
2862 * spares after all
2863 */
2864 nospares = 1;
2865 if (force_change)
2866 nospares = 0;
2867 if (mddev->degraded)
2868 /* If the array is degraded, then skipping spares is both
2869 * dangerous and fairly pointless.
2870 * Dangerous because a device that was removed from the array
2871 * might have a event_count that still looks up-to-date,
2872 * so it can be re-added without a resync.
2873 * Pointless because if there are any spares to skip,
2874 * then a recovery will happen and soon that array won't
2875 * be degraded any more and the spare can go back to sleep then.
2876 */
2877 nospares = 0;
2878
2879 sync_req = mddev->in_sync;
2880
2881 /* If this is just a dirty<->clean transition, and the array is clean
2882 * and 'events' is odd, we can roll back to the previous clean state */
2883 if (nospares
2884 && (mddev->in_sync && mddev->resync_offset == MaxSector)
2885 && mddev->can_decrease_events
2886 && mddev->events != 1) {
2887 mddev->events--;
2888 mddev->can_decrease_events = 0;
2889 } else {
2890 /* otherwise we have to go forward and ... */
2891 mddev->events ++;
2892 mddev->can_decrease_events = nospares;
2893 }
2894
2895 /*
2896 * This 64-bit counter should never wrap.
2897 * Either we are in around ~1 trillion A.C., assuming
2898 * 1 reboot per second, or we have a bug...
2899 */
2900 WARN_ON(mddev->events == 0);
2901
2902 rdev_for_each(rdev, mddev) {
2903 if (rdev->badblocks.changed)
2904 any_badblocks_changed++;
2905 if (test_bit(Faulty, &rdev->flags))
2906 set_bit(FaultRecorded, &rdev->flags);
2907 }
2908
2909 sync_sbs(mddev, nospares);
2910 spin_unlock(&mddev->lock);
2911
2912 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2913 mdname(mddev), mddev->in_sync);
2914
2915 mddev_add_trace_msg(mddev, "md md_update_sb");
2916 rewrite:
2917 if (md_bitmap_enabled(mddev, false))
2918 mddev->bitmap_ops->update_sb(mddev->bitmap);
2919 rdev_for_each(rdev, mddev) {
2920 if (rdev->sb_loaded != 1)
2921 continue; /* no noise on spare devices */
2922
2923 if (!test_bit(Faulty, &rdev->flags)) {
2924 md_write_metadata(mddev, rdev, rdev->sb_start,
2925 rdev->sb_size, rdev->sb_page, 0);
2926 pr_debug("md: (write) %pg's sb offset: %llu\n",
2927 rdev->bdev,
2928 (unsigned long long)rdev->sb_start);
2929 rdev->sb_events = mddev->events;
2930 if (rdev->badblocks.size) {
2931 md_write_metadata(mddev, rdev,
2932 rdev->badblocks.sector,
2933 rdev->badblocks.size << 9,
2934 rdev->bb_page, 0);
2935 rdev->badblocks.size = 0;
2936 }
2937
2938 } else
2939 pr_debug("md: %pg (skipping faulty)\n",
2940 rdev->bdev);
2941 }
2942 if (md_super_wait(mddev) < 0)
2943 goto rewrite;
2944 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2945
2946 if (mddev_is_clustered(mddev) && ret == 0)
2947 mddev->cluster_ops->metadata_update_finish(mddev);
2948
2949 if (mddev->in_sync != sync_req ||
2950 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2951 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2952 /* have to write it out again */
2953 goto repeat;
2954 wake_up(&mddev->sb_wait);
2955 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2956 sysfs_notify_dirent_safe(mddev->sysfs_completed);
2957
2958 rdev_for_each(rdev, mddev) {
2959 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2960 clear_bit(Blocked, &rdev->flags);
2961
2962 if (any_badblocks_changed)
2963 ack_all_badblocks(&rdev->badblocks);
2964 clear_bit(BlockedBadBlocks, &rdev->flags);
2965 wake_up(&rdev->blocked_wait);
2966 }
2967 }
2968 EXPORT_SYMBOL(md_update_sb);
2969
add_bound_rdev(struct md_rdev * rdev)2970 static int add_bound_rdev(struct md_rdev *rdev)
2971 {
2972 struct mddev *mddev = rdev->mddev;
2973 int err = 0;
2974 bool add_journal = test_bit(Journal, &rdev->flags);
2975
2976 if (!mddev->pers->hot_remove_disk || add_journal) {
2977 /* If there is hot_add_disk but no hot_remove_disk
2978 * then added disks for geometry changes,
2979 * and should be added immediately.
2980 */
2981 super_types[mddev->major_version].
2982 validate_super(mddev, NULL/*freshest*/, rdev);
2983 err = mddev->pers->hot_add_disk(mddev, rdev);
2984 if (err) {
2985 md_kick_rdev_from_array(rdev);
2986 return err;
2987 }
2988 }
2989 sysfs_notify_dirent_safe(rdev->sysfs_state);
2990
2991 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2992 if (mddev->degraded)
2993 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2994 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2995 md_new_event();
2996 return 0;
2997 }
2998
2999 /* words written to sysfs files may, or may not, be \n terminated.
3000 * We want to accept with case. For this we use cmd_match.
3001 */
cmd_match(const char * cmd,const char * str)3002 static int cmd_match(const char *cmd, const char *str)
3003 {
3004 /* See if cmd, written into a sysfs file, matches
3005 * str. They must either be the same, or cmd can
3006 * have a trailing newline
3007 */
3008 while (*cmd && *str && *cmd == *str) {
3009 cmd++;
3010 str++;
3011 }
3012 if (*cmd == '\n')
3013 cmd++;
3014 if (*str || *cmd)
3015 return 0;
3016 return 1;
3017 }
3018
3019 struct rdev_sysfs_entry {
3020 struct attribute attr;
3021 ssize_t (*show)(struct md_rdev *, char *);
3022 ssize_t (*store)(struct md_rdev *, const char *, size_t);
3023 };
3024
3025 static ssize_t
state_show(struct md_rdev * rdev,char * page)3026 state_show(struct md_rdev *rdev, char *page)
3027 {
3028 char *sep = ",";
3029 size_t len = 0;
3030 unsigned long flags = READ_ONCE(rdev->flags);
3031
3032 if (test_bit(Faulty, &flags) ||
3033 (!test_bit(ExternalBbl, &flags) &&
3034 rdev->badblocks.unacked_exist))
3035 len += sprintf(page+len, "faulty%s", sep);
3036 if (test_bit(In_sync, &flags))
3037 len += sprintf(page+len, "in_sync%s", sep);
3038 if (test_bit(Journal, &flags))
3039 len += sprintf(page+len, "journal%s", sep);
3040 if (test_bit(WriteMostly, &flags))
3041 len += sprintf(page+len, "write_mostly%s", sep);
3042 if (test_bit(Blocked, &flags) ||
3043 (rdev->badblocks.unacked_exist
3044 && !test_bit(Faulty, &flags)))
3045 len += sprintf(page+len, "blocked%s", sep);
3046 if (!test_bit(Faulty, &flags) &&
3047 !test_bit(Journal, &flags) &&
3048 !test_bit(In_sync, &flags))
3049 len += sprintf(page+len, "spare%s", sep);
3050 if (test_bit(WriteErrorSeen, &flags))
3051 len += sprintf(page+len, "write_error%s", sep);
3052 if (test_bit(WantReplacement, &flags))
3053 len += sprintf(page+len, "want_replacement%s", sep);
3054 if (test_bit(Replacement, &flags))
3055 len += sprintf(page+len, "replacement%s", sep);
3056 if (test_bit(ExternalBbl, &flags))
3057 len += sprintf(page+len, "external_bbl%s", sep);
3058 if (test_bit(FailFast, &flags))
3059 len += sprintf(page+len, "failfast%s", sep);
3060
3061 if (len)
3062 len -= strlen(sep);
3063
3064 return len+sprintf(page+len, "\n");
3065 }
3066
3067 static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)3068 state_store(struct md_rdev *rdev, const char *buf, size_t len)
3069 {
3070 /* can write
3071 * faulty - simulates an error
3072 * remove - disconnects the device
3073 * writemostly - sets write_mostly
3074 * -writemostly - clears write_mostly
3075 * blocked - sets the Blocked flags
3076 * -blocked - clears the Blocked and possibly simulates an error
3077 * insync - sets Insync providing device isn't active
3078 * -insync - clear Insync for a device with a slot assigned,
3079 * so that it gets rebuilt based on bitmap
3080 * write_error - sets WriteErrorSeen
3081 * -write_error - clears WriteErrorSeen
3082 * {,-}failfast - set/clear FailFast
3083 */
3084
3085 struct mddev *mddev = rdev->mddev;
3086 int err = -EINVAL;
3087 bool need_update_sb = false;
3088
3089 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
3090 md_error(rdev->mddev, rdev);
3091
3092 if (test_bit(MD_BROKEN, &rdev->mddev->flags))
3093 err = -EBUSY;
3094 else
3095 err = 0;
3096 } else if (cmd_match(buf, "remove")) {
3097 if (rdev->mddev->pers) {
3098 clear_bit(Blocked, &rdev->flags);
3099 remove_and_add_spares(rdev->mddev, rdev);
3100 }
3101 if (rdev->raid_disk >= 0)
3102 err = -EBUSY;
3103 else {
3104 err = 0;
3105 if (mddev_is_clustered(mddev))
3106 err = mddev->cluster_ops->remove_disk(mddev, rdev);
3107
3108 if (err == 0) {
3109 md_kick_rdev_from_array(rdev);
3110 if (mddev->pers)
3111 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3112 md_new_event();
3113 }
3114 }
3115 } else if (cmd_match(buf, "writemostly")) {
3116 set_bit(WriteMostly, &rdev->flags);
3117 mddev_create_serial_pool(rdev->mddev, rdev);
3118 need_update_sb = true;
3119 err = 0;
3120 } else if (cmd_match(buf, "-writemostly")) {
3121 mddev_destroy_serial_pool(rdev->mddev, rdev);
3122 clear_bit(WriteMostly, &rdev->flags);
3123 need_update_sb = true;
3124 err = 0;
3125 } else if (cmd_match(buf, "blocked")) {
3126 set_bit(Blocked, &rdev->flags);
3127 err = 0;
3128 } else if (cmd_match(buf, "-blocked")) {
3129 if (!test_bit(Faulty, &rdev->flags) &&
3130 !test_bit(ExternalBbl, &rdev->flags) &&
3131 rdev->badblocks.unacked_exist) {
3132 /* metadata handler doesn't understand badblocks,
3133 * so we need to fail the device
3134 */
3135 md_error(rdev->mddev, rdev);
3136 }
3137 clear_bit(Blocked, &rdev->flags);
3138 clear_bit(BlockedBadBlocks, &rdev->flags);
3139 wake_up(&rdev->blocked_wait);
3140 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3141
3142 err = 0;
3143 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3144 set_bit(In_sync, &rdev->flags);
3145 err = 0;
3146 } else if (cmd_match(buf, "failfast")) {
3147 set_bit(FailFast, &rdev->flags);
3148 need_update_sb = true;
3149 err = 0;
3150 } else if (cmd_match(buf, "-failfast")) {
3151 clear_bit(FailFast, &rdev->flags);
3152 need_update_sb = true;
3153 err = 0;
3154 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3155 !test_bit(Journal, &rdev->flags)) {
3156 if (rdev->mddev->pers == NULL) {
3157 clear_bit(In_sync, &rdev->flags);
3158 rdev->saved_raid_disk = rdev->raid_disk;
3159 rdev->raid_disk = -1;
3160 err = 0;
3161 }
3162 } else if (cmd_match(buf, "write_error")) {
3163 set_bit(WriteErrorSeen, &rdev->flags);
3164 err = 0;
3165 } else if (cmd_match(buf, "-write_error")) {
3166 clear_bit(WriteErrorSeen, &rdev->flags);
3167 err = 0;
3168 } else if (cmd_match(buf, "want_replacement")) {
3169 /* Any non-spare device that is not a replacement can
3170 * become want_replacement at any time, but we then need to
3171 * check if recovery is needed.
3172 */
3173 if (rdev->raid_disk >= 0 &&
3174 !test_bit(Journal, &rdev->flags) &&
3175 !test_bit(Replacement, &rdev->flags))
3176 set_bit(WantReplacement, &rdev->flags);
3177 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3178 err = 0;
3179 } else if (cmd_match(buf, "-want_replacement")) {
3180 /* Clearing 'want_replacement' is always allowed.
3181 * Once replacements starts it is too late though.
3182 */
3183 err = 0;
3184 clear_bit(WantReplacement, &rdev->flags);
3185 } else if (cmd_match(buf, "replacement")) {
3186 /* Can only set a device as a replacement when array has not
3187 * yet been started. Once running, replacement is automatic
3188 * from spares, or by assigning 'slot'.
3189 */
3190 if (rdev->mddev->pers)
3191 err = -EBUSY;
3192 else {
3193 set_bit(Replacement, &rdev->flags);
3194 err = 0;
3195 }
3196 } else if (cmd_match(buf, "-replacement")) {
3197 /* Similarly, can only clear Replacement before start */
3198 if (rdev->mddev->pers)
3199 err = -EBUSY;
3200 else {
3201 clear_bit(Replacement, &rdev->flags);
3202 err = 0;
3203 }
3204 } else if (cmd_match(buf, "re-add")) {
3205 if (!rdev->mddev->pers)
3206 err = -EINVAL;
3207 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3208 rdev->saved_raid_disk >= 0) {
3209 /* clear_bit is performed _after_ all the devices
3210 * have their local Faulty bit cleared. If any writes
3211 * happen in the meantime in the local node, they
3212 * will land in the local bitmap, which will be synced
3213 * by this node eventually
3214 */
3215 if (!mddev_is_clustered(rdev->mddev) ||
3216 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
3217 clear_bit(Faulty, &rdev->flags);
3218 err = add_bound_rdev(rdev);
3219 }
3220 } else
3221 err = -EBUSY;
3222 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3223 set_bit(ExternalBbl, &rdev->flags);
3224 rdev->badblocks.shift = 0;
3225 err = 0;
3226 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3227 clear_bit(ExternalBbl, &rdev->flags);
3228 err = 0;
3229 }
3230 if (need_update_sb)
3231 md_update_sb(mddev, 1);
3232 if (!err)
3233 sysfs_notify_dirent_safe(rdev->sysfs_state);
3234 return err ? err : len;
3235 }
3236 static struct rdev_sysfs_entry rdev_state =
3237 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3238
3239 static ssize_t
errors_show(struct md_rdev * rdev,char * page)3240 errors_show(struct md_rdev *rdev, char *page)
3241 {
3242 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3243 }
3244
3245 static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3246 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3247 {
3248 unsigned int n;
3249 int rv;
3250
3251 rv = kstrtouint(buf, 10, &n);
3252 if (rv < 0)
3253 return rv;
3254 atomic_set(&rdev->corrected_errors, n);
3255 return len;
3256 }
3257 static struct rdev_sysfs_entry rdev_errors =
3258 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3259
3260 static ssize_t
slot_show(struct md_rdev * rdev,char * page)3261 slot_show(struct md_rdev *rdev, char *page)
3262 {
3263 if (test_bit(Journal, &rdev->flags))
3264 return sprintf(page, "journal\n");
3265 else if (rdev->raid_disk < 0)
3266 return sprintf(page, "none\n");
3267 else
3268 return sprintf(page, "%d\n", rdev->raid_disk);
3269 }
3270
3271 static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3272 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3273 {
3274 int slot;
3275 int err;
3276
3277 if (test_bit(Journal, &rdev->flags))
3278 return -EBUSY;
3279 if (strncmp(buf, "none", 4)==0)
3280 slot = -1;
3281 else {
3282 err = kstrtouint(buf, 10, (unsigned int *)&slot);
3283 if (err < 0)
3284 return err;
3285 if (slot < 0)
3286 /* overflow */
3287 return -ENOSPC;
3288 }
3289 if (rdev->mddev->pers && slot == -1) {
3290 /* Setting 'slot' on an active array requires also
3291 * updating the 'rd%d' link, and communicating
3292 * with the personality with ->hot_*_disk.
3293 * For now we only support removing
3294 * failed/spare devices. This normally happens automatically,
3295 * but not when the metadata is externally managed.
3296 */
3297 if (rdev->raid_disk == -1)
3298 return -EEXIST;
3299 /* personality does all needed checks */
3300 if (rdev->mddev->pers->hot_remove_disk == NULL)
3301 return -EINVAL;
3302 clear_bit(Blocked, &rdev->flags);
3303 remove_and_add_spares(rdev->mddev, rdev);
3304 if (rdev->raid_disk >= 0)
3305 return -EBUSY;
3306 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3307 } else if (rdev->mddev->pers) {
3308 /* Activating a spare .. or possibly reactivating
3309 * if we ever get bitmaps working here.
3310 */
3311 int err;
3312
3313 if (rdev->raid_disk != -1)
3314 return -EBUSY;
3315
3316 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3317 return -EBUSY;
3318
3319 if (rdev->mddev->pers->hot_add_disk == NULL)
3320 return -EINVAL;
3321
3322 if (slot >= rdev->mddev->raid_disks &&
3323 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3324 return -ENOSPC;
3325
3326 rdev->raid_disk = slot;
3327 if (test_bit(In_sync, &rdev->flags))
3328 rdev->saved_raid_disk = slot;
3329 else
3330 rdev->saved_raid_disk = -1;
3331 clear_bit(In_sync, &rdev->flags);
3332 clear_bit(Bitmap_sync, &rdev->flags);
3333 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3334 if (err) {
3335 rdev->raid_disk = -1;
3336 return err;
3337 } else
3338 sysfs_notify_dirent_safe(rdev->sysfs_state);
3339 /* failure here is OK */;
3340 sysfs_link_rdev(rdev->mddev, rdev);
3341 /* don't wakeup anyone, leave that to userspace. */
3342 } else {
3343 if (slot >= rdev->mddev->raid_disks &&
3344 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3345 return -ENOSPC;
3346 rdev->raid_disk = slot;
3347 /* assume it is working */
3348 clear_bit(Faulty, &rdev->flags);
3349 clear_bit(WriteMostly, &rdev->flags);
3350 set_bit(In_sync, &rdev->flags);
3351 sysfs_notify_dirent_safe(rdev->sysfs_state);
3352 }
3353 return len;
3354 }
3355
3356 static struct rdev_sysfs_entry rdev_slot =
3357 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3358
3359 static ssize_t
offset_show(struct md_rdev * rdev,char * page)3360 offset_show(struct md_rdev *rdev, char *page)
3361 {
3362 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3363 }
3364
3365 static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3366 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3367 {
3368 unsigned long long offset;
3369 if (kstrtoull(buf, 10, &offset) < 0)
3370 return -EINVAL;
3371 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3372 return -EBUSY;
3373 if (rdev->sectors && rdev->mddev->external)
3374 /* Must set offset before size, so overlap checks
3375 * can be sane */
3376 return -EBUSY;
3377 rdev->data_offset = offset;
3378 rdev->new_data_offset = offset;
3379 return len;
3380 }
3381
3382 static struct rdev_sysfs_entry rdev_offset =
3383 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3384
new_offset_show(struct md_rdev * rdev,char * page)3385 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3386 {
3387 return sprintf(page, "%llu\n",
3388 (unsigned long long)rdev->new_data_offset);
3389 }
3390
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3391 static ssize_t new_offset_store(struct md_rdev *rdev,
3392 const char *buf, size_t len)
3393 {
3394 unsigned long long new_offset;
3395 struct mddev *mddev = rdev->mddev;
3396
3397 if (kstrtoull(buf, 10, &new_offset) < 0)
3398 return -EINVAL;
3399
3400 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3401 return -EBUSY;
3402 if (new_offset == rdev->data_offset)
3403 /* reset is always permitted */
3404 ;
3405 else if (new_offset > rdev->data_offset) {
3406 /* must not push array size beyond rdev_sectors */
3407 if (new_offset - rdev->data_offset
3408 + mddev->dev_sectors > rdev->sectors)
3409 return -E2BIG;
3410 }
3411 /* Metadata worries about other space details. */
3412
3413 /* decreasing the offset is inconsistent with a backwards
3414 * reshape.
3415 */
3416 if (new_offset < rdev->data_offset &&
3417 mddev->reshape_backwards)
3418 return -EINVAL;
3419 /* Increasing offset is inconsistent with forwards
3420 * reshape. reshape_direction should be set to
3421 * 'backwards' first.
3422 */
3423 if (new_offset > rdev->data_offset &&
3424 !mddev->reshape_backwards)
3425 return -EINVAL;
3426
3427 if (mddev->pers && mddev->persistent &&
3428 !super_types[mddev->major_version]
3429 .allow_new_offset(rdev, new_offset))
3430 return -E2BIG;
3431 rdev->new_data_offset = new_offset;
3432 if (new_offset > rdev->data_offset)
3433 mddev->reshape_backwards = 1;
3434 else if (new_offset < rdev->data_offset)
3435 mddev->reshape_backwards = 0;
3436
3437 return len;
3438 }
3439 static struct rdev_sysfs_entry rdev_new_offset =
3440 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3441
3442 static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3443 rdev_size_show(struct md_rdev *rdev, char *page)
3444 {
3445 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3446 }
3447
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3448 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3449 {
3450 /* check if two start/length pairs overlap */
3451 if (a->data_offset + a->sectors <= b->data_offset)
3452 return false;
3453 if (b->data_offset + b->sectors <= a->data_offset)
3454 return false;
3455 return true;
3456 }
3457
md_rdev_overlaps(struct md_rdev * rdev)3458 static bool md_rdev_overlaps(struct md_rdev *rdev)
3459 {
3460 struct mddev *mddev;
3461 struct md_rdev *rdev2;
3462
3463 spin_lock(&all_mddevs_lock);
3464 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3465 if (test_bit(MD_DELETED, &mddev->flags))
3466 continue;
3467 rdev_for_each(rdev2, mddev) {
3468 if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3469 md_rdevs_overlap(rdev, rdev2)) {
3470 spin_unlock(&all_mddevs_lock);
3471 return true;
3472 }
3473 }
3474 }
3475 spin_unlock(&all_mddevs_lock);
3476 return false;
3477 }
3478
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3479 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3480 {
3481 unsigned long long blocks;
3482 sector_t new;
3483
3484 if (kstrtoull(buf, 10, &blocks) < 0)
3485 return -EINVAL;
3486
3487 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3488 return -EINVAL; /* sector conversion overflow */
3489
3490 new = blocks * 2;
3491 if (new != blocks * 2)
3492 return -EINVAL; /* unsigned long long to sector_t overflow */
3493
3494 *sectors = new;
3495 return 0;
3496 }
3497
3498 static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3499 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3500 {
3501 struct mddev *my_mddev = rdev->mddev;
3502 sector_t oldsectors = rdev->sectors;
3503 sector_t sectors;
3504
3505 if (test_bit(Journal, &rdev->flags))
3506 return -EBUSY;
3507 if (strict_blocks_to_sectors(buf, §ors) < 0)
3508 return -EINVAL;
3509 if (rdev->data_offset != rdev->new_data_offset)
3510 return -EINVAL; /* too confusing */
3511 if (my_mddev->pers && rdev->raid_disk >= 0) {
3512 if (my_mddev->persistent) {
3513 sectors = super_types[my_mddev->major_version].
3514 rdev_size_change(rdev, sectors);
3515 if (!sectors)
3516 return -EBUSY;
3517 } else if (!sectors)
3518 sectors = bdev_nr_sectors(rdev->bdev) -
3519 rdev->data_offset;
3520 if (!my_mddev->pers->resize)
3521 /* Cannot change size for RAID0 or Linear etc */
3522 return -EINVAL;
3523 }
3524 if (sectors < my_mddev->dev_sectors)
3525 return -EINVAL; /* component must fit device */
3526
3527 rdev->sectors = sectors;
3528
3529 /*
3530 * Check that all other rdevs with the same bdev do not overlap. This
3531 * check does not provide a hard guarantee, it just helps avoid
3532 * dangerous mistakes.
3533 */
3534 if (sectors > oldsectors && my_mddev->external &&
3535 md_rdev_overlaps(rdev)) {
3536 /*
3537 * Someone else could have slipped in a size change here, but
3538 * doing so is just silly. We put oldsectors back because we
3539 * know it is safe, and trust userspace not to race with itself.
3540 */
3541 rdev->sectors = oldsectors;
3542 return -EBUSY;
3543 }
3544 return len;
3545 }
3546
3547 static struct rdev_sysfs_entry rdev_size =
3548 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3549
recovery_start_show(struct md_rdev * rdev,char * page)3550 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3551 {
3552 unsigned long long recovery_start = rdev->recovery_offset;
3553
3554 if (test_bit(In_sync, &rdev->flags) ||
3555 recovery_start == MaxSector)
3556 return sprintf(page, "none\n");
3557
3558 return sprintf(page, "%llu\n", recovery_start);
3559 }
3560
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3561 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3562 {
3563 unsigned long long recovery_start;
3564
3565 if (cmd_match(buf, "none"))
3566 recovery_start = MaxSector;
3567 else if (kstrtoull(buf, 10, &recovery_start))
3568 return -EINVAL;
3569
3570 if (rdev->mddev->pers &&
3571 rdev->raid_disk >= 0)
3572 return -EBUSY;
3573
3574 rdev->recovery_offset = recovery_start;
3575 if (recovery_start == MaxSector)
3576 set_bit(In_sync, &rdev->flags);
3577 else
3578 clear_bit(In_sync, &rdev->flags);
3579 return len;
3580 }
3581
3582 static struct rdev_sysfs_entry rdev_recovery_start =
3583 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3584
3585 /* sysfs access to bad-blocks list.
3586 * We present two files.
3587 * 'bad-blocks' lists sector numbers and lengths of ranges that
3588 * are recorded as bad. The list is truncated to fit within
3589 * the one-page limit of sysfs.
3590 * Writing "sector length" to this file adds an acknowledged
3591 * bad block list.
3592 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3593 * been acknowledged. Writing to this file adds bad blocks
3594 * without acknowledging them. This is largely for testing.
3595 */
bb_show(struct md_rdev * rdev,char * page)3596 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3597 {
3598 return badblocks_show(&rdev->badblocks, page, 0);
3599 }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3600 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3601 {
3602 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3603 /* Maybe that ack was all we needed */
3604 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3605 wake_up(&rdev->blocked_wait);
3606 return rv;
3607 }
3608 static struct rdev_sysfs_entry rdev_bad_blocks =
3609 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3610
ubb_show(struct md_rdev * rdev,char * page)3611 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3612 {
3613 return badblocks_show(&rdev->badblocks, page, 1);
3614 }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3615 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3616 {
3617 return badblocks_store(&rdev->badblocks, page, len, 1);
3618 }
3619 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3620 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3621
3622 static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3623 ppl_sector_show(struct md_rdev *rdev, char *page)
3624 {
3625 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3626 }
3627
3628 static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3629 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3630 {
3631 unsigned long long sector;
3632
3633 if (kstrtoull(buf, 10, §or) < 0)
3634 return -EINVAL;
3635 if (sector != (sector_t)sector)
3636 return -EINVAL;
3637
3638 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3639 rdev->raid_disk >= 0)
3640 return -EBUSY;
3641
3642 if (rdev->mddev->persistent) {
3643 if (rdev->mddev->major_version == 0)
3644 return -EINVAL;
3645 if ((sector > rdev->sb_start &&
3646 sector - rdev->sb_start > S16_MAX) ||
3647 (sector < rdev->sb_start &&
3648 rdev->sb_start - sector > -S16_MIN))
3649 return -EINVAL;
3650 rdev->ppl.offset = sector - rdev->sb_start;
3651 } else if (!rdev->mddev->external) {
3652 return -EBUSY;
3653 }
3654 rdev->ppl.sector = sector;
3655 return len;
3656 }
3657
3658 static struct rdev_sysfs_entry rdev_ppl_sector =
3659 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3660
3661 static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3662 ppl_size_show(struct md_rdev *rdev, char *page)
3663 {
3664 return sprintf(page, "%u\n", rdev->ppl.size);
3665 }
3666
3667 static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3668 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3669 {
3670 unsigned int size;
3671
3672 if (kstrtouint(buf, 10, &size) < 0)
3673 return -EINVAL;
3674
3675 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3676 rdev->raid_disk >= 0)
3677 return -EBUSY;
3678
3679 if (rdev->mddev->persistent) {
3680 if (rdev->mddev->major_version == 0)
3681 return -EINVAL;
3682 if (size > U16_MAX)
3683 return -EINVAL;
3684 } else if (!rdev->mddev->external) {
3685 return -EBUSY;
3686 }
3687 rdev->ppl.size = size;
3688 return len;
3689 }
3690
3691 static struct rdev_sysfs_entry rdev_ppl_size =
3692 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3693
3694 static struct attribute *rdev_default_attrs[] = {
3695 &rdev_state.attr,
3696 &rdev_errors.attr,
3697 &rdev_slot.attr,
3698 &rdev_offset.attr,
3699 &rdev_new_offset.attr,
3700 &rdev_size.attr,
3701 &rdev_recovery_start.attr,
3702 &rdev_bad_blocks.attr,
3703 &rdev_unack_bad_blocks.attr,
3704 &rdev_ppl_sector.attr,
3705 &rdev_ppl_size.attr,
3706 NULL,
3707 };
3708 ATTRIBUTE_GROUPS(rdev_default);
3709 static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3710 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3711 {
3712 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3713 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3714
3715 if (!entry->show)
3716 return -EIO;
3717 if (!rdev->mddev)
3718 return -ENODEV;
3719 return entry->show(rdev, page);
3720 }
3721
3722 static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3723 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3724 const char *page, size_t length)
3725 {
3726 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3727 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3728 struct kernfs_node *kn = NULL;
3729 bool suspend = false;
3730 ssize_t rv;
3731 struct mddev *mddev = READ_ONCE(rdev->mddev);
3732
3733 if (!entry->store)
3734 return -EIO;
3735 if (!capable(CAP_SYS_ADMIN))
3736 return -EACCES;
3737 if (!mddev)
3738 return -ENODEV;
3739
3740 if (entry->store == state_store) {
3741 if (cmd_match(page, "remove"))
3742 kn = sysfs_break_active_protection(kobj, attr);
3743 if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3744 cmd_match(page, "writemostly") ||
3745 cmd_match(page, "-writemostly"))
3746 suspend = true;
3747 }
3748
3749 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3750 if (!rv) {
3751 if (rdev->mddev == NULL)
3752 rv = -ENODEV;
3753 else
3754 rv = entry->store(rdev, page, length);
3755 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3756 }
3757
3758 if (kn)
3759 sysfs_unbreak_active_protection(kn);
3760
3761 return rv;
3762 }
3763
rdev_free(struct kobject * ko)3764 static void rdev_free(struct kobject *ko)
3765 {
3766 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3767 kfree(rdev);
3768 }
3769 static const struct sysfs_ops rdev_sysfs_ops = {
3770 .show = rdev_attr_show,
3771 .store = rdev_attr_store,
3772 };
3773 static const struct kobj_type rdev_ktype = {
3774 .release = rdev_free,
3775 .sysfs_ops = &rdev_sysfs_ops,
3776 .default_groups = rdev_default_groups,
3777 };
3778
md_rdev_init(struct md_rdev * rdev)3779 int md_rdev_init(struct md_rdev *rdev)
3780 {
3781 rdev->desc_nr = -1;
3782 rdev->saved_raid_disk = -1;
3783 rdev->raid_disk = -1;
3784 rdev->flags = 0;
3785 rdev->data_offset = 0;
3786 rdev->new_data_offset = 0;
3787 rdev->sb_events = 0;
3788 rdev->last_read_error = 0;
3789 rdev->sb_loaded = 0;
3790 rdev->bb_page = NULL;
3791 atomic_set(&rdev->nr_pending, 0);
3792 atomic_set(&rdev->read_errors, 0);
3793 atomic_set(&rdev->corrected_errors, 0);
3794
3795 INIT_LIST_HEAD(&rdev->same_set);
3796 init_waitqueue_head(&rdev->blocked_wait);
3797
3798 /* Add space to store bad block list.
3799 * This reserves the space even on arrays where it cannot
3800 * be used - I wonder if that matters
3801 */
3802 return badblocks_init(&rdev->badblocks, 0);
3803 }
3804 EXPORT_SYMBOL_GPL(md_rdev_init);
3805
3806 /*
3807 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3808 *
3809 * mark the device faulty if:
3810 *
3811 * - the device is nonexistent (zero size)
3812 * - the device has no valid superblock
3813 *
3814 * a faulty rdev _never_ has rdev->sb set.
3815 */
md_import_device(dev_t newdev,int super_format,int super_minor)3816 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3817 {
3818 struct md_rdev *rdev;
3819 sector_t size;
3820 int err;
3821
3822 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3823 if (!rdev)
3824 return ERR_PTR(-ENOMEM);
3825
3826 err = md_rdev_init(rdev);
3827 if (err)
3828 goto out_free_rdev;
3829 err = alloc_disk_sb(rdev);
3830 if (err)
3831 goto out_clear_rdev;
3832
3833 rdev->bdev_file = bdev_file_open_by_dev(newdev,
3834 BLK_OPEN_READ | BLK_OPEN_WRITE,
3835 super_format == -2 ? &claim_rdev : rdev, NULL);
3836 if (IS_ERR(rdev->bdev_file)) {
3837 pr_warn("md: could not open device unknown-block(%u,%u).\n",
3838 MAJOR(newdev), MINOR(newdev));
3839 err = PTR_ERR(rdev->bdev_file);
3840 goto out_clear_rdev;
3841 }
3842 rdev->bdev = file_bdev(rdev->bdev_file);
3843
3844 kobject_init(&rdev->kobj, &rdev_ktype);
3845
3846 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3847 if (!size) {
3848 pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3849 rdev->bdev);
3850 err = -EINVAL;
3851 goto out_blkdev_put;
3852 }
3853
3854 if (super_format >= 0) {
3855 err = super_types[super_format].
3856 load_super(rdev, NULL, super_minor);
3857 if (err == -EINVAL) {
3858 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3859 rdev->bdev,
3860 super_format, super_minor);
3861 goto out_blkdev_put;
3862 }
3863 if (err < 0) {
3864 pr_warn("md: could not read %pg's sb, not importing!\n",
3865 rdev->bdev);
3866 goto out_blkdev_put;
3867 }
3868 }
3869
3870 return rdev;
3871
3872 out_blkdev_put:
3873 fput(rdev->bdev_file);
3874 out_clear_rdev:
3875 md_rdev_clear(rdev);
3876 out_free_rdev:
3877 kfree(rdev);
3878 return ERR_PTR(err);
3879 }
3880
3881 /*
3882 * Check a full RAID array for plausibility
3883 */
3884
analyze_sbs(struct mddev * mddev)3885 static int analyze_sbs(struct mddev *mddev)
3886 {
3887 struct md_rdev *rdev, *freshest, *tmp;
3888
3889 freshest = NULL;
3890 rdev_for_each_safe(rdev, tmp, mddev)
3891 switch (super_types[mddev->major_version].
3892 load_super(rdev, freshest, mddev->minor_version)) {
3893 case 1:
3894 freshest = rdev;
3895 break;
3896 case 0:
3897 break;
3898 default:
3899 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3900 rdev->bdev);
3901 md_kick_rdev_from_array(rdev);
3902 }
3903
3904 /* Cannot find a valid fresh disk */
3905 if (!freshest) {
3906 pr_warn("md: cannot find a valid disk\n");
3907 return -EINVAL;
3908 }
3909
3910 super_types[mddev->major_version].
3911 validate_super(mddev, NULL/*freshest*/, freshest);
3912
3913 rdev_for_each_safe(rdev, tmp, mddev) {
3914 if (mddev->max_disks &&
3915 rdev->desc_nr >= mddev->max_disks) {
3916 pr_warn("md: %s: %pg: only %d devices permitted\n",
3917 mdname(mddev), rdev->bdev,
3918 mddev->max_disks);
3919 md_kick_rdev_from_array(rdev);
3920 continue;
3921 }
3922 if (rdev != freshest) {
3923 if (super_types[mddev->major_version].
3924 validate_super(mddev, freshest, rdev)) {
3925 pr_warn("md: kicking non-fresh %pg from array!\n",
3926 rdev->bdev);
3927 md_kick_rdev_from_array(rdev);
3928 continue;
3929 }
3930 }
3931 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3932 !test_bit(Journal, &rdev->flags)) {
3933 rdev->raid_disk = -1;
3934 clear_bit(In_sync, &rdev->flags);
3935 }
3936 }
3937
3938 return 0;
3939 }
3940
3941 /* Read a fixed-point number.
3942 * Numbers in sysfs attributes should be in "standard" units where
3943 * possible, so time should be in seconds.
3944 * However we internally use a a much smaller unit such as
3945 * milliseconds or jiffies.
3946 * This function takes a decimal number with a possible fractional
3947 * component, and produces an integer which is the result of
3948 * multiplying that number by 10^'scale'.
3949 * all without any floating-point arithmetic.
3950 */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3951 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3952 {
3953 unsigned long result = 0;
3954 long decimals = -1;
3955 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3956 if (*cp == '.')
3957 decimals = 0;
3958 else if (decimals < scale) {
3959 unsigned int value;
3960 value = *cp - '0';
3961 result = result * 10 + value;
3962 if (decimals >= 0)
3963 decimals++;
3964 }
3965 cp++;
3966 }
3967 if (*cp == '\n')
3968 cp++;
3969 if (*cp)
3970 return -EINVAL;
3971 if (decimals < 0)
3972 decimals = 0;
3973 *res = result * int_pow(10, scale - decimals);
3974 return 0;
3975 }
3976
3977 static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3978 safe_delay_show(struct mddev *mddev, char *page)
3979 {
3980 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3981
3982 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3983 }
3984 static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3985 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3986 {
3987 unsigned long msec;
3988
3989 if (mddev_is_clustered(mddev)) {
3990 pr_warn("md: Safemode is disabled for clustered mode\n");
3991 return -EINVAL;
3992 }
3993
3994 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3995 return -EINVAL;
3996 if (msec == 0)
3997 mddev->safemode_delay = 0;
3998 else {
3999 unsigned long old_delay = mddev->safemode_delay;
4000 unsigned long new_delay = (msec*HZ)/1000;
4001
4002 if (new_delay == 0)
4003 new_delay = 1;
4004 mddev->safemode_delay = new_delay;
4005 if (new_delay < old_delay || old_delay == 0)
4006 mod_timer(&mddev->safemode_timer, jiffies+1);
4007 }
4008 return len;
4009 }
4010 static struct md_sysfs_entry md_safe_delay =
4011 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
4012
4013 static ssize_t
level_show(struct mddev * mddev,char * page)4014 level_show(struct mddev *mddev, char *page)
4015 {
4016 struct md_personality *p;
4017 int ret;
4018 spin_lock(&mddev->lock);
4019 p = mddev->pers;
4020 if (p)
4021 ret = sprintf(page, "%s\n", p->head.name);
4022 else if (mddev->clevel[0])
4023 ret = sprintf(page, "%s\n", mddev->clevel);
4024 else if (mddev->level != LEVEL_NONE)
4025 ret = sprintf(page, "%d\n", mddev->level);
4026 else
4027 ret = 0;
4028 spin_unlock(&mddev->lock);
4029 return ret;
4030 }
4031
4032 static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)4033 level_store(struct mddev *mddev, const char *buf, size_t len)
4034 {
4035 char clevel[16];
4036 ssize_t rv;
4037 size_t slen = len;
4038 struct md_personality *pers, *oldpers;
4039 long level;
4040 void *priv, *oldpriv;
4041 struct md_rdev *rdev;
4042
4043 if (slen == 0 || slen >= sizeof(clevel))
4044 return -EINVAL;
4045
4046 rv = mddev_suspend_and_lock(mddev);
4047 if (rv)
4048 return rv;
4049
4050 if (mddev->pers == NULL) {
4051 memcpy(mddev->clevel, buf, slen);
4052 if (mddev->clevel[slen-1] == '\n')
4053 slen--;
4054 mddev->clevel[slen] = 0;
4055 mddev->level = LEVEL_NONE;
4056 rv = len;
4057 goto out_unlock;
4058 }
4059 rv = -EROFS;
4060 if (!md_is_rdwr(mddev))
4061 goto out_unlock;
4062
4063 /* request to change the personality. Need to ensure:
4064 * - array is not engaged in resync/recovery/reshape
4065 * - old personality can be suspended
4066 * - new personality will access other array.
4067 */
4068
4069 rv = -EBUSY;
4070 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4071 mddev->reshape_position != MaxSector ||
4072 mddev->sysfs_active)
4073 goto out_unlock;
4074
4075 rv = -EINVAL;
4076 if (!mddev->pers->quiesce) {
4077 pr_warn("md: %s: %s does not support online personality change\n",
4078 mdname(mddev), mddev->pers->head.name);
4079 goto out_unlock;
4080 }
4081
4082 /* Now find the new personality */
4083 memcpy(clevel, buf, slen);
4084 if (clevel[slen-1] == '\n')
4085 slen--;
4086 clevel[slen] = 0;
4087 if (kstrtol(clevel, 10, &level))
4088 level = LEVEL_NONE;
4089
4090 if (request_module("md-%s", clevel) != 0)
4091 request_module("md-level-%s", clevel);
4092 pers = get_pers(level, clevel);
4093 if (!pers) {
4094 rv = -EINVAL;
4095 goto out_unlock;
4096 }
4097
4098 if (pers == mddev->pers) {
4099 /* Nothing to do! */
4100 put_pers(pers);
4101 rv = len;
4102 goto out_unlock;
4103 }
4104 if (!pers->takeover) {
4105 put_pers(pers);
4106 pr_warn("md: %s: %s does not support personality takeover\n",
4107 mdname(mddev), clevel);
4108 rv = -EINVAL;
4109 goto out_unlock;
4110 }
4111
4112 rdev_for_each(rdev, mddev)
4113 rdev->new_raid_disk = rdev->raid_disk;
4114
4115 /* ->takeover must set new_* and/or delta_disks
4116 * if it succeeds, and may set them when it fails.
4117 */
4118 priv = pers->takeover(mddev);
4119 if (IS_ERR(priv)) {
4120 mddev->new_level = mddev->level;
4121 mddev->new_layout = mddev->layout;
4122 mddev->new_chunk_sectors = mddev->chunk_sectors;
4123 mddev->raid_disks -= mddev->delta_disks;
4124 mddev->delta_disks = 0;
4125 mddev->reshape_backwards = 0;
4126 put_pers(pers);
4127 pr_warn("md: %s: %s would not accept array\n",
4128 mdname(mddev), clevel);
4129 rv = PTR_ERR(priv);
4130 goto out_unlock;
4131 }
4132
4133 /* Looks like we have a winner */
4134 mddev_detach(mddev);
4135
4136 spin_lock(&mddev->lock);
4137 oldpers = mddev->pers;
4138 oldpriv = mddev->private;
4139 mddev->pers = pers;
4140 mddev->private = priv;
4141 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
4142 mddev->level = mddev->new_level;
4143 mddev->layout = mddev->new_layout;
4144 mddev->chunk_sectors = mddev->new_chunk_sectors;
4145 mddev->delta_disks = 0;
4146 mddev->reshape_backwards = 0;
4147 mddev->degraded = 0;
4148 spin_unlock(&mddev->lock);
4149
4150 if (oldpers->sync_request == NULL &&
4151 mddev->external) {
4152 /* We are converting from a no-redundancy array
4153 * to a redundancy array and metadata is managed
4154 * externally so we need to be sure that writes
4155 * won't block due to a need to transition
4156 * clean->dirty
4157 * until external management is started.
4158 */
4159 mddev->in_sync = 0;
4160 mddev->safemode_delay = 0;
4161 mddev->safemode = 0;
4162 }
4163
4164 oldpers->free(mddev, oldpriv);
4165
4166 if (oldpers->sync_request == NULL &&
4167 pers->sync_request != NULL) {
4168 /* need to add the md_redundancy_group */
4169 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4170 pr_warn("md: cannot register extra attributes for %s\n",
4171 mdname(mddev));
4172 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4173 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4174 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4175 }
4176 if (oldpers->sync_request != NULL &&
4177 pers->sync_request == NULL) {
4178 /* need to remove the md_redundancy_group */
4179 if (mddev->to_remove == NULL)
4180 mddev->to_remove = &md_redundancy_group;
4181 }
4182
4183 put_pers(oldpers);
4184
4185 rdev_for_each(rdev, mddev) {
4186 if (rdev->raid_disk < 0)
4187 continue;
4188 if (rdev->new_raid_disk >= mddev->raid_disks)
4189 rdev->new_raid_disk = -1;
4190 if (rdev->new_raid_disk == rdev->raid_disk)
4191 continue;
4192 sysfs_unlink_rdev(mddev, rdev);
4193 }
4194 rdev_for_each(rdev, mddev) {
4195 if (rdev->raid_disk < 0)
4196 continue;
4197 if (rdev->new_raid_disk == rdev->raid_disk)
4198 continue;
4199 rdev->raid_disk = rdev->new_raid_disk;
4200 if (rdev->raid_disk < 0)
4201 clear_bit(In_sync, &rdev->flags);
4202 else {
4203 if (sysfs_link_rdev(mddev, rdev))
4204 pr_warn("md: cannot register rd%d for %s after level change\n",
4205 rdev->raid_disk, mdname(mddev));
4206 }
4207 }
4208
4209 if (pers->sync_request == NULL) {
4210 /* this is now an array without redundancy, so
4211 * it must always be in_sync
4212 */
4213 mddev->in_sync = 1;
4214 timer_delete_sync(&mddev->safemode_timer);
4215 }
4216 pers->run(mddev);
4217 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4218 if (!mddev->thread)
4219 md_update_sb(mddev, 1);
4220 sysfs_notify_dirent_safe(mddev->sysfs_level);
4221 md_new_event();
4222 rv = len;
4223 out_unlock:
4224 mddev_unlock_and_resume(mddev);
4225 return rv;
4226 }
4227
4228 static struct md_sysfs_entry md_level =
4229 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4230
4231 static ssize_t
new_level_show(struct mddev * mddev,char * page)4232 new_level_show(struct mddev *mddev, char *page)
4233 {
4234 return sprintf(page, "%d\n", mddev->new_level);
4235 }
4236
4237 static ssize_t
new_level_store(struct mddev * mddev,const char * buf,size_t len)4238 new_level_store(struct mddev *mddev, const char *buf, size_t len)
4239 {
4240 unsigned int n;
4241 int err;
4242
4243 err = kstrtouint(buf, 10, &n);
4244 if (err < 0)
4245 return err;
4246 err = mddev_lock(mddev);
4247 if (err)
4248 return err;
4249
4250 mddev->new_level = n;
4251 md_update_sb(mddev, 1);
4252
4253 mddev_unlock(mddev);
4254 return len;
4255 }
4256 static struct md_sysfs_entry md_new_level =
4257 __ATTR(new_level, 0664, new_level_show, new_level_store);
4258
4259 static ssize_t
bitmap_type_show(struct mddev * mddev,char * page)4260 bitmap_type_show(struct mddev *mddev, char *page)
4261 {
4262 struct md_submodule_head *head;
4263 unsigned long i;
4264 ssize_t len = 0;
4265
4266 if (mddev->bitmap_id == ID_BITMAP_NONE)
4267 len += sprintf(page + len, "[none] ");
4268 else
4269 len += sprintf(page + len, "none ");
4270
4271 xa_lock(&md_submodule);
4272 xa_for_each(&md_submodule, i, head) {
4273 if (head->type != MD_BITMAP)
4274 continue;
4275
4276 if (mddev->bitmap_id == head->id)
4277 len += sprintf(page + len, "[%s] ", head->name);
4278 else
4279 len += sprintf(page + len, "%s ", head->name);
4280 }
4281 xa_unlock(&md_submodule);
4282
4283 len += sprintf(page + len, "\n");
4284 return len;
4285 }
4286
4287 static ssize_t
bitmap_type_store(struct mddev * mddev,const char * buf,size_t len)4288 bitmap_type_store(struct mddev *mddev, const char *buf, size_t len)
4289 {
4290 struct md_submodule_head *head;
4291 enum md_submodule_id id;
4292 unsigned long i;
4293 int err = 0;
4294
4295 xa_lock(&md_submodule);
4296
4297 if (mddev->bitmap_ops) {
4298 err = -EBUSY;
4299 goto out;
4300 }
4301
4302 if (cmd_match(buf, "none")) {
4303 mddev->bitmap_id = ID_BITMAP_NONE;
4304 goto out;
4305 }
4306
4307 xa_for_each(&md_submodule, i, head) {
4308 if (head->type == MD_BITMAP && cmd_match(buf, head->name)) {
4309 mddev->bitmap_id = head->id;
4310 goto out;
4311 }
4312 }
4313
4314 err = kstrtoint(buf, 10, &id);
4315 if (err)
4316 goto out;
4317
4318 if (id == ID_BITMAP_NONE) {
4319 mddev->bitmap_id = id;
4320 goto out;
4321 }
4322
4323 head = xa_load(&md_submodule, id);
4324 if (head && head->type == MD_BITMAP) {
4325 mddev->bitmap_id = id;
4326 goto out;
4327 }
4328
4329 err = -ENOENT;
4330
4331 out:
4332 xa_unlock(&md_submodule);
4333 return err ? err : len;
4334 }
4335
4336 static struct md_sysfs_entry md_bitmap_type =
4337 __ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store);
4338
4339 static ssize_t
layout_show(struct mddev * mddev,char * page)4340 layout_show(struct mddev *mddev, char *page)
4341 {
4342 /* just a number, not meaningful for all levels */
4343 if (mddev->reshape_position != MaxSector &&
4344 mddev->layout != mddev->new_layout)
4345 return sprintf(page, "%d (%d)\n",
4346 mddev->new_layout, mddev->layout);
4347 return sprintf(page, "%d\n", mddev->layout);
4348 }
4349
4350 static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4351 layout_store(struct mddev *mddev, const char *buf, size_t len)
4352 {
4353 unsigned int n;
4354 int err;
4355
4356 err = kstrtouint(buf, 10, &n);
4357 if (err < 0)
4358 return err;
4359 err = mddev_lock(mddev);
4360 if (err)
4361 return err;
4362
4363 if (mddev->pers) {
4364 if (mddev->pers->check_reshape == NULL)
4365 err = -EBUSY;
4366 else if (!md_is_rdwr(mddev))
4367 err = -EROFS;
4368 else {
4369 mddev->new_layout = n;
4370 err = mddev->pers->check_reshape(mddev);
4371 if (err)
4372 mddev->new_layout = mddev->layout;
4373 }
4374 } else {
4375 mddev->new_layout = n;
4376 if (mddev->reshape_position == MaxSector)
4377 mddev->layout = n;
4378 }
4379 mddev_unlock(mddev);
4380 return err ?: len;
4381 }
4382 static struct md_sysfs_entry md_layout =
4383 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4384
4385 static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4386 raid_disks_show(struct mddev *mddev, char *page)
4387 {
4388 if (mddev->raid_disks == 0)
4389 return 0;
4390 if (mddev->reshape_position != MaxSector &&
4391 mddev->delta_disks != 0)
4392 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4393 mddev->raid_disks - mddev->delta_disks);
4394 return sprintf(page, "%d\n", mddev->raid_disks);
4395 }
4396
4397 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4398
4399 static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4400 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4401 {
4402 unsigned int n;
4403 int err;
4404
4405 err = kstrtouint(buf, 10, &n);
4406 if (err < 0)
4407 return err;
4408
4409 err = mddev_suspend_and_lock(mddev);
4410 if (err)
4411 return err;
4412 if (mddev->pers)
4413 err = update_raid_disks(mddev, n);
4414 else if (mddev->reshape_position != MaxSector) {
4415 struct md_rdev *rdev;
4416 int olddisks = mddev->raid_disks - mddev->delta_disks;
4417
4418 err = -EINVAL;
4419 rdev_for_each(rdev, mddev) {
4420 if (olddisks < n &&
4421 rdev->data_offset < rdev->new_data_offset)
4422 goto out_unlock;
4423 if (olddisks > n &&
4424 rdev->data_offset > rdev->new_data_offset)
4425 goto out_unlock;
4426 }
4427 err = 0;
4428 mddev->delta_disks = n - olddisks;
4429 mddev->raid_disks = n;
4430 mddev->reshape_backwards = (mddev->delta_disks < 0);
4431 } else
4432 mddev->raid_disks = n;
4433 out_unlock:
4434 mddev_unlock_and_resume(mddev);
4435 return err ? err : len;
4436 }
4437 static struct md_sysfs_entry md_raid_disks =
4438 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4439
4440 static ssize_t
uuid_show(struct mddev * mddev,char * page)4441 uuid_show(struct mddev *mddev, char *page)
4442 {
4443 return sprintf(page, "%pU\n", mddev->uuid);
4444 }
4445 static struct md_sysfs_entry md_uuid =
4446 __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4447
4448 static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4449 chunk_size_show(struct mddev *mddev, char *page)
4450 {
4451 if (mddev->reshape_position != MaxSector &&
4452 mddev->chunk_sectors != mddev->new_chunk_sectors)
4453 return sprintf(page, "%d (%d)\n",
4454 mddev->new_chunk_sectors << 9,
4455 mddev->chunk_sectors << 9);
4456 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4457 }
4458
4459 static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4460 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4461 {
4462 unsigned long n;
4463 int err;
4464
4465 err = kstrtoul(buf, 10, &n);
4466 if (err < 0)
4467 return err;
4468
4469 err = mddev_lock(mddev);
4470 if (err)
4471 return err;
4472 if (mddev->pers) {
4473 if (mddev->pers->check_reshape == NULL)
4474 err = -EBUSY;
4475 else if (!md_is_rdwr(mddev))
4476 err = -EROFS;
4477 else {
4478 mddev->new_chunk_sectors = n >> 9;
4479 err = mddev->pers->check_reshape(mddev);
4480 if (err)
4481 mddev->new_chunk_sectors = mddev->chunk_sectors;
4482 }
4483 } else {
4484 mddev->new_chunk_sectors = n >> 9;
4485 if (mddev->reshape_position == MaxSector)
4486 mddev->chunk_sectors = n >> 9;
4487 }
4488 mddev_unlock(mddev);
4489 return err ?: len;
4490 }
4491 static struct md_sysfs_entry md_chunk_size =
4492 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4493
4494 static ssize_t
resync_start_show(struct mddev * mddev,char * page)4495 resync_start_show(struct mddev *mddev, char *page)
4496 {
4497 if (mddev->resync_offset == MaxSector)
4498 return sprintf(page, "none\n");
4499 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
4500 }
4501
4502 static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4503 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4504 {
4505 unsigned long long n;
4506 int err;
4507
4508 if (cmd_match(buf, "none"))
4509 n = MaxSector;
4510 else {
4511 err = kstrtoull(buf, 10, &n);
4512 if (err < 0)
4513 return err;
4514 if (n != (sector_t)n)
4515 return -EINVAL;
4516 }
4517
4518 err = mddev_lock(mddev);
4519 if (err)
4520 return err;
4521 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4522 err = -EBUSY;
4523
4524 if (!err) {
4525 mddev->resync_offset = n;
4526 if (mddev->pers)
4527 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4528 }
4529 mddev_unlock(mddev);
4530 return err ?: len;
4531 }
4532 static struct md_sysfs_entry md_resync_start =
4533 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4534 resync_start_show, resync_start_store);
4535
4536 /*
4537 * The array state can be:
4538 *
4539 * clear
4540 * No devices, no size, no level
4541 * Equivalent to STOP_ARRAY ioctl
4542 * inactive
4543 * May have some settings, but array is not active
4544 * all IO results in error
4545 * When written, doesn't tear down array, but just stops it
4546 * suspended (not supported yet)
4547 * All IO requests will block. The array can be reconfigured.
4548 * Writing this, if accepted, will block until array is quiescent
4549 * readonly
4550 * no resync can happen. no superblocks get written.
4551 * write requests fail
4552 * read-auto
4553 * like readonly, but behaves like 'clean' on a write request.
4554 *
4555 * clean - no pending writes, but otherwise active.
4556 * When written to inactive array, starts without resync
4557 * If a write request arrives then
4558 * if metadata is known, mark 'dirty' and switch to 'active'.
4559 * if not known, block and switch to write-pending
4560 * If written to an active array that has pending writes, then fails.
4561 * active
4562 * fully active: IO and resync can be happening.
4563 * When written to inactive array, starts with resync
4564 *
4565 * write-pending
4566 * clean, but writes are blocked waiting for 'active' to be written.
4567 *
4568 * active-idle
4569 * like active, but no writes have been seen for a while (100msec).
4570 *
4571 * broken
4572 * Array is failed. It's useful because mounted-arrays aren't stopped
4573 * when array is failed, so this state will at least alert the user that
4574 * something is wrong.
4575 */
4576 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4577 write_pending, active_idle, broken, bad_word};
4578 static char *array_states[] = {
4579 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4580 "write-pending", "active-idle", "broken", NULL };
4581
match_word(const char * word,char ** list)4582 static int match_word(const char *word, char **list)
4583 {
4584 int n;
4585 for (n=0; list[n]; n++)
4586 if (cmd_match(word, list[n]))
4587 break;
4588 return n;
4589 }
4590
4591 static ssize_t
array_state_show(struct mddev * mddev,char * page)4592 array_state_show(struct mddev *mddev, char *page)
4593 {
4594 enum array_state st = inactive;
4595
4596 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4597 switch(mddev->ro) {
4598 case MD_RDONLY:
4599 st = readonly;
4600 break;
4601 case MD_AUTO_READ:
4602 st = read_auto;
4603 break;
4604 case MD_RDWR:
4605 spin_lock(&mddev->lock);
4606 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4607 st = write_pending;
4608 else if (mddev->in_sync)
4609 st = clean;
4610 else if (mddev->safemode)
4611 st = active_idle;
4612 else
4613 st = active;
4614 spin_unlock(&mddev->lock);
4615 }
4616
4617 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4618 st = broken;
4619 } else {
4620 if (list_empty(&mddev->disks) &&
4621 mddev->raid_disks == 0 &&
4622 mddev->dev_sectors == 0)
4623 st = clear;
4624 else
4625 st = inactive;
4626 }
4627 return sprintf(page, "%s\n", array_states[st]);
4628 }
4629
4630 static int do_md_stop(struct mddev *mddev, int ro);
4631 static int md_set_readonly(struct mddev *mddev);
4632 static int restart_array(struct mddev *mddev);
4633
4634 static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4635 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4636 {
4637 int err = 0;
4638 enum array_state st = match_word(buf, array_states);
4639
4640 /* No lock dependent actions */
4641 switch (st) {
4642 case suspended: /* not supported yet */
4643 case write_pending: /* cannot be set */
4644 case active_idle: /* cannot be set */
4645 case broken: /* cannot be set */
4646 case bad_word:
4647 return -EINVAL;
4648 case clear:
4649 case readonly:
4650 case inactive:
4651 case read_auto:
4652 if (!mddev->pers || !md_is_rdwr(mddev))
4653 break;
4654 /* write sysfs will not open mddev and opener should be 0 */
4655 err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4656 if (err)
4657 return err;
4658 break;
4659 default:
4660 break;
4661 }
4662
4663 if (mddev->pers && (st == active || st == clean) &&
4664 mddev->ro != MD_RDONLY) {
4665 /* don't take reconfig_mutex when toggling between
4666 * clean and active
4667 */
4668 spin_lock(&mddev->lock);
4669 if (st == active) {
4670 restart_array(mddev);
4671 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4672 md_wakeup_thread(mddev->thread);
4673 wake_up(&mddev->sb_wait);
4674 } else /* st == clean */ {
4675 restart_array(mddev);
4676 if (!set_in_sync(mddev))
4677 err = -EBUSY;
4678 }
4679 if (!err)
4680 sysfs_notify_dirent_safe(mddev->sysfs_state);
4681 spin_unlock(&mddev->lock);
4682 return err ?: len;
4683 }
4684 err = mddev_lock(mddev);
4685 if (err)
4686 return err;
4687
4688 switch (st) {
4689 case inactive:
4690 /* stop an active array, return 0 otherwise */
4691 if (mddev->pers)
4692 err = do_md_stop(mddev, 2);
4693 break;
4694 case clear:
4695 err = do_md_stop(mddev, 0);
4696 break;
4697 case readonly:
4698 if (mddev->pers)
4699 err = md_set_readonly(mddev);
4700 else {
4701 mddev->ro = MD_RDONLY;
4702 set_disk_ro(mddev->gendisk, 1);
4703 err = do_md_run(mddev);
4704 }
4705 break;
4706 case read_auto:
4707 if (mddev->pers) {
4708 if (md_is_rdwr(mddev))
4709 err = md_set_readonly(mddev);
4710 else if (mddev->ro == MD_RDONLY)
4711 err = restart_array(mddev);
4712 if (err == 0) {
4713 mddev->ro = MD_AUTO_READ;
4714 set_disk_ro(mddev->gendisk, 0);
4715 }
4716 } else {
4717 mddev->ro = MD_AUTO_READ;
4718 err = do_md_run(mddev);
4719 }
4720 break;
4721 case clean:
4722 if (mddev->pers) {
4723 err = restart_array(mddev);
4724 if (err)
4725 break;
4726 spin_lock(&mddev->lock);
4727 if (!set_in_sync(mddev))
4728 err = -EBUSY;
4729 spin_unlock(&mddev->lock);
4730 } else
4731 err = -EINVAL;
4732 break;
4733 case active:
4734 if (mddev->pers) {
4735 err = restart_array(mddev);
4736 if (err)
4737 break;
4738 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4739 wake_up(&mddev->sb_wait);
4740 err = 0;
4741 } else {
4742 mddev->ro = MD_RDWR;
4743 set_disk_ro(mddev->gendisk, 0);
4744 err = do_md_run(mddev);
4745 }
4746 break;
4747 default:
4748 err = -EINVAL;
4749 break;
4750 }
4751
4752 if (!err) {
4753 if (mddev->hold_active == UNTIL_IOCTL)
4754 mddev->hold_active = 0;
4755 sysfs_notify_dirent_safe(mddev->sysfs_state);
4756 }
4757 mddev_unlock(mddev);
4758
4759 if (st == readonly || st == read_auto || st == inactive ||
4760 (err && st == clear))
4761 clear_bit(MD_CLOSING, &mddev->flags);
4762
4763 return err ?: len;
4764 }
4765 static struct md_sysfs_entry md_array_state =
4766 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4767
4768 static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4769 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4770 return sprintf(page, "%d\n",
4771 atomic_read(&mddev->max_corr_read_errors));
4772 }
4773
4774 static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4775 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4776 {
4777 unsigned int n;
4778 int rv;
4779
4780 rv = kstrtouint(buf, 10, &n);
4781 if (rv < 0)
4782 return rv;
4783 if (n > INT_MAX)
4784 return -EINVAL;
4785 atomic_set(&mddev->max_corr_read_errors, n);
4786 return len;
4787 }
4788
4789 static struct md_sysfs_entry max_corr_read_errors =
4790 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4791 max_corrected_read_errors_store);
4792
4793 static ssize_t
null_show(struct mddev * mddev,char * page)4794 null_show(struct mddev *mddev, char *page)
4795 {
4796 return -EINVAL;
4797 }
4798
4799 static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4800 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4801 {
4802 /* buf must be %d:%d\n? giving major and minor numbers */
4803 /* The new device is added to the array.
4804 * If the array has a persistent superblock, we read the
4805 * superblock to initialise info and check validity.
4806 * Otherwise, only checking done is that in bind_rdev_to_array,
4807 * which mainly checks size.
4808 */
4809 char *e;
4810 int major = simple_strtoul(buf, &e, 10);
4811 int minor;
4812 dev_t dev;
4813 struct md_rdev *rdev;
4814 int err;
4815
4816 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4817 return -EINVAL;
4818 minor = simple_strtoul(e+1, &e, 10);
4819 if (*e && *e != '\n')
4820 return -EINVAL;
4821 dev = MKDEV(major, minor);
4822 if (major != MAJOR(dev) ||
4823 minor != MINOR(dev))
4824 return -EOVERFLOW;
4825
4826 err = mddev_suspend_and_lock(mddev);
4827 if (err)
4828 return err;
4829 if (mddev->persistent) {
4830 rdev = md_import_device(dev, mddev->major_version,
4831 mddev->minor_version);
4832 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4833 struct md_rdev *rdev0
4834 = list_entry(mddev->disks.next,
4835 struct md_rdev, same_set);
4836 err = super_types[mddev->major_version]
4837 .load_super(rdev, rdev0, mddev->minor_version);
4838 if (err < 0)
4839 goto out;
4840 }
4841 } else if (mddev->external)
4842 rdev = md_import_device(dev, -2, -1);
4843 else
4844 rdev = md_import_device(dev, -1, -1);
4845
4846 if (IS_ERR(rdev)) {
4847 mddev_unlock_and_resume(mddev);
4848 return PTR_ERR(rdev);
4849 }
4850 err = bind_rdev_to_array(rdev, mddev);
4851 out:
4852 if (err)
4853 export_rdev(rdev, mddev);
4854 mddev_unlock_and_resume(mddev);
4855 if (!err)
4856 md_new_event();
4857 return err ? err : len;
4858 }
4859
4860 static struct md_sysfs_entry md_new_device =
4861 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4862
4863 static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4864 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4865 {
4866 char *end;
4867 unsigned long chunk, end_chunk;
4868 int err;
4869
4870 if (!md_bitmap_enabled(mddev, false))
4871 return len;
4872
4873 err = mddev_lock(mddev);
4874 if (err)
4875 return err;
4876 if (!mddev->bitmap)
4877 goto out;
4878 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4879 while (*buf) {
4880 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4881 if (buf == end)
4882 break;
4883
4884 if (*end == '-') { /* range */
4885 buf = end + 1;
4886 end_chunk = simple_strtoul(buf, &end, 0);
4887 if (buf == end)
4888 break;
4889 }
4890
4891 if (*end && !isspace(*end))
4892 break;
4893
4894 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
4895 buf = skip_spaces(end);
4896 }
4897 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
4898 out:
4899 mddev_unlock(mddev);
4900 return len;
4901 }
4902
4903 static struct md_sysfs_entry md_bitmap =
4904 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4905
4906 static ssize_t
size_show(struct mddev * mddev,char * page)4907 size_show(struct mddev *mddev, char *page)
4908 {
4909 return sprintf(page, "%llu\n",
4910 (unsigned long long)mddev->dev_sectors / 2);
4911 }
4912
4913 static int update_size(struct mddev *mddev, sector_t num_sectors);
4914
4915 static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4916 size_store(struct mddev *mddev, const char *buf, size_t len)
4917 {
4918 /* If array is inactive, we can reduce the component size, but
4919 * not increase it (except from 0).
4920 * If array is active, we can try an on-line resize
4921 */
4922 sector_t sectors;
4923 int err = strict_blocks_to_sectors(buf, §ors);
4924
4925 if (err < 0)
4926 return err;
4927 err = mddev_lock(mddev);
4928 if (err)
4929 return err;
4930 if (mddev->pers) {
4931 err = update_size(mddev, sectors);
4932 if (err == 0)
4933 md_update_sb(mddev, 1);
4934 } else {
4935 if (mddev->dev_sectors == 0 ||
4936 mddev->dev_sectors > sectors)
4937 mddev->dev_sectors = sectors;
4938 else
4939 err = -ENOSPC;
4940 }
4941 mddev_unlock(mddev);
4942 return err ? err : len;
4943 }
4944
4945 static struct md_sysfs_entry md_size =
4946 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4947
4948 /* Metadata version.
4949 * This is one of
4950 * 'none' for arrays with no metadata (good luck...)
4951 * 'external' for arrays with externally managed metadata,
4952 * or N.M for internally known formats
4953 */
4954 static ssize_t
metadata_show(struct mddev * mddev,char * page)4955 metadata_show(struct mddev *mddev, char *page)
4956 {
4957 if (mddev->persistent)
4958 return sprintf(page, "%d.%d\n",
4959 mddev->major_version, mddev->minor_version);
4960 else if (mddev->external)
4961 return sprintf(page, "external:%s\n", mddev->metadata_type);
4962 else
4963 return sprintf(page, "none\n");
4964 }
4965
4966 static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4967 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4968 {
4969 int major, minor;
4970 char *e;
4971 int err;
4972 /* Changing the details of 'external' metadata is
4973 * always permitted. Otherwise there must be
4974 * no devices attached to the array.
4975 */
4976
4977 err = mddev_lock(mddev);
4978 if (err)
4979 return err;
4980 err = -EBUSY;
4981 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4982 ;
4983 else if (!list_empty(&mddev->disks))
4984 goto out_unlock;
4985
4986 err = 0;
4987 if (cmd_match(buf, "none")) {
4988 mddev->persistent = 0;
4989 mddev->external = 0;
4990 mddev->major_version = 0;
4991 mddev->minor_version = 90;
4992 goto out_unlock;
4993 }
4994 if (strncmp(buf, "external:", 9) == 0) {
4995 size_t namelen = len-9;
4996 if (namelen >= sizeof(mddev->metadata_type))
4997 namelen = sizeof(mddev->metadata_type)-1;
4998 memcpy(mddev->metadata_type, buf+9, namelen);
4999 mddev->metadata_type[namelen] = 0;
5000 if (namelen && mddev->metadata_type[namelen-1] == '\n')
5001 mddev->metadata_type[--namelen] = 0;
5002 mddev->persistent = 0;
5003 mddev->external = 1;
5004 mddev->major_version = 0;
5005 mddev->minor_version = 90;
5006 goto out_unlock;
5007 }
5008 major = simple_strtoul(buf, &e, 10);
5009 err = -EINVAL;
5010 if (e==buf || *e != '.')
5011 goto out_unlock;
5012 buf = e+1;
5013 minor = simple_strtoul(buf, &e, 10);
5014 if (e==buf || (*e && *e != '\n') )
5015 goto out_unlock;
5016 err = -ENOENT;
5017 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
5018 goto out_unlock;
5019 mddev->major_version = major;
5020 mddev->minor_version = minor;
5021 mddev->persistent = 1;
5022 mddev->external = 0;
5023 err = 0;
5024 out_unlock:
5025 mddev_unlock(mddev);
5026 return err ?: len;
5027 }
5028
5029 static struct md_sysfs_entry md_metadata =
5030 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
5031
rdev_needs_recovery(struct md_rdev * rdev,sector_t sectors)5032 static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
5033 {
5034 return rdev->raid_disk >= 0 &&
5035 !test_bit(Journal, &rdev->flags) &&
5036 !test_bit(Faulty, &rdev->flags) &&
5037 !test_bit(In_sync, &rdev->flags) &&
5038 rdev->recovery_offset < sectors;
5039 }
5040
md_get_active_sync_action(struct mddev * mddev)5041 static enum sync_action md_get_active_sync_action(struct mddev *mddev)
5042 {
5043 struct md_rdev *rdev;
5044 bool is_recover = false;
5045
5046 if (mddev->resync_offset < MaxSector)
5047 return ACTION_RESYNC;
5048
5049 if (mddev->reshape_position != MaxSector)
5050 return ACTION_RESHAPE;
5051
5052 rcu_read_lock();
5053 rdev_for_each_rcu(rdev, mddev) {
5054 if (rdev_needs_recovery(rdev, MaxSector)) {
5055 is_recover = true;
5056 break;
5057 }
5058 }
5059 rcu_read_unlock();
5060
5061 return is_recover ? ACTION_RECOVER : ACTION_IDLE;
5062 }
5063
md_sync_action(struct mddev * mddev)5064 enum sync_action md_sync_action(struct mddev *mddev)
5065 {
5066 unsigned long recovery = mddev->recovery;
5067 enum sync_action active_action;
5068
5069 /*
5070 * frozen has the highest priority, means running sync_thread will be
5071 * stopped immediately, and no new sync_thread can start.
5072 */
5073 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
5074 return ACTION_FROZEN;
5075
5076 /*
5077 * read-only array can't register sync_thread, and it can only
5078 * add/remove spares.
5079 */
5080 if (!md_is_rdwr(mddev))
5081 return ACTION_IDLE;
5082
5083 /*
5084 * idle means no sync_thread is running, and no new sync_thread is
5085 * requested.
5086 */
5087 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
5088 !test_bit(MD_RECOVERY_NEEDED, &recovery))
5089 return ACTION_IDLE;
5090
5091 /*
5092 * Check if any sync operation (resync/recover/reshape) is
5093 * currently active. This ensures that only one sync operation
5094 * can run at a time. Returns the type of active operation, or
5095 * ACTION_IDLE if none are active.
5096 */
5097 active_action = md_get_active_sync_action(mddev);
5098 if (active_action != ACTION_IDLE)
5099 return active_action;
5100
5101 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
5102 return ACTION_RESHAPE;
5103
5104 if (test_bit(MD_RECOVERY_RECOVER, &recovery))
5105 return ACTION_RECOVER;
5106
5107 if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
5108 /*
5109 * MD_RECOVERY_CHECK must be paired with
5110 * MD_RECOVERY_REQUESTED.
5111 */
5112 if (test_bit(MD_RECOVERY_CHECK, &recovery))
5113 return ACTION_CHECK;
5114 if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
5115 return ACTION_REPAIR;
5116 return ACTION_RESYNC;
5117 }
5118
5119 /*
5120 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
5121 * sync_action is specified.
5122 */
5123 return ACTION_IDLE;
5124 }
5125
md_sync_action_by_name(const char * page)5126 enum sync_action md_sync_action_by_name(const char *page)
5127 {
5128 enum sync_action action;
5129
5130 for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
5131 if (cmd_match(page, action_name[action]))
5132 return action;
5133 }
5134
5135 return NR_SYNC_ACTIONS;
5136 }
5137
md_sync_action_name(enum sync_action action)5138 const char *md_sync_action_name(enum sync_action action)
5139 {
5140 return action_name[action];
5141 }
5142
5143 static ssize_t
action_show(struct mddev * mddev,char * page)5144 action_show(struct mddev *mddev, char *page)
5145 {
5146 enum sync_action action = md_sync_action(mddev);
5147
5148 return sprintf(page, "%s\n", md_sync_action_name(action));
5149 }
5150
5151 /**
5152 * stop_sync_thread() - wait for sync_thread to stop if it's running.
5153 * @mddev: the array.
5154 * @locked: if set, reconfig_mutex will still be held after this function
5155 * return; if not set, reconfig_mutex will be released after this
5156 * function return.
5157 */
stop_sync_thread(struct mddev * mddev,bool locked)5158 static void stop_sync_thread(struct mddev *mddev, bool locked)
5159 {
5160 int sync_seq = atomic_read(&mddev->sync_seq);
5161
5162 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5163 if (!locked)
5164 mddev_unlock(mddev);
5165 return;
5166 }
5167
5168 mddev_unlock(mddev);
5169
5170 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5171 /*
5172 * Thread might be blocked waiting for metadata update which will now
5173 * never happen
5174 */
5175 md_wakeup_thread_directly(&mddev->sync_thread);
5176 if (work_pending(&mddev->sync_work))
5177 flush_work(&mddev->sync_work);
5178
5179 wait_event(resync_wait,
5180 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
5181 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
5182 sync_seq != atomic_read(&mddev->sync_seq)));
5183
5184 if (locked)
5185 mddev_lock_nointr(mddev);
5186 }
5187
md_idle_sync_thread(struct mddev * mddev)5188 void md_idle_sync_thread(struct mddev *mddev)
5189 {
5190 lockdep_assert_held(&mddev->reconfig_mutex);
5191
5192 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5193 stop_sync_thread(mddev, true);
5194 }
5195 EXPORT_SYMBOL_GPL(md_idle_sync_thread);
5196
md_frozen_sync_thread(struct mddev * mddev)5197 void md_frozen_sync_thread(struct mddev *mddev)
5198 {
5199 lockdep_assert_held(&mddev->reconfig_mutex);
5200
5201 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5202 stop_sync_thread(mddev, true);
5203 }
5204 EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
5205
md_unfrozen_sync_thread(struct mddev * mddev)5206 void md_unfrozen_sync_thread(struct mddev *mddev)
5207 {
5208 lockdep_assert_held(&mddev->reconfig_mutex);
5209
5210 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5211 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5212 md_wakeup_thread(mddev->thread);
5213 sysfs_notify_dirent_safe(mddev->sysfs_action);
5214 }
5215 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
5216
mddev_start_reshape(struct mddev * mddev)5217 static int mddev_start_reshape(struct mddev *mddev)
5218 {
5219 int ret;
5220
5221 if (mddev->pers->start_reshape == NULL)
5222 return -EINVAL;
5223
5224 if (mddev->reshape_position == MaxSector ||
5225 mddev->pers->check_reshape == NULL ||
5226 mddev->pers->check_reshape(mddev)) {
5227 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5228 ret = mddev->pers->start_reshape(mddev);
5229 if (ret)
5230 return ret;
5231 } else {
5232 /*
5233 * If reshape is still in progress, and md_check_recovery() can
5234 * continue to reshape, don't restart reshape because data can
5235 * be corrupted for raid456.
5236 */
5237 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5238 }
5239
5240 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
5241 return 0;
5242 }
5243
5244 static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)5245 action_store(struct mddev *mddev, const char *page, size_t len)
5246 {
5247 int ret;
5248 enum sync_action action;
5249
5250 if (!mddev->pers || !mddev->pers->sync_request)
5251 return -EINVAL;
5252
5253 retry:
5254 if (work_busy(&mddev->sync_work))
5255 flush_work(&mddev->sync_work);
5256
5257 ret = mddev_lock(mddev);
5258 if (ret)
5259 return ret;
5260
5261 if (work_busy(&mddev->sync_work)) {
5262 mddev_unlock(mddev);
5263 goto retry;
5264 }
5265
5266 action = md_sync_action_by_name(page);
5267
5268 /* TODO: mdadm rely on "idle" to start sync_thread. */
5269 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5270 switch (action) {
5271 case ACTION_FROZEN:
5272 md_frozen_sync_thread(mddev);
5273 ret = len;
5274 goto out;
5275 case ACTION_IDLE:
5276 md_idle_sync_thread(mddev);
5277 break;
5278 case ACTION_RESHAPE:
5279 case ACTION_RECOVER:
5280 case ACTION_CHECK:
5281 case ACTION_REPAIR:
5282 case ACTION_RESYNC:
5283 ret = -EBUSY;
5284 goto out;
5285 default:
5286 ret = -EINVAL;
5287 goto out;
5288 }
5289 } else {
5290 switch (action) {
5291 case ACTION_FROZEN:
5292 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5293 ret = len;
5294 goto out;
5295 case ACTION_RESHAPE:
5296 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5297 ret = mddev_start_reshape(mddev);
5298 if (ret)
5299 goto out;
5300 break;
5301 case ACTION_RECOVER:
5302 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5303 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5304 break;
5305 case ACTION_CHECK:
5306 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5307 fallthrough;
5308 case ACTION_REPAIR:
5309 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5310 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5311 fallthrough;
5312 case ACTION_RESYNC:
5313 case ACTION_IDLE:
5314 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5315 break;
5316 default:
5317 ret = -EINVAL;
5318 goto out;
5319 }
5320 }
5321
5322 if (mddev->ro == MD_AUTO_READ) {
5323 /* A write to sync_action is enough to justify
5324 * canceling read-auto mode
5325 */
5326 mddev->ro = MD_RDWR;
5327 md_wakeup_thread(mddev->sync_thread);
5328 }
5329
5330 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5331 md_wakeup_thread(mddev->thread);
5332 sysfs_notify_dirent_safe(mddev->sysfs_action);
5333 ret = len;
5334
5335 out:
5336 mddev_unlock(mddev);
5337 return ret;
5338 }
5339
5340 static struct md_sysfs_entry md_scan_mode =
5341 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5342
5343 static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)5344 last_sync_action_show(struct mddev *mddev, char *page)
5345 {
5346 return sprintf(page, "%s\n",
5347 md_sync_action_name(mddev->last_sync_action));
5348 }
5349
5350 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5351
5352 static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)5353 mismatch_cnt_show(struct mddev *mddev, char *page)
5354 {
5355 return sprintf(page, "%llu\n",
5356 (unsigned long long)
5357 atomic64_read(&mddev->resync_mismatches));
5358 }
5359
5360 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5361
5362 static ssize_t
sync_min_show(struct mddev * mddev,char * page)5363 sync_min_show(struct mddev *mddev, char *page)
5364 {
5365 return sprintf(page, "%d (%s)\n", speed_min(mddev),
5366 mddev->sync_speed_min ? "local" : "system");
5367 }
5368
5369 static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)5370 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5371 {
5372 unsigned int min;
5373 int rv;
5374
5375 if (strncmp(buf, "system", 6) == 0) {
5376 min = 0;
5377 } else {
5378 rv = kstrtouint(buf, 10, &min);
5379 if (rv < 0)
5380 return rv;
5381 if (min == 0)
5382 return -EINVAL;
5383 }
5384 mddev->sync_speed_min = min;
5385 return len;
5386 }
5387
5388 static struct md_sysfs_entry md_sync_min =
5389 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5390
5391 static ssize_t
sync_max_show(struct mddev * mddev,char * page)5392 sync_max_show(struct mddev *mddev, char *page)
5393 {
5394 return sprintf(page, "%d (%s)\n", speed_max(mddev),
5395 mddev->sync_speed_max ? "local" : "system");
5396 }
5397
5398 static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5399 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5400 {
5401 unsigned int max;
5402 int rv;
5403
5404 if (strncmp(buf, "system", 6) == 0) {
5405 max = 0;
5406 } else {
5407 rv = kstrtouint(buf, 10, &max);
5408 if (rv < 0)
5409 return rv;
5410 if (max == 0)
5411 return -EINVAL;
5412 }
5413 mddev->sync_speed_max = max;
5414 return len;
5415 }
5416
5417 static struct md_sysfs_entry md_sync_max =
5418 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5419
5420 static ssize_t
sync_io_depth_show(struct mddev * mddev,char * page)5421 sync_io_depth_show(struct mddev *mddev, char *page)
5422 {
5423 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
5424 mddev->sync_io_depth ? "local" : "system");
5425 }
5426
5427 static ssize_t
sync_io_depth_store(struct mddev * mddev,const char * buf,size_t len)5428 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
5429 {
5430 unsigned int max;
5431 int rv;
5432
5433 if (strncmp(buf, "system", 6) == 0) {
5434 max = 0;
5435 } else {
5436 rv = kstrtouint(buf, 10, &max);
5437 if (rv < 0)
5438 return rv;
5439 if (max == 0)
5440 return -EINVAL;
5441 }
5442 mddev->sync_io_depth = max;
5443 return len;
5444 }
5445
5446 static struct md_sysfs_entry md_sync_io_depth =
5447 __ATTR_RW(sync_io_depth);
5448
5449 static ssize_t
degraded_show(struct mddev * mddev,char * page)5450 degraded_show(struct mddev *mddev, char *page)
5451 {
5452 return sprintf(page, "%d\n", mddev->degraded);
5453 }
5454 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5455
5456 static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5457 sync_force_parallel_show(struct mddev *mddev, char *page)
5458 {
5459 return sprintf(page, "%d\n", mddev->parallel_resync);
5460 }
5461
5462 static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5463 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5464 {
5465 long n;
5466
5467 if (kstrtol(buf, 10, &n))
5468 return -EINVAL;
5469
5470 if (n != 0 && n != 1)
5471 return -EINVAL;
5472
5473 mddev->parallel_resync = n;
5474
5475 if (mddev->sync_thread)
5476 wake_up(&resync_wait);
5477
5478 return len;
5479 }
5480
5481 /* force parallel resync, even with shared block devices */
5482 static struct md_sysfs_entry md_sync_force_parallel =
5483 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5484 sync_force_parallel_show, sync_force_parallel_store);
5485
5486 static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5487 sync_speed_show(struct mddev *mddev, char *page)
5488 {
5489 unsigned long resync, dt, db;
5490 if (mddev->curr_resync == MD_RESYNC_NONE)
5491 return sprintf(page, "none\n");
5492 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5493 dt = (jiffies - mddev->resync_mark) / HZ;
5494 if (!dt) dt++;
5495 db = resync - mddev->resync_mark_cnt;
5496 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5497 }
5498
5499 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5500
5501 static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5502 sync_completed_show(struct mddev *mddev, char *page)
5503 {
5504 unsigned long long max_sectors, resync;
5505
5506 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5507 return sprintf(page, "none\n");
5508
5509 if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5510 mddev->curr_resync == MD_RESYNC_DELAYED)
5511 return sprintf(page, "delayed\n");
5512
5513 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5514 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5515 max_sectors = mddev->resync_max_sectors;
5516 else
5517 max_sectors = mddev->dev_sectors;
5518
5519 resync = mddev->curr_resync_completed;
5520 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5521 }
5522
5523 static struct md_sysfs_entry md_sync_completed =
5524 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5525
5526 static ssize_t
min_sync_show(struct mddev * mddev,char * page)5527 min_sync_show(struct mddev *mddev, char *page)
5528 {
5529 return sprintf(page, "%llu\n",
5530 (unsigned long long)mddev->resync_min);
5531 }
5532 static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5533 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5534 {
5535 unsigned long long min;
5536 int err;
5537
5538 if (kstrtoull(buf, 10, &min))
5539 return -EINVAL;
5540
5541 spin_lock(&mddev->lock);
5542 err = -EINVAL;
5543 if (min > mddev->resync_max)
5544 goto out_unlock;
5545
5546 err = -EBUSY;
5547 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5548 goto out_unlock;
5549
5550 /* Round down to multiple of 4K for safety */
5551 mddev->resync_min = round_down(min, 8);
5552 err = 0;
5553
5554 out_unlock:
5555 spin_unlock(&mddev->lock);
5556 return err ?: len;
5557 }
5558
5559 static struct md_sysfs_entry md_min_sync =
5560 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5561
5562 static ssize_t
max_sync_show(struct mddev * mddev,char * page)5563 max_sync_show(struct mddev *mddev, char *page)
5564 {
5565 if (mddev->resync_max == MaxSector)
5566 return sprintf(page, "max\n");
5567 else
5568 return sprintf(page, "%llu\n",
5569 (unsigned long long)mddev->resync_max);
5570 }
5571 static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5572 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5573 {
5574 int err;
5575 spin_lock(&mddev->lock);
5576 if (strncmp(buf, "max", 3) == 0)
5577 mddev->resync_max = MaxSector;
5578 else {
5579 unsigned long long max;
5580 int chunk;
5581
5582 err = -EINVAL;
5583 if (kstrtoull(buf, 10, &max))
5584 goto out_unlock;
5585 if (max < mddev->resync_min)
5586 goto out_unlock;
5587
5588 err = -EBUSY;
5589 if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5590 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5591 goto out_unlock;
5592
5593 /* Must be a multiple of chunk_size */
5594 chunk = mddev->chunk_sectors;
5595 if (chunk) {
5596 sector_t temp = max;
5597
5598 err = -EINVAL;
5599 if (sector_div(temp, chunk))
5600 goto out_unlock;
5601 }
5602 mddev->resync_max = max;
5603 }
5604 wake_up(&mddev->recovery_wait);
5605 err = 0;
5606 out_unlock:
5607 spin_unlock(&mddev->lock);
5608 return err ?: len;
5609 }
5610
5611 static struct md_sysfs_entry md_max_sync =
5612 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5613
5614 static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5615 suspend_lo_show(struct mddev *mddev, char *page)
5616 {
5617 return sprintf(page, "%llu\n",
5618 (unsigned long long)READ_ONCE(mddev->suspend_lo));
5619 }
5620
5621 static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5622 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5623 {
5624 unsigned long long new;
5625 int err;
5626
5627 err = kstrtoull(buf, 10, &new);
5628 if (err < 0)
5629 return err;
5630 if (new != (sector_t)new)
5631 return -EINVAL;
5632
5633 err = mddev_suspend(mddev, true);
5634 if (err)
5635 return err;
5636
5637 WRITE_ONCE(mddev->suspend_lo, new);
5638 mddev_resume(mddev);
5639
5640 return len;
5641 }
5642 static struct md_sysfs_entry md_suspend_lo =
5643 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5644
5645 static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5646 suspend_hi_show(struct mddev *mddev, char *page)
5647 {
5648 return sprintf(page, "%llu\n",
5649 (unsigned long long)READ_ONCE(mddev->suspend_hi));
5650 }
5651
5652 static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5653 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5654 {
5655 unsigned long long new;
5656 int err;
5657
5658 err = kstrtoull(buf, 10, &new);
5659 if (err < 0)
5660 return err;
5661 if (new != (sector_t)new)
5662 return -EINVAL;
5663
5664 err = mddev_suspend(mddev, true);
5665 if (err)
5666 return err;
5667
5668 WRITE_ONCE(mddev->suspend_hi, new);
5669 mddev_resume(mddev);
5670
5671 return len;
5672 }
5673 static struct md_sysfs_entry md_suspend_hi =
5674 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5675
5676 static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5677 reshape_position_show(struct mddev *mddev, char *page)
5678 {
5679 if (mddev->reshape_position != MaxSector)
5680 return sprintf(page, "%llu\n",
5681 (unsigned long long)mddev->reshape_position);
5682 strcpy(page, "none\n");
5683 return 5;
5684 }
5685
5686 static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5687 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5688 {
5689 struct md_rdev *rdev;
5690 unsigned long long new;
5691 int err;
5692
5693 err = kstrtoull(buf, 10, &new);
5694 if (err < 0)
5695 return err;
5696 if (new != (sector_t)new)
5697 return -EINVAL;
5698 err = mddev_lock(mddev);
5699 if (err)
5700 return err;
5701 err = -EBUSY;
5702 if (mddev->pers)
5703 goto unlock;
5704 mddev->reshape_position = new;
5705 mddev->delta_disks = 0;
5706 mddev->reshape_backwards = 0;
5707 mddev->new_level = mddev->level;
5708 mddev->new_layout = mddev->layout;
5709 mddev->new_chunk_sectors = mddev->chunk_sectors;
5710 rdev_for_each(rdev, mddev)
5711 rdev->new_data_offset = rdev->data_offset;
5712 err = 0;
5713 unlock:
5714 mddev_unlock(mddev);
5715 return err ?: len;
5716 }
5717
5718 static struct md_sysfs_entry md_reshape_position =
5719 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5720 reshape_position_store);
5721
5722 static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5723 reshape_direction_show(struct mddev *mddev, char *page)
5724 {
5725 return sprintf(page, "%s\n",
5726 mddev->reshape_backwards ? "backwards" : "forwards");
5727 }
5728
5729 static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5730 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5731 {
5732 int backwards = 0;
5733 int err;
5734
5735 if (cmd_match(buf, "forwards"))
5736 backwards = 0;
5737 else if (cmd_match(buf, "backwards"))
5738 backwards = 1;
5739 else
5740 return -EINVAL;
5741 if (mddev->reshape_backwards == backwards)
5742 return len;
5743
5744 err = mddev_lock(mddev);
5745 if (err)
5746 return err;
5747 /* check if we are allowed to change */
5748 if (mddev->delta_disks)
5749 err = -EBUSY;
5750 else if (mddev->persistent &&
5751 mddev->major_version == 0)
5752 err = -EINVAL;
5753 else
5754 mddev->reshape_backwards = backwards;
5755 mddev_unlock(mddev);
5756 return err ?: len;
5757 }
5758
5759 static struct md_sysfs_entry md_reshape_direction =
5760 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5761 reshape_direction_store);
5762
5763 static ssize_t
array_size_show(struct mddev * mddev,char * page)5764 array_size_show(struct mddev *mddev, char *page)
5765 {
5766 if (mddev->external_size)
5767 return sprintf(page, "%llu\n",
5768 (unsigned long long)mddev->array_sectors/2);
5769 else
5770 return sprintf(page, "default\n");
5771 }
5772
5773 static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5774 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5775 {
5776 sector_t sectors;
5777 int err;
5778
5779 err = mddev_lock(mddev);
5780 if (err)
5781 return err;
5782
5783 /* cluster raid doesn't support change array_sectors */
5784 if (mddev_is_clustered(mddev)) {
5785 mddev_unlock(mddev);
5786 return -EINVAL;
5787 }
5788
5789 if (strncmp(buf, "default", 7) == 0) {
5790 if (mddev->pers)
5791 sectors = mddev->pers->size(mddev, 0, 0);
5792 else
5793 sectors = mddev->array_sectors;
5794
5795 mddev->external_size = 0;
5796 } else {
5797 if (strict_blocks_to_sectors(buf, §ors) < 0)
5798 err = -EINVAL;
5799 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5800 err = -E2BIG;
5801 else
5802 mddev->external_size = 1;
5803 }
5804
5805 if (!err) {
5806 mddev->array_sectors = sectors;
5807 if (mddev->pers)
5808 set_capacity_and_notify(mddev->gendisk,
5809 mddev->array_sectors);
5810 }
5811 mddev_unlock(mddev);
5812 return err ?: len;
5813 }
5814
5815 static struct md_sysfs_entry md_array_size =
5816 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5817 array_size_store);
5818
5819 static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5820 consistency_policy_show(struct mddev *mddev, char *page)
5821 {
5822 int ret;
5823
5824 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5825 ret = sprintf(page, "journal\n");
5826 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5827 ret = sprintf(page, "ppl\n");
5828 } else if (mddev->bitmap) {
5829 ret = sprintf(page, "bitmap\n");
5830 } else if (mddev->pers) {
5831 if (mddev->pers->sync_request)
5832 ret = sprintf(page, "resync\n");
5833 else
5834 ret = sprintf(page, "none\n");
5835 } else {
5836 ret = sprintf(page, "unknown\n");
5837 }
5838
5839 return ret;
5840 }
5841
5842 static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5843 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5844 {
5845 int err = 0;
5846
5847 if (mddev->pers) {
5848 if (mddev->pers->change_consistency_policy)
5849 err = mddev->pers->change_consistency_policy(mddev, buf);
5850 else
5851 err = -EBUSY;
5852 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5853 set_bit(MD_HAS_PPL, &mddev->flags);
5854 } else {
5855 err = -EINVAL;
5856 }
5857
5858 return err ? err : len;
5859 }
5860
5861 static struct md_sysfs_entry md_consistency_policy =
5862 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5863 consistency_policy_store);
5864
fail_last_dev_show(struct mddev * mddev,char * page)5865 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5866 {
5867 return sprintf(page, "%d\n", mddev->fail_last_dev);
5868 }
5869
5870 /*
5871 * Setting fail_last_dev to true to allow last device to be forcibly removed
5872 * from RAID1/RAID10.
5873 */
5874 static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5875 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5876 {
5877 int ret;
5878 bool value;
5879
5880 ret = kstrtobool(buf, &value);
5881 if (ret)
5882 return ret;
5883
5884 if (value != mddev->fail_last_dev)
5885 mddev->fail_last_dev = value;
5886
5887 return len;
5888 }
5889 static struct md_sysfs_entry md_fail_last_dev =
5890 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5891 fail_last_dev_store);
5892
serialize_policy_show(struct mddev * mddev,char * page)5893 static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5894 {
5895 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
5896 return sprintf(page, "n/a\n");
5897 else
5898 return sprintf(page, "%d\n", mddev->serialize_policy);
5899 }
5900
5901 /*
5902 * Setting serialize_policy to true to enforce write IO is not reordered
5903 * for raid1.
5904 */
5905 static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5906 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5907 {
5908 int err;
5909 bool value;
5910
5911 err = kstrtobool(buf, &value);
5912 if (err)
5913 return err;
5914
5915 if (value == mddev->serialize_policy)
5916 return len;
5917
5918 err = mddev_suspend_and_lock(mddev);
5919 if (err)
5920 return err;
5921 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
5922 pr_err("md: serialize_policy is only effective for raid1\n");
5923 err = -EINVAL;
5924 goto unlock;
5925 }
5926
5927 if (value)
5928 mddev_create_serial_pool(mddev, NULL);
5929 else
5930 mddev_destroy_serial_pool(mddev, NULL);
5931 mddev->serialize_policy = value;
5932 unlock:
5933 mddev_unlock_and_resume(mddev);
5934 return err ?: len;
5935 }
5936
5937 static struct md_sysfs_entry md_serialize_policy =
5938 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5939 serialize_policy_store);
5940
mddev_set_logical_block_size(struct mddev * mddev,unsigned int lbs)5941 static int mddev_set_logical_block_size(struct mddev *mddev,
5942 unsigned int lbs)
5943 {
5944 int err = 0;
5945 struct queue_limits lim;
5946
5947 if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
5948 pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
5949 mdname(mddev), lbs);
5950 return -EINVAL;
5951 }
5952
5953 lim = queue_limits_start_update(mddev->gendisk->queue);
5954 lim.logical_block_size = lbs;
5955 pr_info("%s: logical_block_size is changed, data may be lost\n",
5956 mdname(mddev));
5957 err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
5958 if (err)
5959 return err;
5960
5961 mddev->logical_block_size = lbs;
5962 /* New lbs will be written to superblock after array is running */
5963 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5964 return 0;
5965 }
5966
5967 static ssize_t
lbs_show(struct mddev * mddev,char * page)5968 lbs_show(struct mddev *mddev, char *page)
5969 {
5970 return sprintf(page, "%u\n", mddev->logical_block_size);
5971 }
5972
5973 static ssize_t
lbs_store(struct mddev * mddev,const char * buf,size_t len)5974 lbs_store(struct mddev *mddev, const char *buf, size_t len)
5975 {
5976 unsigned int lbs;
5977 int err = -EBUSY;
5978
5979 /* Only 1.x meta supports configurable LBS */
5980 if (mddev->major_version == 0)
5981 return -EINVAL;
5982
5983 err = kstrtouint(buf, 10, &lbs);
5984 if (err < 0)
5985 return -EINVAL;
5986
5987 if (mddev->pers) {
5988 unsigned int curr_lbs;
5989
5990 if (mddev->logical_block_size)
5991 return -EBUSY;
5992 /*
5993 * To fix forward compatibility issues, LBS is not
5994 * configured for arrays from old kernels (<=6.18) by default.
5995 * If the user confirms no rollback to old kernels,
5996 * enable LBS by writing current LBS — to prevent data
5997 * loss from LBS changes.
5998 */
5999 curr_lbs = queue_logical_block_size(mddev->gendisk->queue);
6000 if (lbs != curr_lbs)
6001 return -EINVAL;
6002
6003 mddev->logical_block_size = curr_lbs;
6004 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6005 pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n",
6006 mdname(mddev));
6007 return len;
6008 }
6009
6010 err = mddev_lock(mddev);
6011 if (err)
6012 goto unlock;
6013
6014 err = mddev_set_logical_block_size(mddev, lbs);
6015
6016 unlock:
6017 mddev_unlock(mddev);
6018 return err ?: len;
6019 }
6020
6021 static struct md_sysfs_entry md_logical_block_size =
6022 __ATTR(logical_block_size, 0644, lbs_show, lbs_store);
6023
6024 static struct attribute *md_default_attrs[] = {
6025 &md_level.attr,
6026 &md_new_level.attr,
6027 &md_bitmap_type.attr,
6028 &md_layout.attr,
6029 &md_raid_disks.attr,
6030 &md_uuid.attr,
6031 &md_chunk_size.attr,
6032 &md_size.attr,
6033 &md_resync_start.attr,
6034 &md_metadata.attr,
6035 &md_new_device.attr,
6036 &md_safe_delay.attr,
6037 &md_array_state.attr,
6038 &md_reshape_position.attr,
6039 &md_reshape_direction.attr,
6040 &md_array_size.attr,
6041 &max_corr_read_errors.attr,
6042 &md_consistency_policy.attr,
6043 &md_fail_last_dev.attr,
6044 &md_serialize_policy.attr,
6045 &md_logical_block_size.attr,
6046 NULL,
6047 };
6048
6049 static const struct attribute_group md_default_group = {
6050 .attrs = md_default_attrs,
6051 };
6052
6053 static struct attribute *md_redundancy_attrs[] = {
6054 &md_scan_mode.attr,
6055 &md_last_scan_mode.attr,
6056 &md_mismatches.attr,
6057 &md_sync_min.attr,
6058 &md_sync_max.attr,
6059 &md_sync_io_depth.attr,
6060 &md_sync_speed.attr,
6061 &md_sync_force_parallel.attr,
6062 &md_sync_completed.attr,
6063 &md_min_sync.attr,
6064 &md_max_sync.attr,
6065 &md_suspend_lo.attr,
6066 &md_suspend_hi.attr,
6067 &md_bitmap.attr,
6068 &md_degraded.attr,
6069 NULL,
6070 };
6071 static const struct attribute_group md_redundancy_group = {
6072 .name = NULL,
6073 .attrs = md_redundancy_attrs,
6074 };
6075
6076 static const struct attribute_group *md_attr_groups[] = {
6077 &md_default_group,
6078 NULL,
6079 };
6080
6081 static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)6082 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
6083 {
6084 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
6085 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
6086 ssize_t rv;
6087
6088 if (!entry->show)
6089 return -EIO;
6090 spin_lock(&all_mddevs_lock);
6091 if (!mddev_get(mddev)) {
6092 spin_unlock(&all_mddevs_lock);
6093 return -EBUSY;
6094 }
6095 spin_unlock(&all_mddevs_lock);
6096
6097 rv = entry->show(mddev, page);
6098 mddev_put(mddev);
6099 return rv;
6100 }
6101
6102 static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)6103 md_attr_store(struct kobject *kobj, struct attribute *attr,
6104 const char *page, size_t length)
6105 {
6106 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
6107 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
6108 ssize_t rv;
6109 struct kernfs_node *kn = NULL;
6110
6111 if (!entry->store)
6112 return -EIO;
6113 if (!capable(CAP_SYS_ADMIN))
6114 return -EACCES;
6115
6116 if (entry->store == array_state_store && cmd_match(page, "clear"))
6117 kn = sysfs_break_active_protection(kobj, attr);
6118
6119 spin_lock(&all_mddevs_lock);
6120 if (!mddev_get(mddev)) {
6121 spin_unlock(&all_mddevs_lock);
6122 if (kn)
6123 sysfs_unbreak_active_protection(kn);
6124 return -EBUSY;
6125 }
6126 spin_unlock(&all_mddevs_lock);
6127 rv = entry->store(mddev, page, length);
6128 mddev_put(mddev);
6129
6130 if (kn)
6131 sysfs_unbreak_active_protection(kn);
6132
6133 return rv;
6134 }
6135
md_kobj_release(struct kobject * ko)6136 static void md_kobj_release(struct kobject *ko)
6137 {
6138 struct mddev *mddev = container_of(ko, struct mddev, kobj);
6139
6140 if (legacy_async_del_gendisk) {
6141 if (mddev->sysfs_state)
6142 sysfs_put(mddev->sysfs_state);
6143 if (mddev->sysfs_level)
6144 sysfs_put(mddev->sysfs_level);
6145 del_gendisk(mddev->gendisk);
6146 }
6147 put_disk(mddev->gendisk);
6148 }
6149
6150 static const struct sysfs_ops md_sysfs_ops = {
6151 .show = md_attr_show,
6152 .store = md_attr_store,
6153 };
6154 static const struct kobj_type md_ktype = {
6155 .release = md_kobj_release,
6156 .sysfs_ops = &md_sysfs_ops,
6157 .default_groups = md_attr_groups,
6158 };
6159
6160 int mdp_major = 0;
6161
6162 /* stack the limit for all rdevs into lim */
mddev_stack_rdev_limits(struct mddev * mddev,struct queue_limits * lim,unsigned int flags)6163 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
6164 unsigned int flags)
6165 {
6166 struct md_rdev *rdev;
6167
6168 rdev_for_each(rdev, mddev) {
6169 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
6170 mddev->gendisk->disk_name);
6171 if ((flags & MDDEV_STACK_INTEGRITY) &&
6172 !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
6173 return -EINVAL;
6174 }
6175
6176 /*
6177 * Before RAID adding folio support, the logical_block_size
6178 * should be smaller than the page size.
6179 */
6180 if (lim->logical_block_size > PAGE_SIZE) {
6181 pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
6182 mdname(mddev));
6183 return -EINVAL;
6184 }
6185
6186 /* Only 1.x meta needs to set logical block size */
6187 if (mddev->major_version == 0)
6188 return 0;
6189
6190 /*
6191 * Fix forward compatibility issue. Only set LBS by default for
6192 * new arrays, mddev->events == 0 indicates the array was just
6193 * created. When assembling an array, read LBS from the superblock
6194 * instead — LBS is 0 in superblocks created by old kernels.
6195 */
6196 if (!mddev->events) {
6197 pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n",
6198 mdname(mddev));
6199 mddev->logical_block_size = lim->logical_block_size;
6200 }
6201
6202 if (!mddev->logical_block_size)
6203 pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n"
6204 "\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n",
6205 mdname(mddev));
6206
6207 return 0;
6208 }
6209 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
6210
6211 /* apply the extra stacking limits from a new rdev into mddev */
mddev_stack_new_rdev(struct mddev * mddev,struct md_rdev * rdev)6212 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
6213 {
6214 struct queue_limits lim;
6215
6216 if (mddev_is_dm(mddev))
6217 return 0;
6218
6219 if (queue_logical_block_size(rdev->bdev->bd_disk->queue) >
6220 queue_logical_block_size(mddev->gendisk->queue)) {
6221 pr_err("%s: incompatible logical_block_size, can not add\n",
6222 mdname(mddev));
6223 return -EINVAL;
6224 }
6225
6226 lim = queue_limits_start_update(mddev->gendisk->queue);
6227 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
6228 mddev->gendisk->disk_name);
6229
6230 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
6231 pr_err("%s: incompatible integrity profile for %pg\n",
6232 mdname(mddev), rdev->bdev);
6233 queue_limits_cancel_update(mddev->gendisk->queue);
6234 return -ENXIO;
6235 }
6236
6237 return queue_limits_commit_update(mddev->gendisk->queue, &lim);
6238 }
6239 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
6240
6241 /* update the optimal I/O size after a reshape */
mddev_update_io_opt(struct mddev * mddev,unsigned int nr_stripes)6242 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
6243 {
6244 struct queue_limits lim;
6245
6246 if (mddev_is_dm(mddev))
6247 return;
6248
6249 /* don't bother updating io_opt if we can't suspend the array */
6250 if (mddev_suspend(mddev, false) < 0)
6251 return;
6252 lim = queue_limits_start_update(mddev->gendisk->queue);
6253 lim.io_opt = lim.io_min * nr_stripes;
6254 queue_limits_commit_update(mddev->gendisk->queue, &lim);
6255 mddev_resume(mddev);
6256 }
6257 EXPORT_SYMBOL_GPL(mddev_update_io_opt);
6258
mddev_delayed_delete(struct work_struct * ws)6259 static void mddev_delayed_delete(struct work_struct *ws)
6260 {
6261 struct mddev *mddev = container_of(ws, struct mddev, del_work);
6262
6263 kobject_put(&mddev->kobj);
6264 }
6265
md_init_stacking_limits(struct queue_limits * lim)6266 void md_init_stacking_limits(struct queue_limits *lim)
6267 {
6268 blk_set_stacking_limits(lim);
6269 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
6270 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
6271 }
6272 EXPORT_SYMBOL_GPL(md_init_stacking_limits);
6273
md_alloc(dev_t dev,char * name)6274 struct mddev *md_alloc(dev_t dev, char *name)
6275 {
6276 /*
6277 * If dev is zero, name is the name of a device to allocate with
6278 * an arbitrary minor number. It will be "md_???"
6279 * If dev is non-zero it must be a device number with a MAJOR of
6280 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
6281 * the device is being created by opening a node in /dev.
6282 * If "name" is not NULL, the device is being created by
6283 * writing to /sys/module/md_mod/parameters/new_array.
6284 */
6285 static DEFINE_MUTEX(disks_mutex);
6286 struct mddev *mddev;
6287 struct gendisk *disk;
6288 int partitioned;
6289 int shift;
6290 int unit;
6291 int error;
6292
6293 /*
6294 * Wait for any previous instance of this device to be completely
6295 * removed (mddev_delayed_delete).
6296 */
6297 flush_workqueue(md_misc_wq);
6298
6299 mutex_lock(&disks_mutex);
6300 mddev = mddev_alloc(dev);
6301 if (IS_ERR(mddev)) {
6302 error = PTR_ERR(mddev);
6303 goto out_unlock;
6304 }
6305
6306 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
6307 shift = partitioned ? MdpMinorShift : 0;
6308 unit = MINOR(mddev->unit) >> shift;
6309
6310 if (name && !dev) {
6311 /* Need to ensure that 'name' is not a duplicate.
6312 */
6313 struct mddev *mddev2;
6314 spin_lock(&all_mddevs_lock);
6315
6316 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
6317 if (mddev2->gendisk &&
6318 strcmp(mddev2->gendisk->disk_name, name) == 0) {
6319 spin_unlock(&all_mddevs_lock);
6320 error = -EEXIST;
6321 goto out_free_mddev;
6322 }
6323 spin_unlock(&all_mddevs_lock);
6324 }
6325 if (name && dev)
6326 /*
6327 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
6328 */
6329 mddev->hold_active = UNTIL_STOP;
6330
6331 disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
6332 if (IS_ERR(disk)) {
6333 error = PTR_ERR(disk);
6334 goto out_free_mddev;
6335 }
6336
6337 disk->major = MAJOR(mddev->unit);
6338 disk->first_minor = unit << shift;
6339 disk->minors = 1 << shift;
6340 if (name)
6341 strcpy(disk->disk_name, name);
6342 else if (partitioned)
6343 sprintf(disk->disk_name, "md_d%d", unit);
6344 else
6345 sprintf(disk->disk_name, "md%d", unit);
6346 disk->fops = &md_fops;
6347 disk->private_data = mddev;
6348
6349 disk->events |= DISK_EVENT_MEDIA_CHANGE;
6350 mddev->gendisk = disk;
6351 error = add_disk(disk);
6352 if (error)
6353 goto out_put_disk;
6354
6355 kobject_init(&mddev->kobj, &md_ktype);
6356 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
6357 if (error) {
6358 /*
6359 * The disk is already live at this point. Clear the hold flag
6360 * and let mddev_put take care of the deletion, as it isn't any
6361 * different from a normal close on last release now.
6362 */
6363 mddev->hold_active = 0;
6364 mutex_unlock(&disks_mutex);
6365 mddev_put(mddev);
6366 return ERR_PTR(error);
6367 }
6368
6369 kobject_uevent(&mddev->kobj, KOBJ_ADD);
6370 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
6371 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
6372 mutex_unlock(&disks_mutex);
6373 return mddev;
6374
6375 out_put_disk:
6376 put_disk(disk);
6377 out_free_mddev:
6378 mddev_free(mddev);
6379 out_unlock:
6380 mutex_unlock(&disks_mutex);
6381 return ERR_PTR(error);
6382 }
6383
md_alloc_and_put(dev_t dev,char * name)6384 static int md_alloc_and_put(dev_t dev, char *name)
6385 {
6386 struct mddev *mddev = md_alloc(dev, name);
6387
6388 if (legacy_async_del_gendisk)
6389 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
6390
6391 if (IS_ERR(mddev))
6392 return PTR_ERR(mddev);
6393 mddev_put(mddev);
6394 return 0;
6395 }
6396
md_probe(dev_t dev)6397 static void md_probe(dev_t dev)
6398 {
6399 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
6400 return;
6401 if (create_on_open)
6402 md_alloc_and_put(dev, NULL);
6403 }
6404
add_named_array(const char * val,const struct kernel_param * kp)6405 static int add_named_array(const char *val, const struct kernel_param *kp)
6406 {
6407 /*
6408 * val must be "md_*" or "mdNNN".
6409 * For "md_*" we allocate an array with a large free minor number, and
6410 * set the name to val. val must not already be an active name.
6411 * For "mdNNN" we allocate an array with the minor number NNN
6412 * which must not already be in use.
6413 */
6414 int len = strlen(val);
6415 char buf[DISK_NAME_LEN];
6416 unsigned long devnum;
6417
6418 while (len && val[len-1] == '\n')
6419 len--;
6420 if (len >= DISK_NAME_LEN)
6421 return -E2BIG;
6422 strscpy(buf, val, len+1);
6423 if (strncmp(buf, "md_", 3) == 0)
6424 return md_alloc_and_put(0, buf);
6425 if (strncmp(buf, "md", 2) == 0 &&
6426 isdigit(buf[2]) &&
6427 kstrtoul(buf+2, 10, &devnum) == 0 &&
6428 devnum <= MINORMASK)
6429 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
6430
6431 return -EINVAL;
6432 }
6433
md_safemode_timeout(struct timer_list * t)6434 static void md_safemode_timeout(struct timer_list *t)
6435 {
6436 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
6437
6438 mddev->safemode = 1;
6439 if (mddev->external)
6440 sysfs_notify_dirent_safe(mddev->sysfs_state);
6441
6442 md_wakeup_thread(mddev->thread);
6443 }
6444
6445 static int start_dirty_degraded;
6446
md_bitmap_create(struct mddev * mddev)6447 static int md_bitmap_create(struct mddev *mddev)
6448 {
6449 if (mddev->bitmap_id == ID_BITMAP_NONE)
6450 return -EINVAL;
6451
6452 if (!mddev_set_bitmap_ops(mddev))
6453 return -ENOENT;
6454
6455 return mddev->bitmap_ops->create(mddev);
6456 }
6457
md_bitmap_destroy(struct mddev * mddev)6458 static void md_bitmap_destroy(struct mddev *mddev)
6459 {
6460 if (!md_bitmap_registered(mddev))
6461 return;
6462
6463 mddev->bitmap_ops->destroy(mddev);
6464 mddev_clear_bitmap_ops(mddev);
6465 }
6466
md_run(struct mddev * mddev)6467 int md_run(struct mddev *mddev)
6468 {
6469 int err;
6470 struct md_rdev *rdev;
6471 struct md_personality *pers;
6472 bool nowait = true;
6473
6474 if (list_empty(&mddev->disks))
6475 /* cannot run an array with no devices.. */
6476 return -EINVAL;
6477
6478 if (mddev->pers)
6479 return -EBUSY;
6480 /* Cannot run until previous stop completes properly */
6481 if (mddev->sysfs_active)
6482 return -EBUSY;
6483
6484 /*
6485 * Analyze all RAID superblock(s)
6486 */
6487 if (!mddev->raid_disks) {
6488 if (!mddev->persistent)
6489 return -EINVAL;
6490 err = analyze_sbs(mddev);
6491 if (err)
6492 return -EINVAL;
6493 }
6494
6495 if (mddev->level != LEVEL_NONE)
6496 request_module("md-level-%d", mddev->level);
6497 else if (mddev->clevel[0])
6498 request_module("md-%s", mddev->clevel);
6499
6500 /*
6501 * Drop all container device buffers, from now on
6502 * the only valid external interface is through the md
6503 * device.
6504 */
6505 mddev->has_superblocks = false;
6506 rdev_for_each(rdev, mddev) {
6507 if (test_bit(Faulty, &rdev->flags))
6508 continue;
6509 sync_blockdev(rdev->bdev);
6510 invalidate_bdev(rdev->bdev);
6511 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6512 mddev->ro = MD_RDONLY;
6513 if (!mddev_is_dm(mddev))
6514 set_disk_ro(mddev->gendisk, 1);
6515 }
6516
6517 if (rdev->sb_page)
6518 mddev->has_superblocks = true;
6519
6520 /* perform some consistency tests on the device.
6521 * We don't want the data to overlap the metadata,
6522 * Internal Bitmap issues have been handled elsewhere.
6523 */
6524 if (rdev->meta_bdev) {
6525 /* Nothing to check */;
6526 } else if (rdev->data_offset < rdev->sb_start) {
6527 if (mddev->dev_sectors &&
6528 rdev->data_offset + mddev->dev_sectors
6529 > rdev->sb_start) {
6530 pr_warn("md: %s: data overlaps metadata\n",
6531 mdname(mddev));
6532 return -EINVAL;
6533 }
6534 } else {
6535 if (rdev->sb_start + rdev->sb_size/512
6536 > rdev->data_offset) {
6537 pr_warn("md: %s: metadata overlaps data\n",
6538 mdname(mddev));
6539 return -EINVAL;
6540 }
6541 }
6542 sysfs_notify_dirent_safe(rdev->sysfs_state);
6543 nowait = nowait && bdev_nowait(rdev->bdev);
6544 }
6545
6546 pers = get_pers(mddev->level, mddev->clevel);
6547 if (!pers)
6548 return -EINVAL;
6549 if (mddev->level != pers->head.id) {
6550 mddev->level = pers->head.id;
6551 mddev->new_level = pers->head.id;
6552 }
6553 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
6554
6555 if (mddev->reshape_position != MaxSector &&
6556 pers->start_reshape == NULL) {
6557 /* This personality cannot handle reshaping... */
6558 put_pers(pers);
6559 return -EINVAL;
6560 }
6561
6562 if (pers->sync_request) {
6563 /* Warn if this is a potentially silly
6564 * configuration.
6565 */
6566 struct md_rdev *rdev2;
6567 int warned = 0;
6568
6569 rdev_for_each(rdev, mddev)
6570 rdev_for_each(rdev2, mddev) {
6571 if (rdev < rdev2 &&
6572 rdev->bdev->bd_disk ==
6573 rdev2->bdev->bd_disk) {
6574 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6575 mdname(mddev),
6576 rdev->bdev,
6577 rdev2->bdev);
6578 warned = 1;
6579 }
6580 }
6581
6582 if (warned)
6583 pr_warn("True protection against single-disk failure might be compromised.\n");
6584 }
6585
6586 /* dm-raid expect sync_thread to be frozen until resume */
6587 if (mddev->gendisk)
6588 mddev->recovery = 0;
6589
6590 /* may be over-ridden by personality */
6591 mddev->resync_max_sectors = mddev->dev_sectors;
6592
6593 mddev->ok_start_degraded = start_dirty_degraded;
6594
6595 if (start_readonly && md_is_rdwr(mddev))
6596 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6597
6598 err = pers->run(mddev);
6599 if (err)
6600 pr_warn("md: pers->run() failed ...\n");
6601 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6602 WARN_ONCE(!mddev->external_size,
6603 "%s: default size too small, but 'external_size' not in effect?\n",
6604 __func__);
6605 pr_warn("md: invalid array_size %llu > default size %llu\n",
6606 (unsigned long long)mddev->array_sectors / 2,
6607 (unsigned long long)pers->size(mddev, 0, 0) / 2);
6608 err = -EINVAL;
6609 }
6610 if (err == 0 && pers->sync_request &&
6611 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6612 err = md_bitmap_create(mddev);
6613 if (err)
6614 pr_warn("%s: failed to create bitmap (%d)\n",
6615 mdname(mddev), err);
6616 }
6617 if (err)
6618 goto bitmap_abort;
6619
6620 if (mddev->bitmap_info.max_write_behind > 0) {
6621 bool create_pool = false;
6622
6623 rdev_for_each(rdev, mddev) {
6624 if (test_bit(WriteMostly, &rdev->flags) &&
6625 rdev_init_serial(rdev))
6626 create_pool = true;
6627 }
6628 if (create_pool && mddev->serial_info_pool == NULL) {
6629 mddev->serial_info_pool =
6630 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6631 sizeof(struct serial_info));
6632 if (!mddev->serial_info_pool) {
6633 err = -ENOMEM;
6634 goto bitmap_abort;
6635 }
6636 }
6637 }
6638
6639 if (pers->sync_request) {
6640 if (mddev->kobj.sd &&
6641 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6642 pr_warn("md: cannot register extra attributes for %s\n",
6643 mdname(mddev));
6644 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6645 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6646 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6647 } else if (mddev->ro == MD_AUTO_READ)
6648 mddev->ro = MD_RDWR;
6649
6650 atomic_set(&mddev->max_corr_read_errors,
6651 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6652 mddev->safemode = 0;
6653 if (mddev_is_clustered(mddev))
6654 mddev->safemode_delay = 0;
6655 else
6656 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6657 mddev->in_sync = 1;
6658 smp_wmb();
6659 spin_lock(&mddev->lock);
6660 mddev->pers = pers;
6661 spin_unlock(&mddev->lock);
6662 rdev_for_each(rdev, mddev)
6663 if (rdev->raid_disk >= 0)
6664 sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6665
6666 if (mddev->degraded && md_is_rdwr(mddev))
6667 /* This ensures that recovering status is reported immediately
6668 * via sysfs - until a lack of spares is confirmed.
6669 */
6670 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6671 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6672
6673 if (mddev->sb_flags)
6674 md_update_sb(mddev, 0);
6675
6676 md_new_event();
6677 return 0;
6678
6679 bitmap_abort:
6680 mddev_detach(mddev);
6681 if (mddev->private)
6682 pers->free(mddev, mddev->private);
6683 mddev->private = NULL;
6684 put_pers(pers);
6685 md_bitmap_destroy(mddev);
6686 return err;
6687 }
6688 EXPORT_SYMBOL_GPL(md_run);
6689
do_md_run(struct mddev * mddev)6690 int do_md_run(struct mddev *mddev)
6691 {
6692 int err;
6693
6694 set_bit(MD_NOT_READY, &mddev->flags);
6695 err = md_run(mddev);
6696 if (err)
6697 goto out;
6698
6699 if (md_bitmap_registered(mddev)) {
6700 err = mddev->bitmap_ops->load(mddev);
6701 if (err) {
6702 md_bitmap_destroy(mddev);
6703 goto out;
6704 }
6705 }
6706
6707 if (mddev_is_clustered(mddev))
6708 md_allow_write(mddev);
6709
6710 /* run start up tasks that require md_thread */
6711 md_start(mddev);
6712
6713 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6714
6715 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6716 clear_bit(MD_NOT_READY, &mddev->flags);
6717 mddev->changed = 1;
6718 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6719 sysfs_notify_dirent_safe(mddev->sysfs_state);
6720 sysfs_notify_dirent_safe(mddev->sysfs_action);
6721 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6722 out:
6723 clear_bit(MD_NOT_READY, &mddev->flags);
6724 return err;
6725 }
6726
md_start(struct mddev * mddev)6727 int md_start(struct mddev *mddev)
6728 {
6729 int ret = 0;
6730
6731 if (mddev->pers->start) {
6732 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6733 ret = mddev->pers->start(mddev);
6734 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6735 md_wakeup_thread(mddev->sync_thread);
6736 }
6737 return ret;
6738 }
6739 EXPORT_SYMBOL_GPL(md_start);
6740
restart_array(struct mddev * mddev)6741 static int restart_array(struct mddev *mddev)
6742 {
6743 struct gendisk *disk = mddev->gendisk;
6744 struct md_rdev *rdev;
6745 bool has_journal = false;
6746 bool has_readonly = false;
6747
6748 /* Complain if it has no devices */
6749 if (list_empty(&mddev->disks))
6750 return -ENXIO;
6751 if (!mddev->pers)
6752 return -EINVAL;
6753 if (md_is_rdwr(mddev))
6754 return -EBUSY;
6755
6756 rcu_read_lock();
6757 rdev_for_each_rcu(rdev, mddev) {
6758 if (test_bit(Journal, &rdev->flags) &&
6759 !test_bit(Faulty, &rdev->flags))
6760 has_journal = true;
6761 if (rdev_read_only(rdev))
6762 has_readonly = true;
6763 }
6764 rcu_read_unlock();
6765 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6766 /* Don't restart rw with journal missing/faulty */
6767 return -EINVAL;
6768 if (has_readonly)
6769 return -EROFS;
6770
6771 mddev->safemode = 0;
6772 mddev->ro = MD_RDWR;
6773 set_disk_ro(disk, 0);
6774 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6775 /* Kick recovery or resync if necessary */
6776 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6777 md_wakeup_thread(mddev->sync_thread);
6778 sysfs_notify_dirent_safe(mddev->sysfs_state);
6779 return 0;
6780 }
6781
md_clean(struct mddev * mddev)6782 static void md_clean(struct mddev *mddev)
6783 {
6784 mddev->array_sectors = 0;
6785 mddev->external_size = 0;
6786 mddev->dev_sectors = 0;
6787 mddev->raid_disks = 0;
6788 mddev->resync_offset = 0;
6789 mddev->resync_min = 0;
6790 mddev->resync_max = MaxSector;
6791 mddev->reshape_position = MaxSector;
6792 /* we still need mddev->external in export_rdev, do not clear it yet */
6793 mddev->persistent = 0;
6794 mddev->level = LEVEL_NONE;
6795 mddev->clevel[0] = 0;
6796
6797 /*
6798 * For legacy_async_del_gendisk mode, it can stop the array in the
6799 * middle of assembling it, then it still can access the array. So
6800 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk,
6801 * it can't open the array again after stopping it. So it doesn't
6802 * clear MD_CLOSING.
6803 */
6804 if (legacy_async_del_gendisk && mddev->hold_active) {
6805 clear_bit(MD_CLOSING, &mddev->flags);
6806 } else {
6807 /* if UNTIL_STOP is set, it's cleared here */
6808 mddev->hold_active = 0;
6809 /* Don't clear MD_CLOSING, or mddev can be opened again. */
6810 mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6811 }
6812 mddev->sb_flags = 0;
6813 mddev->ro = MD_RDWR;
6814 mddev->metadata_type[0] = 0;
6815 mddev->chunk_sectors = 0;
6816 mddev->ctime = mddev->utime = 0;
6817 mddev->layout = 0;
6818 mddev->logical_block_size = 0;
6819 mddev->max_disks = 0;
6820 mddev->events = 0;
6821 mddev->can_decrease_events = 0;
6822 mddev->delta_disks = 0;
6823 mddev->reshape_backwards = 0;
6824 mddev->new_level = LEVEL_NONE;
6825 mddev->new_layout = 0;
6826 mddev->new_chunk_sectors = 0;
6827 mddev->curr_resync = MD_RESYNC_NONE;
6828 atomic64_set(&mddev->resync_mismatches, 0);
6829 mddev->suspend_lo = mddev->suspend_hi = 0;
6830 mddev->sync_speed_min = mddev->sync_speed_max = 0;
6831 mddev->recovery = 0;
6832 mddev->in_sync = 0;
6833 mddev->changed = 0;
6834 mddev->degraded = 0;
6835 mddev->safemode = 0;
6836 mddev->private = NULL;
6837 mddev->cluster_info = NULL;
6838 mddev->bitmap_info.offset = 0;
6839 mddev->bitmap_info.default_offset = 0;
6840 mddev->bitmap_info.default_space = 0;
6841 mddev->bitmap_info.chunksize = 0;
6842 mddev->bitmap_info.daemon_sleep = 0;
6843 mddev->bitmap_info.max_write_behind = 0;
6844 mddev->bitmap_info.nodes = 0;
6845 }
6846
__md_stop_writes(struct mddev * mddev)6847 static void __md_stop_writes(struct mddev *mddev)
6848 {
6849 timer_delete_sync(&mddev->safemode_timer);
6850
6851 if (mddev->pers && mddev->pers->quiesce) {
6852 mddev->pers->quiesce(mddev, 1);
6853 mddev->pers->quiesce(mddev, 0);
6854 }
6855
6856 if (md_bitmap_enabled(mddev, true))
6857 mddev->bitmap_ops->flush(mddev);
6858
6859 if (md_is_rdwr(mddev) &&
6860 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6861 mddev->sb_flags)) {
6862 /* mark array as shutdown cleanly */
6863 if (!mddev_is_clustered(mddev))
6864 mddev->in_sync = 1;
6865 md_update_sb(mddev, 1);
6866 }
6867 /* disable policy to guarantee rdevs free resources for serialization */
6868 mddev->serialize_policy = 0;
6869 mddev_destroy_serial_pool(mddev, NULL);
6870 }
6871
md_stop_writes(struct mddev * mddev)6872 void md_stop_writes(struct mddev *mddev)
6873 {
6874 mddev_lock_nointr(mddev);
6875 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6876 stop_sync_thread(mddev, true);
6877 __md_stop_writes(mddev);
6878 mddev_unlock(mddev);
6879 }
6880 EXPORT_SYMBOL_GPL(md_stop_writes);
6881
mddev_detach(struct mddev * mddev)6882 static void mddev_detach(struct mddev *mddev)
6883 {
6884 if (md_bitmap_enabled(mddev, false))
6885 mddev->bitmap_ops->wait_behind_writes(mddev);
6886 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6887 mddev->pers->quiesce(mddev, 1);
6888 mddev->pers->quiesce(mddev, 0);
6889 }
6890 md_unregister_thread(mddev, &mddev->thread);
6891
6892 /* the unplug fn references 'conf' */
6893 if (!mddev_is_dm(mddev))
6894 blk_sync_queue(mddev->gendisk->queue);
6895 }
6896
__md_stop(struct mddev * mddev)6897 static void __md_stop(struct mddev *mddev)
6898 {
6899 struct md_personality *pers = mddev->pers;
6900
6901 md_bitmap_destroy(mddev);
6902 mddev_detach(mddev);
6903 spin_lock(&mddev->lock);
6904 mddev->pers = NULL;
6905 spin_unlock(&mddev->lock);
6906 if (mddev->private)
6907 pers->free(mddev, mddev->private);
6908 mddev->private = NULL;
6909 put_pers(pers);
6910 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6911 }
6912
md_stop(struct mddev * mddev)6913 void md_stop(struct mddev *mddev)
6914 {
6915 lockdep_assert_held(&mddev->reconfig_mutex);
6916
6917 /* stop the array and free an attached data structures.
6918 * This is called from dm-raid
6919 */
6920 __md_stop_writes(mddev);
6921 __md_stop(mddev);
6922 }
6923
6924 EXPORT_SYMBOL_GPL(md_stop);
6925
6926 /* ensure 'mddev->pers' exist before calling md_set_readonly() */
md_set_readonly(struct mddev * mddev)6927 static int md_set_readonly(struct mddev *mddev)
6928 {
6929 int err = 0;
6930 int did_freeze = 0;
6931
6932 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6933 return -EBUSY;
6934
6935 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6936 did_freeze = 1;
6937 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6938 }
6939
6940 stop_sync_thread(mddev, false);
6941 wait_event(mddev->sb_wait,
6942 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6943 mddev_lock_nointr(mddev);
6944
6945 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6946 pr_warn("md: %s still in use.\n",mdname(mddev));
6947 err = -EBUSY;
6948 goto out;
6949 }
6950
6951 __md_stop_writes(mddev);
6952
6953 if (mddev->ro == MD_RDONLY) {
6954 err = -ENXIO;
6955 goto out;
6956 }
6957
6958 mddev->ro = MD_RDONLY;
6959 set_disk_ro(mddev->gendisk, 1);
6960
6961 out:
6962 if (!err || did_freeze) {
6963 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6964 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6965 sysfs_notify_dirent_safe(mddev->sysfs_state);
6966 }
6967
6968 return err;
6969 }
6970
6971 /* mode:
6972 * 0 - completely stop and dis-assemble array
6973 * 2 - stop but do not disassemble array
6974 */
do_md_stop(struct mddev * mddev,int mode)6975 static int do_md_stop(struct mddev *mddev, int mode)
6976 {
6977 struct gendisk *disk = mddev->gendisk;
6978 struct md_rdev *rdev;
6979 int did_freeze = 0;
6980
6981 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6982 did_freeze = 1;
6983 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6984 }
6985
6986 stop_sync_thread(mddev, true);
6987
6988 if (mddev->sysfs_active ||
6989 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6990 pr_warn("md: %s still in use.\n",mdname(mddev));
6991 if (did_freeze) {
6992 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6993 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6994 }
6995 return -EBUSY;
6996 }
6997 if (mddev->pers) {
6998 if (!md_is_rdwr(mddev))
6999 set_disk_ro(disk, 0);
7000
7001 if (mode == 2 && mddev->pers->sync_request &&
7002 mddev->to_remove == NULL)
7003 mddev->to_remove = &md_redundancy_group;
7004
7005 __md_stop_writes(mddev);
7006 __md_stop(mddev);
7007
7008 /* tell userspace to handle 'inactive' */
7009 sysfs_notify_dirent_safe(mddev->sysfs_state);
7010
7011 rdev_for_each(rdev, mddev)
7012 if (rdev->raid_disk >= 0)
7013 sysfs_unlink_rdev(mddev, rdev);
7014
7015 set_capacity_and_notify(disk, 0);
7016 mddev->changed = 1;
7017
7018 if (!md_is_rdwr(mddev))
7019 mddev->ro = MD_RDWR;
7020 }
7021 /*
7022 * Free resources if final stop
7023 */
7024 if (mode == 0) {
7025 pr_info("md: %s stopped.\n", mdname(mddev));
7026
7027 if (mddev->bitmap_info.file) {
7028 struct file *f = mddev->bitmap_info.file;
7029 spin_lock(&mddev->lock);
7030 mddev->bitmap_info.file = NULL;
7031 spin_unlock(&mddev->lock);
7032 fput(f);
7033 }
7034 mddev->bitmap_info.offset = 0;
7035
7036 export_array(mddev);
7037 md_clean(mddev);
7038 if (!legacy_async_del_gendisk)
7039 set_bit(MD_DELETED, &mddev->flags);
7040 }
7041 md_new_event();
7042 sysfs_notify_dirent_safe(mddev->sysfs_state);
7043 return 0;
7044 }
7045
7046 #ifndef MODULE
autorun_array(struct mddev * mddev)7047 static void autorun_array(struct mddev *mddev)
7048 {
7049 struct md_rdev *rdev;
7050 int err;
7051
7052 if (list_empty(&mddev->disks))
7053 return;
7054
7055 pr_info("md: running: ");
7056
7057 rdev_for_each(rdev, mddev) {
7058 pr_cont("<%pg>", rdev->bdev);
7059 }
7060 pr_cont("\n");
7061
7062 err = do_md_run(mddev);
7063 if (err) {
7064 pr_warn("md: do_md_run() returned %d\n", err);
7065 do_md_stop(mddev, 0);
7066 }
7067 }
7068
7069 /*
7070 * lets try to run arrays based on all disks that have arrived
7071 * until now. (those are in pending_raid_disks)
7072 *
7073 * the method: pick the first pending disk, collect all disks with
7074 * the same UUID, remove all from the pending list and put them into
7075 * the 'same_array' list. Then order this list based on superblock
7076 * update time (freshest comes first), kick out 'old' disks and
7077 * compare superblocks. If everything's fine then run it.
7078 *
7079 * If "unit" is allocated, then bump its reference count
7080 */
autorun_devices(int part)7081 static void autorun_devices(int part)
7082 {
7083 struct md_rdev *rdev0, *rdev, *tmp;
7084 struct mddev *mddev;
7085
7086 pr_info("md: autorun ...\n");
7087 while (!list_empty(&pending_raid_disks)) {
7088 int unit;
7089 dev_t dev;
7090 LIST_HEAD(candidates);
7091 rdev0 = list_entry(pending_raid_disks.next,
7092 struct md_rdev, same_set);
7093
7094 pr_debug("md: considering %pg ...\n", rdev0->bdev);
7095 INIT_LIST_HEAD(&candidates);
7096 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
7097 if (super_90_load(rdev, rdev0, 0) >= 0) {
7098 pr_debug("md: adding %pg ...\n",
7099 rdev->bdev);
7100 list_move(&rdev->same_set, &candidates);
7101 }
7102 /*
7103 * now we have a set of devices, with all of them having
7104 * mostly sane superblocks. It's time to allocate the
7105 * mddev.
7106 */
7107 if (part) {
7108 dev = MKDEV(mdp_major,
7109 rdev0->preferred_minor << MdpMinorShift);
7110 unit = MINOR(dev) >> MdpMinorShift;
7111 } else {
7112 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
7113 unit = MINOR(dev);
7114 }
7115 if (rdev0->preferred_minor != unit) {
7116 pr_warn("md: unit number in %pg is bad: %d\n",
7117 rdev0->bdev, rdev0->preferred_minor);
7118 break;
7119 }
7120
7121 mddev = md_alloc(dev, NULL);
7122 if (IS_ERR(mddev))
7123 break;
7124
7125 if (mddev_suspend_and_lock(mddev))
7126 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
7127 else if (mddev->raid_disks || mddev->major_version
7128 || !list_empty(&mddev->disks)) {
7129 pr_warn("md: %s already running, cannot run %pg\n",
7130 mdname(mddev), rdev0->bdev);
7131 mddev_unlock_and_resume(mddev);
7132 } else {
7133 pr_debug("md: created %s\n", mdname(mddev));
7134 mddev->persistent = 1;
7135 rdev_for_each_list(rdev, tmp, &candidates) {
7136 list_del_init(&rdev->same_set);
7137 if (bind_rdev_to_array(rdev, mddev))
7138 export_rdev(rdev, mddev);
7139 }
7140 autorun_array(mddev);
7141 mddev_unlock_and_resume(mddev);
7142 }
7143 /* on success, candidates will be empty, on error
7144 * it won't...
7145 */
7146 rdev_for_each_list(rdev, tmp, &candidates) {
7147 list_del_init(&rdev->same_set);
7148 export_rdev(rdev, mddev);
7149 }
7150 mddev_put(mddev);
7151 }
7152 pr_info("md: ... autorun DONE.\n");
7153 }
7154 #endif /* !MODULE */
7155
get_version(void __user * arg)7156 static int get_version(void __user *arg)
7157 {
7158 mdu_version_t ver;
7159
7160 ver.major = MD_MAJOR_VERSION;
7161 ver.minor = MD_MINOR_VERSION;
7162 ver.patchlevel = MD_PATCHLEVEL_VERSION;
7163
7164 if (copy_to_user(arg, &ver, sizeof(ver)))
7165 return -EFAULT;
7166
7167 return 0;
7168 }
7169
get_array_info(struct mddev * mddev,void __user * arg)7170 static int get_array_info(struct mddev *mddev, void __user *arg)
7171 {
7172 mdu_array_info_t info;
7173 int nr,working,insync,failed,spare;
7174 struct md_rdev *rdev;
7175
7176 nr = working = insync = failed = spare = 0;
7177 rcu_read_lock();
7178 rdev_for_each_rcu(rdev, mddev) {
7179 nr++;
7180 if (test_bit(Faulty, &rdev->flags))
7181 failed++;
7182 else {
7183 working++;
7184 if (test_bit(In_sync, &rdev->flags))
7185 insync++;
7186 else if (test_bit(Journal, &rdev->flags))
7187 /* TODO: add journal count to md_u.h */
7188 ;
7189 else
7190 spare++;
7191 }
7192 }
7193 rcu_read_unlock();
7194
7195 info.major_version = mddev->major_version;
7196 info.minor_version = mddev->minor_version;
7197 info.patch_version = MD_PATCHLEVEL_VERSION;
7198 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
7199 info.level = mddev->level;
7200 info.size = mddev->dev_sectors / 2;
7201 if (info.size != mddev->dev_sectors / 2) /* overflow */
7202 info.size = -1;
7203 info.nr_disks = nr;
7204 info.raid_disks = mddev->raid_disks;
7205 info.md_minor = mddev->md_minor;
7206 info.not_persistent= !mddev->persistent;
7207
7208 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
7209 info.state = 0;
7210 if (mddev->in_sync)
7211 info.state = (1<<MD_SB_CLEAN);
7212 if (mddev->bitmap && mddev->bitmap_info.offset)
7213 info.state |= (1<<MD_SB_BITMAP_PRESENT);
7214 if (mddev_is_clustered(mddev))
7215 info.state |= (1<<MD_SB_CLUSTERED);
7216 info.active_disks = insync;
7217 info.working_disks = working;
7218 info.failed_disks = failed;
7219 info.spare_disks = spare;
7220
7221 info.layout = mddev->layout;
7222 info.chunk_size = mddev->chunk_sectors << 9;
7223
7224 if (copy_to_user(arg, &info, sizeof(info)))
7225 return -EFAULT;
7226
7227 return 0;
7228 }
7229
get_bitmap_file(struct mddev * mddev,void __user * arg)7230 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
7231 {
7232 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
7233 char *ptr;
7234 int err;
7235
7236 file = kzalloc(sizeof(*file), GFP_NOIO);
7237 if (!file)
7238 return -ENOMEM;
7239
7240 err = 0;
7241 spin_lock(&mddev->lock);
7242 /* bitmap enabled */
7243 if (mddev->bitmap_info.file) {
7244 ptr = file_path(mddev->bitmap_info.file, file->pathname,
7245 sizeof(file->pathname));
7246 if (IS_ERR(ptr))
7247 err = PTR_ERR(ptr);
7248 else
7249 memmove(file->pathname, ptr,
7250 sizeof(file->pathname)-(ptr-file->pathname));
7251 }
7252 spin_unlock(&mddev->lock);
7253
7254 if (err == 0 &&
7255 copy_to_user(arg, file, sizeof(*file)))
7256 err = -EFAULT;
7257
7258 kfree(file);
7259 return err;
7260 }
7261
get_disk_info(struct mddev * mddev,void __user * arg)7262 static int get_disk_info(struct mddev *mddev, void __user * arg)
7263 {
7264 mdu_disk_info_t info;
7265 struct md_rdev *rdev;
7266
7267 if (copy_from_user(&info, arg, sizeof(info)))
7268 return -EFAULT;
7269
7270 rcu_read_lock();
7271 rdev = md_find_rdev_nr_rcu(mddev, info.number);
7272 if (rdev) {
7273 info.major = MAJOR(rdev->bdev->bd_dev);
7274 info.minor = MINOR(rdev->bdev->bd_dev);
7275 info.raid_disk = rdev->raid_disk;
7276 info.state = 0;
7277 if (test_bit(Faulty, &rdev->flags))
7278 info.state |= (1<<MD_DISK_FAULTY);
7279 else if (test_bit(In_sync, &rdev->flags)) {
7280 info.state |= (1<<MD_DISK_ACTIVE);
7281 info.state |= (1<<MD_DISK_SYNC);
7282 }
7283 if (test_bit(Journal, &rdev->flags))
7284 info.state |= (1<<MD_DISK_JOURNAL);
7285 if (test_bit(WriteMostly, &rdev->flags))
7286 info.state |= (1<<MD_DISK_WRITEMOSTLY);
7287 if (test_bit(FailFast, &rdev->flags))
7288 info.state |= (1<<MD_DISK_FAILFAST);
7289 } else {
7290 info.major = info.minor = 0;
7291 info.raid_disk = -1;
7292 info.state = (1<<MD_DISK_REMOVED);
7293 }
7294 rcu_read_unlock();
7295
7296 if (copy_to_user(arg, &info, sizeof(info)))
7297 return -EFAULT;
7298
7299 return 0;
7300 }
7301
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)7302 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
7303 {
7304 struct md_rdev *rdev;
7305 dev_t dev = MKDEV(info->major,info->minor);
7306
7307 if (mddev_is_clustered(mddev) &&
7308 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
7309 pr_warn("%s: Cannot add to clustered mddev.\n",
7310 mdname(mddev));
7311 return -EINVAL;
7312 }
7313
7314 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
7315 return -EOVERFLOW;
7316
7317 if (!mddev->raid_disks) {
7318 int err;
7319 /* expecting a device which has a superblock */
7320 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
7321 if (IS_ERR(rdev)) {
7322 pr_warn("md: md_import_device returned %ld\n",
7323 PTR_ERR(rdev));
7324 return PTR_ERR(rdev);
7325 }
7326 if (!list_empty(&mddev->disks)) {
7327 struct md_rdev *rdev0
7328 = list_entry(mddev->disks.next,
7329 struct md_rdev, same_set);
7330 err = super_types[mddev->major_version]
7331 .load_super(rdev, rdev0, mddev->minor_version);
7332 if (err < 0) {
7333 pr_warn("md: %pg has different UUID to %pg\n",
7334 rdev->bdev,
7335 rdev0->bdev);
7336 export_rdev(rdev, mddev);
7337 return -EINVAL;
7338 }
7339 }
7340 err = bind_rdev_to_array(rdev, mddev);
7341 if (err)
7342 export_rdev(rdev, mddev);
7343 return err;
7344 }
7345
7346 /*
7347 * md_add_new_disk can be used once the array is assembled
7348 * to add "hot spares". They must already have a superblock
7349 * written
7350 */
7351 if (mddev->pers) {
7352 int err;
7353 if (!mddev->pers->hot_add_disk) {
7354 pr_warn("%s: personality does not support diskops!\n",
7355 mdname(mddev));
7356 return -EINVAL;
7357 }
7358 if (mddev->persistent)
7359 rdev = md_import_device(dev, mddev->major_version,
7360 mddev->minor_version);
7361 else
7362 rdev = md_import_device(dev, -1, -1);
7363 if (IS_ERR(rdev)) {
7364 pr_warn("md: md_import_device returned %ld\n",
7365 PTR_ERR(rdev));
7366 return PTR_ERR(rdev);
7367 }
7368 /* set saved_raid_disk if appropriate */
7369 if (!mddev->persistent) {
7370 if (info->state & (1<<MD_DISK_SYNC) &&
7371 info->raid_disk < mddev->raid_disks) {
7372 rdev->raid_disk = info->raid_disk;
7373 clear_bit(Bitmap_sync, &rdev->flags);
7374 } else
7375 rdev->raid_disk = -1;
7376 rdev->saved_raid_disk = rdev->raid_disk;
7377 } else
7378 super_types[mddev->major_version].
7379 validate_super(mddev, NULL/*freshest*/, rdev);
7380 if ((info->state & (1<<MD_DISK_SYNC)) &&
7381 rdev->raid_disk != info->raid_disk) {
7382 /* This was a hot-add request, but events doesn't
7383 * match, so reject it.
7384 */
7385 export_rdev(rdev, mddev);
7386 return -EINVAL;
7387 }
7388
7389 clear_bit(In_sync, &rdev->flags); /* just to be sure */
7390 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7391 set_bit(WriteMostly, &rdev->flags);
7392 else
7393 clear_bit(WriteMostly, &rdev->flags);
7394 if (info->state & (1<<MD_DISK_FAILFAST))
7395 set_bit(FailFast, &rdev->flags);
7396 else
7397 clear_bit(FailFast, &rdev->flags);
7398
7399 if (info->state & (1<<MD_DISK_JOURNAL)) {
7400 struct md_rdev *rdev2;
7401 bool has_journal = false;
7402
7403 /* make sure no existing journal disk */
7404 rdev_for_each(rdev2, mddev) {
7405 if (test_bit(Journal, &rdev2->flags)) {
7406 has_journal = true;
7407 break;
7408 }
7409 }
7410 if (has_journal || mddev->bitmap) {
7411 export_rdev(rdev, mddev);
7412 return -EBUSY;
7413 }
7414 set_bit(Journal, &rdev->flags);
7415 }
7416 /*
7417 * check whether the device shows up in other nodes
7418 */
7419 if (mddev_is_clustered(mddev)) {
7420 if (info->state & (1 << MD_DISK_CANDIDATE))
7421 set_bit(Candidate, &rdev->flags);
7422 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
7423 /* --add initiated by this node */
7424 err = mddev->cluster_ops->add_new_disk(mddev, rdev);
7425 if (err) {
7426 export_rdev(rdev, mddev);
7427 return err;
7428 }
7429 }
7430 }
7431
7432 rdev->raid_disk = -1;
7433 err = bind_rdev_to_array(rdev, mddev);
7434
7435 if (err)
7436 export_rdev(rdev, mddev);
7437
7438 if (mddev_is_clustered(mddev)) {
7439 if (info->state & (1 << MD_DISK_CANDIDATE)) {
7440 if (!err) {
7441 err = mddev->cluster_ops->new_disk_ack(
7442 mddev, err == 0);
7443 if (err)
7444 md_kick_rdev_from_array(rdev);
7445 }
7446 } else {
7447 if (err)
7448 mddev->cluster_ops->add_new_disk_cancel(mddev);
7449 else
7450 err = add_bound_rdev(rdev);
7451 }
7452
7453 } else if (!err)
7454 err = add_bound_rdev(rdev);
7455
7456 return err;
7457 }
7458
7459 /* otherwise, md_add_new_disk is only allowed
7460 * for major_version==0 superblocks
7461 */
7462 if (mddev->major_version != 0) {
7463 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7464 return -EINVAL;
7465 }
7466
7467 if (!(info->state & (1<<MD_DISK_FAULTY))) {
7468 int err;
7469 rdev = md_import_device(dev, -1, 0);
7470 if (IS_ERR(rdev)) {
7471 pr_warn("md: error, md_import_device() returned %ld\n",
7472 PTR_ERR(rdev));
7473 return PTR_ERR(rdev);
7474 }
7475 rdev->desc_nr = info->number;
7476 if (info->raid_disk < mddev->raid_disks)
7477 rdev->raid_disk = info->raid_disk;
7478 else
7479 rdev->raid_disk = -1;
7480
7481 if (rdev->raid_disk < mddev->raid_disks)
7482 if (info->state & (1<<MD_DISK_SYNC))
7483 set_bit(In_sync, &rdev->flags);
7484
7485 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7486 set_bit(WriteMostly, &rdev->flags);
7487 if (info->state & (1<<MD_DISK_FAILFAST))
7488 set_bit(FailFast, &rdev->flags);
7489
7490 if (!mddev->persistent) {
7491 pr_debug("md: nonpersistent superblock ...\n");
7492 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7493 } else
7494 rdev->sb_start = calc_dev_sboffset(rdev);
7495 rdev->sectors = rdev->sb_start;
7496
7497 err = bind_rdev_to_array(rdev, mddev);
7498 if (err) {
7499 export_rdev(rdev, mddev);
7500 return err;
7501 }
7502 }
7503
7504 return 0;
7505 }
7506
hot_remove_disk(struct mddev * mddev,dev_t dev)7507 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7508 {
7509 struct md_rdev *rdev;
7510
7511 if (!mddev->pers)
7512 return -ENODEV;
7513
7514 rdev = find_rdev(mddev, dev);
7515 if (!rdev)
7516 return -ENXIO;
7517
7518 if (rdev->raid_disk < 0)
7519 goto kick_rdev;
7520
7521 clear_bit(Blocked, &rdev->flags);
7522 remove_and_add_spares(mddev, rdev);
7523
7524 if (rdev->raid_disk >= 0)
7525 goto busy;
7526
7527 kick_rdev:
7528 if (mddev_is_clustered(mddev) &&
7529 mddev->cluster_ops->remove_disk(mddev, rdev))
7530 goto busy;
7531
7532 md_kick_rdev_from_array(rdev);
7533 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7534 if (!mddev->thread)
7535 md_update_sb(mddev, 1);
7536 md_new_event();
7537
7538 return 0;
7539 busy:
7540 pr_debug("md: cannot remove active disk %pg from %s ...\n",
7541 rdev->bdev, mdname(mddev));
7542 return -EBUSY;
7543 }
7544
hot_add_disk(struct mddev * mddev,dev_t dev)7545 static int hot_add_disk(struct mddev *mddev, dev_t dev)
7546 {
7547 int err;
7548 struct md_rdev *rdev;
7549
7550 if (!mddev->pers)
7551 return -ENODEV;
7552
7553 if (mddev->major_version != 0) {
7554 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7555 mdname(mddev));
7556 return -EINVAL;
7557 }
7558 if (!mddev->pers->hot_add_disk) {
7559 pr_warn("%s: personality does not support diskops!\n",
7560 mdname(mddev));
7561 return -EINVAL;
7562 }
7563
7564 rdev = md_import_device(dev, -1, 0);
7565 if (IS_ERR(rdev)) {
7566 pr_warn("md: error, md_import_device() returned %ld\n",
7567 PTR_ERR(rdev));
7568 return -EINVAL;
7569 }
7570
7571 if (mddev->persistent)
7572 rdev->sb_start = calc_dev_sboffset(rdev);
7573 else
7574 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7575
7576 rdev->sectors = rdev->sb_start;
7577
7578 if (test_bit(Faulty, &rdev->flags)) {
7579 pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7580 rdev->bdev, mdname(mddev));
7581 err = -EINVAL;
7582 goto abort_export;
7583 }
7584
7585 clear_bit(In_sync, &rdev->flags);
7586 rdev->desc_nr = -1;
7587 rdev->saved_raid_disk = -1;
7588 err = bind_rdev_to_array(rdev, mddev);
7589 if (err)
7590 goto abort_export;
7591
7592 /*
7593 * The rest should better be atomic, we can have disk failures
7594 * noticed in interrupt contexts ...
7595 */
7596
7597 rdev->raid_disk = -1;
7598
7599 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7600 if (!mddev->thread)
7601 md_update_sb(mddev, 1);
7602 /*
7603 * Kick recovery, maybe this spare has to be added to the
7604 * array immediately.
7605 */
7606 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7607 md_new_event();
7608 return 0;
7609
7610 abort_export:
7611 export_rdev(rdev, mddev);
7612 return err;
7613 }
7614
set_bitmap_file(struct mddev * mddev,int fd)7615 static int set_bitmap_file(struct mddev *mddev, int fd)
7616 {
7617 int err = 0;
7618
7619 if (!md_bitmap_registered(mddev))
7620 return -EINVAL;
7621
7622 if (mddev->pers) {
7623 if (!mddev->pers->quiesce || !mddev->thread)
7624 return -EBUSY;
7625 if (mddev->recovery || mddev->sync_thread)
7626 return -EBUSY;
7627 /* we should be able to change the bitmap.. */
7628 }
7629
7630 if (fd >= 0) {
7631 struct inode *inode;
7632 struct file *f;
7633
7634 if (mddev->bitmap || mddev->bitmap_info.file)
7635 return -EEXIST; /* cannot add when bitmap is present */
7636
7637 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7638 pr_warn("%s: bitmap files not supported by this kernel\n",
7639 mdname(mddev));
7640 return -EINVAL;
7641 }
7642 pr_warn("%s: using deprecated bitmap file support\n",
7643 mdname(mddev));
7644
7645 f = fget(fd);
7646
7647 if (f == NULL) {
7648 pr_warn("%s: error: failed to get bitmap file\n",
7649 mdname(mddev));
7650 return -EBADF;
7651 }
7652
7653 inode = f->f_mapping->host;
7654 if (!S_ISREG(inode->i_mode)) {
7655 pr_warn("%s: error: bitmap file must be a regular file\n",
7656 mdname(mddev));
7657 err = -EBADF;
7658 } else if (!(f->f_mode & FMODE_WRITE)) {
7659 pr_warn("%s: error: bitmap file must open for write\n",
7660 mdname(mddev));
7661 err = -EBADF;
7662 } else if (atomic_read(&inode->i_writecount) != 1) {
7663 pr_warn("%s: error: bitmap file is already in use\n",
7664 mdname(mddev));
7665 err = -EBUSY;
7666 }
7667 if (err) {
7668 fput(f);
7669 return err;
7670 }
7671 mddev->bitmap_info.file = f;
7672 mddev->bitmap_info.offset = 0; /* file overrides offset */
7673 } else if (mddev->bitmap == NULL)
7674 return -ENOENT; /* cannot remove what isn't there */
7675 err = 0;
7676 if (mddev->pers) {
7677 if (fd >= 0) {
7678 err = md_bitmap_create(mddev);
7679 if (!err)
7680 err = mddev->bitmap_ops->load(mddev);
7681
7682 if (err) {
7683 md_bitmap_destroy(mddev);
7684 fd = -1;
7685 }
7686 } else if (fd < 0) {
7687 md_bitmap_destroy(mddev);
7688 }
7689 }
7690
7691 if (fd < 0) {
7692 struct file *f = mddev->bitmap_info.file;
7693 if (f) {
7694 spin_lock(&mddev->lock);
7695 mddev->bitmap_info.file = NULL;
7696 spin_unlock(&mddev->lock);
7697 fput(f);
7698 }
7699 }
7700
7701 return err;
7702 }
7703
7704 /*
7705 * md_set_array_info is used two different ways
7706 * The original usage is when creating a new array.
7707 * In this usage, raid_disks is > 0 and it together with
7708 * level, size, not_persistent,layout,chunksize determine the
7709 * shape of the array.
7710 * This will always create an array with a type-0.90.0 superblock.
7711 * The newer usage is when assembling an array.
7712 * In this case raid_disks will be 0, and the major_version field is
7713 * use to determine which style super-blocks are to be found on the devices.
7714 * The minor and patch _version numbers are also kept incase the
7715 * super_block handler wishes to interpret them.
7716 */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7717 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7718 {
7719 if (info->raid_disks == 0) {
7720 /* just setting version number for superblock loading */
7721 if (info->major_version < 0 ||
7722 info->major_version >= ARRAY_SIZE(super_types) ||
7723 super_types[info->major_version].name == NULL) {
7724 /* maybe try to auto-load a module? */
7725 pr_warn("md: superblock version %d not known\n",
7726 info->major_version);
7727 return -EINVAL;
7728 }
7729 mddev->major_version = info->major_version;
7730 mddev->minor_version = info->minor_version;
7731 mddev->patch_version = info->patch_version;
7732 mddev->persistent = !info->not_persistent;
7733 /* ensure mddev_put doesn't delete this now that there
7734 * is some minimal configuration.
7735 */
7736 mddev->ctime = ktime_get_real_seconds();
7737 return 0;
7738 }
7739 mddev->major_version = MD_MAJOR_VERSION;
7740 mddev->minor_version = MD_MINOR_VERSION;
7741 mddev->patch_version = MD_PATCHLEVEL_VERSION;
7742 mddev->ctime = ktime_get_real_seconds();
7743
7744 mddev->level = info->level;
7745 mddev->clevel[0] = 0;
7746 mddev->dev_sectors = 2 * (sector_t)info->size;
7747 mddev->raid_disks = info->raid_disks;
7748 /* don't set md_minor, it is determined by which /dev/md* was
7749 * openned
7750 */
7751 if (info->state & (1<<MD_SB_CLEAN))
7752 mddev->resync_offset = MaxSector;
7753 else
7754 mddev->resync_offset = 0;
7755 mddev->persistent = ! info->not_persistent;
7756 mddev->external = 0;
7757
7758 mddev->layout = info->layout;
7759 if (mddev->level == 0)
7760 /* Cannot trust RAID0 layout info here */
7761 mddev->layout = -1;
7762 mddev->chunk_sectors = info->chunk_size >> 9;
7763
7764 if (mddev->persistent) {
7765 mddev->max_disks = MD_SB_DISKS;
7766 mddev->flags = 0;
7767 mddev->sb_flags = 0;
7768 }
7769 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7770
7771 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7772 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7773 mddev->bitmap_info.offset = 0;
7774
7775 mddev->reshape_position = MaxSector;
7776
7777 /*
7778 * Generate a 128 bit UUID
7779 */
7780 get_random_bytes(mddev->uuid, 16);
7781
7782 mddev->new_level = mddev->level;
7783 mddev->new_chunk_sectors = mddev->chunk_sectors;
7784 mddev->new_layout = mddev->layout;
7785 mddev->delta_disks = 0;
7786 mddev->reshape_backwards = 0;
7787
7788 return 0;
7789 }
7790
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7791 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7792 {
7793 lockdep_assert_held(&mddev->reconfig_mutex);
7794
7795 if (mddev->external_size)
7796 return;
7797
7798 mddev->array_sectors = array_sectors;
7799 }
7800 EXPORT_SYMBOL(md_set_array_sectors);
7801
update_size(struct mddev * mddev,sector_t num_sectors)7802 static int update_size(struct mddev *mddev, sector_t num_sectors)
7803 {
7804 struct md_rdev *rdev;
7805 int rv;
7806 int fit = (num_sectors == 0);
7807 sector_t old_dev_sectors = mddev->dev_sectors;
7808
7809 if (mddev->pers->resize == NULL)
7810 return -EINVAL;
7811 /* The "num_sectors" is the number of sectors of each device that
7812 * is used. This can only make sense for arrays with redundancy.
7813 * linear and raid0 always use whatever space is available. We can only
7814 * consider changing this number if no resync or reconstruction is
7815 * happening, and if the new size is acceptable. It must fit before the
7816 * sb_start or, if that is <data_offset, it must fit before the size
7817 * of each device. If num_sectors is zero, we find the largest size
7818 * that fits.
7819 */
7820 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7821 return -EBUSY;
7822 if (!md_is_rdwr(mddev))
7823 return -EROFS;
7824
7825 rdev_for_each(rdev, mddev) {
7826 sector_t avail = rdev->sectors;
7827
7828 if (fit && (num_sectors == 0 || num_sectors > avail))
7829 num_sectors = avail;
7830 if (avail < num_sectors)
7831 return -ENOSPC;
7832 }
7833 rv = mddev->pers->resize(mddev, num_sectors);
7834 if (!rv) {
7835 if (mddev_is_clustered(mddev))
7836 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
7837 else if (!mddev_is_dm(mddev))
7838 set_capacity_and_notify(mddev->gendisk,
7839 mddev->array_sectors);
7840 }
7841 return rv;
7842 }
7843
update_raid_disks(struct mddev * mddev,int raid_disks)7844 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7845 {
7846 int rv;
7847 struct md_rdev *rdev;
7848 /* change the number of raid disks */
7849 if (mddev->pers->check_reshape == NULL)
7850 return -EINVAL;
7851 if (!md_is_rdwr(mddev))
7852 return -EROFS;
7853 if (raid_disks <= 0 ||
7854 (mddev->max_disks && raid_disks >= mddev->max_disks))
7855 return -EINVAL;
7856 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7857 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7858 mddev->reshape_position != MaxSector)
7859 return -EBUSY;
7860
7861 rdev_for_each(rdev, mddev) {
7862 if (mddev->raid_disks < raid_disks &&
7863 rdev->data_offset < rdev->new_data_offset)
7864 return -EINVAL;
7865 if (mddev->raid_disks > raid_disks &&
7866 rdev->data_offset > rdev->new_data_offset)
7867 return -EINVAL;
7868 }
7869
7870 mddev->delta_disks = raid_disks - mddev->raid_disks;
7871 if (mddev->delta_disks < 0)
7872 mddev->reshape_backwards = 1;
7873 else if (mddev->delta_disks > 0)
7874 mddev->reshape_backwards = 0;
7875
7876 rv = mddev->pers->check_reshape(mddev);
7877 if (rv < 0) {
7878 mddev->delta_disks = 0;
7879 mddev->reshape_backwards = 0;
7880 }
7881 return rv;
7882 }
7883
get_cluster_ops(struct mddev * mddev)7884 static int get_cluster_ops(struct mddev *mddev)
7885 {
7886 xa_lock(&md_submodule);
7887 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
7888 if (mddev->cluster_ops &&
7889 !try_module_get(mddev->cluster_ops->head.owner))
7890 mddev->cluster_ops = NULL;
7891 xa_unlock(&md_submodule);
7892
7893 return mddev->cluster_ops == NULL ? -ENOENT : 0;
7894 }
7895
put_cluster_ops(struct mddev * mddev)7896 static void put_cluster_ops(struct mddev *mddev)
7897 {
7898 if (!mddev->cluster_ops)
7899 return;
7900
7901 mddev->cluster_ops->leave(mddev);
7902 module_put(mddev->cluster_ops->head.owner);
7903 mddev->cluster_ops = NULL;
7904 }
7905
7906 /*
7907 * update_array_info is used to change the configuration of an
7908 * on-line array.
7909 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7910 * fields in the info are checked against the array.
7911 * Any differences that cannot be handled will cause an error.
7912 * Normally, only one change can be managed at a time.
7913 */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7914 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7915 {
7916 int rv = 0;
7917 int cnt = 0;
7918 int state = 0;
7919
7920 /* calculate expected state,ignoring low bits */
7921 if (mddev->bitmap && mddev->bitmap_info.offset)
7922 state |= (1 << MD_SB_BITMAP_PRESENT);
7923
7924 if (mddev->major_version != info->major_version ||
7925 mddev->minor_version != info->minor_version ||
7926 /* mddev->patch_version != info->patch_version || */
7927 mddev->ctime != info->ctime ||
7928 mddev->level != info->level ||
7929 /* mddev->layout != info->layout || */
7930 mddev->persistent != !info->not_persistent ||
7931 mddev->chunk_sectors != info->chunk_size >> 9 ||
7932 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7933 ((state^info->state) & 0xfffffe00)
7934 )
7935 return -EINVAL;
7936 /* Check there is only one change */
7937 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7938 cnt++;
7939 if (mddev->raid_disks != info->raid_disks)
7940 cnt++;
7941 if (mddev->layout != info->layout)
7942 cnt++;
7943 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7944 cnt++;
7945 if (cnt == 0)
7946 return 0;
7947 if (cnt > 1)
7948 return -EINVAL;
7949
7950 if (mddev->layout != info->layout) {
7951 /* Change layout
7952 * we don't need to do anything at the md level, the
7953 * personality will take care of it all.
7954 */
7955 if (mddev->pers->check_reshape == NULL)
7956 return -EINVAL;
7957 else {
7958 mddev->new_layout = info->layout;
7959 rv = mddev->pers->check_reshape(mddev);
7960 if (rv)
7961 mddev->new_layout = mddev->layout;
7962 return rv;
7963 }
7964 }
7965 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7966 rv = update_size(mddev, (sector_t)info->size * 2);
7967
7968 if (mddev->raid_disks != info->raid_disks)
7969 rv = update_raid_disks(mddev, info->raid_disks);
7970
7971 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7972 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7973 rv = -EINVAL;
7974 goto err;
7975 }
7976 if (mddev->recovery || mddev->sync_thread) {
7977 rv = -EBUSY;
7978 goto err;
7979 }
7980 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7981 /* add the bitmap */
7982 if (mddev->bitmap) {
7983 rv = -EEXIST;
7984 goto err;
7985 }
7986 if (mddev->bitmap_info.default_offset == 0) {
7987 rv = -EINVAL;
7988 goto err;
7989 }
7990 mddev->bitmap_info.offset =
7991 mddev->bitmap_info.default_offset;
7992 mddev->bitmap_info.space =
7993 mddev->bitmap_info.default_space;
7994 rv = md_bitmap_create(mddev);
7995 if (!rv)
7996 rv = mddev->bitmap_ops->load(mddev);
7997
7998 if (rv)
7999 md_bitmap_destroy(mddev);
8000 } else {
8001 struct md_bitmap_stats stats;
8002
8003 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8004 if (rv)
8005 goto err;
8006
8007 if (stats.file) {
8008 rv = -EINVAL;
8009 goto err;
8010 }
8011
8012 if (mddev->bitmap_info.nodes) {
8013 /* hold PW on all the bitmap lock */
8014 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
8015 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
8016 rv = -EPERM;
8017 mddev->cluster_ops->unlock_all_bitmaps(mddev);
8018 goto err;
8019 }
8020
8021 mddev->bitmap_info.nodes = 0;
8022 put_cluster_ops(mddev);
8023 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
8024 }
8025 md_bitmap_destroy(mddev);
8026 mddev->bitmap_info.offset = 0;
8027 }
8028 }
8029 md_update_sb(mddev, 1);
8030 return rv;
8031 err:
8032 return rv;
8033 }
8034
set_disk_faulty(struct mddev * mddev,dev_t dev)8035 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
8036 {
8037 struct md_rdev *rdev;
8038 int err = 0;
8039
8040 if (mddev->pers == NULL)
8041 return -ENODEV;
8042
8043 rcu_read_lock();
8044 rdev = md_find_rdev_rcu(mddev, dev);
8045 if (!rdev)
8046 err = -ENODEV;
8047 else {
8048 md_error(mddev, rdev);
8049 if (test_bit(MD_BROKEN, &mddev->flags))
8050 err = -EBUSY;
8051 }
8052 rcu_read_unlock();
8053 return err;
8054 }
8055
8056 /*
8057 * We have a problem here : there is no easy way to give a CHS
8058 * virtual geometry. We currently pretend that we have a 2 heads
8059 * 4 sectors (with a BIG number of cylinders...). This drives
8060 * dosfs just mad... ;-)
8061 */
md_getgeo(struct gendisk * disk,struct hd_geometry * geo)8062 static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo)
8063 {
8064 struct mddev *mddev = disk->private_data;
8065
8066 geo->heads = 2;
8067 geo->sectors = 4;
8068 geo->cylinders = mddev->array_sectors / 8;
8069 return 0;
8070 }
8071
md_ioctl_valid(unsigned int cmd)8072 static inline int md_ioctl_valid(unsigned int cmd)
8073 {
8074 switch (cmd) {
8075 case GET_ARRAY_INFO:
8076 case GET_DISK_INFO:
8077 case RAID_VERSION:
8078 return 0;
8079 case ADD_NEW_DISK:
8080 case GET_BITMAP_FILE:
8081 case HOT_ADD_DISK:
8082 case HOT_REMOVE_DISK:
8083 case RESTART_ARRAY_RW:
8084 case RUN_ARRAY:
8085 case SET_ARRAY_INFO:
8086 case SET_BITMAP_FILE:
8087 case SET_DISK_FAULTY:
8088 case STOP_ARRAY:
8089 case STOP_ARRAY_RO:
8090 case CLUSTERED_DISK_NACK:
8091 if (!capable(CAP_SYS_ADMIN))
8092 return -EACCES;
8093 return 0;
8094 default:
8095 return -ENOTTY;
8096 }
8097 }
8098
md_ioctl_need_suspend(unsigned int cmd)8099 static bool md_ioctl_need_suspend(unsigned int cmd)
8100 {
8101 switch (cmd) {
8102 case ADD_NEW_DISK:
8103 case HOT_ADD_DISK:
8104 case HOT_REMOVE_DISK:
8105 case SET_BITMAP_FILE:
8106 case SET_ARRAY_INFO:
8107 return true;
8108 default:
8109 return false;
8110 }
8111 }
8112
__md_set_array_info(struct mddev * mddev,void __user * argp)8113 static int __md_set_array_info(struct mddev *mddev, void __user *argp)
8114 {
8115 mdu_array_info_t info;
8116 int err;
8117
8118 if (!argp)
8119 memset(&info, 0, sizeof(info));
8120 else if (copy_from_user(&info, argp, sizeof(info)))
8121 return -EFAULT;
8122
8123 if (mddev->pers) {
8124 err = update_array_info(mddev, &info);
8125 if (err)
8126 pr_warn("md: couldn't update array info. %d\n", err);
8127 return err;
8128 }
8129
8130 if (!list_empty(&mddev->disks)) {
8131 pr_warn("md: array %s already has disks!\n", mdname(mddev));
8132 return -EBUSY;
8133 }
8134
8135 if (mddev->raid_disks) {
8136 pr_warn("md: array %s already initialised!\n", mdname(mddev));
8137 return -EBUSY;
8138 }
8139
8140 err = md_set_array_info(mddev, &info);
8141 if (err)
8142 pr_warn("md: couldn't set array info. %d\n", err);
8143
8144 return err;
8145 }
8146
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)8147 static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
8148 unsigned int cmd, unsigned long arg)
8149 {
8150 int err = 0;
8151 void __user *argp = (void __user *)arg;
8152 struct mddev *mddev = NULL;
8153
8154 err = md_ioctl_valid(cmd);
8155 if (err)
8156 return err;
8157
8158 /*
8159 * Commands dealing with the RAID driver but not any
8160 * particular array:
8161 */
8162 if (cmd == RAID_VERSION)
8163 return get_version(argp);
8164
8165 /*
8166 * Commands creating/starting a new array:
8167 */
8168
8169 mddev = bdev->bd_disk->private_data;
8170
8171 /* Some actions do not requires the mutex */
8172 switch (cmd) {
8173 case GET_ARRAY_INFO:
8174 if (!mddev->raid_disks && !mddev->external)
8175 return -ENODEV;
8176 return get_array_info(mddev, argp);
8177
8178 case GET_DISK_INFO:
8179 if (!mddev->raid_disks && !mddev->external)
8180 return -ENODEV;
8181 return get_disk_info(mddev, argp);
8182
8183 case SET_DISK_FAULTY:
8184 return set_disk_faulty(mddev, new_decode_dev(arg));
8185
8186 case GET_BITMAP_FILE:
8187 return get_bitmap_file(mddev, argp);
8188 }
8189
8190 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
8191 /* Need to flush page cache, and ensure no-one else opens
8192 * and writes
8193 */
8194 err = mddev_set_closing_and_sync_blockdev(mddev, 1);
8195 if (err)
8196 return err;
8197 }
8198
8199 if (!md_is_rdwr(mddev))
8200 flush_work(&mddev->sync_work);
8201
8202 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
8203 mddev_lock(mddev);
8204 if (err) {
8205 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
8206 err, cmd);
8207 goto out;
8208 }
8209
8210 if (cmd == SET_ARRAY_INFO) {
8211 err = __md_set_array_info(mddev, argp);
8212 goto unlock;
8213 }
8214
8215 /*
8216 * Commands querying/configuring an existing array:
8217 */
8218 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
8219 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
8220 if ((!mddev->raid_disks && !mddev->external)
8221 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
8222 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
8223 && cmd != GET_BITMAP_FILE) {
8224 err = -ENODEV;
8225 goto unlock;
8226 }
8227
8228 /*
8229 * Commands even a read-only array can execute:
8230 */
8231 switch (cmd) {
8232 case RESTART_ARRAY_RW:
8233 err = restart_array(mddev);
8234 goto unlock;
8235
8236 case STOP_ARRAY:
8237 err = do_md_stop(mddev, 0);
8238 goto unlock;
8239
8240 case STOP_ARRAY_RO:
8241 if (mddev->pers)
8242 err = md_set_readonly(mddev);
8243 goto unlock;
8244
8245 case HOT_REMOVE_DISK:
8246 err = hot_remove_disk(mddev, new_decode_dev(arg));
8247 goto unlock;
8248
8249 case ADD_NEW_DISK:
8250 /* We can support ADD_NEW_DISK on read-only arrays
8251 * only if we are re-adding a preexisting device.
8252 * So require mddev->pers and MD_DISK_SYNC.
8253 */
8254 if (mddev->pers) {
8255 mdu_disk_info_t info;
8256 if (copy_from_user(&info, argp, sizeof(info)))
8257 err = -EFAULT;
8258 else if (!(info.state & (1<<MD_DISK_SYNC)))
8259 /* Need to clear read-only for this */
8260 break;
8261 else
8262 err = md_add_new_disk(mddev, &info);
8263 goto unlock;
8264 }
8265 break;
8266 }
8267
8268 /*
8269 * The remaining ioctls are changing the state of the
8270 * superblock, so we do not allow them on read-only arrays.
8271 */
8272 if (!md_is_rdwr(mddev) && mddev->pers) {
8273 if (mddev->ro != MD_AUTO_READ) {
8274 err = -EROFS;
8275 goto unlock;
8276 }
8277 mddev->ro = MD_RDWR;
8278 sysfs_notify_dirent_safe(mddev->sysfs_state);
8279 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8280 /* mddev_unlock will wake thread */
8281 /* If a device failed while we were read-only, we
8282 * need to make sure the metadata is updated now.
8283 */
8284 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
8285 mddev_unlock(mddev);
8286 wait_event(mddev->sb_wait,
8287 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
8288 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8289 mddev_lock_nointr(mddev);
8290 }
8291 }
8292
8293 switch (cmd) {
8294 case ADD_NEW_DISK:
8295 {
8296 mdu_disk_info_t info;
8297 if (copy_from_user(&info, argp, sizeof(info)))
8298 err = -EFAULT;
8299 else
8300 err = md_add_new_disk(mddev, &info);
8301 goto unlock;
8302 }
8303
8304 case CLUSTERED_DISK_NACK:
8305 if (mddev_is_clustered(mddev))
8306 mddev->cluster_ops->new_disk_ack(mddev, false);
8307 else
8308 err = -EINVAL;
8309 goto unlock;
8310
8311 case HOT_ADD_DISK:
8312 err = hot_add_disk(mddev, new_decode_dev(arg));
8313 goto unlock;
8314
8315 case RUN_ARRAY:
8316 err = do_md_run(mddev);
8317 goto unlock;
8318
8319 case SET_BITMAP_FILE:
8320 err = set_bitmap_file(mddev, (int)arg);
8321 goto unlock;
8322
8323 default:
8324 err = -EINVAL;
8325 goto unlock;
8326 }
8327
8328 unlock:
8329 if (mddev->hold_active == UNTIL_IOCTL &&
8330 err != -EINVAL)
8331 mddev->hold_active = 0;
8332
8333 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
8334 mddev_unlock(mddev);
8335
8336 out:
8337 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
8338 clear_bit(MD_CLOSING, &mddev->flags);
8339 return err;
8340 }
8341 #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)8342 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
8343 unsigned int cmd, unsigned long arg)
8344 {
8345 switch (cmd) {
8346 case HOT_REMOVE_DISK:
8347 case HOT_ADD_DISK:
8348 case SET_DISK_FAULTY:
8349 case SET_BITMAP_FILE:
8350 /* These take in integer arg, do not convert */
8351 break;
8352 default:
8353 arg = (unsigned long)compat_ptr(arg);
8354 break;
8355 }
8356
8357 return md_ioctl(bdev, mode, cmd, arg);
8358 }
8359 #endif /* CONFIG_COMPAT */
8360
md_set_read_only(struct block_device * bdev,bool ro)8361 static int md_set_read_only(struct block_device *bdev, bool ro)
8362 {
8363 struct mddev *mddev = bdev->bd_disk->private_data;
8364 int err;
8365
8366 err = mddev_lock(mddev);
8367 if (err)
8368 return err;
8369
8370 if (!mddev->raid_disks && !mddev->external) {
8371 err = -ENODEV;
8372 goto out_unlock;
8373 }
8374
8375 /*
8376 * Transitioning to read-auto need only happen for arrays that call
8377 * md_write_start and which are not ready for writes yet.
8378 */
8379 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
8380 err = restart_array(mddev);
8381 if (err)
8382 goto out_unlock;
8383 mddev->ro = MD_AUTO_READ;
8384 }
8385
8386 out_unlock:
8387 mddev_unlock(mddev);
8388 return err;
8389 }
8390
md_open(struct gendisk * disk,blk_mode_t mode)8391 static int md_open(struct gendisk *disk, blk_mode_t mode)
8392 {
8393 struct mddev *mddev;
8394 int err;
8395
8396 spin_lock(&all_mddevs_lock);
8397 mddev = mddev_get(disk->private_data);
8398 spin_unlock(&all_mddevs_lock);
8399 if (!mddev)
8400 return -ENODEV;
8401
8402 err = mutex_lock_interruptible(&mddev->open_mutex);
8403 if (err)
8404 goto out;
8405
8406 err = -ENODEV;
8407 if (test_bit(MD_CLOSING, &mddev->flags))
8408 goto out_unlock;
8409
8410 atomic_inc(&mddev->openers);
8411 mutex_unlock(&mddev->open_mutex);
8412
8413 disk_check_media_change(disk);
8414 return 0;
8415
8416 out_unlock:
8417 mutex_unlock(&mddev->open_mutex);
8418 out:
8419 mddev_put(mddev);
8420 return err;
8421 }
8422
md_release(struct gendisk * disk)8423 static void md_release(struct gendisk *disk)
8424 {
8425 struct mddev *mddev = disk->private_data;
8426
8427 BUG_ON(!mddev);
8428 atomic_dec(&mddev->openers);
8429 mddev_put(mddev);
8430 }
8431
md_check_events(struct gendisk * disk,unsigned int clearing)8432 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
8433 {
8434 struct mddev *mddev = disk->private_data;
8435 unsigned int ret = 0;
8436
8437 if (mddev->changed)
8438 ret = DISK_EVENT_MEDIA_CHANGE;
8439 mddev->changed = 0;
8440 return ret;
8441 }
8442
md_free_disk(struct gendisk * disk)8443 static void md_free_disk(struct gendisk *disk)
8444 {
8445 struct mddev *mddev = disk->private_data;
8446
8447 mddev_free(mddev);
8448 }
8449
8450 const struct block_device_operations md_fops =
8451 {
8452 .owner = THIS_MODULE,
8453 .submit_bio = md_submit_bio,
8454 .open = md_open,
8455 .release = md_release,
8456 .ioctl = md_ioctl,
8457 #ifdef CONFIG_COMPAT
8458 .compat_ioctl = md_compat_ioctl,
8459 #endif
8460 .getgeo = md_getgeo,
8461 .check_events = md_check_events,
8462 .set_read_only = md_set_read_only,
8463 .free_disk = md_free_disk,
8464 };
8465
md_thread(void * arg)8466 static int md_thread(void *arg)
8467 {
8468 struct md_thread *thread = arg;
8469
8470 /*
8471 * md_thread is a 'system-thread', it's priority should be very
8472 * high. We avoid resource deadlocks individually in each
8473 * raid personality. (RAID5 does preallocation) We also use RR and
8474 * the very same RT priority as kswapd, thus we will never get
8475 * into a priority inversion deadlock.
8476 *
8477 * we definitely have to have equal or higher priority than
8478 * bdflush, otherwise bdflush will deadlock if there are too
8479 * many dirty RAID5 blocks.
8480 */
8481
8482 allow_signal(SIGKILL);
8483 while (!kthread_should_stop()) {
8484
8485 /* We need to wait INTERRUPTIBLE so that
8486 * we don't add to the load-average.
8487 * That means we need to be sure no signals are
8488 * pending
8489 */
8490 if (signal_pending(current))
8491 flush_signals(current);
8492
8493 wait_event_interruptible_timeout
8494 (thread->wqueue,
8495 test_bit(THREAD_WAKEUP, &thread->flags)
8496 || kthread_should_stop() || kthread_should_park(),
8497 thread->timeout);
8498
8499 clear_bit(THREAD_WAKEUP, &thread->flags);
8500 if (kthread_should_park())
8501 kthread_parkme();
8502 if (!kthread_should_stop())
8503 thread->run(thread);
8504 }
8505
8506 return 0;
8507 }
8508
md_wakeup_thread_directly(struct md_thread __rcu ** thread)8509 static void md_wakeup_thread_directly(struct md_thread __rcu **thread)
8510 {
8511 struct md_thread *t;
8512
8513 rcu_read_lock();
8514 t = rcu_dereference(*thread);
8515 if (t)
8516 wake_up_process(t->tsk);
8517 rcu_read_unlock();
8518 }
8519
__md_wakeup_thread(struct md_thread __rcu * thread)8520 void __md_wakeup_thread(struct md_thread __rcu *thread)
8521 {
8522 struct md_thread *t;
8523
8524 t = rcu_dereference(thread);
8525 if (t) {
8526 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8527 set_bit(THREAD_WAKEUP, &t->flags);
8528 if (wq_has_sleeper(&t->wqueue))
8529 wake_up(&t->wqueue);
8530 }
8531 }
8532 EXPORT_SYMBOL(__md_wakeup_thread);
8533
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8534 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8535 struct mddev *mddev, const char *name)
8536 {
8537 struct md_thread *thread;
8538
8539 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8540 if (!thread)
8541 return NULL;
8542
8543 init_waitqueue_head(&thread->wqueue);
8544
8545 thread->run = run;
8546 thread->mddev = mddev;
8547 thread->timeout = MAX_SCHEDULE_TIMEOUT;
8548 thread->tsk = kthread_run(md_thread, thread,
8549 "%s_%s",
8550 mdname(thread->mddev),
8551 name);
8552 if (IS_ERR(thread->tsk)) {
8553 kfree(thread);
8554 return NULL;
8555 }
8556 return thread;
8557 }
8558 EXPORT_SYMBOL(md_register_thread);
8559
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8560 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8561 {
8562 struct md_thread *thread = rcu_dereference_protected(*threadp,
8563 lockdep_is_held(&mddev->reconfig_mutex));
8564
8565 if (!thread)
8566 return;
8567
8568 rcu_assign_pointer(*threadp, NULL);
8569 synchronize_rcu();
8570
8571 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8572 kthread_stop(thread->tsk);
8573 kfree(thread);
8574 }
8575 EXPORT_SYMBOL(md_unregister_thread);
8576
md_error(struct mddev * mddev,struct md_rdev * rdev)8577 void md_error(struct mddev *mddev, struct md_rdev *rdev)
8578 {
8579 if (!rdev || test_bit(Faulty, &rdev->flags))
8580 return;
8581
8582 if (!mddev->pers || !mddev->pers->error_handler)
8583 return;
8584 mddev->pers->error_handler(mddev, rdev);
8585
8586 if (mddev->pers->head.id == ID_RAID0 ||
8587 mddev->pers->head.id == ID_LINEAR)
8588 return;
8589
8590 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8591 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8592 sysfs_notify_dirent_safe(rdev->sysfs_state);
8593 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8594 if (!test_bit(MD_BROKEN, &mddev->flags)) {
8595 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8596 md_wakeup_thread(mddev->thread);
8597 }
8598 if (mddev->event_work.func)
8599 queue_work(md_misc_wq, &mddev->event_work);
8600 md_new_event();
8601 }
8602 EXPORT_SYMBOL(md_error);
8603
8604 /* seq_file implementation /proc/mdstat */
8605
status_unused(struct seq_file * seq)8606 static void status_unused(struct seq_file *seq)
8607 {
8608 int i = 0;
8609 struct md_rdev *rdev;
8610
8611 seq_printf(seq, "unused devices: ");
8612
8613 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8614 i++;
8615 seq_printf(seq, "%pg ", rdev->bdev);
8616 }
8617 if (!i)
8618 seq_printf(seq, "<none>");
8619
8620 seq_printf(seq, "\n");
8621 }
8622
status_personalities(struct seq_file * seq)8623 static void status_personalities(struct seq_file *seq)
8624 {
8625 struct md_submodule_head *head;
8626 unsigned long i;
8627
8628 seq_puts(seq, "Personalities : ");
8629
8630 xa_lock(&md_submodule);
8631 xa_for_each(&md_submodule, i, head)
8632 if (head->type == MD_PERSONALITY)
8633 seq_printf(seq, "[%s] ", head->name);
8634 xa_unlock(&md_submodule);
8635
8636 seq_puts(seq, "\n");
8637 }
8638
status_resync(struct seq_file * seq,struct mddev * mddev)8639 static int status_resync(struct seq_file *seq, struct mddev *mddev)
8640 {
8641 sector_t max_sectors, resync, res;
8642 unsigned long dt, db = 0;
8643 sector_t rt, curr_mark_cnt, resync_mark_cnt;
8644 int scale, recovery_active;
8645 unsigned int per_milli;
8646
8647 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8648 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8649 max_sectors = mddev->resync_max_sectors;
8650 else
8651 max_sectors = mddev->dev_sectors;
8652
8653 resync = mddev->curr_resync;
8654 if (resync < MD_RESYNC_ACTIVE) {
8655 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8656 /* Still cleaning up */
8657 resync = max_sectors;
8658 } else if (resync > max_sectors) {
8659 resync = max_sectors;
8660 } else {
8661 res = atomic_read(&mddev->recovery_active);
8662 /*
8663 * Resync has started, but the subtraction has overflowed or
8664 * yielded one of the special values. Force it to active to
8665 * ensure the status reports an active resync.
8666 */
8667 if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8668 resync = MD_RESYNC_ACTIVE;
8669 else
8670 resync -= res;
8671 }
8672
8673 if (resync == MD_RESYNC_NONE) {
8674 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8675 struct md_rdev *rdev;
8676
8677 rdev_for_each(rdev, mddev)
8678 if (rdev->raid_disk >= 0 &&
8679 !test_bit(Faulty, &rdev->flags) &&
8680 rdev->recovery_offset != MaxSector &&
8681 rdev->recovery_offset) {
8682 seq_printf(seq, "\trecover=REMOTE");
8683 return 1;
8684 }
8685 if (mddev->reshape_position != MaxSector)
8686 seq_printf(seq, "\treshape=REMOTE");
8687 else
8688 seq_printf(seq, "\tresync=REMOTE");
8689 return 1;
8690 }
8691 if (mddev->resync_offset < MaxSector) {
8692 seq_printf(seq, "\tresync=PENDING");
8693 return 1;
8694 }
8695 return 0;
8696 }
8697 if (resync < MD_RESYNC_ACTIVE) {
8698 seq_printf(seq, "\tresync=DELAYED");
8699 return 1;
8700 }
8701
8702 WARN_ON(max_sectors == 0);
8703 /* Pick 'scale' such that (resync>>scale)*1000 will fit
8704 * in a sector_t, and (max_sectors>>scale) will fit in a
8705 * u32, as those are the requirements for sector_div.
8706 * Thus 'scale' must be at least 10
8707 */
8708 scale = 10;
8709 if (sizeof(sector_t) > sizeof(unsigned long)) {
8710 while ( max_sectors/2 > (1ULL<<(scale+32)))
8711 scale++;
8712 }
8713 res = (resync>>scale)*1000;
8714 sector_div(res, (u32)((max_sectors>>scale)+1));
8715
8716 per_milli = res;
8717 {
8718 int i, x = per_milli/50, y = 20-x;
8719 seq_printf(seq, "[");
8720 for (i = 0; i < x; i++)
8721 seq_printf(seq, "=");
8722 seq_printf(seq, ">");
8723 for (i = 0; i < y; i++)
8724 seq_printf(seq, ".");
8725 seq_printf(seq, "] ");
8726 }
8727 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8728 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8729 "reshape" :
8730 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8731 "check" :
8732 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8733 "resync" : "recovery"))),
8734 per_milli/10, per_milli % 10,
8735 (unsigned long long) resync/2,
8736 (unsigned long long) max_sectors/2);
8737
8738 /*
8739 * dt: time from mark until now
8740 * db: blocks written from mark until now
8741 * rt: remaining time
8742 *
8743 * rt is a sector_t, which is always 64bit now. We are keeping
8744 * the original algorithm, but it is not really necessary.
8745 *
8746 * Original algorithm:
8747 * So we divide before multiply in case it is 32bit and close
8748 * to the limit.
8749 * We scale the divisor (db) by 32 to avoid losing precision
8750 * near the end of resync when the number of remaining sectors
8751 * is close to 'db'.
8752 * We then divide rt by 32 after multiplying by db to compensate.
8753 * The '+1' avoids division by zero if db is very small.
8754 */
8755 dt = ((jiffies - mddev->resync_mark) / HZ);
8756 if (!dt) dt++;
8757
8758 curr_mark_cnt = mddev->curr_mark_cnt;
8759 recovery_active = atomic_read(&mddev->recovery_active);
8760 resync_mark_cnt = mddev->resync_mark_cnt;
8761
8762 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8763 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8764
8765 rt = max_sectors - resync; /* number of remaining sectors */
8766 rt = div64_u64(rt, db/32+1);
8767 rt *= dt;
8768 rt >>= 5;
8769
8770 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8771 ((unsigned long)rt % 60)/6);
8772
8773 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8774 return 1;
8775 }
8776
md_seq_start(struct seq_file * seq,loff_t * pos)8777 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8778 __acquires(&all_mddevs_lock)
8779 {
8780 seq->poll_event = atomic_read(&md_event_count);
8781 spin_lock(&all_mddevs_lock);
8782
8783 return seq_list_start_head(&all_mddevs, *pos);
8784 }
8785
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8786 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8787 {
8788 return seq_list_next(v, &all_mddevs, pos);
8789 }
8790
md_seq_stop(struct seq_file * seq,void * v)8791 static void md_seq_stop(struct seq_file *seq, void *v)
8792 __releases(&all_mddevs_lock)
8793 {
8794 spin_unlock(&all_mddevs_lock);
8795 }
8796
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8797 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8798 {
8799 struct md_bitmap_stats stats;
8800 unsigned long used_pages;
8801 unsigned long chunk_kb;
8802 int err;
8803
8804 if (!md_bitmap_enabled(mddev, false))
8805 return;
8806
8807 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8808 if (err)
8809 return;
8810
8811 chunk_kb = mddev->bitmap_info.chunksize >> 10;
8812 used_pages = stats.pages - stats.missing_pages;
8813
8814 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8815 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8816 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8817 chunk_kb ? "KB" : "B");
8818
8819 if (stats.file) {
8820 seq_puts(seq, ", file: ");
8821 seq_file_path(seq, stats.file, " \t\n");
8822 }
8823
8824 seq_putc(seq, '\n');
8825 }
8826
md_seq_show(struct seq_file * seq,void * v)8827 static int md_seq_show(struct seq_file *seq, void *v)
8828 {
8829 struct mddev *mddev;
8830 sector_t sectors;
8831 struct md_rdev *rdev;
8832
8833 if (v == &all_mddevs) {
8834 status_personalities(seq);
8835 if (list_empty(&all_mddevs))
8836 status_unused(seq);
8837 return 0;
8838 }
8839
8840 mddev = list_entry(v, struct mddev, all_mddevs);
8841 if (!mddev_get(mddev))
8842 return 0;
8843
8844 spin_unlock(&all_mddevs_lock);
8845
8846 /* prevent bitmap to be freed after checking */
8847 mutex_lock(&mddev->bitmap_info.mutex);
8848
8849 spin_lock(&mddev->lock);
8850 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8851 seq_printf(seq, "%s : ", mdname(mddev));
8852 if (mddev->pers) {
8853 if (test_bit(MD_BROKEN, &mddev->flags))
8854 seq_printf(seq, "broken");
8855 else
8856 seq_printf(seq, "active");
8857 if (mddev->ro == MD_RDONLY)
8858 seq_printf(seq, " (read-only)");
8859 if (mddev->ro == MD_AUTO_READ)
8860 seq_printf(seq, " (auto-read-only)");
8861 seq_printf(seq, " %s", mddev->pers->head.name);
8862 } else {
8863 seq_printf(seq, "inactive");
8864 }
8865
8866 sectors = 0;
8867 rcu_read_lock();
8868 rdev_for_each_rcu(rdev, mddev) {
8869 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8870
8871 if (test_bit(WriteMostly, &rdev->flags))
8872 seq_printf(seq, "(W)");
8873 if (test_bit(Journal, &rdev->flags))
8874 seq_printf(seq, "(J)");
8875 if (test_bit(Faulty, &rdev->flags)) {
8876 seq_printf(seq, "(F)");
8877 continue;
8878 }
8879 if (rdev->raid_disk < 0)
8880 seq_printf(seq, "(S)"); /* spare */
8881 if (test_bit(Replacement, &rdev->flags))
8882 seq_printf(seq, "(R)");
8883 sectors += rdev->sectors;
8884 }
8885 rcu_read_unlock();
8886
8887 if (!list_empty(&mddev->disks)) {
8888 if (mddev->pers)
8889 seq_printf(seq, "\n %llu blocks",
8890 (unsigned long long)
8891 mddev->array_sectors / 2);
8892 else
8893 seq_printf(seq, "\n %llu blocks",
8894 (unsigned long long)sectors / 2);
8895 }
8896 if (mddev->persistent) {
8897 if (mddev->major_version != 0 ||
8898 mddev->minor_version != 90) {
8899 seq_printf(seq," super %d.%d",
8900 mddev->major_version,
8901 mddev->minor_version);
8902 }
8903 } else if (mddev->external)
8904 seq_printf(seq, " super external:%s",
8905 mddev->metadata_type);
8906 else
8907 seq_printf(seq, " super non-persistent");
8908
8909 if (mddev->pers) {
8910 mddev->pers->status(seq, mddev);
8911 seq_printf(seq, "\n ");
8912 if (mddev->pers->sync_request) {
8913 if (status_resync(seq, mddev))
8914 seq_printf(seq, "\n ");
8915 }
8916 } else
8917 seq_printf(seq, "\n ");
8918
8919 md_bitmap_status(seq, mddev);
8920
8921 seq_printf(seq, "\n");
8922 }
8923 spin_unlock(&mddev->lock);
8924 mutex_unlock(&mddev->bitmap_info.mutex);
8925 spin_lock(&all_mddevs_lock);
8926
8927 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8928 status_unused(seq);
8929
8930 mddev_put_locked(mddev);
8931 return 0;
8932 }
8933
8934 static const struct seq_operations md_seq_ops = {
8935 .start = md_seq_start,
8936 .next = md_seq_next,
8937 .stop = md_seq_stop,
8938 .show = md_seq_show,
8939 };
8940
md_seq_open(struct inode * inode,struct file * file)8941 static int md_seq_open(struct inode *inode, struct file *file)
8942 {
8943 struct seq_file *seq;
8944 int error;
8945
8946 error = seq_open(file, &md_seq_ops);
8947 if (error)
8948 return error;
8949
8950 seq = file->private_data;
8951 seq->poll_event = atomic_read(&md_event_count);
8952 return error;
8953 }
8954
8955 static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8956 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8957 {
8958 struct seq_file *seq = filp->private_data;
8959 __poll_t mask;
8960
8961 if (md_unloading)
8962 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8963 poll_wait(filp, &md_event_waiters, wait);
8964
8965 /* always allow read */
8966 mask = EPOLLIN | EPOLLRDNORM;
8967
8968 if (seq->poll_event != atomic_read(&md_event_count))
8969 mask |= EPOLLERR | EPOLLPRI;
8970 return mask;
8971 }
8972
8973 static const struct proc_ops mdstat_proc_ops = {
8974 .proc_open = md_seq_open,
8975 .proc_read = seq_read,
8976 .proc_lseek = seq_lseek,
8977 .proc_release = seq_release,
8978 .proc_poll = mdstat_poll,
8979 };
8980
register_md_submodule(struct md_submodule_head * msh)8981 int register_md_submodule(struct md_submodule_head *msh)
8982 {
8983 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
8984 }
8985 EXPORT_SYMBOL_GPL(register_md_submodule);
8986
unregister_md_submodule(struct md_submodule_head * msh)8987 void unregister_md_submodule(struct md_submodule_head *msh)
8988 {
8989 xa_erase(&md_submodule, msh->id);
8990 }
8991 EXPORT_SYMBOL_GPL(unregister_md_submodule);
8992
md_setup_cluster(struct mddev * mddev,int nodes)8993 int md_setup_cluster(struct mddev *mddev, int nodes)
8994 {
8995 int ret = get_cluster_ops(mddev);
8996
8997 if (ret) {
8998 request_module("md-cluster");
8999 ret = get_cluster_ops(mddev);
9000 }
9001
9002 /* ensure module won't be unloaded */
9003 if (ret) {
9004 pr_warn("can't find md-cluster module or get its reference.\n");
9005 return ret;
9006 }
9007
9008 ret = mddev->cluster_ops->join(mddev, nodes);
9009 if (!ret)
9010 mddev->safemode_delay = 0;
9011 return ret;
9012 }
9013
md_cluster_stop(struct mddev * mddev)9014 void md_cluster_stop(struct mddev *mddev)
9015 {
9016 put_cluster_ops(mddev);
9017 }
9018
is_rdev_holder_idle(struct md_rdev * rdev,bool init)9019 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
9020 {
9021 unsigned long last_events = rdev->last_events;
9022
9023 if (!bdev_is_partition(rdev->bdev))
9024 return true;
9025
9026 /*
9027 * If rdev is partition, and user doesn't issue IO to the array, the
9028 * array is still not idle if user issues IO to other partitions.
9029 */
9030 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
9031 sectors) -
9032 part_stat_read_accum(rdev->bdev, sectors);
9033
9034 return init || rdev->last_events <= last_events;
9035 }
9036
9037 /*
9038 * mddev is idle if following conditions are matched since last check:
9039 * 1) mddev doesn't have normal IO completed;
9040 * 2) mddev doesn't have inflight normal IO;
9041 * 3) if any member disk is partition, and other partitions don't have IO
9042 * completed;
9043 *
9044 * Noted this checking rely on IO accounting is enabled.
9045 */
is_mddev_idle(struct mddev * mddev,int init)9046 static bool is_mddev_idle(struct mddev *mddev, int init)
9047 {
9048 unsigned long last_events = mddev->normal_io_events;
9049 struct gendisk *disk;
9050 struct md_rdev *rdev;
9051 bool idle = true;
9052
9053 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
9054 if (!disk)
9055 return true;
9056
9057 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
9058 if (!init && (mddev->normal_io_events > last_events ||
9059 bdev_count_inflight(disk->part0)))
9060 idle = false;
9061
9062 rcu_read_lock();
9063 rdev_for_each_rcu(rdev, mddev)
9064 if (!is_rdev_holder_idle(rdev, init))
9065 idle = false;
9066 rcu_read_unlock();
9067
9068 return idle;
9069 }
9070
md_done_sync(struct mddev * mddev,int blocks,int ok)9071 void md_done_sync(struct mddev *mddev, int blocks, int ok)
9072 {
9073 /* another "blocks" (512byte) blocks have been synced */
9074 atomic_sub(blocks, &mddev->recovery_active);
9075 wake_up(&mddev->recovery_wait);
9076 if (!ok) {
9077 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9078 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
9079 md_wakeup_thread(mddev->thread);
9080 // stop recovery, signal do_sync ....
9081 }
9082 }
9083 EXPORT_SYMBOL(md_done_sync);
9084
9085 /* md_write_start(mddev, bi)
9086 * If we need to update some array metadata (e.g. 'active' flag
9087 * in superblock) before writing, schedule a superblock update
9088 * and wait for it to complete.
9089 * A return value of 'false' means that the write wasn't recorded
9090 * and cannot proceed as the array is being suspend.
9091 */
md_write_start(struct mddev * mddev,struct bio * bi)9092 void md_write_start(struct mddev *mddev, struct bio *bi)
9093 {
9094 int did_change = 0;
9095
9096 if (bio_data_dir(bi) != WRITE)
9097 return;
9098
9099 BUG_ON(mddev->ro == MD_RDONLY);
9100 if (mddev->ro == MD_AUTO_READ) {
9101 /* need to switch to read/write */
9102 mddev->ro = MD_RDWR;
9103 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9104 md_wakeup_thread(mddev->thread);
9105 md_wakeup_thread(mddev->sync_thread);
9106 did_change = 1;
9107 }
9108 rcu_read_lock();
9109 percpu_ref_get(&mddev->writes_pending);
9110 smp_mb(); /* Match smp_mb in set_in_sync() */
9111 if (mddev->safemode == 1)
9112 mddev->safemode = 0;
9113 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
9114 if (mddev->in_sync || mddev->sync_checkers) {
9115 spin_lock(&mddev->lock);
9116 if (mddev->in_sync) {
9117 mddev->in_sync = 0;
9118 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9119 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9120 md_wakeup_thread(mddev->thread);
9121 did_change = 1;
9122 }
9123 spin_unlock(&mddev->lock);
9124 }
9125 rcu_read_unlock();
9126 if (did_change)
9127 sysfs_notify_dirent_safe(mddev->sysfs_state);
9128 if (!mddev->has_superblocks)
9129 return;
9130 wait_event(mddev->sb_wait,
9131 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
9132 }
9133 EXPORT_SYMBOL(md_write_start);
9134
9135 /* md_write_inc can only be called when md_write_start() has
9136 * already been called at least once of the current request.
9137 * It increments the counter and is useful when a single request
9138 * is split into several parts. Each part causes an increment and
9139 * so needs a matching md_write_end().
9140 * Unlike md_write_start(), it is safe to call md_write_inc() inside
9141 * a spinlocked region.
9142 */
md_write_inc(struct mddev * mddev,struct bio * bi)9143 void md_write_inc(struct mddev *mddev, struct bio *bi)
9144 {
9145 if (bio_data_dir(bi) != WRITE)
9146 return;
9147 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
9148 percpu_ref_get(&mddev->writes_pending);
9149 }
9150 EXPORT_SYMBOL(md_write_inc);
9151
md_write_end(struct mddev * mddev)9152 void md_write_end(struct mddev *mddev)
9153 {
9154 percpu_ref_put(&mddev->writes_pending);
9155
9156 if (mddev->safemode == 2)
9157 md_wakeup_thread(mddev->thread);
9158 else if (mddev->safemode_delay)
9159 /* The roundup() ensures this only performs locking once
9160 * every ->safemode_delay jiffies
9161 */
9162 mod_timer(&mddev->safemode_timer,
9163 roundup(jiffies, mddev->safemode_delay) +
9164 mddev->safemode_delay);
9165 }
9166
9167 EXPORT_SYMBOL(md_write_end);
9168
9169 /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)9170 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
9171 struct bio *bio, sector_t start, sector_t size)
9172 {
9173 struct bio *discard_bio = NULL;
9174
9175 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
9176 &discard_bio) || !discard_bio)
9177 return;
9178
9179 bio_chain(discard_bio, bio);
9180 bio_clone_blkg_association(discard_bio, bio);
9181 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
9182 submit_bio_noacct(discard_bio);
9183 }
9184 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
9185
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)9186 static void md_bitmap_start(struct mddev *mddev,
9187 struct md_io_clone *md_io_clone)
9188 {
9189 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
9190 mddev->bitmap_ops->start_discard :
9191 mddev->bitmap_ops->start_write;
9192
9193 if (mddev->pers->bitmap_sector)
9194 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
9195 &md_io_clone->sectors);
9196
9197 fn(mddev, md_io_clone->offset, md_io_clone->sectors);
9198 }
9199
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)9200 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
9201 {
9202 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
9203 mddev->bitmap_ops->end_discard :
9204 mddev->bitmap_ops->end_write;
9205
9206 fn(mddev, md_io_clone->offset, md_io_clone->sectors);
9207 }
9208
md_end_clone_io(struct bio * bio)9209 static void md_end_clone_io(struct bio *bio)
9210 {
9211 struct md_io_clone *md_io_clone = bio->bi_private;
9212 struct bio *orig_bio = md_io_clone->orig_bio;
9213 struct mddev *mddev = md_io_clone->mddev;
9214
9215 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
9216 md_bitmap_end(mddev, md_io_clone);
9217
9218 if (bio->bi_status && !orig_bio->bi_status)
9219 orig_bio->bi_status = bio->bi_status;
9220
9221 if (md_io_clone->start_time)
9222 bio_end_io_acct(orig_bio, md_io_clone->start_time);
9223
9224 bio_put(bio);
9225 bio_endio(orig_bio);
9226 percpu_ref_put(&mddev->active_io);
9227 }
9228
md_clone_bio(struct mddev * mddev,struct bio ** bio)9229 static void md_clone_bio(struct mddev *mddev, struct bio **bio)
9230 {
9231 struct block_device *bdev = (*bio)->bi_bdev;
9232 struct md_io_clone *md_io_clone;
9233 struct bio *clone =
9234 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
9235
9236 md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
9237 md_io_clone->orig_bio = *bio;
9238 md_io_clone->mddev = mddev;
9239 if (blk_queue_io_stat(bdev->bd_disk->queue))
9240 md_io_clone->start_time = bio_start_io_acct(*bio);
9241
9242 if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) {
9243 md_io_clone->offset = (*bio)->bi_iter.bi_sector;
9244 md_io_clone->sectors = bio_sectors(*bio);
9245 md_io_clone->rw = op_stat_group(bio_op(*bio));
9246 md_bitmap_start(mddev, md_io_clone);
9247 }
9248
9249 clone->bi_end_io = md_end_clone_io;
9250 clone->bi_private = md_io_clone;
9251 *bio = clone;
9252 }
9253
md_account_bio(struct mddev * mddev,struct bio ** bio)9254 void md_account_bio(struct mddev *mddev, struct bio **bio)
9255 {
9256 percpu_ref_get(&mddev->active_io);
9257 md_clone_bio(mddev, bio);
9258 }
9259 EXPORT_SYMBOL_GPL(md_account_bio);
9260
md_free_cloned_bio(struct bio * bio)9261 void md_free_cloned_bio(struct bio *bio)
9262 {
9263 struct md_io_clone *md_io_clone = bio->bi_private;
9264 struct bio *orig_bio = md_io_clone->orig_bio;
9265 struct mddev *mddev = md_io_clone->mddev;
9266
9267 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
9268 md_bitmap_end(mddev, md_io_clone);
9269
9270 if (bio->bi_status && !orig_bio->bi_status)
9271 orig_bio->bi_status = bio->bi_status;
9272
9273 if (md_io_clone->start_time)
9274 bio_end_io_acct(orig_bio, md_io_clone->start_time);
9275
9276 bio_put(bio);
9277 percpu_ref_put(&mddev->active_io);
9278 }
9279 EXPORT_SYMBOL_GPL(md_free_cloned_bio);
9280
9281 /* md_allow_write(mddev)
9282 * Calling this ensures that the array is marked 'active' so that writes
9283 * may proceed without blocking. It is important to call this before
9284 * attempting a GFP_KERNEL allocation while holding the mddev lock.
9285 * Must be called with mddev_lock held.
9286 */
md_allow_write(struct mddev * mddev)9287 void md_allow_write(struct mddev *mddev)
9288 {
9289 if (!mddev->pers)
9290 return;
9291 if (!md_is_rdwr(mddev))
9292 return;
9293 if (!mddev->pers->sync_request)
9294 return;
9295
9296 spin_lock(&mddev->lock);
9297 if (mddev->in_sync) {
9298 mddev->in_sync = 0;
9299 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9300 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9301 if (mddev->safemode_delay &&
9302 mddev->safemode == 0)
9303 mddev->safemode = 1;
9304 spin_unlock(&mddev->lock);
9305 md_update_sb(mddev, 0);
9306 sysfs_notify_dirent_safe(mddev->sysfs_state);
9307 /* wait for the dirty state to be recorded in the metadata */
9308 wait_event(mddev->sb_wait,
9309 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
9310 } else
9311 spin_unlock(&mddev->lock);
9312 }
9313 EXPORT_SYMBOL_GPL(md_allow_write);
9314
md_sync_max_sectors(struct mddev * mddev,enum sync_action action)9315 static sector_t md_sync_max_sectors(struct mddev *mddev,
9316 enum sync_action action)
9317 {
9318 switch (action) {
9319 case ACTION_RESYNC:
9320 case ACTION_CHECK:
9321 case ACTION_REPAIR:
9322 atomic64_set(&mddev->resync_mismatches, 0);
9323 fallthrough;
9324 case ACTION_RESHAPE:
9325 return mddev->resync_max_sectors;
9326 case ACTION_RECOVER:
9327 return mddev->dev_sectors;
9328 default:
9329 return 0;
9330 }
9331 }
9332
9333 /*
9334 * If lazy recovery is requested and all rdevs are in sync, select the rdev with
9335 * the higest index to perfore recovery to build initial xor data, this is the
9336 * same as old bitmap.
9337 */
mddev_select_lazy_recover_rdev(struct mddev * mddev)9338 static bool mddev_select_lazy_recover_rdev(struct mddev *mddev)
9339 {
9340 struct md_rdev *recover_rdev = NULL;
9341 struct md_rdev *rdev;
9342 bool ret = false;
9343
9344 rcu_read_lock();
9345 rdev_for_each_rcu(rdev, mddev) {
9346 if (rdev->raid_disk < 0)
9347 continue;
9348
9349 if (test_bit(Faulty, &rdev->flags) ||
9350 !test_bit(In_sync, &rdev->flags))
9351 break;
9352
9353 if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk)
9354 recover_rdev = rdev;
9355 }
9356
9357 if (recover_rdev) {
9358 clear_bit(In_sync, &recover_rdev->flags);
9359 ret = true;
9360 }
9361
9362 rcu_read_unlock();
9363 return ret;
9364 }
9365
md_sync_position(struct mddev * mddev,enum sync_action action)9366 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
9367 {
9368 sector_t start = 0;
9369 struct md_rdev *rdev;
9370
9371 switch (action) {
9372 case ACTION_CHECK:
9373 case ACTION_REPAIR:
9374 return mddev->resync_min;
9375 case ACTION_RESYNC:
9376 if (!mddev->bitmap)
9377 return mddev->resync_offset;
9378 return 0;
9379 case ACTION_RESHAPE:
9380 /*
9381 * If the original node aborts reshaping then we continue the
9382 * reshaping, so set again to avoid restart reshape from the
9383 * first beginning
9384 */
9385 if (mddev_is_clustered(mddev) &&
9386 mddev->reshape_position != MaxSector)
9387 return mddev->reshape_position;
9388 return 0;
9389 case ACTION_RECOVER:
9390 start = MaxSector;
9391 rcu_read_lock();
9392 rdev_for_each_rcu(rdev, mddev)
9393 if (rdev_needs_recovery(rdev, start))
9394 start = rdev->recovery_offset;
9395 rcu_read_unlock();
9396
9397 /*
9398 * If there are no spares, and raid456 lazy initial recover is
9399 * requested.
9400 */
9401 if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) &&
9402 start == MaxSector && mddev_select_lazy_recover_rdev(mddev))
9403 start = 0;
9404
9405 /* If there is a bitmap, we need to make sure all
9406 * writes that started before we added a spare
9407 * complete before we start doing a recovery.
9408 * Otherwise the write might complete and (via
9409 * bitmap_endwrite) set a bit in the bitmap after the
9410 * recovery has checked that bit and skipped that
9411 * region.
9412 */
9413 if (mddev->bitmap) {
9414 mddev->pers->quiesce(mddev, 1);
9415 mddev->pers->quiesce(mddev, 0);
9416 }
9417 return start;
9418 default:
9419 return MaxSector;
9420 }
9421 }
9422
sync_io_within_limit(struct mddev * mddev)9423 static bool sync_io_within_limit(struct mddev *mddev)
9424 {
9425 /*
9426 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
9427 * RESYNC_PAGES(64k) per IO.
9428 */
9429 return atomic_read(&mddev->recovery_active) <
9430 (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
9431 }
9432
9433 #define SYNC_MARKS 10
9434 #define SYNC_MARK_STEP (3*HZ)
9435 #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)9436 void md_do_sync(struct md_thread *thread)
9437 {
9438 struct mddev *mddev = thread->mddev;
9439 struct mddev *mddev2;
9440 unsigned int currspeed = 0, window;
9441 sector_t max_sectors,j, io_sectors, recovery_done;
9442 unsigned long mark[SYNC_MARKS];
9443 unsigned long update_time;
9444 sector_t mark_cnt[SYNC_MARKS];
9445 int last_mark,m;
9446 sector_t last_check;
9447 int skipped = 0;
9448 struct md_rdev *rdev;
9449 enum sync_action action;
9450 const char *desc;
9451 struct blk_plug plug;
9452 int ret;
9453
9454 /* just incase thread restarts... */
9455 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9456 return;
9457
9458 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9459 goto skip;
9460
9461 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
9462 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
9463 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9464 goto skip;
9465 }
9466
9467 if (mddev_is_clustered(mddev)) {
9468 ret = mddev->cluster_ops->resync_start(mddev);
9469 if (ret)
9470 goto skip;
9471
9472 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
9473 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
9474 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
9475 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
9476 && ((unsigned long long)mddev->curr_resync_completed
9477 < (unsigned long long)mddev->resync_max_sectors))
9478 goto skip;
9479 }
9480
9481 action = md_sync_action(mddev);
9482 if (action == ACTION_FROZEN || action == ACTION_IDLE) {
9483 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9484 goto skip;
9485 }
9486
9487 desc = md_sync_action_name(action);
9488 mddev->last_sync_action = action;
9489
9490 /*
9491 * Before starting a resync we must have set curr_resync to
9492 * 2, and then checked that every "conflicting" array has curr_resync
9493 * less than ours. When we find one that is the same or higher
9494 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
9495 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
9496 * This will mean we have to start checking from the beginning again.
9497 *
9498 */
9499 if (mddev_is_clustered(mddev))
9500 mddev->cluster_ops->resync_start_notify(mddev);
9501 do {
9502 int mddev2_minor = -1;
9503 mddev->curr_resync = MD_RESYNC_DELAYED;
9504
9505 try_again:
9506 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9507 goto skip;
9508 spin_lock(&all_mddevs_lock);
9509 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
9510 if (test_bit(MD_DELETED, &mddev2->flags))
9511 continue;
9512 if (mddev2 == mddev)
9513 continue;
9514 if (!mddev->parallel_resync
9515 && mddev2->curr_resync
9516 && match_mddev_units(mddev, mddev2)) {
9517 DEFINE_WAIT(wq);
9518 if (mddev < mddev2 &&
9519 mddev->curr_resync == MD_RESYNC_DELAYED) {
9520 /* arbitrarily yield */
9521 mddev->curr_resync = MD_RESYNC_YIELDED;
9522 wake_up(&resync_wait);
9523 }
9524 if (mddev > mddev2 &&
9525 mddev->curr_resync == MD_RESYNC_YIELDED)
9526 /* no need to wait here, we can wait the next
9527 * time 'round when curr_resync == 2
9528 */
9529 continue;
9530 /* We need to wait 'interruptible' so as not to
9531 * contribute to the load average, and not to
9532 * be caught by 'softlockup'
9533 */
9534 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
9535 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9536 mddev2->curr_resync >= mddev->curr_resync) {
9537 if (mddev2_minor != mddev2->md_minor) {
9538 mddev2_minor = mddev2->md_minor;
9539 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
9540 desc, mdname(mddev),
9541 mdname(mddev2));
9542 }
9543 spin_unlock(&all_mddevs_lock);
9544
9545 if (signal_pending(current))
9546 flush_signals(current);
9547 schedule();
9548 finish_wait(&resync_wait, &wq);
9549 goto try_again;
9550 }
9551 finish_wait(&resync_wait, &wq);
9552 }
9553 }
9554 spin_unlock(&all_mddevs_lock);
9555 } while (mddev->curr_resync < MD_RESYNC_DELAYED);
9556
9557 max_sectors = md_sync_max_sectors(mddev, action);
9558 j = md_sync_position(mddev, action);
9559
9560 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9561 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
9562 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9563 speed_max(mddev), desc);
9564
9565 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9566
9567 io_sectors = 0;
9568 for (m = 0; m < SYNC_MARKS; m++) {
9569 mark[m] = jiffies;
9570 mark_cnt[m] = io_sectors;
9571 }
9572 last_mark = 0;
9573 mddev->resync_mark = mark[last_mark];
9574 mddev->resync_mark_cnt = mark_cnt[last_mark];
9575
9576 /*
9577 * Tune reconstruction:
9578 */
9579 window = 32 * (PAGE_SIZE / 512);
9580 pr_debug("md: using %dk window, over a total of %lluk.\n",
9581 window/2, (unsigned long long)max_sectors/2);
9582
9583 atomic_set(&mddev->recovery_active, 0);
9584 last_check = 0;
9585
9586 if (j >= MD_RESYNC_ACTIVE) {
9587 pr_debug("md: resuming %s of %s from checkpoint.\n",
9588 desc, mdname(mddev));
9589 mddev->curr_resync = j;
9590 } else
9591 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9592 mddev->curr_resync_completed = j;
9593 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9594 md_new_event();
9595 update_time = jiffies;
9596
9597 blk_start_plug(&plug);
9598 while (j < max_sectors) {
9599 sector_t sectors;
9600
9601 skipped = 0;
9602
9603 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9604 ((mddev->curr_resync > mddev->curr_resync_completed &&
9605 (mddev->curr_resync - mddev->curr_resync_completed)
9606 > (max_sectors >> 4)) ||
9607 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9608 (j - mddev->curr_resync_completed)*2
9609 >= mddev->resync_max - mddev->curr_resync_completed ||
9610 mddev->curr_resync_completed > mddev->resync_max
9611 )) {
9612 /* time to update curr_resync_completed */
9613 wait_event(mddev->recovery_wait,
9614 atomic_read(&mddev->recovery_active) == 0);
9615 mddev->curr_resync_completed = j;
9616 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9617 j > mddev->resync_offset)
9618 mddev->resync_offset = j;
9619 update_time = jiffies;
9620 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9621 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9622 }
9623
9624 while (j >= mddev->resync_max &&
9625 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9626 /* As this condition is controlled by user-space,
9627 * we can block indefinitely, so use '_interruptible'
9628 * to avoid triggering warnings.
9629 */
9630 flush_signals(current); /* just in case */
9631 wait_event_interruptible(mddev->recovery_wait,
9632 mddev->resync_max > j
9633 || test_bit(MD_RECOVERY_INTR,
9634 &mddev->recovery));
9635 }
9636
9637 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9638 break;
9639
9640 if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) {
9641 sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j);
9642 if (sectors)
9643 goto update;
9644 }
9645
9646 sectors = mddev->pers->sync_request(mddev, j, max_sectors,
9647 &skipped);
9648 if (sectors == 0) {
9649 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9650 break;
9651 }
9652
9653 if (!skipped) { /* actual IO requested */
9654 io_sectors += sectors;
9655 atomic_add(sectors, &mddev->recovery_active);
9656 }
9657
9658 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9659 break;
9660
9661 update:
9662 j += sectors;
9663 if (j > max_sectors)
9664 /* when skipping, extra large numbers can be returned. */
9665 j = max_sectors;
9666 if (j >= MD_RESYNC_ACTIVE)
9667 mddev->curr_resync = j;
9668 mddev->curr_mark_cnt = io_sectors;
9669 if (last_check == 0)
9670 /* this is the earliest that rebuild will be
9671 * visible in /proc/mdstat
9672 */
9673 md_new_event();
9674
9675 if (last_check + window > io_sectors || j == max_sectors)
9676 continue;
9677
9678 last_check = io_sectors;
9679 repeat:
9680 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9681 /* step marks */
9682 int next = (last_mark+1) % SYNC_MARKS;
9683
9684 mddev->resync_mark = mark[next];
9685 mddev->resync_mark_cnt = mark_cnt[next];
9686 mark[next] = jiffies;
9687 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9688 last_mark = next;
9689 }
9690
9691 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9692 break;
9693
9694 /*
9695 * this loop exits only if either when we are slower than
9696 * the 'hard' speed limit, or the system was IO-idle for
9697 * a jiffy.
9698 * the system might be non-idle CPU-wise, but we only care
9699 * about not overloading the IO subsystem. (things like an
9700 * e2fsck being done on the RAID array should execute fast)
9701 */
9702 cond_resched();
9703
9704 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9705 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9706 /((jiffies-mddev->resync_mark)/HZ +1) +1;
9707
9708 if (currspeed > speed_min(mddev)) {
9709 if (currspeed > speed_max(mddev)) {
9710 msleep(500);
9711 goto repeat;
9712 }
9713 if (!sync_io_within_limit(mddev) &&
9714 !is_mddev_idle(mddev, 0)) {
9715 /*
9716 * Give other IO more of a chance.
9717 * The faster the devices, the less we wait.
9718 */
9719 wait_event(mddev->recovery_wait,
9720 !atomic_read(&mddev->recovery_active));
9721 }
9722 }
9723 }
9724 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9725 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9726 ? "interrupted" : "done");
9727 /*
9728 * this also signals 'finished resyncing' to md_stop
9729 */
9730 blk_finish_plug(&plug);
9731 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9732
9733 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9734 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9735 mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9736 mddev->curr_resync_completed = mddev->curr_resync;
9737 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9738 }
9739 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
9740
9741 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9742 mddev->curr_resync > MD_RESYNC_ACTIVE) {
9743 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9744 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9745 if (mddev->curr_resync >= mddev->resync_offset) {
9746 pr_debug("md: checkpointing %s of %s.\n",
9747 desc, mdname(mddev));
9748 if (test_bit(MD_RECOVERY_ERROR,
9749 &mddev->recovery))
9750 mddev->resync_offset =
9751 mddev->curr_resync_completed;
9752 else
9753 mddev->resync_offset =
9754 mddev->curr_resync;
9755 }
9756 } else
9757 mddev->resync_offset = MaxSector;
9758 } else {
9759 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9760 mddev->curr_resync = MaxSector;
9761 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9762 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9763 rcu_read_lock();
9764 rdev_for_each_rcu(rdev, mddev)
9765 if (mddev->delta_disks >= 0 &&
9766 rdev_needs_recovery(rdev, mddev->curr_resync))
9767 rdev->recovery_offset = mddev->curr_resync;
9768 rcu_read_unlock();
9769 }
9770 }
9771 }
9772 skip:
9773 /* set CHANGE_PENDING here since maybe another update is needed,
9774 * so other nodes are informed. It should be harmless for normal
9775 * raid */
9776 set_mask_bits(&mddev->sb_flags, 0,
9777 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9778
9779 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9780 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9781 mddev->delta_disks > 0 &&
9782 mddev->pers->finish_reshape &&
9783 mddev->pers->size &&
9784 !mddev_is_dm(mddev)) {
9785 mddev_lock_nointr(mddev);
9786 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9787 mddev_unlock(mddev);
9788 if (!mddev_is_clustered(mddev))
9789 set_capacity_and_notify(mddev->gendisk,
9790 mddev->array_sectors);
9791 }
9792
9793 spin_lock(&mddev->lock);
9794 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9795 /* We completed so min/max setting can be forgotten if used. */
9796 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9797 mddev->resync_min = 0;
9798 mddev->resync_max = MaxSector;
9799 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9800 mddev->resync_min = mddev->curr_resync_completed;
9801 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9802 mddev->curr_resync = MD_RESYNC_NONE;
9803 spin_unlock(&mddev->lock);
9804
9805 wake_up(&resync_wait);
9806 md_wakeup_thread(mddev->thread);
9807 return;
9808 }
9809 EXPORT_SYMBOL_GPL(md_do_sync);
9810
rdev_removeable(struct md_rdev * rdev)9811 static bool rdev_removeable(struct md_rdev *rdev)
9812 {
9813 /* rdev is not used. */
9814 if (rdev->raid_disk < 0)
9815 return false;
9816
9817 /* There are still inflight io, don't remove this rdev. */
9818 if (atomic_read(&rdev->nr_pending))
9819 return false;
9820
9821 /*
9822 * An error occurred but has not yet been acknowledged by the metadata
9823 * handler, don't remove this rdev.
9824 */
9825 if (test_bit(Blocked, &rdev->flags))
9826 return false;
9827
9828 /* Fautly rdev is not used, it's safe to remove it. */
9829 if (test_bit(Faulty, &rdev->flags))
9830 return true;
9831
9832 /* Journal disk can only be removed if it's faulty. */
9833 if (test_bit(Journal, &rdev->flags))
9834 return false;
9835
9836 /*
9837 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9838 * replacement has just become active from pers->spare_active(), and
9839 * then pers->hot_remove_disk() will replace this rdev with replacement.
9840 */
9841 if (!test_bit(In_sync, &rdev->flags))
9842 return true;
9843
9844 return false;
9845 }
9846
rdev_is_spare(struct md_rdev * rdev)9847 static bool rdev_is_spare(struct md_rdev *rdev)
9848 {
9849 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9850 !test_bit(In_sync, &rdev->flags) &&
9851 !test_bit(Journal, &rdev->flags) &&
9852 !test_bit(Faulty, &rdev->flags);
9853 }
9854
rdev_addable(struct md_rdev * rdev)9855 static bool rdev_addable(struct md_rdev *rdev)
9856 {
9857 struct mddev *mddev;
9858
9859 mddev = READ_ONCE(rdev->mddev);
9860 if (!mddev)
9861 return false;
9862
9863 /* rdev is already used, don't add it again. */
9864 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9865 test_bit(Faulty, &rdev->flags))
9866 return false;
9867
9868 /* Allow to add journal disk. */
9869 if (test_bit(Journal, &rdev->flags))
9870 return true;
9871
9872 /* Allow to add if array is read-write. */
9873 if (md_is_rdwr(mddev))
9874 return true;
9875
9876 /*
9877 * For read-only array, only allow to readd a rdev. And if bitmap is
9878 * used, don't allow to readd a rdev that is too old.
9879 */
9880 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9881 return true;
9882
9883 return false;
9884 }
9885
md_spares_need_change(struct mddev * mddev)9886 static bool md_spares_need_change(struct mddev *mddev)
9887 {
9888 struct md_rdev *rdev;
9889
9890 rcu_read_lock();
9891 rdev_for_each_rcu(rdev, mddev) {
9892 if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9893 rcu_read_unlock();
9894 return true;
9895 }
9896 }
9897 rcu_read_unlock();
9898 return false;
9899 }
9900
remove_spares(struct mddev * mddev,struct md_rdev * this)9901 static int remove_spares(struct mddev *mddev, struct md_rdev *this)
9902 {
9903 struct md_rdev *rdev;
9904 int removed = 0;
9905
9906 rdev_for_each(rdev, mddev) {
9907 if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9908 !mddev->pers->hot_remove_disk(mddev, rdev)) {
9909 sysfs_unlink_rdev(mddev, rdev);
9910 rdev->saved_raid_disk = rdev->raid_disk;
9911 rdev->raid_disk = -1;
9912 removed++;
9913 }
9914 }
9915
9916 if (removed && mddev->kobj.sd)
9917 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9918
9919 return removed;
9920 }
9921
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9922 static int remove_and_add_spares(struct mddev *mddev,
9923 struct md_rdev *this)
9924 {
9925 struct md_rdev *rdev;
9926 int spares = 0;
9927 int removed = 0;
9928
9929 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9930 /* Mustn't remove devices when resync thread is running */
9931 return 0;
9932
9933 removed = remove_spares(mddev, this);
9934 if (this && removed)
9935 goto no_add;
9936
9937 rdev_for_each(rdev, mddev) {
9938 if (this && this != rdev)
9939 continue;
9940 if (rdev_is_spare(rdev))
9941 spares++;
9942 if (!rdev_addable(rdev))
9943 continue;
9944 if (!test_bit(Journal, &rdev->flags))
9945 rdev->recovery_offset = 0;
9946 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9947 /* failure here is OK */
9948 sysfs_link_rdev(mddev, rdev);
9949 if (!test_bit(Journal, &rdev->flags))
9950 spares++;
9951 md_new_event();
9952 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9953 }
9954 }
9955 no_add:
9956 if (removed)
9957 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9958 return spares;
9959 }
9960
md_choose_sync_action(struct mddev * mddev,int * spares)9961 static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9962 {
9963 /* Check if reshape is in progress first. */
9964 if (mddev->reshape_position != MaxSector) {
9965 if (mddev->pers->check_reshape == NULL ||
9966 mddev->pers->check_reshape(mddev) != 0)
9967 return false;
9968
9969 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9970 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9971 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
9972 return true;
9973 }
9974
9975 /* Check if resync is in progress. */
9976 if (mddev->resync_offset < MaxSector) {
9977 remove_spares(mddev, NULL);
9978 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9979 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9980 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
9981 return true;
9982 }
9983
9984 /*
9985 * Remove any failed drives, then add spares if possible. Spares are
9986 * also removed and re-added, to allow the personality to fail the
9987 * re-add.
9988 */
9989 *spares = remove_and_add_spares(mddev, NULL);
9990 if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) {
9991 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9992 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9993 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9994
9995 /* Start new recovery. */
9996 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9997 return true;
9998 }
9999
10000 /* Delay to choose resync/check/repair in md_do_sync(). */
10001 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
10002 return true;
10003
10004 /* Nothing to be done */
10005 return false;
10006 }
10007
md_start_sync(struct work_struct * ws)10008 static void md_start_sync(struct work_struct *ws)
10009 {
10010 struct mddev *mddev = container_of(ws, struct mddev, sync_work);
10011 int spares = 0;
10012 bool suspend = false;
10013 char *name;
10014
10015 /*
10016 * If reshape is still in progress, spares won't be added or removed
10017 * from conf until reshape is done.
10018 */
10019 if (mddev->reshape_position == MaxSector &&
10020 md_spares_need_change(mddev)) {
10021 suspend = true;
10022 mddev_suspend(mddev, false);
10023 }
10024
10025 mddev_lock_nointr(mddev);
10026 if (!md_is_rdwr(mddev)) {
10027 /*
10028 * On a read-only array we can:
10029 * - remove failed devices
10030 * - add already-in_sync devices if the array itself is in-sync.
10031 * As we only add devices that are already in-sync, we can
10032 * activate the spares immediately.
10033 */
10034 remove_and_add_spares(mddev, NULL);
10035 goto not_running;
10036 }
10037
10038 if (!md_choose_sync_action(mddev, &spares))
10039 goto not_running;
10040
10041 if (!mddev->pers->sync_request)
10042 goto not_running;
10043
10044 /*
10045 * We are adding a device or devices to an array which has the bitmap
10046 * stored on all devices. So make sure all bitmap pages get written.
10047 */
10048 if (spares && md_bitmap_enabled(mddev, true))
10049 mddev->bitmap_ops->write_all(mddev);
10050
10051 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
10052 "reshape" : "resync";
10053 rcu_assign_pointer(mddev->sync_thread,
10054 md_register_thread(md_do_sync, mddev, name));
10055 if (!mddev->sync_thread) {
10056 pr_warn("%s: could not start resync thread...\n",
10057 mdname(mddev));
10058 /* leave the spares where they are, it shouldn't hurt */
10059 goto not_running;
10060 }
10061
10062 mddev_unlock(mddev);
10063 /*
10064 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
10065 * not set it again. Otherwise, we may cause issue like this one:
10066 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
10067 * Therefore, use __mddev_resume(mddev, false).
10068 */
10069 if (suspend)
10070 __mddev_resume(mddev, false);
10071 md_wakeup_thread(mddev->sync_thread);
10072 sysfs_notify_dirent_safe(mddev->sysfs_action);
10073 md_new_event();
10074 return;
10075
10076 not_running:
10077 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
10078 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
10079 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
10080 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
10081 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10082 mddev_unlock(mddev);
10083 /*
10084 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
10085 * not set it again. Otherwise, we may cause issue like this one:
10086 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
10087 * Therefore, use __mddev_resume(mddev, false).
10088 */
10089 if (suspend)
10090 __mddev_resume(mddev, false);
10091
10092 wake_up(&resync_wait);
10093 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
10094 mddev->sysfs_action)
10095 sysfs_notify_dirent_safe(mddev->sysfs_action);
10096 }
10097
unregister_sync_thread(struct mddev * mddev)10098 static void unregister_sync_thread(struct mddev *mddev)
10099 {
10100 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
10101 /* resync/recovery still happening */
10102 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10103 return;
10104 }
10105
10106 if (WARN_ON_ONCE(!mddev->sync_thread))
10107 return;
10108
10109 md_reap_sync_thread(mddev);
10110 }
10111
md_should_do_recovery(struct mddev * mddev)10112 static bool md_should_do_recovery(struct mddev *mddev)
10113 {
10114 /*
10115 * As long as one of the following flags is set,
10116 * recovery needs to do or cleanup.
10117 */
10118 if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
10119 test_bit(MD_RECOVERY_DONE, &mddev->recovery))
10120 return true;
10121
10122 /*
10123 * If no flags are set and it is in read-only status,
10124 * there is nothing to do.
10125 */
10126 if (!md_is_rdwr(mddev))
10127 return false;
10128
10129 /*
10130 * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to
10131 * active, and no action is needed for now.
10132 * All other MD_SB_* flags require to update the superblock.
10133 */
10134 if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING))
10135 return true;
10136
10137 /*
10138 * If the array is not using external metadata and there has been no data
10139 * written for some time, then the array's status needs to be set to
10140 * in_sync.
10141 */
10142 if (mddev->external == 0 && mddev->safemode == 1)
10143 return true;
10144
10145 /*
10146 * When the system is about to restart or the process receives an signal,
10147 * the array needs to be synchronized as soon as possible.
10148 * Once the data synchronization is completed, need to change the array
10149 * status to in_sync.
10150 */
10151 if (mddev->safemode == 2 && !mddev->in_sync &&
10152 mddev->resync_offset == MaxSector)
10153 return true;
10154
10155 return false;
10156 }
10157
10158 /*
10159 * This routine is regularly called by all per-raid-array threads to
10160 * deal with generic issues like resync and super-block update.
10161 * Raid personalities that don't have a thread (linear/raid0) do not
10162 * need this as they never do any recovery or update the superblock.
10163 *
10164 * It does not do any resync itself, but rather "forks" off other threads
10165 * to do that as needed.
10166 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
10167 * "->recovery" and create a thread at ->sync_thread.
10168 * When the thread finishes it sets MD_RECOVERY_DONE
10169 * and wakeups up this thread which will reap the thread and finish up.
10170 * This thread also removes any faulty devices (with nr_pending == 0).
10171 *
10172 * The overall approach is:
10173 * 1/ if the superblock needs updating, update it.
10174 * 2/ If a recovery thread is running, don't do anything else.
10175 * 3/ If recovery has finished, clean up, possibly marking spares active.
10176 * 4/ If there are any faulty devices, remove them.
10177 * 5/ If array is degraded, try to add spares devices
10178 * 6/ If array has spares or is not in-sync, start a resync thread.
10179 */
md_check_recovery(struct mddev * mddev)10180 void md_check_recovery(struct mddev *mddev)
10181 {
10182 if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work)
10183 mddev->bitmap_ops->daemon_work(mddev);
10184
10185 if (signal_pending(current)) {
10186 if (mddev->pers->sync_request && !mddev->external) {
10187 pr_debug("md: %s in immediate safe mode\n",
10188 mdname(mddev));
10189 mddev->safemode = 2;
10190 }
10191 flush_signals(current);
10192 }
10193
10194 if (!md_should_do_recovery(mddev))
10195 return;
10196
10197 if (mddev_trylock(mddev)) {
10198 bool try_set_sync = mddev->safemode != 0;
10199
10200 if (!mddev->external && mddev->safemode == 1)
10201 mddev->safemode = 0;
10202
10203 if (!md_is_rdwr(mddev)) {
10204 struct md_rdev *rdev;
10205
10206 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
10207 unregister_sync_thread(mddev);
10208 goto unlock;
10209 }
10210
10211 if (!mddev->external && mddev->in_sync)
10212 /*
10213 * 'Blocked' flag not needed as failed devices
10214 * will be recorded if array switched to read/write.
10215 * Leaving it set will prevent the device
10216 * from being removed.
10217 */
10218 rdev_for_each(rdev, mddev)
10219 clear_bit(Blocked, &rdev->flags);
10220
10221 /*
10222 * There is no thread, but we need to call
10223 * ->spare_active and clear saved_raid_disk
10224 */
10225 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
10226 md_reap_sync_thread(mddev);
10227
10228 /*
10229 * Let md_start_sync() to remove and add rdevs to the
10230 * array.
10231 */
10232 if (md_spares_need_change(mddev)) {
10233 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10234 queue_work(md_misc_wq, &mddev->sync_work);
10235 }
10236
10237 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
10238 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
10239 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10240 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
10241
10242 goto unlock;
10243 }
10244
10245 if (mddev_is_clustered(mddev)) {
10246 struct md_rdev *rdev, *tmp;
10247 /* kick the device if another node issued a
10248 * remove disk.
10249 */
10250 rdev_for_each_safe(rdev, tmp, mddev) {
10251 if (rdev->raid_disk < 0 &&
10252 test_and_clear_bit(ClusterRemove, &rdev->flags))
10253 md_kick_rdev_from_array(rdev);
10254 }
10255 }
10256
10257 if (try_set_sync && !mddev->external && !mddev->in_sync) {
10258 spin_lock(&mddev->lock);
10259 set_in_sync(mddev);
10260 spin_unlock(&mddev->lock);
10261 }
10262
10263 if (mddev->sb_flags)
10264 md_update_sb(mddev, 0);
10265
10266 /*
10267 * Never start a new sync thread if MD_RECOVERY_RUNNING is
10268 * still set.
10269 */
10270 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
10271 unregister_sync_thread(mddev);
10272 goto unlock;
10273 }
10274
10275 /* Set RUNNING before clearing NEEDED to avoid
10276 * any transients in the value of "sync_action".
10277 */
10278 mddev->curr_resync_completed = 0;
10279 spin_lock(&mddev->lock);
10280 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10281 spin_unlock(&mddev->lock);
10282 /* Clear some bits that don't mean anything, but
10283 * might be left set
10284 */
10285 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
10286 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
10287
10288 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
10289 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
10290 queue_work(md_misc_wq, &mddev->sync_work);
10291 } else {
10292 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10293 wake_up(&resync_wait);
10294 }
10295
10296 unlock:
10297 wake_up(&mddev->sb_wait);
10298 mddev_unlock(mddev);
10299 }
10300 }
10301 EXPORT_SYMBOL(md_check_recovery);
10302
md_reap_sync_thread(struct mddev * mddev)10303 void md_reap_sync_thread(struct mddev *mddev)
10304 {
10305 struct md_rdev *rdev;
10306 sector_t old_dev_sectors = mddev->dev_sectors;
10307 bool is_reshaped = false;
10308
10309 /* resync has finished, collect result */
10310 md_unregister_thread(mddev, &mddev->sync_thread);
10311 atomic_inc(&mddev->sync_seq);
10312
10313 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
10314 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
10315 mddev->degraded != mddev->raid_disks) {
10316 /* success...*/
10317 /* activate any spares */
10318 if (mddev->pers->spare_active(mddev)) {
10319 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10320 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
10321 }
10322 }
10323 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
10324 mddev->pers->finish_reshape) {
10325 mddev->pers->finish_reshape(mddev);
10326 if (mddev_is_clustered(mddev))
10327 is_reshaped = true;
10328 }
10329
10330 /* If array is no-longer degraded, then any saved_raid_disk
10331 * information must be scrapped.
10332 */
10333 if (!mddev->degraded)
10334 rdev_for_each(rdev, mddev)
10335 rdev->saved_raid_disk = -1;
10336
10337 md_update_sb(mddev, 1);
10338 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
10339 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
10340 * clustered raid */
10341 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
10342 mddev->cluster_ops->resync_finish(mddev);
10343 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
10344 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
10345 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
10346 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
10347 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
10348 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
10349 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
10350 /*
10351 * We call mddev->cluster_ops->update_size here because sync_size could
10352 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
10353 * so it is time to update size across cluster.
10354 */
10355 if (mddev_is_clustered(mddev) && is_reshaped
10356 && !test_bit(MD_CLOSING, &mddev->flags))
10357 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
10358 /* flag recovery needed just to double check */
10359 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10360 sysfs_notify_dirent_safe(mddev->sysfs_completed);
10361 sysfs_notify_dirent_safe(mddev->sysfs_action);
10362 md_new_event();
10363 if (mddev->event_work.func)
10364 queue_work(md_misc_wq, &mddev->event_work);
10365 wake_up(&resync_wait);
10366 }
10367 EXPORT_SYMBOL(md_reap_sync_thread);
10368
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)10369 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
10370 {
10371 sysfs_notify_dirent_safe(rdev->sysfs_state);
10372 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
10373 msecs_to_jiffies(5000));
10374 rdev_dec_pending(rdev, mddev);
10375 }
10376 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
10377
md_finish_reshape(struct mddev * mddev)10378 void md_finish_reshape(struct mddev *mddev)
10379 {
10380 /* called be personality module when reshape completes. */
10381 struct md_rdev *rdev;
10382
10383 rdev_for_each(rdev, mddev) {
10384 if (rdev->data_offset > rdev->new_data_offset)
10385 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
10386 else
10387 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
10388 rdev->data_offset = rdev->new_data_offset;
10389 }
10390 }
10391 EXPORT_SYMBOL(md_finish_reshape);
10392
10393 /* Bad block management */
10394
10395 /* Returns true on success, false on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)10396 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
10397 int is_new)
10398 {
10399 struct mddev *mddev = rdev->mddev;
10400
10401 /*
10402 * Recording new badblocks for faulty rdev will force unnecessary
10403 * super block updating. This is fragile for external management because
10404 * userspace daemon may trying to remove this device and deadlock may
10405 * occur. This will be probably solved in the mdadm, but it is safer to
10406 * avoid it.
10407 */
10408 if (test_bit(Faulty, &rdev->flags))
10409 return true;
10410
10411 if (is_new)
10412 s += rdev->new_data_offset;
10413 else
10414 s += rdev->data_offset;
10415
10416 if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
10417 return false;
10418
10419 /* Make sure they get written out promptly */
10420 if (test_bit(ExternalBbl, &rdev->flags))
10421 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
10422 sysfs_notify_dirent_safe(rdev->sysfs_state);
10423 set_mask_bits(&mddev->sb_flags, 0,
10424 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
10425 md_wakeup_thread(rdev->mddev->thread);
10426 return true;
10427 }
10428 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
10429
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)10430 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
10431 int is_new)
10432 {
10433 if (is_new)
10434 s += rdev->new_data_offset;
10435 else
10436 s += rdev->data_offset;
10437
10438 if (!badblocks_clear(&rdev->badblocks, s, sectors))
10439 return;
10440
10441 if (test_bit(ExternalBbl, &rdev->flags))
10442 sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
10443 }
10444 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
10445
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)10446 static int md_notify_reboot(struct notifier_block *this,
10447 unsigned long code, void *x)
10448 {
10449 struct mddev *mddev;
10450
10451 spin_lock(&all_mddevs_lock);
10452 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10453 if (!mddev_get(mddev))
10454 continue;
10455 spin_unlock(&all_mddevs_lock);
10456 if (mddev_trylock(mddev)) {
10457 if (mddev->pers)
10458 __md_stop_writes(mddev);
10459 if (mddev->persistent)
10460 mddev->safemode = 2;
10461 mddev_unlock(mddev);
10462 }
10463 spin_lock(&all_mddevs_lock);
10464 mddev_put_locked(mddev);
10465 }
10466 spin_unlock(&all_mddevs_lock);
10467
10468 return NOTIFY_DONE;
10469 }
10470
10471 static struct notifier_block md_notifier = {
10472 .notifier_call = md_notify_reboot,
10473 .next = NULL,
10474 .priority = INT_MAX, /* before any real devices */
10475 };
10476
md_geninit(void)10477 static void md_geninit(void)
10478 {
10479 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
10480
10481 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
10482 }
10483
md_init(void)10484 static int __init md_init(void)
10485 {
10486 int ret = md_bitmap_init();
10487
10488 if (ret)
10489 return ret;
10490
10491 ret = md_llbitmap_init();
10492 if (ret)
10493 goto err_bitmap;
10494
10495 ret = -ENOMEM;
10496 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
10497 if (!md_wq)
10498 goto err_wq;
10499
10500 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
10501 if (!md_misc_wq)
10502 goto err_misc_wq;
10503
10504 ret = __register_blkdev(MD_MAJOR, "md", md_probe);
10505 if (ret < 0)
10506 goto err_md;
10507
10508 ret = __register_blkdev(0, "mdp", md_probe);
10509 if (ret < 0)
10510 goto err_mdp;
10511 mdp_major = ret;
10512
10513 register_reboot_notifier(&md_notifier);
10514 raid_table_header = register_sysctl("dev/raid", raid_table);
10515
10516 md_geninit();
10517 return 0;
10518
10519 err_mdp:
10520 unregister_blkdev(MD_MAJOR, "md");
10521 err_md:
10522 destroy_workqueue(md_misc_wq);
10523 err_misc_wq:
10524 destroy_workqueue(md_wq);
10525 err_wq:
10526 md_llbitmap_exit();
10527 err_bitmap:
10528 md_bitmap_exit();
10529 return ret;
10530 }
10531
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)10532 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
10533 {
10534 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
10535 struct md_rdev *rdev2, *tmp;
10536 int role, ret;
10537
10538 /*
10539 * If size is changed in another node then we need to
10540 * do resize as well.
10541 */
10542 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
10543 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
10544 if (ret)
10545 pr_info("md-cluster: resize failed\n");
10546 else if (md_bitmap_enabled(mddev, false))
10547 mddev->bitmap_ops->update_sb(mddev->bitmap);
10548 }
10549
10550 /* Check for change of roles in the active devices */
10551 rdev_for_each_safe(rdev2, tmp, mddev) {
10552 if (test_bit(Faulty, &rdev2->flags)) {
10553 if (test_bit(ClusterRemove, &rdev2->flags))
10554 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10555 continue;
10556 }
10557
10558 /* Check if the roles changed */
10559 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
10560
10561 if (test_bit(Candidate, &rdev2->flags)) {
10562 if (role == MD_DISK_ROLE_FAULTY) {
10563 pr_info("md: Removing Candidate device %pg because add failed\n",
10564 rdev2->bdev);
10565 md_kick_rdev_from_array(rdev2);
10566 continue;
10567 }
10568 else
10569 clear_bit(Candidate, &rdev2->flags);
10570 }
10571
10572 if (role != rdev2->raid_disk) {
10573 /*
10574 * got activated except reshape is happening.
10575 */
10576 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
10577 !(le32_to_cpu(sb->feature_map) &
10578 MD_FEATURE_RESHAPE_ACTIVE) &&
10579 !mddev->cluster_ops->resync_status_get(mddev)) {
10580 /*
10581 * -1 to make raid1_add_disk() set conf->fullsync
10582 * to 1. This could avoid skipping sync when the
10583 * remote node is down during resyncing.
10584 */
10585 if ((le32_to_cpu(sb->feature_map)
10586 & MD_FEATURE_RECOVERY_OFFSET))
10587 rdev2->saved_raid_disk = -1;
10588 else
10589 rdev2->saved_raid_disk = role;
10590 ret = remove_and_add_spares(mddev, rdev2);
10591 pr_info("Activated spare: %pg\n",
10592 rdev2->bdev);
10593 /* wakeup mddev->thread here, so array could
10594 * perform resync with the new activated disk */
10595 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10596 md_wakeup_thread(mddev->thread);
10597 }
10598 /* device faulty
10599 * We just want to do the minimum to mark the disk
10600 * as faulty. The recovery is performed by the
10601 * one who initiated the error.
10602 */
10603 if (role == MD_DISK_ROLE_FAULTY ||
10604 role == MD_DISK_ROLE_JOURNAL) {
10605 md_error(mddev, rdev2);
10606 clear_bit(Blocked, &rdev2->flags);
10607 }
10608 }
10609 }
10610
10611 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
10612 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
10613 if (ret)
10614 pr_warn("md: updating array disks failed. %d\n", ret);
10615 }
10616
10617 /*
10618 * Since mddev->delta_disks has already updated in update_raid_disks,
10619 * so it is time to check reshape.
10620 */
10621 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10622 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10623 /*
10624 * reshape is happening in the remote node, we need to
10625 * update reshape_position and call start_reshape.
10626 */
10627 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10628 if (mddev->pers->update_reshape_pos)
10629 mddev->pers->update_reshape_pos(mddev);
10630 if (mddev->pers->start_reshape)
10631 mddev->pers->start_reshape(mddev);
10632 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10633 mddev->reshape_position != MaxSector &&
10634 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10635 /* reshape is just done in another node. */
10636 mddev->reshape_position = MaxSector;
10637 if (mddev->pers->update_reshape_pos)
10638 mddev->pers->update_reshape_pos(mddev);
10639 }
10640
10641 /* Finally set the event to be up to date */
10642 mddev->events = le64_to_cpu(sb->events);
10643 }
10644
read_rdev(struct mddev * mddev,struct md_rdev * rdev)10645 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10646 {
10647 int err;
10648 struct page *swapout = rdev->sb_page;
10649 struct mdp_superblock_1 *sb;
10650
10651 /* Store the sb page of the rdev in the swapout temporary
10652 * variable in case we err in the future
10653 */
10654 rdev->sb_page = NULL;
10655 err = alloc_disk_sb(rdev);
10656 if (err == 0) {
10657 ClearPageUptodate(rdev->sb_page);
10658 rdev->sb_loaded = 0;
10659 err = super_types[mddev->major_version].
10660 load_super(rdev, NULL, mddev->minor_version);
10661 }
10662 if (err < 0) {
10663 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10664 __func__, __LINE__, rdev->desc_nr, err);
10665 if (rdev->sb_page)
10666 put_page(rdev->sb_page);
10667 rdev->sb_page = swapout;
10668 rdev->sb_loaded = 1;
10669 return err;
10670 }
10671
10672 sb = page_address(rdev->sb_page);
10673 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10674 * is not set
10675 */
10676
10677 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10678 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10679
10680 /* The other node finished recovery, call spare_active to set
10681 * device In_sync and mddev->degraded
10682 */
10683 if (rdev->recovery_offset == MaxSector &&
10684 !test_bit(In_sync, &rdev->flags) &&
10685 mddev->pers->spare_active(mddev))
10686 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10687
10688 put_page(swapout);
10689 return 0;
10690 }
10691
md_reload_sb(struct mddev * mddev,int nr)10692 void md_reload_sb(struct mddev *mddev, int nr)
10693 {
10694 struct md_rdev *rdev = NULL, *iter;
10695 int err;
10696
10697 /* Find the rdev */
10698 rdev_for_each_rcu(iter, mddev) {
10699 if (iter->desc_nr == nr) {
10700 rdev = iter;
10701 break;
10702 }
10703 }
10704
10705 if (!rdev) {
10706 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10707 return;
10708 }
10709
10710 err = read_rdev(mddev, rdev);
10711 if (err < 0)
10712 return;
10713
10714 check_sb_changes(mddev, rdev);
10715
10716 /* Read all rdev's to update recovery_offset */
10717 rdev_for_each_rcu(rdev, mddev) {
10718 if (!test_bit(Faulty, &rdev->flags))
10719 read_rdev(mddev, rdev);
10720 }
10721 }
10722 EXPORT_SYMBOL(md_reload_sb);
10723
10724 #ifndef MODULE
10725
10726 /*
10727 * Searches all registered partitions for autorun RAID arrays
10728 * at boot time.
10729 */
10730
10731 static DEFINE_MUTEX(detected_devices_mutex);
10732 static LIST_HEAD(all_detected_devices);
10733 struct detected_devices_node {
10734 struct list_head list;
10735 dev_t dev;
10736 };
10737
md_autodetect_dev(dev_t dev)10738 void md_autodetect_dev(dev_t dev)
10739 {
10740 struct detected_devices_node *node_detected_dev;
10741
10742 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
10743 if (node_detected_dev) {
10744 node_detected_dev->dev = dev;
10745 mutex_lock(&detected_devices_mutex);
10746 list_add_tail(&node_detected_dev->list, &all_detected_devices);
10747 mutex_unlock(&detected_devices_mutex);
10748 }
10749 }
10750
md_autostart_arrays(int part)10751 void md_autostart_arrays(int part)
10752 {
10753 struct md_rdev *rdev;
10754 struct detected_devices_node *node_detected_dev;
10755 dev_t dev;
10756 int i_scanned, i_passed;
10757
10758 i_scanned = 0;
10759 i_passed = 0;
10760
10761 pr_info("md: Autodetecting RAID arrays.\n");
10762
10763 mutex_lock(&detected_devices_mutex);
10764 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10765 i_scanned++;
10766 node_detected_dev = list_entry(all_detected_devices.next,
10767 struct detected_devices_node, list);
10768 list_del(&node_detected_dev->list);
10769 dev = node_detected_dev->dev;
10770 kfree(node_detected_dev);
10771 mutex_unlock(&detected_devices_mutex);
10772 rdev = md_import_device(dev,0, 90);
10773 mutex_lock(&detected_devices_mutex);
10774 if (IS_ERR(rdev))
10775 continue;
10776
10777 if (test_bit(Faulty, &rdev->flags))
10778 continue;
10779
10780 set_bit(AutoDetected, &rdev->flags);
10781 list_add(&rdev->same_set, &pending_raid_disks);
10782 i_passed++;
10783 }
10784 mutex_unlock(&detected_devices_mutex);
10785
10786 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10787
10788 autorun_devices(part);
10789 }
10790
10791 #endif /* !MODULE */
10792
md_exit(void)10793 static __exit void md_exit(void)
10794 {
10795 struct mddev *mddev;
10796 int delay = 1;
10797
10798 unregister_blkdev(MD_MAJOR,"md");
10799 unregister_blkdev(mdp_major, "mdp");
10800 unregister_reboot_notifier(&md_notifier);
10801 unregister_sysctl_table(raid_table_header);
10802
10803 /* We cannot unload the modules while some process is
10804 * waiting for us in select() or poll() - wake them up
10805 */
10806 md_unloading = 1;
10807 while (waitqueue_active(&md_event_waiters)) {
10808 /* not safe to leave yet */
10809 wake_up(&md_event_waiters);
10810 msleep(delay);
10811 delay += delay;
10812 }
10813 remove_proc_entry("mdstat", NULL);
10814
10815 spin_lock(&all_mddevs_lock);
10816 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10817 if (!mddev_get(mddev))
10818 continue;
10819 spin_unlock(&all_mddevs_lock);
10820 export_array(mddev);
10821 mddev->ctime = 0;
10822 mddev->hold_active = 0;
10823 /*
10824 * As the mddev is now fully clear, mddev_put will schedule
10825 * the mddev for destruction by a workqueue, and the
10826 * destroy_workqueue() below will wait for that to complete.
10827 */
10828 spin_lock(&all_mddevs_lock);
10829 mddev_put_locked(mddev);
10830 }
10831 spin_unlock(&all_mddevs_lock);
10832
10833 destroy_workqueue(md_misc_wq);
10834 destroy_workqueue(md_wq);
10835 md_bitmap_exit();
10836 }
10837
10838 subsys_initcall(md_init);
module_exit(md_exit)10839 module_exit(md_exit)
10840
10841 static int get_ro(char *buffer, const struct kernel_param *kp)
10842 {
10843 return sprintf(buffer, "%d\n", start_readonly);
10844 }
set_ro(const char * val,const struct kernel_param * kp)10845 static int set_ro(const char *val, const struct kernel_param *kp)
10846 {
10847 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10848 }
10849
10850 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10851 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10852 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10853 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10854 module_param(legacy_async_del_gendisk, bool, 0600);
10855 module_param(check_new_feature, bool, 0600);
10856
10857 MODULE_LICENSE("GPL");
10858 MODULE_DESCRIPTION("MD RAID framework");
10859 MODULE_ALIAS("md");
10860 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10861