1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5
6 completely rewritten, based on the MD driver code from Marc Zyngier
7
8 Changes:
9
10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14 - kmod support by: Cyrus Durgin
15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17
18 - lots of fixes and improvements to the RAID1/RAID5 and generic
19 RAID code (such as request based resynchronization):
20
21 Neil Brown <neilb@cse.unsw.edu.au>.
22
23 - persistent bitmap code
24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25
26
27 Errors, Warnings, etc.
28 Please use:
29 pr_crit() for error conditions that risk data loss
30 pr_err() for error conditions that are unexpected, like an IO error
31 or internal inconsistency
32 pr_warn() for error conditions that could have been predicated, like
33 adding a device to an array when it has incompatible metadata
34 pr_info() for every interesting, very rare events, like an array starting
35 or stopping, or resync starting or stopping
36 pr_debug() for everything else.
37
38 */
39
40 #include <linux/sched/mm.h>
41 #include <linux/sched/signal.h>
42 #include <linux/kthread.h>
43 #include <linux/blkdev.h>
44 #include <linux/blk-integrity.h>
45 #include <linux/badblocks.h>
46 #include <linux/sysctl.h>
47 #include <linux/seq_file.h>
48 #include <linux/fs.h>
49 #include <linux/poll.h>
50 #include <linux/ctype.h>
51 #include <linux/string.h>
52 #include <linux/hdreg.h>
53 #include <linux/proc_fs.h>
54 #include <linux/random.h>
55 #include <linux/major.h>
56 #include <linux/module.h>
57 #include <linux/reboot.h>
58 #include <linux/file.h>
59 #include <linux/compat.h>
60 #include <linux/delay.h>
61 #include <linux/raid/md_p.h>
62 #include <linux/raid/md_u.h>
63 #include <linux/raid/detect.h>
64 #include <linux/slab.h>
65 #include <linux/percpu-refcount.h>
66 #include <linux/part_stat.h>
67
68 #include "md.h"
69 #include "md-bitmap.h"
70 #include "md-cluster.h"
71
72 static const char *action_name[NR_SYNC_ACTIONS] = {
73 [ACTION_RESYNC] = "resync",
74 [ACTION_RECOVER] = "recover",
75 [ACTION_CHECK] = "check",
76 [ACTION_REPAIR] = "repair",
77 [ACTION_RESHAPE] = "reshape",
78 [ACTION_FROZEN] = "frozen",
79 [ACTION_IDLE] = "idle",
80 };
81
82 static DEFINE_XARRAY(md_submodule);
83
84 static const struct kobj_type md_ktype;
85
86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87 static struct workqueue_struct *md_wq;
88
89 /*
90 * This workqueue is used for sync_work to register new sync_thread, and for
91 * del_work to remove rdev, and for event_work that is only set by dm-raid.
92 *
93 * Noted that sync_work will grab reconfig_mutex, hence never flush this
94 * workqueue whith reconfig_mutex grabbed.
95 */
96 static struct workqueue_struct *md_misc_wq;
97 struct workqueue_struct *md_bitmap_wq;
98
99 static int remove_and_add_spares(struct mddev *mddev,
100 struct md_rdev *this);
101 static void mddev_detach(struct mddev *mddev);
102 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
103 static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
104
105 /*
106 * Default number of read corrections we'll attempt on an rdev
107 * before ejecting it from the array. We divide the read error
108 * count by 2 for every hour elapsed between read errors.
109 */
110 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
111 /* Default safemode delay: 200 msec */
112 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
113 /*
114 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
115 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
116 * does not show up that much. Increase it if you want to have more guaranteed
117 * speed. Note that the RAID driver will use the maximum bandwidth
118 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
119 *
120 * Background sync IO speed control:
121 *
122 * - below speed min:
123 * no limit;
124 * - above speed min and below speed max:
125 * a) if mddev is idle, then no limit;
126 * b) if mddev is busy handling normal IO, then limit inflight sync IO
127 * to sync_io_depth;
128 * - above speed max:
129 * sync IO can't be issued;
130 *
131 * Following configurations can be changed via /proc/sys/dev/raid/ for system
132 * or /sys/block/mdX/md/ for one array.
133 */
134 static int sysctl_speed_limit_min = 1000;
135 static int sysctl_speed_limit_max = 200000;
136 static int sysctl_sync_io_depth = 32;
137
speed_min(struct mddev * mddev)138 static int speed_min(struct mddev *mddev)
139 {
140 return mddev->sync_speed_min ?
141 mddev->sync_speed_min : sysctl_speed_limit_min;
142 }
143
speed_max(struct mddev * mddev)144 static int speed_max(struct mddev *mddev)
145 {
146 return mddev->sync_speed_max ?
147 mddev->sync_speed_max : sysctl_speed_limit_max;
148 }
149
sync_io_depth(struct mddev * mddev)150 static int sync_io_depth(struct mddev *mddev)
151 {
152 return mddev->sync_io_depth ?
153 mddev->sync_io_depth : sysctl_sync_io_depth;
154 }
155
rdev_uninit_serial(struct md_rdev * rdev)156 static void rdev_uninit_serial(struct md_rdev *rdev)
157 {
158 if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
159 return;
160
161 kvfree(rdev->serial);
162 rdev->serial = NULL;
163 }
164
rdevs_uninit_serial(struct mddev * mddev)165 static void rdevs_uninit_serial(struct mddev *mddev)
166 {
167 struct md_rdev *rdev;
168
169 rdev_for_each(rdev, mddev)
170 rdev_uninit_serial(rdev);
171 }
172
rdev_init_serial(struct md_rdev * rdev)173 static int rdev_init_serial(struct md_rdev *rdev)
174 {
175 /* serial_nums equals with BARRIER_BUCKETS_NR */
176 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
177 struct serial_in_rdev *serial = NULL;
178
179 if (test_bit(CollisionCheck, &rdev->flags))
180 return 0;
181
182 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
183 GFP_KERNEL);
184 if (!serial)
185 return -ENOMEM;
186
187 for (i = 0; i < serial_nums; i++) {
188 struct serial_in_rdev *serial_tmp = &serial[i];
189
190 spin_lock_init(&serial_tmp->serial_lock);
191 serial_tmp->serial_rb = RB_ROOT_CACHED;
192 init_waitqueue_head(&serial_tmp->serial_io_wait);
193 }
194
195 rdev->serial = serial;
196 set_bit(CollisionCheck, &rdev->flags);
197
198 return 0;
199 }
200
rdevs_init_serial(struct mddev * mddev)201 static int rdevs_init_serial(struct mddev *mddev)
202 {
203 struct md_rdev *rdev;
204 int ret = 0;
205
206 rdev_for_each(rdev, mddev) {
207 ret = rdev_init_serial(rdev);
208 if (ret)
209 break;
210 }
211
212 /* Free all resources if pool is not existed */
213 if (ret && !mddev->serial_info_pool)
214 rdevs_uninit_serial(mddev);
215
216 return ret;
217 }
218
219 /*
220 * rdev needs to enable serial stuffs if it meets the conditions:
221 * 1. it is multi-queue device flaged with writemostly.
222 * 2. the write-behind mode is enabled.
223 */
rdev_need_serial(struct md_rdev * rdev)224 static int rdev_need_serial(struct md_rdev *rdev)
225 {
226 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
227 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
228 test_bit(WriteMostly, &rdev->flags));
229 }
230
231 /*
232 * Init resource for rdev(s), then create serial_info_pool if:
233 * 1. rdev is the first device which return true from rdev_enable_serial.
234 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
235 */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev)236 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
237 {
238 int ret = 0;
239
240 if (rdev && !rdev_need_serial(rdev) &&
241 !test_bit(CollisionCheck, &rdev->flags))
242 return;
243
244 if (!rdev)
245 ret = rdevs_init_serial(mddev);
246 else
247 ret = rdev_init_serial(rdev);
248 if (ret)
249 return;
250
251 if (mddev->serial_info_pool == NULL) {
252 /*
253 * already in memalloc noio context by
254 * mddev_suspend()
255 */
256 mddev->serial_info_pool =
257 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
258 sizeof(struct serial_info));
259 if (!mddev->serial_info_pool) {
260 rdevs_uninit_serial(mddev);
261 pr_err("can't alloc memory pool for serialization\n");
262 }
263 }
264 }
265
266 /*
267 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
268 * 1. rdev is the last device flaged with CollisionCheck.
269 * 2. when bitmap is destroyed while policy is not enabled.
270 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
271 */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev)272 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
273 {
274 if (rdev && !test_bit(CollisionCheck, &rdev->flags))
275 return;
276
277 if (mddev->serial_info_pool) {
278 struct md_rdev *temp;
279 int num = 0; /* used to track if other rdevs need the pool */
280
281 rdev_for_each(temp, mddev) {
282 if (!rdev) {
283 if (!mddev->serialize_policy ||
284 !rdev_need_serial(temp))
285 rdev_uninit_serial(temp);
286 else
287 num++;
288 } else if (temp != rdev &&
289 test_bit(CollisionCheck, &temp->flags))
290 num++;
291 }
292
293 if (rdev)
294 rdev_uninit_serial(rdev);
295
296 if (num)
297 pr_info("The mempool could be used by other devices\n");
298 else {
299 mempool_destroy(mddev->serial_info_pool);
300 mddev->serial_info_pool = NULL;
301 }
302 }
303 }
304
305 static struct ctl_table_header *raid_table_header;
306
307 static const struct ctl_table raid_table[] = {
308 {
309 .procname = "speed_limit_min",
310 .data = &sysctl_speed_limit_min,
311 .maxlen = sizeof(int),
312 .mode = 0644,
313 .proc_handler = proc_dointvec,
314 },
315 {
316 .procname = "speed_limit_max",
317 .data = &sysctl_speed_limit_max,
318 .maxlen = sizeof(int),
319 .mode = 0644,
320 .proc_handler = proc_dointvec,
321 },
322 {
323 .procname = "sync_io_depth",
324 .data = &sysctl_sync_io_depth,
325 .maxlen = sizeof(int),
326 .mode = 0644,
327 .proc_handler = proc_dointvec,
328 },
329 };
330
331 static int start_readonly;
332
333 /*
334 * The original mechanism for creating an md device is to create
335 * a device node in /dev and to open it. This causes races with device-close.
336 * The preferred method is to write to the "new_array" module parameter.
337 * This can avoid races.
338 * Setting create_on_open to false disables the original mechanism
339 * so all the races disappear.
340 */
341 static bool create_on_open = true;
342
343 /*
344 * We have a system wide 'event count' that is incremented
345 * on any 'interesting' event, and readers of /proc/mdstat
346 * can use 'poll' or 'select' to find out when the event
347 * count increases.
348 *
349 * Events are:
350 * start array, stop array, error, add device, remove device,
351 * start build, activate spare
352 */
353 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
354 static atomic_t md_event_count;
md_new_event(void)355 void md_new_event(void)
356 {
357 atomic_inc(&md_event_count);
358 wake_up(&md_event_waiters);
359 }
360 EXPORT_SYMBOL_GPL(md_new_event);
361
362 /*
363 * Enables to iterate over all existing md arrays
364 * all_mddevs_lock protects this list.
365 */
366 static LIST_HEAD(all_mddevs);
367 static DEFINE_SPINLOCK(all_mddevs_lock);
368
is_md_suspended(struct mddev * mddev)369 static bool is_md_suspended(struct mddev *mddev)
370 {
371 return percpu_ref_is_dying(&mddev->active_io);
372 }
373 /* Rather than calling directly into the personality make_request function,
374 * IO requests come here first so that we can check if the device is
375 * being suspended pending a reconfiguration.
376 * We hold a refcount over the call to ->make_request. By the time that
377 * call has finished, the bio has been linked into some internal structure
378 * and so is visible to ->quiesce(), so we don't need the refcount any more.
379 */
is_suspended(struct mddev * mddev,struct bio * bio)380 static bool is_suspended(struct mddev *mddev, struct bio *bio)
381 {
382 if (is_md_suspended(mddev))
383 return true;
384 if (bio_data_dir(bio) != WRITE)
385 return false;
386 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
387 return false;
388 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
389 return false;
390 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
391 return false;
392 return true;
393 }
394
md_handle_request(struct mddev * mddev,struct bio * bio)395 bool md_handle_request(struct mddev *mddev, struct bio *bio)
396 {
397 check_suspended:
398 if (is_suspended(mddev, bio)) {
399 DEFINE_WAIT(__wait);
400 /* Bail out if REQ_NOWAIT is set for the bio */
401 if (bio->bi_opf & REQ_NOWAIT) {
402 bio_wouldblock_error(bio);
403 return true;
404 }
405 for (;;) {
406 prepare_to_wait(&mddev->sb_wait, &__wait,
407 TASK_UNINTERRUPTIBLE);
408 if (!is_suspended(mddev, bio))
409 break;
410 schedule();
411 }
412 finish_wait(&mddev->sb_wait, &__wait);
413 }
414 if (!percpu_ref_tryget_live(&mddev->active_io))
415 goto check_suspended;
416
417 if (!mddev->pers->make_request(mddev, bio)) {
418 percpu_ref_put(&mddev->active_io);
419 if (!mddev->gendisk && mddev->pers->prepare_suspend)
420 return false;
421 goto check_suspended;
422 }
423
424 percpu_ref_put(&mddev->active_io);
425 return true;
426 }
427 EXPORT_SYMBOL(md_handle_request);
428
md_submit_bio(struct bio * bio)429 static void md_submit_bio(struct bio *bio)
430 {
431 const int rw = bio_data_dir(bio);
432 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
433
434 if (mddev == NULL || mddev->pers == NULL) {
435 bio_io_error(bio);
436 return;
437 }
438
439 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
440 bio_io_error(bio);
441 return;
442 }
443
444 bio = bio_split_to_limits(bio);
445 if (!bio)
446 return;
447
448 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
449 if (bio_sectors(bio) != 0)
450 bio->bi_status = BLK_STS_IOERR;
451 bio_endio(bio);
452 return;
453 }
454
455 /* bio could be mergeable after passing to underlayer */
456 bio->bi_opf &= ~REQ_NOMERGE;
457
458 md_handle_request(mddev, bio);
459 }
460
461 /*
462 * Make sure no new requests are submitted to the device, and any requests that
463 * have been submitted are completely handled.
464 */
mddev_suspend(struct mddev * mddev,bool interruptible)465 int mddev_suspend(struct mddev *mddev, bool interruptible)
466 {
467 int err = 0;
468
469 /*
470 * hold reconfig_mutex to wait for normal io will deadlock, because
471 * other context can't update super_block, and normal io can rely on
472 * updating super_block.
473 */
474 lockdep_assert_not_held(&mddev->reconfig_mutex);
475
476 if (interruptible)
477 err = mutex_lock_interruptible(&mddev->suspend_mutex);
478 else
479 mutex_lock(&mddev->suspend_mutex);
480 if (err)
481 return err;
482
483 if (mddev->suspended) {
484 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
485 mutex_unlock(&mddev->suspend_mutex);
486 return 0;
487 }
488
489 percpu_ref_kill(&mddev->active_io);
490 if (interruptible)
491 err = wait_event_interruptible(mddev->sb_wait,
492 percpu_ref_is_zero(&mddev->active_io));
493 else
494 wait_event(mddev->sb_wait,
495 percpu_ref_is_zero(&mddev->active_io));
496 if (err) {
497 percpu_ref_resurrect(&mddev->active_io);
498 mutex_unlock(&mddev->suspend_mutex);
499 return err;
500 }
501
502 /*
503 * For raid456, io might be waiting for reshape to make progress,
504 * allow new reshape to start while waiting for io to be done to
505 * prevent deadlock.
506 */
507 WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
508
509 /* restrict memory reclaim I/O during raid array is suspend */
510 mddev->noio_flag = memalloc_noio_save();
511
512 mutex_unlock(&mddev->suspend_mutex);
513 return 0;
514 }
515 EXPORT_SYMBOL_GPL(mddev_suspend);
516
__mddev_resume(struct mddev * mddev,bool recovery_needed)517 static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
518 {
519 lockdep_assert_not_held(&mddev->reconfig_mutex);
520
521 mutex_lock(&mddev->suspend_mutex);
522 WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
523 if (mddev->suspended) {
524 mutex_unlock(&mddev->suspend_mutex);
525 return;
526 }
527
528 /* entred the memalloc scope from mddev_suspend() */
529 memalloc_noio_restore(mddev->noio_flag);
530
531 percpu_ref_resurrect(&mddev->active_io);
532 wake_up(&mddev->sb_wait);
533
534 if (recovery_needed)
535 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
536 md_wakeup_thread(mddev->thread);
537 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
538
539 mutex_unlock(&mddev->suspend_mutex);
540 }
541
mddev_resume(struct mddev * mddev)542 void mddev_resume(struct mddev *mddev)
543 {
544 return __mddev_resume(mddev, true);
545 }
546 EXPORT_SYMBOL_GPL(mddev_resume);
547
548 /* sync bdev before setting device to readonly or stopping raid*/
mddev_set_closing_and_sync_blockdev(struct mddev * mddev,int opener_num)549 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
550 {
551 mutex_lock(&mddev->open_mutex);
552 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
553 mutex_unlock(&mddev->open_mutex);
554 return -EBUSY;
555 }
556 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
557 mutex_unlock(&mddev->open_mutex);
558 return -EBUSY;
559 }
560 mutex_unlock(&mddev->open_mutex);
561
562 sync_blockdev(mddev->gendisk->part0);
563 return 0;
564 }
565
566 /*
567 * The only difference from bio_chain_endio() is that the current
568 * bi_status of bio does not affect the bi_status of parent.
569 */
md_end_flush(struct bio * bio)570 static void md_end_flush(struct bio *bio)
571 {
572 struct bio *parent = bio->bi_private;
573
574 /*
575 * If any flush io error before the power failure,
576 * disk data may be lost.
577 */
578 if (bio->bi_status)
579 pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
580 blk_status_to_errno(bio->bi_status));
581
582 bio_put(bio);
583 bio_endio(parent);
584 }
585
md_flush_request(struct mddev * mddev,struct bio * bio)586 bool md_flush_request(struct mddev *mddev, struct bio *bio)
587 {
588 struct md_rdev *rdev;
589 struct bio *new;
590
591 /*
592 * md_flush_reqeust() should be called under md_handle_request() and
593 * 'active_io' is already grabbed. Hence it's safe to get rdev directly
594 * without rcu protection.
595 */
596 WARN_ON(percpu_ref_is_zero(&mddev->active_io));
597
598 rdev_for_each(rdev, mddev) {
599 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
600 continue;
601
602 new = bio_alloc_bioset(rdev->bdev, 0,
603 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
604 &mddev->bio_set);
605 new->bi_private = bio;
606 new->bi_end_io = md_end_flush;
607 bio_inc_remaining(bio);
608 submit_bio(new);
609 }
610
611 if (bio_sectors(bio) == 0) {
612 bio_endio(bio);
613 return true;
614 }
615
616 bio->bi_opf &= ~REQ_PREFLUSH;
617 return false;
618 }
619 EXPORT_SYMBOL(md_flush_request);
620
mddev_get(struct mddev * mddev)621 static inline struct mddev *mddev_get(struct mddev *mddev)
622 {
623 lockdep_assert_held(&all_mddevs_lock);
624
625 if (test_bit(MD_DELETED, &mddev->flags))
626 return NULL;
627 atomic_inc(&mddev->active);
628 return mddev;
629 }
630
631 static void mddev_delayed_delete(struct work_struct *ws);
632
__mddev_put(struct mddev * mddev)633 static void __mddev_put(struct mddev *mddev)
634 {
635 if (mddev->raid_disks || !list_empty(&mddev->disks) ||
636 mddev->ctime || mddev->hold_active)
637 return;
638
639 /* Array is not configured at all, and not held active, so destroy it */
640 set_bit(MD_DELETED, &mddev->flags);
641
642 /*
643 * Call queue_work inside the spinlock so that flush_workqueue() after
644 * mddev_find will succeed in waiting for the work to be done.
645 */
646 queue_work(md_misc_wq, &mddev->del_work);
647 }
648
mddev_put_locked(struct mddev * mddev)649 static void mddev_put_locked(struct mddev *mddev)
650 {
651 if (atomic_dec_and_test(&mddev->active))
652 __mddev_put(mddev);
653 }
654
mddev_put(struct mddev * mddev)655 void mddev_put(struct mddev *mddev)
656 {
657 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
658 return;
659
660 __mddev_put(mddev);
661 spin_unlock(&all_mddevs_lock);
662 }
663
664 static void md_safemode_timeout(struct timer_list *t);
665 static void md_start_sync(struct work_struct *ws);
666
active_io_release(struct percpu_ref * ref)667 static void active_io_release(struct percpu_ref *ref)
668 {
669 struct mddev *mddev = container_of(ref, struct mddev, active_io);
670
671 wake_up(&mddev->sb_wait);
672 }
673
no_op(struct percpu_ref * r)674 static void no_op(struct percpu_ref *r) {}
675
mddev_init(struct mddev * mddev)676 int mddev_init(struct mddev *mddev)
677 {
678
679 if (percpu_ref_init(&mddev->active_io, active_io_release,
680 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
681 return -ENOMEM;
682
683 if (percpu_ref_init(&mddev->writes_pending, no_op,
684 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
685 percpu_ref_exit(&mddev->active_io);
686 return -ENOMEM;
687 }
688
689 /* We want to start with the refcount at zero */
690 percpu_ref_put(&mddev->writes_pending);
691
692 mutex_init(&mddev->open_mutex);
693 mutex_init(&mddev->reconfig_mutex);
694 mutex_init(&mddev->suspend_mutex);
695 mutex_init(&mddev->bitmap_info.mutex);
696 INIT_LIST_HEAD(&mddev->disks);
697 INIT_LIST_HEAD(&mddev->all_mddevs);
698 INIT_LIST_HEAD(&mddev->deleting);
699 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
700 atomic_set(&mddev->active, 1);
701 atomic_set(&mddev->openers, 0);
702 atomic_set(&mddev->sync_seq, 0);
703 spin_lock_init(&mddev->lock);
704 init_waitqueue_head(&mddev->sb_wait);
705 init_waitqueue_head(&mddev->recovery_wait);
706 mddev->reshape_position = MaxSector;
707 mddev->reshape_backwards = 0;
708 mddev->last_sync_action = ACTION_IDLE;
709 mddev->resync_min = 0;
710 mddev->resync_max = MaxSector;
711 mddev->level = LEVEL_NONE;
712 mddev_set_bitmap_ops(mddev);
713
714 INIT_WORK(&mddev->sync_work, md_start_sync);
715 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
716
717 return 0;
718 }
719 EXPORT_SYMBOL_GPL(mddev_init);
720
mddev_destroy(struct mddev * mddev)721 void mddev_destroy(struct mddev *mddev)
722 {
723 percpu_ref_exit(&mddev->active_io);
724 percpu_ref_exit(&mddev->writes_pending);
725 }
726 EXPORT_SYMBOL_GPL(mddev_destroy);
727
mddev_find_locked(dev_t unit)728 static struct mddev *mddev_find_locked(dev_t unit)
729 {
730 struct mddev *mddev;
731
732 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
733 if (mddev->unit == unit)
734 return mddev;
735
736 return NULL;
737 }
738
739 /* find an unused unit number */
mddev_alloc_unit(void)740 static dev_t mddev_alloc_unit(void)
741 {
742 static int next_minor = 512;
743 int start = next_minor;
744 bool is_free = 0;
745 dev_t dev = 0;
746
747 while (!is_free) {
748 dev = MKDEV(MD_MAJOR, next_minor);
749 next_minor++;
750 if (next_minor > MINORMASK)
751 next_minor = 0;
752 if (next_minor == start)
753 return 0; /* Oh dear, all in use. */
754 is_free = !mddev_find_locked(dev);
755 }
756
757 return dev;
758 }
759
mddev_alloc(dev_t unit)760 static struct mddev *mddev_alloc(dev_t unit)
761 {
762 struct mddev *new;
763 int error;
764
765 if (unit && MAJOR(unit) != MD_MAJOR)
766 unit &= ~((1 << MdpMinorShift) - 1);
767
768 new = kzalloc(sizeof(*new), GFP_KERNEL);
769 if (!new)
770 return ERR_PTR(-ENOMEM);
771
772 error = mddev_init(new);
773 if (error)
774 goto out_free_new;
775
776 spin_lock(&all_mddevs_lock);
777 if (unit) {
778 error = -EEXIST;
779 if (mddev_find_locked(unit))
780 goto out_destroy_new;
781 new->unit = unit;
782 if (MAJOR(unit) == MD_MAJOR)
783 new->md_minor = MINOR(unit);
784 else
785 new->md_minor = MINOR(unit) >> MdpMinorShift;
786 new->hold_active = UNTIL_IOCTL;
787 } else {
788 error = -ENODEV;
789 new->unit = mddev_alloc_unit();
790 if (!new->unit)
791 goto out_destroy_new;
792 new->md_minor = MINOR(new->unit);
793 new->hold_active = UNTIL_STOP;
794 }
795
796 list_add(&new->all_mddevs, &all_mddevs);
797 spin_unlock(&all_mddevs_lock);
798 return new;
799
800 out_destroy_new:
801 spin_unlock(&all_mddevs_lock);
802 mddev_destroy(new);
803 out_free_new:
804 kfree(new);
805 return ERR_PTR(error);
806 }
807
mddev_free(struct mddev * mddev)808 static void mddev_free(struct mddev *mddev)
809 {
810 spin_lock(&all_mddevs_lock);
811 list_del(&mddev->all_mddevs);
812 spin_unlock(&all_mddevs_lock);
813
814 mddev_destroy(mddev);
815 kfree(mddev);
816 }
817
818 static const struct attribute_group md_redundancy_group;
819
mddev_unlock(struct mddev * mddev)820 void mddev_unlock(struct mddev *mddev)
821 {
822 struct md_rdev *rdev;
823 struct md_rdev *tmp;
824 LIST_HEAD(delete);
825
826 if (!list_empty(&mddev->deleting))
827 list_splice_init(&mddev->deleting, &delete);
828
829 if (mddev->to_remove) {
830 /* These cannot be removed under reconfig_mutex as
831 * an access to the files will try to take reconfig_mutex
832 * while holding the file unremovable, which leads to
833 * a deadlock.
834 * So hold set sysfs_active while the remove in happeing,
835 * and anything else which might set ->to_remove or my
836 * otherwise change the sysfs namespace will fail with
837 * -EBUSY if sysfs_active is still set.
838 * We set sysfs_active under reconfig_mutex and elsewhere
839 * test it under the same mutex to ensure its correct value
840 * is seen.
841 */
842 const struct attribute_group *to_remove = mddev->to_remove;
843 mddev->to_remove = NULL;
844 mddev->sysfs_active = 1;
845 mutex_unlock(&mddev->reconfig_mutex);
846
847 if (mddev->kobj.sd) {
848 if (to_remove != &md_redundancy_group)
849 sysfs_remove_group(&mddev->kobj, to_remove);
850 if (mddev->pers == NULL ||
851 mddev->pers->sync_request == NULL) {
852 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
853 if (mddev->sysfs_action)
854 sysfs_put(mddev->sysfs_action);
855 if (mddev->sysfs_completed)
856 sysfs_put(mddev->sysfs_completed);
857 if (mddev->sysfs_degraded)
858 sysfs_put(mddev->sysfs_degraded);
859 mddev->sysfs_action = NULL;
860 mddev->sysfs_completed = NULL;
861 mddev->sysfs_degraded = NULL;
862 }
863 }
864 mddev->sysfs_active = 0;
865 } else
866 mutex_unlock(&mddev->reconfig_mutex);
867
868 md_wakeup_thread(mddev->thread);
869 wake_up(&mddev->sb_wait);
870
871 list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
872 list_del_init(&rdev->same_set);
873 kobject_del(&rdev->kobj);
874 export_rdev(rdev, mddev);
875 }
876 }
877 EXPORT_SYMBOL_GPL(mddev_unlock);
878
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)879 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
880 {
881 struct md_rdev *rdev;
882
883 rdev_for_each_rcu(rdev, mddev)
884 if (rdev->desc_nr == nr)
885 return rdev;
886
887 return NULL;
888 }
889 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
890
find_rdev(struct mddev * mddev,dev_t dev)891 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
892 {
893 struct md_rdev *rdev;
894
895 rdev_for_each(rdev, mddev)
896 if (rdev->bdev->bd_dev == dev)
897 return rdev;
898
899 return NULL;
900 }
901
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)902 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
903 {
904 struct md_rdev *rdev;
905
906 rdev_for_each_rcu(rdev, mddev)
907 if (rdev->bdev->bd_dev == dev)
908 return rdev;
909
910 return NULL;
911 }
912 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
913
get_pers(int level,char * clevel)914 static struct md_personality *get_pers(int level, char *clevel)
915 {
916 struct md_personality *ret = NULL;
917 struct md_submodule_head *head;
918 unsigned long i;
919
920 xa_lock(&md_submodule);
921 xa_for_each(&md_submodule, i, head) {
922 if (head->type != MD_PERSONALITY)
923 continue;
924 if ((level != LEVEL_NONE && head->id == level) ||
925 !strcmp(head->name, clevel)) {
926 if (try_module_get(head->owner))
927 ret = (void *)head;
928 break;
929 }
930 }
931 xa_unlock(&md_submodule);
932
933 if (!ret) {
934 if (level != LEVEL_NONE)
935 pr_warn("md: personality for level %d is not loaded!\n",
936 level);
937 else
938 pr_warn("md: personality for level %s is not loaded!\n",
939 clevel);
940 }
941
942 return ret;
943 }
944
put_pers(struct md_personality * pers)945 static void put_pers(struct md_personality *pers)
946 {
947 module_put(pers->head.owner);
948 }
949
950 /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)951 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
952 {
953 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
954 }
955
alloc_disk_sb(struct md_rdev * rdev)956 static int alloc_disk_sb(struct md_rdev *rdev)
957 {
958 rdev->sb_page = alloc_page(GFP_KERNEL);
959 if (!rdev->sb_page)
960 return -ENOMEM;
961 return 0;
962 }
963
md_rdev_clear(struct md_rdev * rdev)964 void md_rdev_clear(struct md_rdev *rdev)
965 {
966 if (rdev->sb_page) {
967 put_page(rdev->sb_page);
968 rdev->sb_loaded = 0;
969 rdev->sb_page = NULL;
970 rdev->sb_start = 0;
971 rdev->sectors = 0;
972 }
973 if (rdev->bb_page) {
974 put_page(rdev->bb_page);
975 rdev->bb_page = NULL;
976 }
977 badblocks_exit(&rdev->badblocks);
978 }
979 EXPORT_SYMBOL_GPL(md_rdev_clear);
980
super_written(struct bio * bio)981 static void super_written(struct bio *bio)
982 {
983 struct md_rdev *rdev = bio->bi_private;
984 struct mddev *mddev = rdev->mddev;
985
986 if (bio->bi_status) {
987 pr_err("md: %s gets error=%d\n", __func__,
988 blk_status_to_errno(bio->bi_status));
989 md_error(mddev, rdev);
990 if (!test_bit(Faulty, &rdev->flags)
991 && (bio->bi_opf & MD_FAILFAST)) {
992 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
993 set_bit(LastDev, &rdev->flags);
994 }
995 } else
996 clear_bit(LastDev, &rdev->flags);
997
998 bio_put(bio);
999
1000 rdev_dec_pending(rdev, mddev);
1001
1002 if (atomic_dec_and_test(&mddev->pending_writes))
1003 wake_up(&mddev->sb_wait);
1004 }
1005
md_super_write(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page)1006 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
1007 sector_t sector, int size, struct page *page)
1008 {
1009 /* write first size bytes of page to sector of rdev
1010 * Increment mddev->pending_writes before returning
1011 * and decrement it on completion, waking up sb_wait
1012 * if zero is reached.
1013 * If an error occurred, call md_error
1014 */
1015 struct bio *bio;
1016
1017 if (!page)
1018 return;
1019
1020 if (test_bit(Faulty, &rdev->flags))
1021 return;
1022
1023 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1024 1,
1025 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1026 | REQ_PREFLUSH | REQ_FUA,
1027 GFP_NOIO, &mddev->sync_set);
1028
1029 atomic_inc(&rdev->nr_pending);
1030
1031 bio->bi_iter.bi_sector = sector;
1032 __bio_add_page(bio, page, size, 0);
1033 bio->bi_private = rdev;
1034 bio->bi_end_io = super_written;
1035
1036 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1037 test_bit(FailFast, &rdev->flags) &&
1038 !test_bit(LastDev, &rdev->flags))
1039 bio->bi_opf |= MD_FAILFAST;
1040
1041 atomic_inc(&mddev->pending_writes);
1042 submit_bio(bio);
1043 }
1044
md_super_wait(struct mddev * mddev)1045 int md_super_wait(struct mddev *mddev)
1046 {
1047 /* wait for all superblock writes that were scheduled to complete */
1048 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1049 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1050 return -EAGAIN;
1051 return 0;
1052 }
1053
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)1054 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1055 struct page *page, blk_opf_t opf, bool metadata_op)
1056 {
1057 struct bio bio;
1058 struct bio_vec bvec;
1059
1060 if (metadata_op && rdev->meta_bdev)
1061 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1062 else
1063 bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1064
1065 if (metadata_op)
1066 bio.bi_iter.bi_sector = sector + rdev->sb_start;
1067 else if (rdev->mddev->reshape_position != MaxSector &&
1068 (rdev->mddev->reshape_backwards ==
1069 (sector >= rdev->mddev->reshape_position)))
1070 bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1071 else
1072 bio.bi_iter.bi_sector = sector + rdev->data_offset;
1073 __bio_add_page(&bio, page, size, 0);
1074
1075 submit_bio_wait(&bio);
1076
1077 return !bio.bi_status;
1078 }
1079 EXPORT_SYMBOL_GPL(sync_page_io);
1080
read_disk_sb(struct md_rdev * rdev,int size)1081 static int read_disk_sb(struct md_rdev *rdev, int size)
1082 {
1083 if (rdev->sb_loaded)
1084 return 0;
1085
1086 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1087 goto fail;
1088 rdev->sb_loaded = 1;
1089 return 0;
1090
1091 fail:
1092 pr_err("md: disabled device %pg, could not read superblock.\n",
1093 rdev->bdev);
1094 return -EINVAL;
1095 }
1096
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1097 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1098 {
1099 return sb1->set_uuid0 == sb2->set_uuid0 &&
1100 sb1->set_uuid1 == sb2->set_uuid1 &&
1101 sb1->set_uuid2 == sb2->set_uuid2 &&
1102 sb1->set_uuid3 == sb2->set_uuid3;
1103 }
1104
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1105 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1106 {
1107 int ret;
1108 mdp_super_t *tmp1, *tmp2;
1109
1110 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1111 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1112
1113 if (!tmp1 || !tmp2) {
1114 ret = 0;
1115 goto abort;
1116 }
1117
1118 *tmp1 = *sb1;
1119 *tmp2 = *sb2;
1120
1121 /*
1122 * nr_disks is not constant
1123 */
1124 tmp1->nr_disks = 0;
1125 tmp2->nr_disks = 0;
1126
1127 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1128 abort:
1129 kfree(tmp1);
1130 kfree(tmp2);
1131 return ret;
1132 }
1133
md_csum_fold(u32 csum)1134 static u32 md_csum_fold(u32 csum)
1135 {
1136 csum = (csum & 0xffff) + (csum >> 16);
1137 return (csum & 0xffff) + (csum >> 16);
1138 }
1139
calc_sb_csum(mdp_super_t * sb)1140 static unsigned int calc_sb_csum(mdp_super_t *sb)
1141 {
1142 u64 newcsum = 0;
1143 u32 *sb32 = (u32*)sb;
1144 int i;
1145 unsigned int disk_csum, csum;
1146
1147 disk_csum = sb->sb_csum;
1148 sb->sb_csum = 0;
1149
1150 for (i = 0; i < MD_SB_BYTES/4 ; i++)
1151 newcsum += sb32[i];
1152 csum = (newcsum & 0xffffffff) + (newcsum>>32);
1153
1154 #ifdef CONFIG_ALPHA
1155 /* This used to use csum_partial, which was wrong for several
1156 * reasons including that different results are returned on
1157 * different architectures. It isn't critical that we get exactly
1158 * the same return value as before (we always csum_fold before
1159 * testing, and that removes any differences). However as we
1160 * know that csum_partial always returned a 16bit value on
1161 * alphas, do a fold to maximise conformity to previous behaviour.
1162 */
1163 sb->sb_csum = md_csum_fold(disk_csum);
1164 #else
1165 sb->sb_csum = disk_csum;
1166 #endif
1167 return csum;
1168 }
1169
1170 /*
1171 * Handle superblock details.
1172 * We want to be able to handle multiple superblock formats
1173 * so we have a common interface to them all, and an array of
1174 * different handlers.
1175 * We rely on user-space to write the initial superblock, and support
1176 * reading and updating of superblocks.
1177 * Interface methods are:
1178 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1179 * loads and validates a superblock on dev.
1180 * if refdev != NULL, compare superblocks on both devices
1181 * Return:
1182 * 0 - dev has a superblock that is compatible with refdev
1183 * 1 - dev has a superblock that is compatible and newer than refdev
1184 * so dev should be used as the refdev in future
1185 * -EINVAL superblock incompatible or invalid
1186 * -othererror e.g. -EIO
1187 *
1188 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1189 * Verify that dev is acceptable into mddev.
1190 * The first time, mddev->raid_disks will be 0, and data from
1191 * dev should be merged in. Subsequent calls check that dev
1192 * is new enough. Return 0 or -EINVAL
1193 *
1194 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1195 * Update the superblock for rdev with data in mddev
1196 * This does not write to disc.
1197 *
1198 */
1199
1200 struct super_type {
1201 char *name;
1202 struct module *owner;
1203 int (*load_super)(struct md_rdev *rdev,
1204 struct md_rdev *refdev,
1205 int minor_version);
1206 int (*validate_super)(struct mddev *mddev,
1207 struct md_rdev *freshest,
1208 struct md_rdev *rdev);
1209 void (*sync_super)(struct mddev *mddev,
1210 struct md_rdev *rdev);
1211 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1212 sector_t num_sectors);
1213 int (*allow_new_offset)(struct md_rdev *rdev,
1214 unsigned long long new_offset);
1215 };
1216
1217 /*
1218 * Check that the given mddev has no bitmap.
1219 *
1220 * This function is called from the run method of all personalities that do not
1221 * support bitmaps. It prints an error message and returns non-zero if mddev
1222 * has a bitmap. Otherwise, it returns 0.
1223 *
1224 */
md_check_no_bitmap(struct mddev * mddev)1225 int md_check_no_bitmap(struct mddev *mddev)
1226 {
1227 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1228 return 0;
1229 pr_warn("%s: bitmaps are not supported for %s\n",
1230 mdname(mddev), mddev->pers->head.name);
1231 return 1;
1232 }
1233 EXPORT_SYMBOL(md_check_no_bitmap);
1234
1235 /*
1236 * load_super for 0.90.0
1237 */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1238 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1239 {
1240 mdp_super_t *sb;
1241 int ret;
1242 bool spare_disk = true;
1243
1244 /*
1245 * Calculate the position of the superblock (512byte sectors),
1246 * it's at the end of the disk.
1247 *
1248 * It also happens to be a multiple of 4Kb.
1249 */
1250 rdev->sb_start = calc_dev_sboffset(rdev);
1251
1252 ret = read_disk_sb(rdev, MD_SB_BYTES);
1253 if (ret)
1254 return ret;
1255
1256 ret = -EINVAL;
1257
1258 sb = page_address(rdev->sb_page);
1259
1260 if (sb->md_magic != MD_SB_MAGIC) {
1261 pr_warn("md: invalid raid superblock magic on %pg\n",
1262 rdev->bdev);
1263 goto abort;
1264 }
1265
1266 if (sb->major_version != 0 ||
1267 sb->minor_version < 90 ||
1268 sb->minor_version > 91) {
1269 pr_warn("Bad version number %d.%d on %pg\n",
1270 sb->major_version, sb->minor_version, rdev->bdev);
1271 goto abort;
1272 }
1273
1274 if (sb->raid_disks <= 0)
1275 goto abort;
1276
1277 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1278 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1279 goto abort;
1280 }
1281
1282 rdev->preferred_minor = sb->md_minor;
1283 rdev->data_offset = 0;
1284 rdev->new_data_offset = 0;
1285 rdev->sb_size = MD_SB_BYTES;
1286 rdev->badblocks.shift = -1;
1287
1288 rdev->desc_nr = sb->this_disk.number;
1289
1290 /* not spare disk */
1291 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1292 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1293 spare_disk = false;
1294
1295 if (!refdev) {
1296 if (!spare_disk)
1297 ret = 1;
1298 else
1299 ret = 0;
1300 } else {
1301 __u64 ev1, ev2;
1302 mdp_super_t *refsb = page_address(refdev->sb_page);
1303 if (!md_uuid_equal(refsb, sb)) {
1304 pr_warn("md: %pg has different UUID to %pg\n",
1305 rdev->bdev, refdev->bdev);
1306 goto abort;
1307 }
1308 if (!md_sb_equal(refsb, sb)) {
1309 pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1310 rdev->bdev, refdev->bdev);
1311 goto abort;
1312 }
1313 ev1 = md_event(sb);
1314 ev2 = md_event(refsb);
1315
1316 if (!spare_disk && ev1 > ev2)
1317 ret = 1;
1318 else
1319 ret = 0;
1320 }
1321 rdev->sectors = rdev->sb_start;
1322 /* Limit to 4TB as metadata cannot record more than that.
1323 * (not needed for Linear and RAID0 as metadata doesn't
1324 * record this size)
1325 */
1326 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1327 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1328
1329 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1330 /* "this cannot possibly happen" ... */
1331 ret = -EINVAL;
1332
1333 abort:
1334 return ret;
1335 }
1336
md_bitmap_events_cleared(struct mddev * mddev)1337 static u64 md_bitmap_events_cleared(struct mddev *mddev)
1338 {
1339 struct md_bitmap_stats stats;
1340 int err;
1341
1342 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
1343 if (err)
1344 return 0;
1345
1346 return stats.events_cleared;
1347 }
1348
1349 /*
1350 * validate_super for 0.90.0
1351 * note: we are not using "freshest" for 0.9 superblock
1352 */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1353 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1354 {
1355 mdp_disk_t *desc;
1356 mdp_super_t *sb = page_address(rdev->sb_page);
1357 __u64 ev1 = md_event(sb);
1358
1359 rdev->raid_disk = -1;
1360 clear_bit(Faulty, &rdev->flags);
1361 clear_bit(In_sync, &rdev->flags);
1362 clear_bit(Bitmap_sync, &rdev->flags);
1363 clear_bit(WriteMostly, &rdev->flags);
1364
1365 if (mddev->raid_disks == 0) {
1366 mddev->major_version = 0;
1367 mddev->minor_version = sb->minor_version;
1368 mddev->patch_version = sb->patch_version;
1369 mddev->external = 0;
1370 mddev->chunk_sectors = sb->chunk_size >> 9;
1371 mddev->ctime = sb->ctime;
1372 mddev->utime = sb->utime;
1373 mddev->level = sb->level;
1374 mddev->clevel[0] = 0;
1375 mddev->layout = sb->layout;
1376 mddev->raid_disks = sb->raid_disks;
1377 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1378 mddev->events = ev1;
1379 mddev->bitmap_info.offset = 0;
1380 mddev->bitmap_info.space = 0;
1381 /* bitmap can use 60 K after the 4K superblocks */
1382 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1383 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1384 mddev->reshape_backwards = 0;
1385
1386 if (mddev->minor_version >= 91) {
1387 mddev->reshape_position = sb->reshape_position;
1388 mddev->delta_disks = sb->delta_disks;
1389 mddev->new_level = sb->new_level;
1390 mddev->new_layout = sb->new_layout;
1391 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1392 if (mddev->delta_disks < 0)
1393 mddev->reshape_backwards = 1;
1394 } else {
1395 mddev->reshape_position = MaxSector;
1396 mddev->delta_disks = 0;
1397 mddev->new_level = mddev->level;
1398 mddev->new_layout = mddev->layout;
1399 mddev->new_chunk_sectors = mddev->chunk_sectors;
1400 }
1401 if (mddev->level == 0)
1402 mddev->layout = -1;
1403
1404 if (sb->state & (1<<MD_SB_CLEAN))
1405 mddev->recovery_cp = MaxSector;
1406 else {
1407 if (sb->events_hi == sb->cp_events_hi &&
1408 sb->events_lo == sb->cp_events_lo) {
1409 mddev->recovery_cp = sb->recovery_cp;
1410 } else
1411 mddev->recovery_cp = 0;
1412 }
1413
1414 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1415 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1416 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1417 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1418
1419 mddev->max_disks = MD_SB_DISKS;
1420
1421 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1422 mddev->bitmap_info.file == NULL) {
1423 mddev->bitmap_info.offset =
1424 mddev->bitmap_info.default_offset;
1425 mddev->bitmap_info.space =
1426 mddev->bitmap_info.default_space;
1427 }
1428
1429 } else if (mddev->pers == NULL) {
1430 /* Insist on good event counter while assembling, except
1431 * for spares (which don't need an event count) */
1432 ++ev1;
1433 if (sb->disks[rdev->desc_nr].state & (
1434 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1435 if (ev1 < mddev->events)
1436 return -EINVAL;
1437 } else if (mddev->bitmap) {
1438 /* if adding to array with a bitmap, then we can accept an
1439 * older device ... but not too old.
1440 */
1441 if (ev1 < md_bitmap_events_cleared(mddev))
1442 return 0;
1443 if (ev1 < mddev->events)
1444 set_bit(Bitmap_sync, &rdev->flags);
1445 } else {
1446 if (ev1 < mddev->events)
1447 /* just a hot-add of a new device, leave raid_disk at -1 */
1448 return 0;
1449 }
1450
1451 desc = sb->disks + rdev->desc_nr;
1452
1453 if (desc->state & (1<<MD_DISK_FAULTY))
1454 set_bit(Faulty, &rdev->flags);
1455 else if (desc->state & (1<<MD_DISK_SYNC)) {
1456 set_bit(In_sync, &rdev->flags);
1457 rdev->raid_disk = desc->raid_disk;
1458 rdev->saved_raid_disk = desc->raid_disk;
1459 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1460 /* active but not in sync implies recovery up to
1461 * reshape position. We don't know exactly where
1462 * that is, so set to zero for now
1463 */
1464 if (mddev->minor_version >= 91) {
1465 rdev->recovery_offset = 0;
1466 rdev->raid_disk = desc->raid_disk;
1467 }
1468 }
1469 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1470 set_bit(WriteMostly, &rdev->flags);
1471 if (desc->state & (1<<MD_DISK_FAILFAST))
1472 set_bit(FailFast, &rdev->flags);
1473 return 0;
1474 }
1475
1476 /*
1477 * sync_super for 0.90.0
1478 */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1479 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1480 {
1481 mdp_super_t *sb;
1482 struct md_rdev *rdev2;
1483 int next_spare = mddev->raid_disks;
1484
1485 /* make rdev->sb match mddev data..
1486 *
1487 * 1/ zero out disks
1488 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1489 * 3/ any empty disks < next_spare become removed
1490 *
1491 * disks[0] gets initialised to REMOVED because
1492 * we cannot be sure from other fields if it has
1493 * been initialised or not.
1494 */
1495 int i;
1496 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1497
1498 rdev->sb_size = MD_SB_BYTES;
1499
1500 sb = page_address(rdev->sb_page);
1501
1502 memset(sb, 0, sizeof(*sb));
1503
1504 sb->md_magic = MD_SB_MAGIC;
1505 sb->major_version = mddev->major_version;
1506 sb->patch_version = mddev->patch_version;
1507 sb->gvalid_words = 0; /* ignored */
1508 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1509 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1510 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1511 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1512
1513 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1514 sb->level = mddev->level;
1515 sb->size = mddev->dev_sectors / 2;
1516 sb->raid_disks = mddev->raid_disks;
1517 sb->md_minor = mddev->md_minor;
1518 sb->not_persistent = 0;
1519 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1520 sb->state = 0;
1521 sb->events_hi = (mddev->events>>32);
1522 sb->events_lo = (u32)mddev->events;
1523
1524 if (mddev->reshape_position == MaxSector)
1525 sb->minor_version = 90;
1526 else {
1527 sb->minor_version = 91;
1528 sb->reshape_position = mddev->reshape_position;
1529 sb->new_level = mddev->new_level;
1530 sb->delta_disks = mddev->delta_disks;
1531 sb->new_layout = mddev->new_layout;
1532 sb->new_chunk = mddev->new_chunk_sectors << 9;
1533 }
1534 mddev->minor_version = sb->minor_version;
1535 if (mddev->in_sync)
1536 {
1537 sb->recovery_cp = mddev->recovery_cp;
1538 sb->cp_events_hi = (mddev->events>>32);
1539 sb->cp_events_lo = (u32)mddev->events;
1540 if (mddev->recovery_cp == MaxSector)
1541 sb->state = (1<< MD_SB_CLEAN);
1542 } else
1543 sb->recovery_cp = 0;
1544
1545 sb->layout = mddev->layout;
1546 sb->chunk_size = mddev->chunk_sectors << 9;
1547
1548 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1549 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1550
1551 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1552 rdev_for_each(rdev2, mddev) {
1553 mdp_disk_t *d;
1554 int desc_nr;
1555 int is_active = test_bit(In_sync, &rdev2->flags);
1556
1557 if (rdev2->raid_disk >= 0 &&
1558 sb->minor_version >= 91)
1559 /* we have nowhere to store the recovery_offset,
1560 * but if it is not below the reshape_position,
1561 * we can piggy-back on that.
1562 */
1563 is_active = 1;
1564 if (rdev2->raid_disk < 0 ||
1565 test_bit(Faulty, &rdev2->flags))
1566 is_active = 0;
1567 if (is_active)
1568 desc_nr = rdev2->raid_disk;
1569 else
1570 desc_nr = next_spare++;
1571 rdev2->desc_nr = desc_nr;
1572 d = &sb->disks[rdev2->desc_nr];
1573 nr_disks++;
1574 d->number = rdev2->desc_nr;
1575 d->major = MAJOR(rdev2->bdev->bd_dev);
1576 d->minor = MINOR(rdev2->bdev->bd_dev);
1577 if (is_active)
1578 d->raid_disk = rdev2->raid_disk;
1579 else
1580 d->raid_disk = rdev2->desc_nr; /* compatibility */
1581 if (test_bit(Faulty, &rdev2->flags))
1582 d->state = (1<<MD_DISK_FAULTY);
1583 else if (is_active) {
1584 d->state = (1<<MD_DISK_ACTIVE);
1585 if (test_bit(In_sync, &rdev2->flags))
1586 d->state |= (1<<MD_DISK_SYNC);
1587 active++;
1588 working++;
1589 } else {
1590 d->state = 0;
1591 spare++;
1592 working++;
1593 }
1594 if (test_bit(WriteMostly, &rdev2->flags))
1595 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1596 if (test_bit(FailFast, &rdev2->flags))
1597 d->state |= (1<<MD_DISK_FAILFAST);
1598 }
1599 /* now set the "removed" and "faulty" bits on any missing devices */
1600 for (i=0 ; i < mddev->raid_disks ; i++) {
1601 mdp_disk_t *d = &sb->disks[i];
1602 if (d->state == 0 && d->number == 0) {
1603 d->number = i;
1604 d->raid_disk = i;
1605 d->state = (1<<MD_DISK_REMOVED);
1606 d->state |= (1<<MD_DISK_FAULTY);
1607 failed++;
1608 }
1609 }
1610 sb->nr_disks = nr_disks;
1611 sb->active_disks = active;
1612 sb->working_disks = working;
1613 sb->failed_disks = failed;
1614 sb->spare_disks = spare;
1615
1616 sb->this_disk = sb->disks[rdev->desc_nr];
1617 sb->sb_csum = calc_sb_csum(sb);
1618 }
1619
1620 /*
1621 * rdev_size_change for 0.90.0
1622 */
1623 static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1624 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1625 {
1626 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1627 return 0; /* component must fit device */
1628 if (rdev->mddev->bitmap_info.offset)
1629 return 0; /* can't move bitmap */
1630 rdev->sb_start = calc_dev_sboffset(rdev);
1631 if (!num_sectors || num_sectors > rdev->sb_start)
1632 num_sectors = rdev->sb_start;
1633 /* Limit to 4TB as metadata cannot record more than that.
1634 * 4TB == 2^32 KB, or 2*2^32 sectors.
1635 */
1636 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1637 num_sectors = (sector_t)(2ULL << 32) - 2;
1638 do {
1639 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1640 rdev->sb_page);
1641 } while (md_super_wait(rdev->mddev) < 0);
1642 return num_sectors;
1643 }
1644
1645 static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1646 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1647 {
1648 /* non-zero offset changes not possible with v0.90 */
1649 return new_offset == 0;
1650 }
1651
1652 /*
1653 * version 1 superblock
1654 */
1655
calc_sb_1_csum(struct mdp_superblock_1 * sb)1656 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1657 {
1658 __le32 disk_csum;
1659 u32 csum;
1660 unsigned long long newcsum;
1661 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1662 __le32 *isuper = (__le32*)sb;
1663
1664 disk_csum = sb->sb_csum;
1665 sb->sb_csum = 0;
1666 newcsum = 0;
1667 for (; size >= 4; size -= 4)
1668 newcsum += le32_to_cpu(*isuper++);
1669
1670 if (size == 2)
1671 newcsum += le16_to_cpu(*(__le16*) isuper);
1672
1673 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1674 sb->sb_csum = disk_csum;
1675 return cpu_to_le32(csum);
1676 }
1677
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1678 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1679 {
1680 struct mdp_superblock_1 *sb;
1681 int ret;
1682 sector_t sb_start;
1683 sector_t sectors;
1684 int bmask;
1685 bool spare_disk = true;
1686
1687 /*
1688 * Calculate the position of the superblock in 512byte sectors.
1689 * It is always aligned to a 4K boundary and
1690 * depeding on minor_version, it can be:
1691 * 0: At least 8K, but less than 12K, from end of device
1692 * 1: At start of device
1693 * 2: 4K from start of device.
1694 */
1695 switch(minor_version) {
1696 case 0:
1697 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1698 sb_start &= ~(sector_t)(4*2-1);
1699 break;
1700 case 1:
1701 sb_start = 0;
1702 break;
1703 case 2:
1704 sb_start = 8;
1705 break;
1706 default:
1707 return -EINVAL;
1708 }
1709 rdev->sb_start = sb_start;
1710
1711 /* superblock is rarely larger than 1K, but it can be larger,
1712 * and it is safe to read 4k, so we do that
1713 */
1714 ret = read_disk_sb(rdev, 4096);
1715 if (ret) return ret;
1716
1717 sb = page_address(rdev->sb_page);
1718
1719 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1720 sb->major_version != cpu_to_le32(1) ||
1721 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1722 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1723 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1724 return -EINVAL;
1725
1726 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1727 pr_warn("md: invalid superblock checksum on %pg\n",
1728 rdev->bdev);
1729 return -EINVAL;
1730 }
1731 if (le64_to_cpu(sb->data_size) < 10) {
1732 pr_warn("md: data_size too small on %pg\n",
1733 rdev->bdev);
1734 return -EINVAL;
1735 }
1736 if (sb->pad0 ||
1737 sb->pad3[0] ||
1738 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1739 /* Some padding is non-zero, might be a new feature */
1740 return -EINVAL;
1741
1742 rdev->preferred_minor = 0xffff;
1743 rdev->data_offset = le64_to_cpu(sb->data_offset);
1744 rdev->new_data_offset = rdev->data_offset;
1745 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1746 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1747 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1748 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1749
1750 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1751 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1752 if (rdev->sb_size & bmask)
1753 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1754
1755 if (minor_version
1756 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1757 return -EINVAL;
1758 if (minor_version
1759 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1760 return -EINVAL;
1761
1762 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1763
1764 if (!rdev->bb_page) {
1765 rdev->bb_page = alloc_page(GFP_KERNEL);
1766 if (!rdev->bb_page)
1767 return -ENOMEM;
1768 }
1769 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1770 rdev->badblocks.count == 0) {
1771 /* need to load the bad block list.
1772 * Currently we limit it to one page.
1773 */
1774 s32 offset;
1775 sector_t bb_sector;
1776 __le64 *bbp;
1777 int i;
1778 int sectors = le16_to_cpu(sb->bblog_size);
1779 if (sectors > (PAGE_SIZE / 512))
1780 return -EINVAL;
1781 offset = le32_to_cpu(sb->bblog_offset);
1782 if (offset == 0)
1783 return -EINVAL;
1784 bb_sector = (long long)offset;
1785 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1786 rdev->bb_page, REQ_OP_READ, true))
1787 return -EIO;
1788 bbp = (__le64 *)page_address(rdev->bb_page);
1789 rdev->badblocks.shift = sb->bblog_shift;
1790 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1791 u64 bb = le64_to_cpu(*bbp);
1792 int count = bb & (0x3ff);
1793 u64 sector = bb >> 10;
1794 sector <<= sb->bblog_shift;
1795 count <<= sb->bblog_shift;
1796 if (bb + 1 == 0)
1797 break;
1798 if (!badblocks_set(&rdev->badblocks, sector, count, 1))
1799 return -EINVAL;
1800 }
1801 } else if (sb->bblog_offset != 0)
1802 rdev->badblocks.shift = 0;
1803
1804 if ((le32_to_cpu(sb->feature_map) &
1805 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1806 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1807 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1808 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1809 }
1810
1811 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1812 sb->level != 0)
1813 return -EINVAL;
1814
1815 /* not spare disk */
1816 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1817 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1818 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1819 spare_disk = false;
1820
1821 if (!refdev) {
1822 if (!spare_disk)
1823 ret = 1;
1824 else
1825 ret = 0;
1826 } else {
1827 __u64 ev1, ev2;
1828 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1829
1830 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1831 sb->level != refsb->level ||
1832 sb->layout != refsb->layout ||
1833 sb->chunksize != refsb->chunksize) {
1834 pr_warn("md: %pg has strangely different superblock to %pg\n",
1835 rdev->bdev,
1836 refdev->bdev);
1837 return -EINVAL;
1838 }
1839 ev1 = le64_to_cpu(sb->events);
1840 ev2 = le64_to_cpu(refsb->events);
1841
1842 if (!spare_disk && ev1 > ev2)
1843 ret = 1;
1844 else
1845 ret = 0;
1846 }
1847 if (minor_version)
1848 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1849 else
1850 sectors = rdev->sb_start;
1851 if (sectors < le64_to_cpu(sb->data_size))
1852 return -EINVAL;
1853 rdev->sectors = le64_to_cpu(sb->data_size);
1854 return ret;
1855 }
1856
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1857 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1858 {
1859 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1860 __u64 ev1 = le64_to_cpu(sb->events);
1861 int role;
1862
1863 rdev->raid_disk = -1;
1864 clear_bit(Faulty, &rdev->flags);
1865 clear_bit(In_sync, &rdev->flags);
1866 clear_bit(Bitmap_sync, &rdev->flags);
1867 clear_bit(WriteMostly, &rdev->flags);
1868
1869 if (mddev->raid_disks == 0) {
1870 mddev->major_version = 1;
1871 mddev->patch_version = 0;
1872 mddev->external = 0;
1873 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1874 mddev->ctime = le64_to_cpu(sb->ctime);
1875 mddev->utime = le64_to_cpu(sb->utime);
1876 mddev->level = le32_to_cpu(sb->level);
1877 mddev->clevel[0] = 0;
1878 mddev->layout = le32_to_cpu(sb->layout);
1879 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1880 mddev->dev_sectors = le64_to_cpu(sb->size);
1881 mddev->events = ev1;
1882 mddev->bitmap_info.offset = 0;
1883 mddev->bitmap_info.space = 0;
1884 /* Default location for bitmap is 1K after superblock
1885 * using 3K - total of 4K
1886 */
1887 mddev->bitmap_info.default_offset = 1024 >> 9;
1888 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1889 mddev->reshape_backwards = 0;
1890
1891 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1892 memcpy(mddev->uuid, sb->set_uuid, 16);
1893
1894 mddev->max_disks = (4096-256)/2;
1895
1896 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1897 mddev->bitmap_info.file == NULL) {
1898 mddev->bitmap_info.offset =
1899 (__s32)le32_to_cpu(sb->bitmap_offset);
1900 /* Metadata doesn't record how much space is available.
1901 * For 1.0, we assume we can use up to the superblock
1902 * if before, else to 4K beyond superblock.
1903 * For others, assume no change is possible.
1904 */
1905 if (mddev->minor_version > 0)
1906 mddev->bitmap_info.space = 0;
1907 else if (mddev->bitmap_info.offset > 0)
1908 mddev->bitmap_info.space =
1909 8 - mddev->bitmap_info.offset;
1910 else
1911 mddev->bitmap_info.space =
1912 -mddev->bitmap_info.offset;
1913 }
1914
1915 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1916 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1917 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1918 mddev->new_level = le32_to_cpu(sb->new_level);
1919 mddev->new_layout = le32_to_cpu(sb->new_layout);
1920 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1921 if (mddev->delta_disks < 0 ||
1922 (mddev->delta_disks == 0 &&
1923 (le32_to_cpu(sb->feature_map)
1924 & MD_FEATURE_RESHAPE_BACKWARDS)))
1925 mddev->reshape_backwards = 1;
1926 } else {
1927 mddev->reshape_position = MaxSector;
1928 mddev->delta_disks = 0;
1929 mddev->new_level = mddev->level;
1930 mddev->new_layout = mddev->layout;
1931 mddev->new_chunk_sectors = mddev->chunk_sectors;
1932 }
1933
1934 if (mddev->level == 0 &&
1935 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1936 mddev->layout = -1;
1937
1938 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1939 set_bit(MD_HAS_JOURNAL, &mddev->flags);
1940
1941 if (le32_to_cpu(sb->feature_map) &
1942 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1943 if (le32_to_cpu(sb->feature_map) &
1944 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1945 return -EINVAL;
1946 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1947 (le32_to_cpu(sb->feature_map) &
1948 MD_FEATURE_MULTIPLE_PPLS))
1949 return -EINVAL;
1950 set_bit(MD_HAS_PPL, &mddev->flags);
1951 }
1952 } else if (mddev->pers == NULL) {
1953 /* Insist of good event counter while assembling, except for
1954 * spares (which don't need an event count).
1955 * Similar to mdadm, we allow event counter difference of 1
1956 * from the freshest device.
1957 */
1958 if (rdev->desc_nr >= 0 &&
1959 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1960 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1961 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1962 if (ev1 + 1 < mddev->events)
1963 return -EINVAL;
1964 } else if (mddev->bitmap) {
1965 /* If adding to array with a bitmap, then we can accept an
1966 * older device, but not too old.
1967 */
1968 if (ev1 < md_bitmap_events_cleared(mddev))
1969 return 0;
1970 if (ev1 < mddev->events)
1971 set_bit(Bitmap_sync, &rdev->flags);
1972 } else {
1973 if (ev1 < mddev->events)
1974 /* just a hot-add of a new device, leave raid_disk at -1 */
1975 return 0;
1976 }
1977
1978 if (rdev->desc_nr < 0 ||
1979 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1980 role = MD_DISK_ROLE_SPARE;
1981 rdev->desc_nr = -1;
1982 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
1983 /*
1984 * If we are assembling, and our event counter is smaller than the
1985 * highest event counter, we cannot trust our superblock about the role.
1986 * It could happen that our rdev was marked as Faulty, and all other
1987 * superblocks were updated with +1 event counter.
1988 * Then, before the next superblock update, which typically happens when
1989 * remove_and_add_spares() removes the device from the array, there was
1990 * a crash or reboot.
1991 * If we allow current rdev without consulting the freshest superblock,
1992 * we could cause data corruption.
1993 * Note that in this case our event counter is smaller by 1 than the
1994 * highest, otherwise, this rdev would not be allowed into array;
1995 * both kernel and mdadm allow event counter difference of 1.
1996 */
1997 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
1998 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
1999
2000 if (rdev->desc_nr >= freshest_max_dev) {
2001 /* this is unexpected, better not proceed */
2002 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
2003 mdname(mddev), rdev->bdev, rdev->desc_nr,
2004 freshest->bdev, freshest_max_dev);
2005 return -EUCLEAN;
2006 }
2007
2008 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
2009 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
2010 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
2011 } else {
2012 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2013 }
2014 switch (role) {
2015 case MD_DISK_ROLE_SPARE: /* spare */
2016 break;
2017 case MD_DISK_ROLE_FAULTY: /* faulty */
2018 set_bit(Faulty, &rdev->flags);
2019 break;
2020 case MD_DISK_ROLE_JOURNAL: /* journal device */
2021 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
2022 /* journal device without journal feature */
2023 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2024 return -EINVAL;
2025 }
2026 set_bit(Journal, &rdev->flags);
2027 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2028 rdev->raid_disk = 0;
2029 break;
2030 default:
2031 rdev->saved_raid_disk = role;
2032 if ((le32_to_cpu(sb->feature_map) &
2033 MD_FEATURE_RECOVERY_OFFSET)) {
2034 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2035 if (!(le32_to_cpu(sb->feature_map) &
2036 MD_FEATURE_RECOVERY_BITMAP))
2037 rdev->saved_raid_disk = -1;
2038 } else {
2039 /*
2040 * If the array is FROZEN, then the device can't
2041 * be in_sync with rest of array.
2042 */
2043 if (!test_bit(MD_RECOVERY_FROZEN,
2044 &mddev->recovery))
2045 set_bit(In_sync, &rdev->flags);
2046 }
2047 rdev->raid_disk = role;
2048 break;
2049 }
2050 if (sb->devflags & WriteMostly1)
2051 set_bit(WriteMostly, &rdev->flags);
2052 if (sb->devflags & FailFast1)
2053 set_bit(FailFast, &rdev->flags);
2054 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2055 set_bit(Replacement, &rdev->flags);
2056
2057 return 0;
2058 }
2059
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)2060 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2061 {
2062 struct mdp_superblock_1 *sb;
2063 struct md_rdev *rdev2;
2064 int max_dev, i;
2065 /* make rdev->sb match mddev and rdev data. */
2066
2067 sb = page_address(rdev->sb_page);
2068
2069 sb->feature_map = 0;
2070 sb->pad0 = 0;
2071 sb->recovery_offset = cpu_to_le64(0);
2072 memset(sb->pad3, 0, sizeof(sb->pad3));
2073
2074 sb->utime = cpu_to_le64((__u64)mddev->utime);
2075 sb->events = cpu_to_le64(mddev->events);
2076 if (mddev->in_sync)
2077 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2078 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2079 sb->resync_offset = cpu_to_le64(MaxSector);
2080 else
2081 sb->resync_offset = cpu_to_le64(0);
2082
2083 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2084
2085 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2086 sb->size = cpu_to_le64(mddev->dev_sectors);
2087 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2088 sb->level = cpu_to_le32(mddev->level);
2089 sb->layout = cpu_to_le32(mddev->layout);
2090 if (test_bit(FailFast, &rdev->flags))
2091 sb->devflags |= FailFast1;
2092 else
2093 sb->devflags &= ~FailFast1;
2094
2095 if (test_bit(WriteMostly, &rdev->flags))
2096 sb->devflags |= WriteMostly1;
2097 else
2098 sb->devflags &= ~WriteMostly1;
2099 sb->data_offset = cpu_to_le64(rdev->data_offset);
2100 sb->data_size = cpu_to_le64(rdev->sectors);
2101
2102 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2103 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2104 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2105 }
2106
2107 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2108 !test_bit(In_sync, &rdev->flags)) {
2109 sb->feature_map |=
2110 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2111 sb->recovery_offset =
2112 cpu_to_le64(rdev->recovery_offset);
2113 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2114 sb->feature_map |=
2115 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2116 }
2117 /* Note: recovery_offset and journal_tail share space */
2118 if (test_bit(Journal, &rdev->flags))
2119 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2120 if (test_bit(Replacement, &rdev->flags))
2121 sb->feature_map |=
2122 cpu_to_le32(MD_FEATURE_REPLACEMENT);
2123
2124 if (mddev->reshape_position != MaxSector) {
2125 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2126 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2127 sb->new_layout = cpu_to_le32(mddev->new_layout);
2128 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2129 sb->new_level = cpu_to_le32(mddev->new_level);
2130 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2131 if (mddev->delta_disks == 0 &&
2132 mddev->reshape_backwards)
2133 sb->feature_map
2134 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2135 if (rdev->new_data_offset != rdev->data_offset) {
2136 sb->feature_map
2137 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2138 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2139 - rdev->data_offset));
2140 }
2141 }
2142
2143 if (mddev_is_clustered(mddev))
2144 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2145
2146 if (rdev->badblocks.count == 0)
2147 /* Nothing to do for bad blocks*/ ;
2148 else if (sb->bblog_offset == 0)
2149 /* Cannot record bad blocks on this device */
2150 md_error(mddev, rdev);
2151 else {
2152 struct badblocks *bb = &rdev->badblocks;
2153 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2154 u64 *p = bb->page;
2155 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2156 if (bb->changed) {
2157 unsigned seq;
2158
2159 retry:
2160 seq = read_seqbegin(&bb->lock);
2161
2162 memset(bbp, 0xff, PAGE_SIZE);
2163
2164 for (i = 0 ; i < bb->count ; i++) {
2165 u64 internal_bb = p[i];
2166 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2167 | BB_LEN(internal_bb));
2168 bbp[i] = cpu_to_le64(store_bb);
2169 }
2170 bb->changed = 0;
2171 if (read_seqretry(&bb->lock, seq))
2172 goto retry;
2173
2174 bb->sector = (rdev->sb_start +
2175 (int)le32_to_cpu(sb->bblog_offset));
2176 bb->size = le16_to_cpu(sb->bblog_size);
2177 }
2178 }
2179
2180 max_dev = 0;
2181 rdev_for_each(rdev2, mddev)
2182 if (rdev2->desc_nr+1 > max_dev)
2183 max_dev = rdev2->desc_nr+1;
2184
2185 if (max_dev > le32_to_cpu(sb->max_dev)) {
2186 int bmask;
2187 sb->max_dev = cpu_to_le32(max_dev);
2188 rdev->sb_size = max_dev * 2 + 256;
2189 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2190 if (rdev->sb_size & bmask)
2191 rdev->sb_size = (rdev->sb_size | bmask) + 1;
2192 } else
2193 max_dev = le32_to_cpu(sb->max_dev);
2194
2195 for (i=0; i<max_dev;i++)
2196 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2197
2198 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2199 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2200
2201 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2202 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2203 sb->feature_map |=
2204 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2205 else
2206 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2207 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2208 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2209 }
2210
2211 rdev_for_each(rdev2, mddev) {
2212 i = rdev2->desc_nr;
2213 if (test_bit(Faulty, &rdev2->flags))
2214 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2215 else if (test_bit(In_sync, &rdev2->flags))
2216 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2217 else if (test_bit(Journal, &rdev2->flags))
2218 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2219 else if (rdev2->raid_disk >= 0)
2220 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2221 else
2222 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2223 }
2224
2225 sb->sb_csum = calc_sb_1_csum(sb);
2226 }
2227
super_1_choose_bm_space(sector_t dev_size)2228 static sector_t super_1_choose_bm_space(sector_t dev_size)
2229 {
2230 sector_t bm_space;
2231
2232 /* if the device is bigger than 8Gig, save 64k for bitmap
2233 * usage, if bigger than 200Gig, save 128k
2234 */
2235 if (dev_size < 64*2)
2236 bm_space = 0;
2237 else if (dev_size - 64*2 >= 200*1024*1024*2)
2238 bm_space = 128*2;
2239 else if (dev_size - 4*2 > 8*1024*1024*2)
2240 bm_space = 64*2;
2241 else
2242 bm_space = 4*2;
2243 return bm_space;
2244 }
2245
2246 static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2247 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2248 {
2249 struct mdp_superblock_1 *sb;
2250 sector_t max_sectors;
2251 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2252 return 0; /* component must fit device */
2253 if (rdev->data_offset != rdev->new_data_offset)
2254 return 0; /* too confusing */
2255 if (rdev->sb_start < rdev->data_offset) {
2256 /* minor versions 1 and 2; superblock before data */
2257 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2258 if (!num_sectors || num_sectors > max_sectors)
2259 num_sectors = max_sectors;
2260 } else if (rdev->mddev->bitmap_info.offset) {
2261 /* minor version 0 with bitmap we can't move */
2262 return 0;
2263 } else {
2264 /* minor version 0; superblock after data */
2265 sector_t sb_start, bm_space;
2266 sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2267
2268 /* 8K is for superblock */
2269 sb_start = dev_size - 8*2;
2270 sb_start &= ~(sector_t)(4*2 - 1);
2271
2272 bm_space = super_1_choose_bm_space(dev_size);
2273
2274 /* Space that can be used to store date needs to decrease
2275 * superblock bitmap space and bad block space(4K)
2276 */
2277 max_sectors = sb_start - bm_space - 4*2;
2278
2279 if (!num_sectors || num_sectors > max_sectors)
2280 num_sectors = max_sectors;
2281 rdev->sb_start = sb_start;
2282 }
2283 sb = page_address(rdev->sb_page);
2284 sb->data_size = cpu_to_le64(num_sectors);
2285 sb->super_offset = cpu_to_le64(rdev->sb_start);
2286 sb->sb_csum = calc_sb_1_csum(sb);
2287 do {
2288 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2289 rdev->sb_page);
2290 } while (md_super_wait(rdev->mddev) < 0);
2291 return num_sectors;
2292
2293 }
2294
2295 static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2296 super_1_allow_new_offset(struct md_rdev *rdev,
2297 unsigned long long new_offset)
2298 {
2299 /* All necessary checks on new >= old have been done */
2300 if (new_offset >= rdev->data_offset)
2301 return 1;
2302
2303 /* with 1.0 metadata, there is no metadata to tread on
2304 * so we can always move back */
2305 if (rdev->mddev->minor_version == 0)
2306 return 1;
2307
2308 /* otherwise we must be sure not to step on
2309 * any metadata, so stay:
2310 * 36K beyond start of superblock
2311 * beyond end of badblocks
2312 * beyond write-intent bitmap
2313 */
2314 if (rdev->sb_start + (32+4)*2 > new_offset)
2315 return 0;
2316
2317 if (!rdev->mddev->bitmap_info.file) {
2318 struct mddev *mddev = rdev->mddev;
2319 struct md_bitmap_stats stats;
2320 int err;
2321
2322 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
2323 if (!err && rdev->sb_start + mddev->bitmap_info.offset +
2324 stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
2325 return 0;
2326 }
2327
2328 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2329 return 0;
2330
2331 return 1;
2332 }
2333
2334 static struct super_type super_types[] = {
2335 [0] = {
2336 .name = "0.90.0",
2337 .owner = THIS_MODULE,
2338 .load_super = super_90_load,
2339 .validate_super = super_90_validate,
2340 .sync_super = super_90_sync,
2341 .rdev_size_change = super_90_rdev_size_change,
2342 .allow_new_offset = super_90_allow_new_offset,
2343 },
2344 [1] = {
2345 .name = "md-1",
2346 .owner = THIS_MODULE,
2347 .load_super = super_1_load,
2348 .validate_super = super_1_validate,
2349 .sync_super = super_1_sync,
2350 .rdev_size_change = super_1_rdev_size_change,
2351 .allow_new_offset = super_1_allow_new_offset,
2352 },
2353 };
2354
sync_super(struct mddev * mddev,struct md_rdev * rdev)2355 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2356 {
2357 if (mddev->sync_super) {
2358 mddev->sync_super(mddev, rdev);
2359 return;
2360 }
2361
2362 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2363
2364 super_types[mddev->major_version].sync_super(mddev, rdev);
2365 }
2366
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2367 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2368 {
2369 struct md_rdev *rdev, *rdev2;
2370
2371 rcu_read_lock();
2372 rdev_for_each_rcu(rdev, mddev1) {
2373 if (test_bit(Faulty, &rdev->flags) ||
2374 test_bit(Journal, &rdev->flags) ||
2375 rdev->raid_disk == -1)
2376 continue;
2377 rdev_for_each_rcu(rdev2, mddev2) {
2378 if (test_bit(Faulty, &rdev2->flags) ||
2379 test_bit(Journal, &rdev2->flags) ||
2380 rdev2->raid_disk == -1)
2381 continue;
2382 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2383 rcu_read_unlock();
2384 return 1;
2385 }
2386 }
2387 }
2388 rcu_read_unlock();
2389 return 0;
2390 }
2391
2392 static LIST_HEAD(pending_raid_disks);
2393
2394 /*
2395 * Try to register data integrity profile for an mddev
2396 *
2397 * This is called when an array is started and after a disk has been kicked
2398 * from the array. It only succeeds if all working and active component devices
2399 * are integrity capable with matching profiles.
2400 */
md_integrity_register(struct mddev * mddev)2401 int md_integrity_register(struct mddev *mddev)
2402 {
2403 if (list_empty(&mddev->disks))
2404 return 0; /* nothing to do */
2405 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
2406 return 0; /* shouldn't register */
2407
2408 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2409 return 0;
2410 }
2411 EXPORT_SYMBOL(md_integrity_register);
2412
rdev_read_only(struct md_rdev * rdev)2413 static bool rdev_read_only(struct md_rdev *rdev)
2414 {
2415 return bdev_read_only(rdev->bdev) ||
2416 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2417 }
2418
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2419 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2420 {
2421 char b[BDEVNAME_SIZE];
2422 int err;
2423
2424 /* prevent duplicates */
2425 if (find_rdev(mddev, rdev->bdev->bd_dev))
2426 return -EEXIST;
2427
2428 if (rdev_read_only(rdev) && mddev->pers)
2429 return -EROFS;
2430
2431 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2432 if (!test_bit(Journal, &rdev->flags) &&
2433 rdev->sectors &&
2434 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2435 if (mddev->pers) {
2436 /* Cannot change size, so fail
2437 * If mddev->level <= 0, then we don't care
2438 * about aligning sizes (e.g. linear)
2439 */
2440 if (mddev->level > 0)
2441 return -ENOSPC;
2442 } else
2443 mddev->dev_sectors = rdev->sectors;
2444 }
2445
2446 /* Verify rdev->desc_nr is unique.
2447 * If it is -1, assign a free number, else
2448 * check number is not in use
2449 */
2450 rcu_read_lock();
2451 if (rdev->desc_nr < 0) {
2452 int choice = 0;
2453 if (mddev->pers)
2454 choice = mddev->raid_disks;
2455 while (md_find_rdev_nr_rcu(mddev, choice))
2456 choice++;
2457 rdev->desc_nr = choice;
2458 } else {
2459 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2460 rcu_read_unlock();
2461 return -EBUSY;
2462 }
2463 }
2464 rcu_read_unlock();
2465 if (!test_bit(Journal, &rdev->flags) &&
2466 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2467 pr_warn("md: %s: array is limited to %d devices\n",
2468 mdname(mddev), mddev->max_disks);
2469 return -EBUSY;
2470 }
2471 snprintf(b, sizeof(b), "%pg", rdev->bdev);
2472 strreplace(b, '/', '!');
2473
2474 rdev->mddev = mddev;
2475 pr_debug("md: bind<%s>\n", b);
2476
2477 if (mddev->raid_disks)
2478 mddev_create_serial_pool(mddev, rdev);
2479
2480 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2481 goto fail;
2482
2483 /* failure here is OK */
2484 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2485 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2486 rdev->sysfs_unack_badblocks =
2487 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2488 rdev->sysfs_badblocks =
2489 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2490
2491 list_add_rcu(&rdev->same_set, &mddev->disks);
2492 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2493
2494 /* May as well allow recovery to be retried once */
2495 mddev->recovery_disabled++;
2496
2497 return 0;
2498
2499 fail:
2500 pr_warn("md: failed to register dev-%s for %s\n",
2501 b, mdname(mddev));
2502 mddev_destroy_serial_pool(mddev, rdev);
2503 return err;
2504 }
2505
2506 void md_autodetect_dev(dev_t dev);
2507
2508 /* just for claiming the bdev */
2509 static struct md_rdev claim_rdev;
2510
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2511 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2512 {
2513 pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2514 md_rdev_clear(rdev);
2515 #ifndef MODULE
2516 if (test_bit(AutoDetected, &rdev->flags))
2517 md_autodetect_dev(rdev->bdev->bd_dev);
2518 #endif
2519 fput(rdev->bdev_file);
2520 rdev->bdev = NULL;
2521 kobject_put(&rdev->kobj);
2522 }
2523
md_kick_rdev_from_array(struct md_rdev * rdev)2524 static void md_kick_rdev_from_array(struct md_rdev *rdev)
2525 {
2526 struct mddev *mddev = rdev->mddev;
2527
2528 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2529 list_del_rcu(&rdev->same_set);
2530 pr_debug("md: unbind<%pg>\n", rdev->bdev);
2531 mddev_destroy_serial_pool(rdev->mddev, rdev);
2532 WRITE_ONCE(rdev->mddev, NULL);
2533 sysfs_remove_link(&rdev->kobj, "block");
2534 sysfs_put(rdev->sysfs_state);
2535 sysfs_put(rdev->sysfs_unack_badblocks);
2536 sysfs_put(rdev->sysfs_badblocks);
2537 rdev->sysfs_state = NULL;
2538 rdev->sysfs_unack_badblocks = NULL;
2539 rdev->sysfs_badblocks = NULL;
2540 rdev->badblocks.count = 0;
2541
2542 synchronize_rcu();
2543
2544 /*
2545 * kobject_del() will wait for all in progress writers to be done, where
2546 * reconfig_mutex is held, hence it can't be called under
2547 * reconfig_mutex and it's delayed to mddev_unlock().
2548 */
2549 list_add(&rdev->same_set, &mddev->deleting);
2550 }
2551
export_array(struct mddev * mddev)2552 static void export_array(struct mddev *mddev)
2553 {
2554 struct md_rdev *rdev;
2555
2556 while (!list_empty(&mddev->disks)) {
2557 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2558 same_set);
2559 md_kick_rdev_from_array(rdev);
2560 }
2561 mddev->raid_disks = 0;
2562 mddev->major_version = 0;
2563 }
2564
set_in_sync(struct mddev * mddev)2565 static bool set_in_sync(struct mddev *mddev)
2566 {
2567 lockdep_assert_held(&mddev->lock);
2568 if (!mddev->in_sync) {
2569 mddev->sync_checkers++;
2570 spin_unlock(&mddev->lock);
2571 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2572 spin_lock(&mddev->lock);
2573 if (!mddev->in_sync &&
2574 percpu_ref_is_zero(&mddev->writes_pending)) {
2575 mddev->in_sync = 1;
2576 /*
2577 * Ensure ->in_sync is visible before we clear
2578 * ->sync_checkers.
2579 */
2580 smp_mb();
2581 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2582 sysfs_notify_dirent_safe(mddev->sysfs_state);
2583 }
2584 if (--mddev->sync_checkers == 0)
2585 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2586 }
2587 if (mddev->safemode == 1)
2588 mddev->safemode = 0;
2589 return mddev->in_sync;
2590 }
2591
sync_sbs(struct mddev * mddev,int nospares)2592 static void sync_sbs(struct mddev *mddev, int nospares)
2593 {
2594 /* Update each superblock (in-memory image), but
2595 * if we are allowed to, skip spares which already
2596 * have the right event counter, or have one earlier
2597 * (which would mean they aren't being marked as dirty
2598 * with the rest of the array)
2599 */
2600 struct md_rdev *rdev;
2601 rdev_for_each(rdev, mddev) {
2602 if (rdev->sb_events == mddev->events ||
2603 (nospares &&
2604 rdev->raid_disk < 0 &&
2605 rdev->sb_events+1 == mddev->events)) {
2606 /* Don't update this superblock */
2607 rdev->sb_loaded = 2;
2608 } else {
2609 sync_super(mddev, rdev);
2610 rdev->sb_loaded = 1;
2611 }
2612 }
2613 }
2614
does_sb_need_changing(struct mddev * mddev)2615 static bool does_sb_need_changing(struct mddev *mddev)
2616 {
2617 struct md_rdev *rdev = NULL, *iter;
2618 struct mdp_superblock_1 *sb;
2619 int role;
2620
2621 /* Find a good rdev */
2622 rdev_for_each(iter, mddev)
2623 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2624 rdev = iter;
2625 break;
2626 }
2627
2628 /* No good device found. */
2629 if (!rdev)
2630 return false;
2631
2632 sb = page_address(rdev->sb_page);
2633 /* Check if a device has become faulty or a spare become active */
2634 rdev_for_each(rdev, mddev) {
2635 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2636 /* Device activated? */
2637 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2638 !test_bit(Faulty, &rdev->flags))
2639 return true;
2640 /* Device turned faulty? */
2641 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2642 return true;
2643 }
2644
2645 /* Check if any mddev parameters have changed */
2646 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2647 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2648 (mddev->layout != le32_to_cpu(sb->layout)) ||
2649 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2650 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2651 return true;
2652
2653 return false;
2654 }
2655
md_update_sb(struct mddev * mddev,int force_change)2656 void md_update_sb(struct mddev *mddev, int force_change)
2657 {
2658 struct md_rdev *rdev;
2659 int sync_req;
2660 int nospares = 0;
2661 int any_badblocks_changed = 0;
2662 int ret = -1;
2663
2664 if (!md_is_rdwr(mddev)) {
2665 if (force_change)
2666 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2667 return;
2668 }
2669
2670 repeat:
2671 if (mddev_is_clustered(mddev)) {
2672 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2673 force_change = 1;
2674 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2675 nospares = 1;
2676 ret = mddev->cluster_ops->metadata_update_start(mddev);
2677 /* Has someone else has updated the sb */
2678 if (!does_sb_need_changing(mddev)) {
2679 if (ret == 0)
2680 mddev->cluster_ops->metadata_update_cancel(mddev);
2681 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2682 BIT(MD_SB_CHANGE_DEVS) |
2683 BIT(MD_SB_CHANGE_CLEAN));
2684 return;
2685 }
2686 }
2687
2688 /*
2689 * First make sure individual recovery_offsets are correct
2690 * curr_resync_completed can only be used during recovery.
2691 * During reshape/resync it might use array-addresses rather
2692 * that device addresses.
2693 */
2694 rdev_for_each(rdev, mddev) {
2695 if (rdev->raid_disk >= 0 &&
2696 mddev->delta_disks >= 0 &&
2697 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2698 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2699 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2700 !test_bit(Journal, &rdev->flags) &&
2701 !test_bit(In_sync, &rdev->flags) &&
2702 mddev->curr_resync_completed > rdev->recovery_offset)
2703 rdev->recovery_offset = mddev->curr_resync_completed;
2704
2705 }
2706 if (!mddev->persistent) {
2707 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2708 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2709 if (!mddev->external) {
2710 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2711 rdev_for_each(rdev, mddev) {
2712 if (rdev->badblocks.changed) {
2713 rdev->badblocks.changed = 0;
2714 ack_all_badblocks(&rdev->badblocks);
2715 md_error(mddev, rdev);
2716 }
2717 clear_bit(Blocked, &rdev->flags);
2718 clear_bit(BlockedBadBlocks, &rdev->flags);
2719 wake_up(&rdev->blocked_wait);
2720 }
2721 }
2722 wake_up(&mddev->sb_wait);
2723 return;
2724 }
2725
2726 spin_lock(&mddev->lock);
2727
2728 mddev->utime = ktime_get_real_seconds();
2729
2730 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2731 force_change = 1;
2732 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2733 /* just a clean<-> dirty transition, possibly leave spares alone,
2734 * though if events isn't the right even/odd, we will have to do
2735 * spares after all
2736 */
2737 nospares = 1;
2738 if (force_change)
2739 nospares = 0;
2740 if (mddev->degraded)
2741 /* If the array is degraded, then skipping spares is both
2742 * dangerous and fairly pointless.
2743 * Dangerous because a device that was removed from the array
2744 * might have a event_count that still looks up-to-date,
2745 * so it can be re-added without a resync.
2746 * Pointless because if there are any spares to skip,
2747 * then a recovery will happen and soon that array won't
2748 * be degraded any more and the spare can go back to sleep then.
2749 */
2750 nospares = 0;
2751
2752 sync_req = mddev->in_sync;
2753
2754 /* If this is just a dirty<->clean transition, and the array is clean
2755 * and 'events' is odd, we can roll back to the previous clean state */
2756 if (nospares
2757 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2758 && mddev->can_decrease_events
2759 && mddev->events != 1) {
2760 mddev->events--;
2761 mddev->can_decrease_events = 0;
2762 } else {
2763 /* otherwise we have to go forward and ... */
2764 mddev->events ++;
2765 mddev->can_decrease_events = nospares;
2766 }
2767
2768 /*
2769 * This 64-bit counter should never wrap.
2770 * Either we are in around ~1 trillion A.C., assuming
2771 * 1 reboot per second, or we have a bug...
2772 */
2773 WARN_ON(mddev->events == 0);
2774
2775 rdev_for_each(rdev, mddev) {
2776 if (rdev->badblocks.changed)
2777 any_badblocks_changed++;
2778 if (test_bit(Faulty, &rdev->flags))
2779 set_bit(FaultRecorded, &rdev->flags);
2780 }
2781
2782 sync_sbs(mddev, nospares);
2783 spin_unlock(&mddev->lock);
2784
2785 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2786 mdname(mddev), mddev->in_sync);
2787
2788 mddev_add_trace_msg(mddev, "md md_update_sb");
2789 rewrite:
2790 mddev->bitmap_ops->update_sb(mddev->bitmap);
2791 rdev_for_each(rdev, mddev) {
2792 if (rdev->sb_loaded != 1)
2793 continue; /* no noise on spare devices */
2794
2795 if (!test_bit(Faulty, &rdev->flags)) {
2796 md_super_write(mddev,rdev,
2797 rdev->sb_start, rdev->sb_size,
2798 rdev->sb_page);
2799 pr_debug("md: (write) %pg's sb offset: %llu\n",
2800 rdev->bdev,
2801 (unsigned long long)rdev->sb_start);
2802 rdev->sb_events = mddev->events;
2803 if (rdev->badblocks.size) {
2804 md_super_write(mddev, rdev,
2805 rdev->badblocks.sector,
2806 rdev->badblocks.size << 9,
2807 rdev->bb_page);
2808 rdev->badblocks.size = 0;
2809 }
2810
2811 } else
2812 pr_debug("md: %pg (skipping faulty)\n",
2813 rdev->bdev);
2814 }
2815 if (md_super_wait(mddev) < 0)
2816 goto rewrite;
2817 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2818
2819 if (mddev_is_clustered(mddev) && ret == 0)
2820 mddev->cluster_ops->metadata_update_finish(mddev);
2821
2822 if (mddev->in_sync != sync_req ||
2823 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2824 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2825 /* have to write it out again */
2826 goto repeat;
2827 wake_up(&mddev->sb_wait);
2828 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2829 sysfs_notify_dirent_safe(mddev->sysfs_completed);
2830
2831 rdev_for_each(rdev, mddev) {
2832 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2833 clear_bit(Blocked, &rdev->flags);
2834
2835 if (any_badblocks_changed)
2836 ack_all_badblocks(&rdev->badblocks);
2837 clear_bit(BlockedBadBlocks, &rdev->flags);
2838 wake_up(&rdev->blocked_wait);
2839 }
2840 }
2841 EXPORT_SYMBOL(md_update_sb);
2842
add_bound_rdev(struct md_rdev * rdev)2843 static int add_bound_rdev(struct md_rdev *rdev)
2844 {
2845 struct mddev *mddev = rdev->mddev;
2846 int err = 0;
2847 bool add_journal = test_bit(Journal, &rdev->flags);
2848
2849 if (!mddev->pers->hot_remove_disk || add_journal) {
2850 /* If there is hot_add_disk but no hot_remove_disk
2851 * then added disks for geometry changes,
2852 * and should be added immediately.
2853 */
2854 super_types[mddev->major_version].
2855 validate_super(mddev, NULL/*freshest*/, rdev);
2856 err = mddev->pers->hot_add_disk(mddev, rdev);
2857 if (err) {
2858 md_kick_rdev_from_array(rdev);
2859 return err;
2860 }
2861 }
2862 sysfs_notify_dirent_safe(rdev->sysfs_state);
2863
2864 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2865 if (mddev->degraded)
2866 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2868 md_new_event();
2869 return 0;
2870 }
2871
2872 /* words written to sysfs files may, or may not, be \n terminated.
2873 * We want to accept with case. For this we use cmd_match.
2874 */
cmd_match(const char * cmd,const char * str)2875 static int cmd_match(const char *cmd, const char *str)
2876 {
2877 /* See if cmd, written into a sysfs file, matches
2878 * str. They must either be the same, or cmd can
2879 * have a trailing newline
2880 */
2881 while (*cmd && *str && *cmd == *str) {
2882 cmd++;
2883 str++;
2884 }
2885 if (*cmd == '\n')
2886 cmd++;
2887 if (*str || *cmd)
2888 return 0;
2889 return 1;
2890 }
2891
2892 struct rdev_sysfs_entry {
2893 struct attribute attr;
2894 ssize_t (*show)(struct md_rdev *, char *);
2895 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2896 };
2897
2898 static ssize_t
state_show(struct md_rdev * rdev,char * page)2899 state_show(struct md_rdev *rdev, char *page)
2900 {
2901 char *sep = ",";
2902 size_t len = 0;
2903 unsigned long flags = READ_ONCE(rdev->flags);
2904
2905 if (test_bit(Faulty, &flags) ||
2906 (!test_bit(ExternalBbl, &flags) &&
2907 rdev->badblocks.unacked_exist))
2908 len += sprintf(page+len, "faulty%s", sep);
2909 if (test_bit(In_sync, &flags))
2910 len += sprintf(page+len, "in_sync%s", sep);
2911 if (test_bit(Journal, &flags))
2912 len += sprintf(page+len, "journal%s", sep);
2913 if (test_bit(WriteMostly, &flags))
2914 len += sprintf(page+len, "write_mostly%s", sep);
2915 if (test_bit(Blocked, &flags) ||
2916 (rdev->badblocks.unacked_exist
2917 && !test_bit(Faulty, &flags)))
2918 len += sprintf(page+len, "blocked%s", sep);
2919 if (!test_bit(Faulty, &flags) &&
2920 !test_bit(Journal, &flags) &&
2921 !test_bit(In_sync, &flags))
2922 len += sprintf(page+len, "spare%s", sep);
2923 if (test_bit(WriteErrorSeen, &flags))
2924 len += sprintf(page+len, "write_error%s", sep);
2925 if (test_bit(WantReplacement, &flags))
2926 len += sprintf(page+len, "want_replacement%s", sep);
2927 if (test_bit(Replacement, &flags))
2928 len += sprintf(page+len, "replacement%s", sep);
2929 if (test_bit(ExternalBbl, &flags))
2930 len += sprintf(page+len, "external_bbl%s", sep);
2931 if (test_bit(FailFast, &flags))
2932 len += sprintf(page+len, "failfast%s", sep);
2933
2934 if (len)
2935 len -= strlen(sep);
2936
2937 return len+sprintf(page+len, "\n");
2938 }
2939
2940 static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)2941 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2942 {
2943 /* can write
2944 * faulty - simulates an error
2945 * remove - disconnects the device
2946 * writemostly - sets write_mostly
2947 * -writemostly - clears write_mostly
2948 * blocked - sets the Blocked flags
2949 * -blocked - clears the Blocked and possibly simulates an error
2950 * insync - sets Insync providing device isn't active
2951 * -insync - clear Insync for a device with a slot assigned,
2952 * so that it gets rebuilt based on bitmap
2953 * write_error - sets WriteErrorSeen
2954 * -write_error - clears WriteErrorSeen
2955 * {,-}failfast - set/clear FailFast
2956 */
2957
2958 struct mddev *mddev = rdev->mddev;
2959 int err = -EINVAL;
2960 bool need_update_sb = false;
2961
2962 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2963 md_error(rdev->mddev, rdev);
2964
2965 if (test_bit(MD_BROKEN, &rdev->mddev->flags))
2966 err = -EBUSY;
2967 else
2968 err = 0;
2969 } else if (cmd_match(buf, "remove")) {
2970 if (rdev->mddev->pers) {
2971 clear_bit(Blocked, &rdev->flags);
2972 remove_and_add_spares(rdev->mddev, rdev);
2973 }
2974 if (rdev->raid_disk >= 0)
2975 err = -EBUSY;
2976 else {
2977 err = 0;
2978 if (mddev_is_clustered(mddev))
2979 err = mddev->cluster_ops->remove_disk(mddev, rdev);
2980
2981 if (err == 0) {
2982 md_kick_rdev_from_array(rdev);
2983 if (mddev->pers)
2984 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2985 md_new_event();
2986 }
2987 }
2988 } else if (cmd_match(buf, "writemostly")) {
2989 set_bit(WriteMostly, &rdev->flags);
2990 mddev_create_serial_pool(rdev->mddev, rdev);
2991 need_update_sb = true;
2992 err = 0;
2993 } else if (cmd_match(buf, "-writemostly")) {
2994 mddev_destroy_serial_pool(rdev->mddev, rdev);
2995 clear_bit(WriteMostly, &rdev->flags);
2996 need_update_sb = true;
2997 err = 0;
2998 } else if (cmd_match(buf, "blocked")) {
2999 set_bit(Blocked, &rdev->flags);
3000 err = 0;
3001 } else if (cmd_match(buf, "-blocked")) {
3002 if (!test_bit(Faulty, &rdev->flags) &&
3003 !test_bit(ExternalBbl, &rdev->flags) &&
3004 rdev->badblocks.unacked_exist) {
3005 /* metadata handler doesn't understand badblocks,
3006 * so we need to fail the device
3007 */
3008 md_error(rdev->mddev, rdev);
3009 }
3010 clear_bit(Blocked, &rdev->flags);
3011 clear_bit(BlockedBadBlocks, &rdev->flags);
3012 wake_up(&rdev->blocked_wait);
3013 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3014
3015 err = 0;
3016 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3017 set_bit(In_sync, &rdev->flags);
3018 err = 0;
3019 } else if (cmd_match(buf, "failfast")) {
3020 set_bit(FailFast, &rdev->flags);
3021 need_update_sb = true;
3022 err = 0;
3023 } else if (cmd_match(buf, "-failfast")) {
3024 clear_bit(FailFast, &rdev->flags);
3025 need_update_sb = true;
3026 err = 0;
3027 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3028 !test_bit(Journal, &rdev->flags)) {
3029 if (rdev->mddev->pers == NULL) {
3030 clear_bit(In_sync, &rdev->flags);
3031 rdev->saved_raid_disk = rdev->raid_disk;
3032 rdev->raid_disk = -1;
3033 err = 0;
3034 }
3035 } else if (cmd_match(buf, "write_error")) {
3036 set_bit(WriteErrorSeen, &rdev->flags);
3037 err = 0;
3038 } else if (cmd_match(buf, "-write_error")) {
3039 clear_bit(WriteErrorSeen, &rdev->flags);
3040 err = 0;
3041 } else if (cmd_match(buf, "want_replacement")) {
3042 /* Any non-spare device that is not a replacement can
3043 * become want_replacement at any time, but we then need to
3044 * check if recovery is needed.
3045 */
3046 if (rdev->raid_disk >= 0 &&
3047 !test_bit(Journal, &rdev->flags) &&
3048 !test_bit(Replacement, &rdev->flags))
3049 set_bit(WantReplacement, &rdev->flags);
3050 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3051 err = 0;
3052 } else if (cmd_match(buf, "-want_replacement")) {
3053 /* Clearing 'want_replacement' is always allowed.
3054 * Once replacements starts it is too late though.
3055 */
3056 err = 0;
3057 clear_bit(WantReplacement, &rdev->flags);
3058 } else if (cmd_match(buf, "replacement")) {
3059 /* Can only set a device as a replacement when array has not
3060 * yet been started. Once running, replacement is automatic
3061 * from spares, or by assigning 'slot'.
3062 */
3063 if (rdev->mddev->pers)
3064 err = -EBUSY;
3065 else {
3066 set_bit(Replacement, &rdev->flags);
3067 err = 0;
3068 }
3069 } else if (cmd_match(buf, "-replacement")) {
3070 /* Similarly, can only clear Replacement before start */
3071 if (rdev->mddev->pers)
3072 err = -EBUSY;
3073 else {
3074 clear_bit(Replacement, &rdev->flags);
3075 err = 0;
3076 }
3077 } else if (cmd_match(buf, "re-add")) {
3078 if (!rdev->mddev->pers)
3079 err = -EINVAL;
3080 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3081 rdev->saved_raid_disk >= 0) {
3082 /* clear_bit is performed _after_ all the devices
3083 * have their local Faulty bit cleared. If any writes
3084 * happen in the meantime in the local node, they
3085 * will land in the local bitmap, which will be synced
3086 * by this node eventually
3087 */
3088 if (!mddev_is_clustered(rdev->mddev) ||
3089 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
3090 clear_bit(Faulty, &rdev->flags);
3091 err = add_bound_rdev(rdev);
3092 }
3093 } else
3094 err = -EBUSY;
3095 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3096 set_bit(ExternalBbl, &rdev->flags);
3097 rdev->badblocks.shift = 0;
3098 err = 0;
3099 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3100 clear_bit(ExternalBbl, &rdev->flags);
3101 err = 0;
3102 }
3103 if (need_update_sb)
3104 md_update_sb(mddev, 1);
3105 if (!err)
3106 sysfs_notify_dirent_safe(rdev->sysfs_state);
3107 return err ? err : len;
3108 }
3109 static struct rdev_sysfs_entry rdev_state =
3110 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3111
3112 static ssize_t
errors_show(struct md_rdev * rdev,char * page)3113 errors_show(struct md_rdev *rdev, char *page)
3114 {
3115 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3116 }
3117
3118 static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3119 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3120 {
3121 unsigned int n;
3122 int rv;
3123
3124 rv = kstrtouint(buf, 10, &n);
3125 if (rv < 0)
3126 return rv;
3127 atomic_set(&rdev->corrected_errors, n);
3128 return len;
3129 }
3130 static struct rdev_sysfs_entry rdev_errors =
3131 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3132
3133 static ssize_t
slot_show(struct md_rdev * rdev,char * page)3134 slot_show(struct md_rdev *rdev, char *page)
3135 {
3136 if (test_bit(Journal, &rdev->flags))
3137 return sprintf(page, "journal\n");
3138 else if (rdev->raid_disk < 0)
3139 return sprintf(page, "none\n");
3140 else
3141 return sprintf(page, "%d\n", rdev->raid_disk);
3142 }
3143
3144 static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3145 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3146 {
3147 int slot;
3148 int err;
3149
3150 if (test_bit(Journal, &rdev->flags))
3151 return -EBUSY;
3152 if (strncmp(buf, "none", 4)==0)
3153 slot = -1;
3154 else {
3155 err = kstrtouint(buf, 10, (unsigned int *)&slot);
3156 if (err < 0)
3157 return err;
3158 if (slot < 0)
3159 /* overflow */
3160 return -ENOSPC;
3161 }
3162 if (rdev->mddev->pers && slot == -1) {
3163 /* Setting 'slot' on an active array requires also
3164 * updating the 'rd%d' link, and communicating
3165 * with the personality with ->hot_*_disk.
3166 * For now we only support removing
3167 * failed/spare devices. This normally happens automatically,
3168 * but not when the metadata is externally managed.
3169 */
3170 if (rdev->raid_disk == -1)
3171 return -EEXIST;
3172 /* personality does all needed checks */
3173 if (rdev->mddev->pers->hot_remove_disk == NULL)
3174 return -EINVAL;
3175 clear_bit(Blocked, &rdev->flags);
3176 remove_and_add_spares(rdev->mddev, rdev);
3177 if (rdev->raid_disk >= 0)
3178 return -EBUSY;
3179 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3180 } else if (rdev->mddev->pers) {
3181 /* Activating a spare .. or possibly reactivating
3182 * if we ever get bitmaps working here.
3183 */
3184 int err;
3185
3186 if (rdev->raid_disk != -1)
3187 return -EBUSY;
3188
3189 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3190 return -EBUSY;
3191
3192 if (rdev->mddev->pers->hot_add_disk == NULL)
3193 return -EINVAL;
3194
3195 if (slot >= rdev->mddev->raid_disks &&
3196 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3197 return -ENOSPC;
3198
3199 rdev->raid_disk = slot;
3200 if (test_bit(In_sync, &rdev->flags))
3201 rdev->saved_raid_disk = slot;
3202 else
3203 rdev->saved_raid_disk = -1;
3204 clear_bit(In_sync, &rdev->flags);
3205 clear_bit(Bitmap_sync, &rdev->flags);
3206 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3207 if (err) {
3208 rdev->raid_disk = -1;
3209 return err;
3210 } else
3211 sysfs_notify_dirent_safe(rdev->sysfs_state);
3212 /* failure here is OK */;
3213 sysfs_link_rdev(rdev->mddev, rdev);
3214 /* don't wakeup anyone, leave that to userspace. */
3215 } else {
3216 if (slot >= rdev->mddev->raid_disks &&
3217 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3218 return -ENOSPC;
3219 rdev->raid_disk = slot;
3220 /* assume it is working */
3221 clear_bit(Faulty, &rdev->flags);
3222 clear_bit(WriteMostly, &rdev->flags);
3223 set_bit(In_sync, &rdev->flags);
3224 sysfs_notify_dirent_safe(rdev->sysfs_state);
3225 }
3226 return len;
3227 }
3228
3229 static struct rdev_sysfs_entry rdev_slot =
3230 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3231
3232 static ssize_t
offset_show(struct md_rdev * rdev,char * page)3233 offset_show(struct md_rdev *rdev, char *page)
3234 {
3235 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3236 }
3237
3238 static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3239 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3240 {
3241 unsigned long long offset;
3242 if (kstrtoull(buf, 10, &offset) < 0)
3243 return -EINVAL;
3244 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3245 return -EBUSY;
3246 if (rdev->sectors && rdev->mddev->external)
3247 /* Must set offset before size, so overlap checks
3248 * can be sane */
3249 return -EBUSY;
3250 rdev->data_offset = offset;
3251 rdev->new_data_offset = offset;
3252 return len;
3253 }
3254
3255 static struct rdev_sysfs_entry rdev_offset =
3256 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3257
new_offset_show(struct md_rdev * rdev,char * page)3258 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3259 {
3260 return sprintf(page, "%llu\n",
3261 (unsigned long long)rdev->new_data_offset);
3262 }
3263
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3264 static ssize_t new_offset_store(struct md_rdev *rdev,
3265 const char *buf, size_t len)
3266 {
3267 unsigned long long new_offset;
3268 struct mddev *mddev = rdev->mddev;
3269
3270 if (kstrtoull(buf, 10, &new_offset) < 0)
3271 return -EINVAL;
3272
3273 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3274 return -EBUSY;
3275 if (new_offset == rdev->data_offset)
3276 /* reset is always permitted */
3277 ;
3278 else if (new_offset > rdev->data_offset) {
3279 /* must not push array size beyond rdev_sectors */
3280 if (new_offset - rdev->data_offset
3281 + mddev->dev_sectors > rdev->sectors)
3282 return -E2BIG;
3283 }
3284 /* Metadata worries about other space details. */
3285
3286 /* decreasing the offset is inconsistent with a backwards
3287 * reshape.
3288 */
3289 if (new_offset < rdev->data_offset &&
3290 mddev->reshape_backwards)
3291 return -EINVAL;
3292 /* Increasing offset is inconsistent with forwards
3293 * reshape. reshape_direction should be set to
3294 * 'backwards' first.
3295 */
3296 if (new_offset > rdev->data_offset &&
3297 !mddev->reshape_backwards)
3298 return -EINVAL;
3299
3300 if (mddev->pers && mddev->persistent &&
3301 !super_types[mddev->major_version]
3302 .allow_new_offset(rdev, new_offset))
3303 return -E2BIG;
3304 rdev->new_data_offset = new_offset;
3305 if (new_offset > rdev->data_offset)
3306 mddev->reshape_backwards = 1;
3307 else if (new_offset < rdev->data_offset)
3308 mddev->reshape_backwards = 0;
3309
3310 return len;
3311 }
3312 static struct rdev_sysfs_entry rdev_new_offset =
3313 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3314
3315 static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3316 rdev_size_show(struct md_rdev *rdev, char *page)
3317 {
3318 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3319 }
3320
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3321 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3322 {
3323 /* check if two start/length pairs overlap */
3324 if (a->data_offset + a->sectors <= b->data_offset)
3325 return false;
3326 if (b->data_offset + b->sectors <= a->data_offset)
3327 return false;
3328 return true;
3329 }
3330
md_rdev_overlaps(struct md_rdev * rdev)3331 static bool md_rdev_overlaps(struct md_rdev *rdev)
3332 {
3333 struct mddev *mddev;
3334 struct md_rdev *rdev2;
3335
3336 spin_lock(&all_mddevs_lock);
3337 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3338 if (test_bit(MD_DELETED, &mddev->flags))
3339 continue;
3340 rdev_for_each(rdev2, mddev) {
3341 if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3342 md_rdevs_overlap(rdev, rdev2)) {
3343 spin_unlock(&all_mddevs_lock);
3344 return true;
3345 }
3346 }
3347 }
3348 spin_unlock(&all_mddevs_lock);
3349 return false;
3350 }
3351
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3352 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3353 {
3354 unsigned long long blocks;
3355 sector_t new;
3356
3357 if (kstrtoull(buf, 10, &blocks) < 0)
3358 return -EINVAL;
3359
3360 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3361 return -EINVAL; /* sector conversion overflow */
3362
3363 new = blocks * 2;
3364 if (new != blocks * 2)
3365 return -EINVAL; /* unsigned long long to sector_t overflow */
3366
3367 *sectors = new;
3368 return 0;
3369 }
3370
3371 static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3372 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3373 {
3374 struct mddev *my_mddev = rdev->mddev;
3375 sector_t oldsectors = rdev->sectors;
3376 sector_t sectors;
3377
3378 if (test_bit(Journal, &rdev->flags))
3379 return -EBUSY;
3380 if (strict_blocks_to_sectors(buf, §ors) < 0)
3381 return -EINVAL;
3382 if (rdev->data_offset != rdev->new_data_offset)
3383 return -EINVAL; /* too confusing */
3384 if (my_mddev->pers && rdev->raid_disk >= 0) {
3385 if (my_mddev->persistent) {
3386 sectors = super_types[my_mddev->major_version].
3387 rdev_size_change(rdev, sectors);
3388 if (!sectors)
3389 return -EBUSY;
3390 } else if (!sectors)
3391 sectors = bdev_nr_sectors(rdev->bdev) -
3392 rdev->data_offset;
3393 if (!my_mddev->pers->resize)
3394 /* Cannot change size for RAID0 or Linear etc */
3395 return -EINVAL;
3396 }
3397 if (sectors < my_mddev->dev_sectors)
3398 return -EINVAL; /* component must fit device */
3399
3400 rdev->sectors = sectors;
3401
3402 /*
3403 * Check that all other rdevs with the same bdev do not overlap. This
3404 * check does not provide a hard guarantee, it just helps avoid
3405 * dangerous mistakes.
3406 */
3407 if (sectors > oldsectors && my_mddev->external &&
3408 md_rdev_overlaps(rdev)) {
3409 /*
3410 * Someone else could have slipped in a size change here, but
3411 * doing so is just silly. We put oldsectors back because we
3412 * know it is safe, and trust userspace not to race with itself.
3413 */
3414 rdev->sectors = oldsectors;
3415 return -EBUSY;
3416 }
3417 return len;
3418 }
3419
3420 static struct rdev_sysfs_entry rdev_size =
3421 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3422
recovery_start_show(struct md_rdev * rdev,char * page)3423 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3424 {
3425 unsigned long long recovery_start = rdev->recovery_offset;
3426
3427 if (test_bit(In_sync, &rdev->flags) ||
3428 recovery_start == MaxSector)
3429 return sprintf(page, "none\n");
3430
3431 return sprintf(page, "%llu\n", recovery_start);
3432 }
3433
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3434 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3435 {
3436 unsigned long long recovery_start;
3437
3438 if (cmd_match(buf, "none"))
3439 recovery_start = MaxSector;
3440 else if (kstrtoull(buf, 10, &recovery_start))
3441 return -EINVAL;
3442
3443 if (rdev->mddev->pers &&
3444 rdev->raid_disk >= 0)
3445 return -EBUSY;
3446
3447 rdev->recovery_offset = recovery_start;
3448 if (recovery_start == MaxSector)
3449 set_bit(In_sync, &rdev->flags);
3450 else
3451 clear_bit(In_sync, &rdev->flags);
3452 return len;
3453 }
3454
3455 static struct rdev_sysfs_entry rdev_recovery_start =
3456 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3457
3458 /* sysfs access to bad-blocks list.
3459 * We present two files.
3460 * 'bad-blocks' lists sector numbers and lengths of ranges that
3461 * are recorded as bad. The list is truncated to fit within
3462 * the one-page limit of sysfs.
3463 * Writing "sector length" to this file adds an acknowledged
3464 * bad block list.
3465 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3466 * been acknowledged. Writing to this file adds bad blocks
3467 * without acknowledging them. This is largely for testing.
3468 */
bb_show(struct md_rdev * rdev,char * page)3469 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3470 {
3471 return badblocks_show(&rdev->badblocks, page, 0);
3472 }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3473 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3474 {
3475 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3476 /* Maybe that ack was all we needed */
3477 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3478 wake_up(&rdev->blocked_wait);
3479 return rv;
3480 }
3481 static struct rdev_sysfs_entry rdev_bad_blocks =
3482 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3483
ubb_show(struct md_rdev * rdev,char * page)3484 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3485 {
3486 return badblocks_show(&rdev->badblocks, page, 1);
3487 }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3488 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3489 {
3490 return badblocks_store(&rdev->badblocks, page, len, 1);
3491 }
3492 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3493 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3494
3495 static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3496 ppl_sector_show(struct md_rdev *rdev, char *page)
3497 {
3498 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3499 }
3500
3501 static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3502 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3503 {
3504 unsigned long long sector;
3505
3506 if (kstrtoull(buf, 10, §or) < 0)
3507 return -EINVAL;
3508 if (sector != (sector_t)sector)
3509 return -EINVAL;
3510
3511 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3512 rdev->raid_disk >= 0)
3513 return -EBUSY;
3514
3515 if (rdev->mddev->persistent) {
3516 if (rdev->mddev->major_version == 0)
3517 return -EINVAL;
3518 if ((sector > rdev->sb_start &&
3519 sector - rdev->sb_start > S16_MAX) ||
3520 (sector < rdev->sb_start &&
3521 rdev->sb_start - sector > -S16_MIN))
3522 return -EINVAL;
3523 rdev->ppl.offset = sector - rdev->sb_start;
3524 } else if (!rdev->mddev->external) {
3525 return -EBUSY;
3526 }
3527 rdev->ppl.sector = sector;
3528 return len;
3529 }
3530
3531 static struct rdev_sysfs_entry rdev_ppl_sector =
3532 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3533
3534 static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3535 ppl_size_show(struct md_rdev *rdev, char *page)
3536 {
3537 return sprintf(page, "%u\n", rdev->ppl.size);
3538 }
3539
3540 static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3541 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3542 {
3543 unsigned int size;
3544
3545 if (kstrtouint(buf, 10, &size) < 0)
3546 return -EINVAL;
3547
3548 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3549 rdev->raid_disk >= 0)
3550 return -EBUSY;
3551
3552 if (rdev->mddev->persistent) {
3553 if (rdev->mddev->major_version == 0)
3554 return -EINVAL;
3555 if (size > U16_MAX)
3556 return -EINVAL;
3557 } else if (!rdev->mddev->external) {
3558 return -EBUSY;
3559 }
3560 rdev->ppl.size = size;
3561 return len;
3562 }
3563
3564 static struct rdev_sysfs_entry rdev_ppl_size =
3565 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3566
3567 static struct attribute *rdev_default_attrs[] = {
3568 &rdev_state.attr,
3569 &rdev_errors.attr,
3570 &rdev_slot.attr,
3571 &rdev_offset.attr,
3572 &rdev_new_offset.attr,
3573 &rdev_size.attr,
3574 &rdev_recovery_start.attr,
3575 &rdev_bad_blocks.attr,
3576 &rdev_unack_bad_blocks.attr,
3577 &rdev_ppl_sector.attr,
3578 &rdev_ppl_size.attr,
3579 NULL,
3580 };
3581 ATTRIBUTE_GROUPS(rdev_default);
3582 static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3583 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3584 {
3585 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3586 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3587
3588 if (!entry->show)
3589 return -EIO;
3590 if (!rdev->mddev)
3591 return -ENODEV;
3592 return entry->show(rdev, page);
3593 }
3594
3595 static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3596 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3597 const char *page, size_t length)
3598 {
3599 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3600 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3601 struct kernfs_node *kn = NULL;
3602 bool suspend = false;
3603 ssize_t rv;
3604 struct mddev *mddev = READ_ONCE(rdev->mddev);
3605
3606 if (!entry->store)
3607 return -EIO;
3608 if (!capable(CAP_SYS_ADMIN))
3609 return -EACCES;
3610 if (!mddev)
3611 return -ENODEV;
3612
3613 if (entry->store == state_store) {
3614 if (cmd_match(page, "remove"))
3615 kn = sysfs_break_active_protection(kobj, attr);
3616 if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3617 cmd_match(page, "writemostly") ||
3618 cmd_match(page, "-writemostly"))
3619 suspend = true;
3620 }
3621
3622 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3623 if (!rv) {
3624 if (rdev->mddev == NULL)
3625 rv = -ENODEV;
3626 else
3627 rv = entry->store(rdev, page, length);
3628 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3629 }
3630
3631 if (kn)
3632 sysfs_unbreak_active_protection(kn);
3633
3634 return rv;
3635 }
3636
rdev_free(struct kobject * ko)3637 static void rdev_free(struct kobject *ko)
3638 {
3639 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3640 kfree(rdev);
3641 }
3642 static const struct sysfs_ops rdev_sysfs_ops = {
3643 .show = rdev_attr_show,
3644 .store = rdev_attr_store,
3645 };
3646 static const struct kobj_type rdev_ktype = {
3647 .release = rdev_free,
3648 .sysfs_ops = &rdev_sysfs_ops,
3649 .default_groups = rdev_default_groups,
3650 };
3651
md_rdev_init(struct md_rdev * rdev)3652 int md_rdev_init(struct md_rdev *rdev)
3653 {
3654 rdev->desc_nr = -1;
3655 rdev->saved_raid_disk = -1;
3656 rdev->raid_disk = -1;
3657 rdev->flags = 0;
3658 rdev->data_offset = 0;
3659 rdev->new_data_offset = 0;
3660 rdev->sb_events = 0;
3661 rdev->last_read_error = 0;
3662 rdev->sb_loaded = 0;
3663 rdev->bb_page = NULL;
3664 atomic_set(&rdev->nr_pending, 0);
3665 atomic_set(&rdev->read_errors, 0);
3666 atomic_set(&rdev->corrected_errors, 0);
3667
3668 INIT_LIST_HEAD(&rdev->same_set);
3669 init_waitqueue_head(&rdev->blocked_wait);
3670
3671 /* Add space to store bad block list.
3672 * This reserves the space even on arrays where it cannot
3673 * be used - I wonder if that matters
3674 */
3675 return badblocks_init(&rdev->badblocks, 0);
3676 }
3677 EXPORT_SYMBOL_GPL(md_rdev_init);
3678
3679 /*
3680 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3681 *
3682 * mark the device faulty if:
3683 *
3684 * - the device is nonexistent (zero size)
3685 * - the device has no valid superblock
3686 *
3687 * a faulty rdev _never_ has rdev->sb set.
3688 */
md_import_device(dev_t newdev,int super_format,int super_minor)3689 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3690 {
3691 struct md_rdev *rdev;
3692 sector_t size;
3693 int err;
3694
3695 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3696 if (!rdev)
3697 return ERR_PTR(-ENOMEM);
3698
3699 err = md_rdev_init(rdev);
3700 if (err)
3701 goto out_free_rdev;
3702 err = alloc_disk_sb(rdev);
3703 if (err)
3704 goto out_clear_rdev;
3705
3706 rdev->bdev_file = bdev_file_open_by_dev(newdev,
3707 BLK_OPEN_READ | BLK_OPEN_WRITE,
3708 super_format == -2 ? &claim_rdev : rdev, NULL);
3709 if (IS_ERR(rdev->bdev_file)) {
3710 pr_warn("md: could not open device unknown-block(%u,%u).\n",
3711 MAJOR(newdev), MINOR(newdev));
3712 err = PTR_ERR(rdev->bdev_file);
3713 goto out_clear_rdev;
3714 }
3715 rdev->bdev = file_bdev(rdev->bdev_file);
3716
3717 kobject_init(&rdev->kobj, &rdev_ktype);
3718
3719 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3720 if (!size) {
3721 pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3722 rdev->bdev);
3723 err = -EINVAL;
3724 goto out_blkdev_put;
3725 }
3726
3727 if (super_format >= 0) {
3728 err = super_types[super_format].
3729 load_super(rdev, NULL, super_minor);
3730 if (err == -EINVAL) {
3731 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3732 rdev->bdev,
3733 super_format, super_minor);
3734 goto out_blkdev_put;
3735 }
3736 if (err < 0) {
3737 pr_warn("md: could not read %pg's sb, not importing!\n",
3738 rdev->bdev);
3739 goto out_blkdev_put;
3740 }
3741 }
3742
3743 return rdev;
3744
3745 out_blkdev_put:
3746 fput(rdev->bdev_file);
3747 out_clear_rdev:
3748 md_rdev_clear(rdev);
3749 out_free_rdev:
3750 kfree(rdev);
3751 return ERR_PTR(err);
3752 }
3753
3754 /*
3755 * Check a full RAID array for plausibility
3756 */
3757
analyze_sbs(struct mddev * mddev)3758 static int analyze_sbs(struct mddev *mddev)
3759 {
3760 int i;
3761 struct md_rdev *rdev, *freshest, *tmp;
3762
3763 freshest = NULL;
3764 rdev_for_each_safe(rdev, tmp, mddev)
3765 switch (super_types[mddev->major_version].
3766 load_super(rdev, freshest, mddev->minor_version)) {
3767 case 1:
3768 freshest = rdev;
3769 break;
3770 case 0:
3771 break;
3772 default:
3773 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3774 rdev->bdev);
3775 md_kick_rdev_from_array(rdev);
3776 }
3777
3778 /* Cannot find a valid fresh disk */
3779 if (!freshest) {
3780 pr_warn("md: cannot find a valid disk\n");
3781 return -EINVAL;
3782 }
3783
3784 super_types[mddev->major_version].
3785 validate_super(mddev, NULL/*freshest*/, freshest);
3786
3787 i = 0;
3788 rdev_for_each_safe(rdev, tmp, mddev) {
3789 if (mddev->max_disks &&
3790 (rdev->desc_nr >= mddev->max_disks ||
3791 i > mddev->max_disks)) {
3792 pr_warn("md: %s: %pg: only %d devices permitted\n",
3793 mdname(mddev), rdev->bdev,
3794 mddev->max_disks);
3795 md_kick_rdev_from_array(rdev);
3796 continue;
3797 }
3798 if (rdev != freshest) {
3799 if (super_types[mddev->major_version].
3800 validate_super(mddev, freshest, rdev)) {
3801 pr_warn("md: kicking non-fresh %pg from array!\n",
3802 rdev->bdev);
3803 md_kick_rdev_from_array(rdev);
3804 continue;
3805 }
3806 }
3807 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3808 !test_bit(Journal, &rdev->flags)) {
3809 rdev->raid_disk = -1;
3810 clear_bit(In_sync, &rdev->flags);
3811 }
3812 }
3813
3814 return 0;
3815 }
3816
3817 /* Read a fixed-point number.
3818 * Numbers in sysfs attributes should be in "standard" units where
3819 * possible, so time should be in seconds.
3820 * However we internally use a a much smaller unit such as
3821 * milliseconds or jiffies.
3822 * This function takes a decimal number with a possible fractional
3823 * component, and produces an integer which is the result of
3824 * multiplying that number by 10^'scale'.
3825 * all without any floating-point arithmetic.
3826 */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3827 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3828 {
3829 unsigned long result = 0;
3830 long decimals = -1;
3831 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3832 if (*cp == '.')
3833 decimals = 0;
3834 else if (decimals < scale) {
3835 unsigned int value;
3836 value = *cp - '0';
3837 result = result * 10 + value;
3838 if (decimals >= 0)
3839 decimals++;
3840 }
3841 cp++;
3842 }
3843 if (*cp == '\n')
3844 cp++;
3845 if (*cp)
3846 return -EINVAL;
3847 if (decimals < 0)
3848 decimals = 0;
3849 *res = result * int_pow(10, scale - decimals);
3850 return 0;
3851 }
3852
3853 static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3854 safe_delay_show(struct mddev *mddev, char *page)
3855 {
3856 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3857
3858 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3859 }
3860 static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3861 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3862 {
3863 unsigned long msec;
3864
3865 if (mddev_is_clustered(mddev)) {
3866 pr_warn("md: Safemode is disabled for clustered mode\n");
3867 return -EINVAL;
3868 }
3869
3870 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3871 return -EINVAL;
3872 if (msec == 0)
3873 mddev->safemode_delay = 0;
3874 else {
3875 unsigned long old_delay = mddev->safemode_delay;
3876 unsigned long new_delay = (msec*HZ)/1000;
3877
3878 if (new_delay == 0)
3879 new_delay = 1;
3880 mddev->safemode_delay = new_delay;
3881 if (new_delay < old_delay || old_delay == 0)
3882 mod_timer(&mddev->safemode_timer, jiffies+1);
3883 }
3884 return len;
3885 }
3886 static struct md_sysfs_entry md_safe_delay =
3887 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3888
3889 static ssize_t
level_show(struct mddev * mddev,char * page)3890 level_show(struct mddev *mddev, char *page)
3891 {
3892 struct md_personality *p;
3893 int ret;
3894 spin_lock(&mddev->lock);
3895 p = mddev->pers;
3896 if (p)
3897 ret = sprintf(page, "%s\n", p->head.name);
3898 else if (mddev->clevel[0])
3899 ret = sprintf(page, "%s\n", mddev->clevel);
3900 else if (mddev->level != LEVEL_NONE)
3901 ret = sprintf(page, "%d\n", mddev->level);
3902 else
3903 ret = 0;
3904 spin_unlock(&mddev->lock);
3905 return ret;
3906 }
3907
3908 static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)3909 level_store(struct mddev *mddev, const char *buf, size_t len)
3910 {
3911 char clevel[16];
3912 ssize_t rv;
3913 size_t slen = len;
3914 struct md_personality *pers, *oldpers;
3915 long level;
3916 void *priv, *oldpriv;
3917 struct md_rdev *rdev;
3918
3919 if (slen == 0 || slen >= sizeof(clevel))
3920 return -EINVAL;
3921
3922 rv = mddev_suspend_and_lock(mddev);
3923 if (rv)
3924 return rv;
3925
3926 if (mddev->pers == NULL) {
3927 memcpy(mddev->clevel, buf, slen);
3928 if (mddev->clevel[slen-1] == '\n')
3929 slen--;
3930 mddev->clevel[slen] = 0;
3931 mddev->level = LEVEL_NONE;
3932 rv = len;
3933 goto out_unlock;
3934 }
3935 rv = -EROFS;
3936 if (!md_is_rdwr(mddev))
3937 goto out_unlock;
3938
3939 /* request to change the personality. Need to ensure:
3940 * - array is not engaged in resync/recovery/reshape
3941 * - old personality can be suspended
3942 * - new personality will access other array.
3943 */
3944
3945 rv = -EBUSY;
3946 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3947 mddev->reshape_position != MaxSector ||
3948 mddev->sysfs_active)
3949 goto out_unlock;
3950
3951 rv = -EINVAL;
3952 if (!mddev->pers->quiesce) {
3953 pr_warn("md: %s: %s does not support online personality change\n",
3954 mdname(mddev), mddev->pers->head.name);
3955 goto out_unlock;
3956 }
3957
3958 /* Now find the new personality */
3959 memcpy(clevel, buf, slen);
3960 if (clevel[slen-1] == '\n')
3961 slen--;
3962 clevel[slen] = 0;
3963 if (kstrtol(clevel, 10, &level))
3964 level = LEVEL_NONE;
3965
3966 if (request_module("md-%s", clevel) != 0)
3967 request_module("md-level-%s", clevel);
3968 pers = get_pers(level, clevel);
3969 if (!pers) {
3970 rv = -EINVAL;
3971 goto out_unlock;
3972 }
3973
3974 if (pers == mddev->pers) {
3975 /* Nothing to do! */
3976 put_pers(pers);
3977 rv = len;
3978 goto out_unlock;
3979 }
3980 if (!pers->takeover) {
3981 put_pers(pers);
3982 pr_warn("md: %s: %s does not support personality takeover\n",
3983 mdname(mddev), clevel);
3984 rv = -EINVAL;
3985 goto out_unlock;
3986 }
3987
3988 rdev_for_each(rdev, mddev)
3989 rdev->new_raid_disk = rdev->raid_disk;
3990
3991 /* ->takeover must set new_* and/or delta_disks
3992 * if it succeeds, and may set them when it fails.
3993 */
3994 priv = pers->takeover(mddev);
3995 if (IS_ERR(priv)) {
3996 mddev->new_level = mddev->level;
3997 mddev->new_layout = mddev->layout;
3998 mddev->new_chunk_sectors = mddev->chunk_sectors;
3999 mddev->raid_disks -= mddev->delta_disks;
4000 mddev->delta_disks = 0;
4001 mddev->reshape_backwards = 0;
4002 put_pers(pers);
4003 pr_warn("md: %s: %s would not accept array\n",
4004 mdname(mddev), clevel);
4005 rv = PTR_ERR(priv);
4006 goto out_unlock;
4007 }
4008
4009 /* Looks like we have a winner */
4010 mddev_detach(mddev);
4011
4012 spin_lock(&mddev->lock);
4013 oldpers = mddev->pers;
4014 oldpriv = mddev->private;
4015 mddev->pers = pers;
4016 mddev->private = priv;
4017 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
4018 mddev->level = mddev->new_level;
4019 mddev->layout = mddev->new_layout;
4020 mddev->chunk_sectors = mddev->new_chunk_sectors;
4021 mddev->delta_disks = 0;
4022 mddev->reshape_backwards = 0;
4023 mddev->degraded = 0;
4024 spin_unlock(&mddev->lock);
4025
4026 if (oldpers->sync_request == NULL &&
4027 mddev->external) {
4028 /* We are converting from a no-redundancy array
4029 * to a redundancy array and metadata is managed
4030 * externally so we need to be sure that writes
4031 * won't block due to a need to transition
4032 * clean->dirty
4033 * until external management is started.
4034 */
4035 mddev->in_sync = 0;
4036 mddev->safemode_delay = 0;
4037 mddev->safemode = 0;
4038 }
4039
4040 oldpers->free(mddev, oldpriv);
4041
4042 if (oldpers->sync_request == NULL &&
4043 pers->sync_request != NULL) {
4044 /* need to add the md_redundancy_group */
4045 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4046 pr_warn("md: cannot register extra attributes for %s\n",
4047 mdname(mddev));
4048 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4049 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4050 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4051 }
4052 if (oldpers->sync_request != NULL &&
4053 pers->sync_request == NULL) {
4054 /* need to remove the md_redundancy_group */
4055 if (mddev->to_remove == NULL)
4056 mddev->to_remove = &md_redundancy_group;
4057 }
4058
4059 put_pers(oldpers);
4060
4061 rdev_for_each(rdev, mddev) {
4062 if (rdev->raid_disk < 0)
4063 continue;
4064 if (rdev->new_raid_disk >= mddev->raid_disks)
4065 rdev->new_raid_disk = -1;
4066 if (rdev->new_raid_disk == rdev->raid_disk)
4067 continue;
4068 sysfs_unlink_rdev(mddev, rdev);
4069 }
4070 rdev_for_each(rdev, mddev) {
4071 if (rdev->raid_disk < 0)
4072 continue;
4073 if (rdev->new_raid_disk == rdev->raid_disk)
4074 continue;
4075 rdev->raid_disk = rdev->new_raid_disk;
4076 if (rdev->raid_disk < 0)
4077 clear_bit(In_sync, &rdev->flags);
4078 else {
4079 if (sysfs_link_rdev(mddev, rdev))
4080 pr_warn("md: cannot register rd%d for %s after level change\n",
4081 rdev->raid_disk, mdname(mddev));
4082 }
4083 }
4084
4085 if (pers->sync_request == NULL) {
4086 /* this is now an array without redundancy, so
4087 * it must always be in_sync
4088 */
4089 mddev->in_sync = 1;
4090 timer_delete_sync(&mddev->safemode_timer);
4091 }
4092 pers->run(mddev);
4093 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4094 if (!mddev->thread)
4095 md_update_sb(mddev, 1);
4096 sysfs_notify_dirent_safe(mddev->sysfs_level);
4097 md_new_event();
4098 rv = len;
4099 out_unlock:
4100 mddev_unlock_and_resume(mddev);
4101 return rv;
4102 }
4103
4104 static struct md_sysfs_entry md_level =
4105 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4106
4107 static ssize_t
new_level_show(struct mddev * mddev,char * page)4108 new_level_show(struct mddev *mddev, char *page)
4109 {
4110 return sprintf(page, "%d\n", mddev->new_level);
4111 }
4112
4113 static ssize_t
new_level_store(struct mddev * mddev,const char * buf,size_t len)4114 new_level_store(struct mddev *mddev, const char *buf, size_t len)
4115 {
4116 unsigned int n;
4117 int err;
4118
4119 err = kstrtouint(buf, 10, &n);
4120 if (err < 0)
4121 return err;
4122 err = mddev_lock(mddev);
4123 if (err)
4124 return err;
4125
4126 mddev->new_level = n;
4127 md_update_sb(mddev, 1);
4128
4129 mddev_unlock(mddev);
4130 return len;
4131 }
4132 static struct md_sysfs_entry md_new_level =
4133 __ATTR(new_level, 0664, new_level_show, new_level_store);
4134
4135 static ssize_t
layout_show(struct mddev * mddev,char * page)4136 layout_show(struct mddev *mddev, char *page)
4137 {
4138 /* just a number, not meaningful for all levels */
4139 if (mddev->reshape_position != MaxSector &&
4140 mddev->layout != mddev->new_layout)
4141 return sprintf(page, "%d (%d)\n",
4142 mddev->new_layout, mddev->layout);
4143 return sprintf(page, "%d\n", mddev->layout);
4144 }
4145
4146 static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4147 layout_store(struct mddev *mddev, const char *buf, size_t len)
4148 {
4149 unsigned int n;
4150 int err;
4151
4152 err = kstrtouint(buf, 10, &n);
4153 if (err < 0)
4154 return err;
4155 err = mddev_lock(mddev);
4156 if (err)
4157 return err;
4158
4159 if (mddev->pers) {
4160 if (mddev->pers->check_reshape == NULL)
4161 err = -EBUSY;
4162 else if (!md_is_rdwr(mddev))
4163 err = -EROFS;
4164 else {
4165 mddev->new_layout = n;
4166 err = mddev->pers->check_reshape(mddev);
4167 if (err)
4168 mddev->new_layout = mddev->layout;
4169 }
4170 } else {
4171 mddev->new_layout = n;
4172 if (mddev->reshape_position == MaxSector)
4173 mddev->layout = n;
4174 }
4175 mddev_unlock(mddev);
4176 return err ?: len;
4177 }
4178 static struct md_sysfs_entry md_layout =
4179 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4180
4181 static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4182 raid_disks_show(struct mddev *mddev, char *page)
4183 {
4184 if (mddev->raid_disks == 0)
4185 return 0;
4186 if (mddev->reshape_position != MaxSector &&
4187 mddev->delta_disks != 0)
4188 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4189 mddev->raid_disks - mddev->delta_disks);
4190 return sprintf(page, "%d\n", mddev->raid_disks);
4191 }
4192
4193 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4194
4195 static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4196 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4197 {
4198 unsigned int n;
4199 int err;
4200
4201 err = kstrtouint(buf, 10, &n);
4202 if (err < 0)
4203 return err;
4204
4205 err = mddev_lock(mddev);
4206 if (err)
4207 return err;
4208 if (mddev->pers)
4209 err = update_raid_disks(mddev, n);
4210 else if (mddev->reshape_position != MaxSector) {
4211 struct md_rdev *rdev;
4212 int olddisks = mddev->raid_disks - mddev->delta_disks;
4213
4214 err = -EINVAL;
4215 rdev_for_each(rdev, mddev) {
4216 if (olddisks < n &&
4217 rdev->data_offset < rdev->new_data_offset)
4218 goto out_unlock;
4219 if (olddisks > n &&
4220 rdev->data_offset > rdev->new_data_offset)
4221 goto out_unlock;
4222 }
4223 err = 0;
4224 mddev->delta_disks = n - olddisks;
4225 mddev->raid_disks = n;
4226 mddev->reshape_backwards = (mddev->delta_disks < 0);
4227 } else
4228 mddev->raid_disks = n;
4229 out_unlock:
4230 mddev_unlock(mddev);
4231 return err ? err : len;
4232 }
4233 static struct md_sysfs_entry md_raid_disks =
4234 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4235
4236 static ssize_t
uuid_show(struct mddev * mddev,char * page)4237 uuid_show(struct mddev *mddev, char *page)
4238 {
4239 return sprintf(page, "%pU\n", mddev->uuid);
4240 }
4241 static struct md_sysfs_entry md_uuid =
4242 __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4243
4244 static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4245 chunk_size_show(struct mddev *mddev, char *page)
4246 {
4247 if (mddev->reshape_position != MaxSector &&
4248 mddev->chunk_sectors != mddev->new_chunk_sectors)
4249 return sprintf(page, "%d (%d)\n",
4250 mddev->new_chunk_sectors << 9,
4251 mddev->chunk_sectors << 9);
4252 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4253 }
4254
4255 static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4256 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4257 {
4258 unsigned long n;
4259 int err;
4260
4261 err = kstrtoul(buf, 10, &n);
4262 if (err < 0)
4263 return err;
4264
4265 err = mddev_lock(mddev);
4266 if (err)
4267 return err;
4268 if (mddev->pers) {
4269 if (mddev->pers->check_reshape == NULL)
4270 err = -EBUSY;
4271 else if (!md_is_rdwr(mddev))
4272 err = -EROFS;
4273 else {
4274 mddev->new_chunk_sectors = n >> 9;
4275 err = mddev->pers->check_reshape(mddev);
4276 if (err)
4277 mddev->new_chunk_sectors = mddev->chunk_sectors;
4278 }
4279 } else {
4280 mddev->new_chunk_sectors = n >> 9;
4281 if (mddev->reshape_position == MaxSector)
4282 mddev->chunk_sectors = n >> 9;
4283 }
4284 mddev_unlock(mddev);
4285 return err ?: len;
4286 }
4287 static struct md_sysfs_entry md_chunk_size =
4288 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4289
4290 static ssize_t
resync_start_show(struct mddev * mddev,char * page)4291 resync_start_show(struct mddev *mddev, char *page)
4292 {
4293 if (mddev->recovery_cp == MaxSector)
4294 return sprintf(page, "none\n");
4295 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4296 }
4297
4298 static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4299 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4300 {
4301 unsigned long long n;
4302 int err;
4303
4304 if (cmd_match(buf, "none"))
4305 n = MaxSector;
4306 else {
4307 err = kstrtoull(buf, 10, &n);
4308 if (err < 0)
4309 return err;
4310 if (n != (sector_t)n)
4311 return -EINVAL;
4312 }
4313
4314 err = mddev_lock(mddev);
4315 if (err)
4316 return err;
4317 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4318 err = -EBUSY;
4319
4320 if (!err) {
4321 mddev->recovery_cp = n;
4322 if (mddev->pers)
4323 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4324 }
4325 mddev_unlock(mddev);
4326 return err ?: len;
4327 }
4328 static struct md_sysfs_entry md_resync_start =
4329 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4330 resync_start_show, resync_start_store);
4331
4332 /*
4333 * The array state can be:
4334 *
4335 * clear
4336 * No devices, no size, no level
4337 * Equivalent to STOP_ARRAY ioctl
4338 * inactive
4339 * May have some settings, but array is not active
4340 * all IO results in error
4341 * When written, doesn't tear down array, but just stops it
4342 * suspended (not supported yet)
4343 * All IO requests will block. The array can be reconfigured.
4344 * Writing this, if accepted, will block until array is quiescent
4345 * readonly
4346 * no resync can happen. no superblocks get written.
4347 * write requests fail
4348 * read-auto
4349 * like readonly, but behaves like 'clean' on a write request.
4350 *
4351 * clean - no pending writes, but otherwise active.
4352 * When written to inactive array, starts without resync
4353 * If a write request arrives then
4354 * if metadata is known, mark 'dirty' and switch to 'active'.
4355 * if not known, block and switch to write-pending
4356 * If written to an active array that has pending writes, then fails.
4357 * active
4358 * fully active: IO and resync can be happening.
4359 * When written to inactive array, starts with resync
4360 *
4361 * write-pending
4362 * clean, but writes are blocked waiting for 'active' to be written.
4363 *
4364 * active-idle
4365 * like active, but no writes have been seen for a while (100msec).
4366 *
4367 * broken
4368 * Array is failed. It's useful because mounted-arrays aren't stopped
4369 * when array is failed, so this state will at least alert the user that
4370 * something is wrong.
4371 */
4372 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4373 write_pending, active_idle, broken, bad_word};
4374 static char *array_states[] = {
4375 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4376 "write-pending", "active-idle", "broken", NULL };
4377
match_word(const char * word,char ** list)4378 static int match_word(const char *word, char **list)
4379 {
4380 int n;
4381 for (n=0; list[n]; n++)
4382 if (cmd_match(word, list[n]))
4383 break;
4384 return n;
4385 }
4386
4387 static ssize_t
array_state_show(struct mddev * mddev,char * page)4388 array_state_show(struct mddev *mddev, char *page)
4389 {
4390 enum array_state st = inactive;
4391
4392 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4393 switch(mddev->ro) {
4394 case MD_RDONLY:
4395 st = readonly;
4396 break;
4397 case MD_AUTO_READ:
4398 st = read_auto;
4399 break;
4400 case MD_RDWR:
4401 spin_lock(&mddev->lock);
4402 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4403 st = write_pending;
4404 else if (mddev->in_sync)
4405 st = clean;
4406 else if (mddev->safemode)
4407 st = active_idle;
4408 else
4409 st = active;
4410 spin_unlock(&mddev->lock);
4411 }
4412
4413 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4414 st = broken;
4415 } else {
4416 if (list_empty(&mddev->disks) &&
4417 mddev->raid_disks == 0 &&
4418 mddev->dev_sectors == 0)
4419 st = clear;
4420 else
4421 st = inactive;
4422 }
4423 return sprintf(page, "%s\n", array_states[st]);
4424 }
4425
4426 static int do_md_stop(struct mddev *mddev, int ro);
4427 static int md_set_readonly(struct mddev *mddev);
4428 static int restart_array(struct mddev *mddev);
4429
4430 static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4431 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4432 {
4433 int err = 0;
4434 enum array_state st = match_word(buf, array_states);
4435
4436 /* No lock dependent actions */
4437 switch (st) {
4438 case suspended: /* not supported yet */
4439 case write_pending: /* cannot be set */
4440 case active_idle: /* cannot be set */
4441 case broken: /* cannot be set */
4442 case bad_word:
4443 return -EINVAL;
4444 case clear:
4445 case readonly:
4446 case inactive:
4447 case read_auto:
4448 if (!mddev->pers || !md_is_rdwr(mddev))
4449 break;
4450 /* write sysfs will not open mddev and opener should be 0 */
4451 err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4452 if (err)
4453 return err;
4454 break;
4455 default:
4456 break;
4457 }
4458
4459 if (mddev->pers && (st == active || st == clean) &&
4460 mddev->ro != MD_RDONLY) {
4461 /* don't take reconfig_mutex when toggling between
4462 * clean and active
4463 */
4464 spin_lock(&mddev->lock);
4465 if (st == active) {
4466 restart_array(mddev);
4467 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4468 md_wakeup_thread(mddev->thread);
4469 wake_up(&mddev->sb_wait);
4470 } else /* st == clean */ {
4471 restart_array(mddev);
4472 if (!set_in_sync(mddev))
4473 err = -EBUSY;
4474 }
4475 if (!err)
4476 sysfs_notify_dirent_safe(mddev->sysfs_state);
4477 spin_unlock(&mddev->lock);
4478 return err ?: len;
4479 }
4480 err = mddev_lock(mddev);
4481 if (err)
4482 return err;
4483
4484 switch (st) {
4485 case inactive:
4486 /* stop an active array, return 0 otherwise */
4487 if (mddev->pers)
4488 err = do_md_stop(mddev, 2);
4489 break;
4490 case clear:
4491 err = do_md_stop(mddev, 0);
4492 break;
4493 case readonly:
4494 if (mddev->pers)
4495 err = md_set_readonly(mddev);
4496 else {
4497 mddev->ro = MD_RDONLY;
4498 set_disk_ro(mddev->gendisk, 1);
4499 err = do_md_run(mddev);
4500 }
4501 break;
4502 case read_auto:
4503 if (mddev->pers) {
4504 if (md_is_rdwr(mddev))
4505 err = md_set_readonly(mddev);
4506 else if (mddev->ro == MD_RDONLY)
4507 err = restart_array(mddev);
4508 if (err == 0) {
4509 mddev->ro = MD_AUTO_READ;
4510 set_disk_ro(mddev->gendisk, 0);
4511 }
4512 } else {
4513 mddev->ro = MD_AUTO_READ;
4514 err = do_md_run(mddev);
4515 }
4516 break;
4517 case clean:
4518 if (mddev->pers) {
4519 err = restart_array(mddev);
4520 if (err)
4521 break;
4522 spin_lock(&mddev->lock);
4523 if (!set_in_sync(mddev))
4524 err = -EBUSY;
4525 spin_unlock(&mddev->lock);
4526 } else
4527 err = -EINVAL;
4528 break;
4529 case active:
4530 if (mddev->pers) {
4531 err = restart_array(mddev);
4532 if (err)
4533 break;
4534 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4535 wake_up(&mddev->sb_wait);
4536 err = 0;
4537 } else {
4538 mddev->ro = MD_RDWR;
4539 set_disk_ro(mddev->gendisk, 0);
4540 err = do_md_run(mddev);
4541 }
4542 break;
4543 default:
4544 err = -EINVAL;
4545 break;
4546 }
4547
4548 if (!err) {
4549 if (mddev->hold_active == UNTIL_IOCTL)
4550 mddev->hold_active = 0;
4551 sysfs_notify_dirent_safe(mddev->sysfs_state);
4552 }
4553 mddev_unlock(mddev);
4554
4555 if (st == readonly || st == read_auto || st == inactive ||
4556 (err && st == clear))
4557 clear_bit(MD_CLOSING, &mddev->flags);
4558
4559 return err ?: len;
4560 }
4561 static struct md_sysfs_entry md_array_state =
4562 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4563
4564 static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4565 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4566 return sprintf(page, "%d\n",
4567 atomic_read(&mddev->max_corr_read_errors));
4568 }
4569
4570 static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4571 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4572 {
4573 unsigned int n;
4574 int rv;
4575
4576 rv = kstrtouint(buf, 10, &n);
4577 if (rv < 0)
4578 return rv;
4579 if (n > INT_MAX)
4580 return -EINVAL;
4581 atomic_set(&mddev->max_corr_read_errors, n);
4582 return len;
4583 }
4584
4585 static struct md_sysfs_entry max_corr_read_errors =
4586 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4587 max_corrected_read_errors_store);
4588
4589 static ssize_t
null_show(struct mddev * mddev,char * page)4590 null_show(struct mddev *mddev, char *page)
4591 {
4592 return -EINVAL;
4593 }
4594
4595 static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4596 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4597 {
4598 /* buf must be %d:%d\n? giving major and minor numbers */
4599 /* The new device is added to the array.
4600 * If the array has a persistent superblock, we read the
4601 * superblock to initialise info and check validity.
4602 * Otherwise, only checking done is that in bind_rdev_to_array,
4603 * which mainly checks size.
4604 */
4605 char *e;
4606 int major = simple_strtoul(buf, &e, 10);
4607 int minor;
4608 dev_t dev;
4609 struct md_rdev *rdev;
4610 int err;
4611
4612 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4613 return -EINVAL;
4614 minor = simple_strtoul(e+1, &e, 10);
4615 if (*e && *e != '\n')
4616 return -EINVAL;
4617 dev = MKDEV(major, minor);
4618 if (major != MAJOR(dev) ||
4619 minor != MINOR(dev))
4620 return -EOVERFLOW;
4621
4622 err = mddev_suspend_and_lock(mddev);
4623 if (err)
4624 return err;
4625 if (mddev->persistent) {
4626 rdev = md_import_device(dev, mddev->major_version,
4627 mddev->minor_version);
4628 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4629 struct md_rdev *rdev0
4630 = list_entry(mddev->disks.next,
4631 struct md_rdev, same_set);
4632 err = super_types[mddev->major_version]
4633 .load_super(rdev, rdev0, mddev->minor_version);
4634 if (err < 0)
4635 goto out;
4636 }
4637 } else if (mddev->external)
4638 rdev = md_import_device(dev, -2, -1);
4639 else
4640 rdev = md_import_device(dev, -1, -1);
4641
4642 if (IS_ERR(rdev)) {
4643 mddev_unlock_and_resume(mddev);
4644 return PTR_ERR(rdev);
4645 }
4646 err = bind_rdev_to_array(rdev, mddev);
4647 out:
4648 if (err)
4649 export_rdev(rdev, mddev);
4650 mddev_unlock_and_resume(mddev);
4651 if (!err)
4652 md_new_event();
4653 return err ? err : len;
4654 }
4655
4656 static struct md_sysfs_entry md_new_device =
4657 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4658
4659 static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4660 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4661 {
4662 char *end;
4663 unsigned long chunk, end_chunk;
4664 int err;
4665
4666 err = mddev_lock(mddev);
4667 if (err)
4668 return err;
4669 if (!mddev->bitmap)
4670 goto out;
4671 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4672 while (*buf) {
4673 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4674 if (buf == end)
4675 break;
4676
4677 if (*end == '-') { /* range */
4678 buf = end + 1;
4679 end_chunk = simple_strtoul(buf, &end, 0);
4680 if (buf == end)
4681 break;
4682 }
4683
4684 if (*end && !isspace(*end))
4685 break;
4686
4687 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
4688 buf = skip_spaces(end);
4689 }
4690 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
4691 out:
4692 mddev_unlock(mddev);
4693 return len;
4694 }
4695
4696 static struct md_sysfs_entry md_bitmap =
4697 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4698
4699 static ssize_t
size_show(struct mddev * mddev,char * page)4700 size_show(struct mddev *mddev, char *page)
4701 {
4702 return sprintf(page, "%llu\n",
4703 (unsigned long long)mddev->dev_sectors / 2);
4704 }
4705
4706 static int update_size(struct mddev *mddev, sector_t num_sectors);
4707
4708 static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4709 size_store(struct mddev *mddev, const char *buf, size_t len)
4710 {
4711 /* If array is inactive, we can reduce the component size, but
4712 * not increase it (except from 0).
4713 * If array is active, we can try an on-line resize
4714 */
4715 sector_t sectors;
4716 int err = strict_blocks_to_sectors(buf, §ors);
4717
4718 if (err < 0)
4719 return err;
4720 err = mddev_lock(mddev);
4721 if (err)
4722 return err;
4723 if (mddev->pers) {
4724 err = update_size(mddev, sectors);
4725 if (err == 0)
4726 md_update_sb(mddev, 1);
4727 } else {
4728 if (mddev->dev_sectors == 0 ||
4729 mddev->dev_sectors > sectors)
4730 mddev->dev_sectors = sectors;
4731 else
4732 err = -ENOSPC;
4733 }
4734 mddev_unlock(mddev);
4735 return err ? err : len;
4736 }
4737
4738 static struct md_sysfs_entry md_size =
4739 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4740
4741 /* Metadata version.
4742 * This is one of
4743 * 'none' for arrays with no metadata (good luck...)
4744 * 'external' for arrays with externally managed metadata,
4745 * or N.M for internally known formats
4746 */
4747 static ssize_t
metadata_show(struct mddev * mddev,char * page)4748 metadata_show(struct mddev *mddev, char *page)
4749 {
4750 if (mddev->persistent)
4751 return sprintf(page, "%d.%d\n",
4752 mddev->major_version, mddev->minor_version);
4753 else if (mddev->external)
4754 return sprintf(page, "external:%s\n", mddev->metadata_type);
4755 else
4756 return sprintf(page, "none\n");
4757 }
4758
4759 static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4760 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4761 {
4762 int major, minor;
4763 char *e;
4764 int err;
4765 /* Changing the details of 'external' metadata is
4766 * always permitted. Otherwise there must be
4767 * no devices attached to the array.
4768 */
4769
4770 err = mddev_lock(mddev);
4771 if (err)
4772 return err;
4773 err = -EBUSY;
4774 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4775 ;
4776 else if (!list_empty(&mddev->disks))
4777 goto out_unlock;
4778
4779 err = 0;
4780 if (cmd_match(buf, "none")) {
4781 mddev->persistent = 0;
4782 mddev->external = 0;
4783 mddev->major_version = 0;
4784 mddev->minor_version = 90;
4785 goto out_unlock;
4786 }
4787 if (strncmp(buf, "external:", 9) == 0) {
4788 size_t namelen = len-9;
4789 if (namelen >= sizeof(mddev->metadata_type))
4790 namelen = sizeof(mddev->metadata_type)-1;
4791 memcpy(mddev->metadata_type, buf+9, namelen);
4792 mddev->metadata_type[namelen] = 0;
4793 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4794 mddev->metadata_type[--namelen] = 0;
4795 mddev->persistent = 0;
4796 mddev->external = 1;
4797 mddev->major_version = 0;
4798 mddev->minor_version = 90;
4799 goto out_unlock;
4800 }
4801 major = simple_strtoul(buf, &e, 10);
4802 err = -EINVAL;
4803 if (e==buf || *e != '.')
4804 goto out_unlock;
4805 buf = e+1;
4806 minor = simple_strtoul(buf, &e, 10);
4807 if (e==buf || (*e && *e != '\n') )
4808 goto out_unlock;
4809 err = -ENOENT;
4810 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4811 goto out_unlock;
4812 mddev->major_version = major;
4813 mddev->minor_version = minor;
4814 mddev->persistent = 1;
4815 mddev->external = 0;
4816 err = 0;
4817 out_unlock:
4818 mddev_unlock(mddev);
4819 return err ?: len;
4820 }
4821
4822 static struct md_sysfs_entry md_metadata =
4823 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4824
md_sync_action(struct mddev * mddev)4825 enum sync_action md_sync_action(struct mddev *mddev)
4826 {
4827 unsigned long recovery = mddev->recovery;
4828
4829 /*
4830 * frozen has the highest priority, means running sync_thread will be
4831 * stopped immediately, and no new sync_thread can start.
4832 */
4833 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4834 return ACTION_FROZEN;
4835
4836 /*
4837 * read-only array can't register sync_thread, and it can only
4838 * add/remove spares.
4839 */
4840 if (!md_is_rdwr(mddev))
4841 return ACTION_IDLE;
4842
4843 /*
4844 * idle means no sync_thread is running, and no new sync_thread is
4845 * requested.
4846 */
4847 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
4848 !test_bit(MD_RECOVERY_NEEDED, &recovery))
4849 return ACTION_IDLE;
4850
4851 if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
4852 mddev->reshape_position != MaxSector)
4853 return ACTION_RESHAPE;
4854
4855 if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4856 return ACTION_RECOVER;
4857
4858 if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4859 /*
4860 * MD_RECOVERY_CHECK must be paired with
4861 * MD_RECOVERY_REQUESTED.
4862 */
4863 if (test_bit(MD_RECOVERY_CHECK, &recovery))
4864 return ACTION_CHECK;
4865 if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
4866 return ACTION_REPAIR;
4867 return ACTION_RESYNC;
4868 }
4869
4870 /*
4871 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
4872 * sync_action is specified.
4873 */
4874 return ACTION_IDLE;
4875 }
4876
md_sync_action_by_name(const char * page)4877 enum sync_action md_sync_action_by_name(const char *page)
4878 {
4879 enum sync_action action;
4880
4881 for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
4882 if (cmd_match(page, action_name[action]))
4883 return action;
4884 }
4885
4886 return NR_SYNC_ACTIONS;
4887 }
4888
md_sync_action_name(enum sync_action action)4889 const char *md_sync_action_name(enum sync_action action)
4890 {
4891 return action_name[action];
4892 }
4893
4894 static ssize_t
action_show(struct mddev * mddev,char * page)4895 action_show(struct mddev *mddev, char *page)
4896 {
4897 enum sync_action action = md_sync_action(mddev);
4898
4899 return sprintf(page, "%s\n", md_sync_action_name(action));
4900 }
4901
4902 /**
4903 * stop_sync_thread() - wait for sync_thread to stop if it's running.
4904 * @mddev: the array.
4905 * @locked: if set, reconfig_mutex will still be held after this function
4906 * return; if not set, reconfig_mutex will be released after this
4907 * function return.
4908 */
stop_sync_thread(struct mddev * mddev,bool locked)4909 static void stop_sync_thread(struct mddev *mddev, bool locked)
4910 {
4911 int sync_seq = atomic_read(&mddev->sync_seq);
4912
4913 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4914 if (!locked)
4915 mddev_unlock(mddev);
4916 return;
4917 }
4918
4919 mddev_unlock(mddev);
4920
4921 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4922 /*
4923 * Thread might be blocked waiting for metadata update which will now
4924 * never happen
4925 */
4926 md_wakeup_thread_directly(mddev->sync_thread);
4927 if (work_pending(&mddev->sync_work))
4928 flush_work(&mddev->sync_work);
4929
4930 wait_event(resync_wait,
4931 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4932 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
4933 sync_seq != atomic_read(&mddev->sync_seq)));
4934
4935 if (locked)
4936 mddev_lock_nointr(mddev);
4937 }
4938
md_idle_sync_thread(struct mddev * mddev)4939 void md_idle_sync_thread(struct mddev *mddev)
4940 {
4941 lockdep_assert_held(&mddev->reconfig_mutex);
4942
4943 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4944 stop_sync_thread(mddev, true);
4945 }
4946 EXPORT_SYMBOL_GPL(md_idle_sync_thread);
4947
md_frozen_sync_thread(struct mddev * mddev)4948 void md_frozen_sync_thread(struct mddev *mddev)
4949 {
4950 lockdep_assert_held(&mddev->reconfig_mutex);
4951
4952 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4953 stop_sync_thread(mddev, true);
4954 }
4955 EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
4956
md_unfrozen_sync_thread(struct mddev * mddev)4957 void md_unfrozen_sync_thread(struct mddev *mddev)
4958 {
4959 lockdep_assert_held(&mddev->reconfig_mutex);
4960
4961 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4962 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4963 md_wakeup_thread(mddev->thread);
4964 sysfs_notify_dirent_safe(mddev->sysfs_action);
4965 }
4966 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
4967
mddev_start_reshape(struct mddev * mddev)4968 static int mddev_start_reshape(struct mddev *mddev)
4969 {
4970 int ret;
4971
4972 if (mddev->pers->start_reshape == NULL)
4973 return -EINVAL;
4974
4975 if (mddev->reshape_position == MaxSector ||
4976 mddev->pers->check_reshape == NULL ||
4977 mddev->pers->check_reshape(mddev)) {
4978 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4979 ret = mddev->pers->start_reshape(mddev);
4980 if (ret)
4981 return ret;
4982 } else {
4983 /*
4984 * If reshape is still in progress, and md_check_recovery() can
4985 * continue to reshape, don't restart reshape because data can
4986 * be corrupted for raid456.
4987 */
4988 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4989 }
4990
4991 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4992 return 0;
4993 }
4994
4995 static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)4996 action_store(struct mddev *mddev, const char *page, size_t len)
4997 {
4998 int ret;
4999 enum sync_action action;
5000
5001 if (!mddev->pers || !mddev->pers->sync_request)
5002 return -EINVAL;
5003
5004 retry:
5005 if (work_busy(&mddev->sync_work))
5006 flush_work(&mddev->sync_work);
5007
5008 ret = mddev_lock(mddev);
5009 if (ret)
5010 return ret;
5011
5012 if (work_busy(&mddev->sync_work)) {
5013 mddev_unlock(mddev);
5014 goto retry;
5015 }
5016
5017 action = md_sync_action_by_name(page);
5018
5019 /* TODO: mdadm rely on "idle" to start sync_thread. */
5020 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5021 switch (action) {
5022 case ACTION_FROZEN:
5023 md_frozen_sync_thread(mddev);
5024 ret = len;
5025 goto out;
5026 case ACTION_IDLE:
5027 md_idle_sync_thread(mddev);
5028 break;
5029 case ACTION_RESHAPE:
5030 case ACTION_RECOVER:
5031 case ACTION_CHECK:
5032 case ACTION_REPAIR:
5033 case ACTION_RESYNC:
5034 ret = -EBUSY;
5035 goto out;
5036 default:
5037 ret = -EINVAL;
5038 goto out;
5039 }
5040 } else {
5041 switch (action) {
5042 case ACTION_FROZEN:
5043 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5044 ret = len;
5045 goto out;
5046 case ACTION_RESHAPE:
5047 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5048 ret = mddev_start_reshape(mddev);
5049 if (ret)
5050 goto out;
5051 break;
5052 case ACTION_RECOVER:
5053 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5054 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5055 break;
5056 case ACTION_CHECK:
5057 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5058 fallthrough;
5059 case ACTION_REPAIR:
5060 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5061 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5062 fallthrough;
5063 case ACTION_RESYNC:
5064 case ACTION_IDLE:
5065 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5066 break;
5067 default:
5068 ret = -EINVAL;
5069 goto out;
5070 }
5071 }
5072
5073 if (mddev->ro == MD_AUTO_READ) {
5074 /* A write to sync_action is enough to justify
5075 * canceling read-auto mode
5076 */
5077 mddev->ro = MD_RDWR;
5078 md_wakeup_thread(mddev->sync_thread);
5079 }
5080
5081 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5082 md_wakeup_thread(mddev->thread);
5083 sysfs_notify_dirent_safe(mddev->sysfs_action);
5084 ret = len;
5085
5086 out:
5087 mddev_unlock(mddev);
5088 return ret;
5089 }
5090
5091 static struct md_sysfs_entry md_scan_mode =
5092 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5093
5094 static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)5095 last_sync_action_show(struct mddev *mddev, char *page)
5096 {
5097 return sprintf(page, "%s\n",
5098 md_sync_action_name(mddev->last_sync_action));
5099 }
5100
5101 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5102
5103 static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)5104 mismatch_cnt_show(struct mddev *mddev, char *page)
5105 {
5106 return sprintf(page, "%llu\n",
5107 (unsigned long long)
5108 atomic64_read(&mddev->resync_mismatches));
5109 }
5110
5111 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5112
5113 static ssize_t
sync_min_show(struct mddev * mddev,char * page)5114 sync_min_show(struct mddev *mddev, char *page)
5115 {
5116 return sprintf(page, "%d (%s)\n", speed_min(mddev),
5117 mddev->sync_speed_min ? "local" : "system");
5118 }
5119
5120 static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)5121 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5122 {
5123 unsigned int min;
5124 int rv;
5125
5126 if (strncmp(buf, "system", 6) == 0) {
5127 min = 0;
5128 } else {
5129 rv = kstrtouint(buf, 10, &min);
5130 if (rv < 0)
5131 return rv;
5132 if (min == 0)
5133 return -EINVAL;
5134 }
5135 mddev->sync_speed_min = min;
5136 return len;
5137 }
5138
5139 static struct md_sysfs_entry md_sync_min =
5140 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5141
5142 static ssize_t
sync_max_show(struct mddev * mddev,char * page)5143 sync_max_show(struct mddev *mddev, char *page)
5144 {
5145 return sprintf(page, "%d (%s)\n", speed_max(mddev),
5146 mddev->sync_speed_max ? "local" : "system");
5147 }
5148
5149 static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5150 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5151 {
5152 unsigned int max;
5153 int rv;
5154
5155 if (strncmp(buf, "system", 6) == 0) {
5156 max = 0;
5157 } else {
5158 rv = kstrtouint(buf, 10, &max);
5159 if (rv < 0)
5160 return rv;
5161 if (max == 0)
5162 return -EINVAL;
5163 }
5164 mddev->sync_speed_max = max;
5165 return len;
5166 }
5167
5168 static struct md_sysfs_entry md_sync_max =
5169 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5170
5171 static ssize_t
sync_io_depth_show(struct mddev * mddev,char * page)5172 sync_io_depth_show(struct mddev *mddev, char *page)
5173 {
5174 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
5175 mddev->sync_io_depth ? "local" : "system");
5176 }
5177
5178 static ssize_t
sync_io_depth_store(struct mddev * mddev,const char * buf,size_t len)5179 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
5180 {
5181 unsigned int max;
5182 int rv;
5183
5184 if (strncmp(buf, "system", 6) == 0) {
5185 max = 0;
5186 } else {
5187 rv = kstrtouint(buf, 10, &max);
5188 if (rv < 0)
5189 return rv;
5190 if (max == 0)
5191 return -EINVAL;
5192 }
5193 mddev->sync_io_depth = max;
5194 return len;
5195 }
5196
5197 static struct md_sysfs_entry md_sync_io_depth =
5198 __ATTR_RW(sync_io_depth);
5199
5200 static ssize_t
degraded_show(struct mddev * mddev,char * page)5201 degraded_show(struct mddev *mddev, char *page)
5202 {
5203 return sprintf(page, "%d\n", mddev->degraded);
5204 }
5205 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5206
5207 static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5208 sync_force_parallel_show(struct mddev *mddev, char *page)
5209 {
5210 return sprintf(page, "%d\n", mddev->parallel_resync);
5211 }
5212
5213 static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5214 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5215 {
5216 long n;
5217
5218 if (kstrtol(buf, 10, &n))
5219 return -EINVAL;
5220
5221 if (n != 0 && n != 1)
5222 return -EINVAL;
5223
5224 mddev->parallel_resync = n;
5225
5226 if (mddev->sync_thread)
5227 wake_up(&resync_wait);
5228
5229 return len;
5230 }
5231
5232 /* force parallel resync, even with shared block devices */
5233 static struct md_sysfs_entry md_sync_force_parallel =
5234 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5235 sync_force_parallel_show, sync_force_parallel_store);
5236
5237 static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5238 sync_speed_show(struct mddev *mddev, char *page)
5239 {
5240 unsigned long resync, dt, db;
5241 if (mddev->curr_resync == MD_RESYNC_NONE)
5242 return sprintf(page, "none\n");
5243 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5244 dt = (jiffies - mddev->resync_mark) / HZ;
5245 if (!dt) dt++;
5246 db = resync - mddev->resync_mark_cnt;
5247 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5248 }
5249
5250 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5251
5252 static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5253 sync_completed_show(struct mddev *mddev, char *page)
5254 {
5255 unsigned long long max_sectors, resync;
5256
5257 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5258 return sprintf(page, "none\n");
5259
5260 if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5261 mddev->curr_resync == MD_RESYNC_DELAYED)
5262 return sprintf(page, "delayed\n");
5263
5264 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5265 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5266 max_sectors = mddev->resync_max_sectors;
5267 else
5268 max_sectors = mddev->dev_sectors;
5269
5270 resync = mddev->curr_resync_completed;
5271 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5272 }
5273
5274 static struct md_sysfs_entry md_sync_completed =
5275 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5276
5277 static ssize_t
min_sync_show(struct mddev * mddev,char * page)5278 min_sync_show(struct mddev *mddev, char *page)
5279 {
5280 return sprintf(page, "%llu\n",
5281 (unsigned long long)mddev->resync_min);
5282 }
5283 static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5284 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5285 {
5286 unsigned long long min;
5287 int err;
5288
5289 if (kstrtoull(buf, 10, &min))
5290 return -EINVAL;
5291
5292 spin_lock(&mddev->lock);
5293 err = -EINVAL;
5294 if (min > mddev->resync_max)
5295 goto out_unlock;
5296
5297 err = -EBUSY;
5298 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5299 goto out_unlock;
5300
5301 /* Round down to multiple of 4K for safety */
5302 mddev->resync_min = round_down(min, 8);
5303 err = 0;
5304
5305 out_unlock:
5306 spin_unlock(&mddev->lock);
5307 return err ?: len;
5308 }
5309
5310 static struct md_sysfs_entry md_min_sync =
5311 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5312
5313 static ssize_t
max_sync_show(struct mddev * mddev,char * page)5314 max_sync_show(struct mddev *mddev, char *page)
5315 {
5316 if (mddev->resync_max == MaxSector)
5317 return sprintf(page, "max\n");
5318 else
5319 return sprintf(page, "%llu\n",
5320 (unsigned long long)mddev->resync_max);
5321 }
5322 static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5323 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5324 {
5325 int err;
5326 spin_lock(&mddev->lock);
5327 if (strncmp(buf, "max", 3) == 0)
5328 mddev->resync_max = MaxSector;
5329 else {
5330 unsigned long long max;
5331 int chunk;
5332
5333 err = -EINVAL;
5334 if (kstrtoull(buf, 10, &max))
5335 goto out_unlock;
5336 if (max < mddev->resync_min)
5337 goto out_unlock;
5338
5339 err = -EBUSY;
5340 if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5341 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5342 goto out_unlock;
5343
5344 /* Must be a multiple of chunk_size */
5345 chunk = mddev->chunk_sectors;
5346 if (chunk) {
5347 sector_t temp = max;
5348
5349 err = -EINVAL;
5350 if (sector_div(temp, chunk))
5351 goto out_unlock;
5352 }
5353 mddev->resync_max = max;
5354 }
5355 wake_up(&mddev->recovery_wait);
5356 err = 0;
5357 out_unlock:
5358 spin_unlock(&mddev->lock);
5359 return err ?: len;
5360 }
5361
5362 static struct md_sysfs_entry md_max_sync =
5363 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5364
5365 static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5366 suspend_lo_show(struct mddev *mddev, char *page)
5367 {
5368 return sprintf(page, "%llu\n",
5369 (unsigned long long)READ_ONCE(mddev->suspend_lo));
5370 }
5371
5372 static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5373 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5374 {
5375 unsigned long long new;
5376 int err;
5377
5378 err = kstrtoull(buf, 10, &new);
5379 if (err < 0)
5380 return err;
5381 if (new != (sector_t)new)
5382 return -EINVAL;
5383
5384 err = mddev_suspend(mddev, true);
5385 if (err)
5386 return err;
5387
5388 WRITE_ONCE(mddev->suspend_lo, new);
5389 mddev_resume(mddev);
5390
5391 return len;
5392 }
5393 static struct md_sysfs_entry md_suspend_lo =
5394 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5395
5396 static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5397 suspend_hi_show(struct mddev *mddev, char *page)
5398 {
5399 return sprintf(page, "%llu\n",
5400 (unsigned long long)READ_ONCE(mddev->suspend_hi));
5401 }
5402
5403 static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5404 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5405 {
5406 unsigned long long new;
5407 int err;
5408
5409 err = kstrtoull(buf, 10, &new);
5410 if (err < 0)
5411 return err;
5412 if (new != (sector_t)new)
5413 return -EINVAL;
5414
5415 err = mddev_suspend(mddev, true);
5416 if (err)
5417 return err;
5418
5419 WRITE_ONCE(mddev->suspend_hi, new);
5420 mddev_resume(mddev);
5421
5422 return len;
5423 }
5424 static struct md_sysfs_entry md_suspend_hi =
5425 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5426
5427 static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5428 reshape_position_show(struct mddev *mddev, char *page)
5429 {
5430 if (mddev->reshape_position != MaxSector)
5431 return sprintf(page, "%llu\n",
5432 (unsigned long long)mddev->reshape_position);
5433 strcpy(page, "none\n");
5434 return 5;
5435 }
5436
5437 static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5438 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5439 {
5440 struct md_rdev *rdev;
5441 unsigned long long new;
5442 int err;
5443
5444 err = kstrtoull(buf, 10, &new);
5445 if (err < 0)
5446 return err;
5447 if (new != (sector_t)new)
5448 return -EINVAL;
5449 err = mddev_lock(mddev);
5450 if (err)
5451 return err;
5452 err = -EBUSY;
5453 if (mddev->pers)
5454 goto unlock;
5455 mddev->reshape_position = new;
5456 mddev->delta_disks = 0;
5457 mddev->reshape_backwards = 0;
5458 mddev->new_level = mddev->level;
5459 mddev->new_layout = mddev->layout;
5460 mddev->new_chunk_sectors = mddev->chunk_sectors;
5461 rdev_for_each(rdev, mddev)
5462 rdev->new_data_offset = rdev->data_offset;
5463 err = 0;
5464 unlock:
5465 mddev_unlock(mddev);
5466 return err ?: len;
5467 }
5468
5469 static struct md_sysfs_entry md_reshape_position =
5470 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5471 reshape_position_store);
5472
5473 static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5474 reshape_direction_show(struct mddev *mddev, char *page)
5475 {
5476 return sprintf(page, "%s\n",
5477 mddev->reshape_backwards ? "backwards" : "forwards");
5478 }
5479
5480 static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5481 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5482 {
5483 int backwards = 0;
5484 int err;
5485
5486 if (cmd_match(buf, "forwards"))
5487 backwards = 0;
5488 else if (cmd_match(buf, "backwards"))
5489 backwards = 1;
5490 else
5491 return -EINVAL;
5492 if (mddev->reshape_backwards == backwards)
5493 return len;
5494
5495 err = mddev_lock(mddev);
5496 if (err)
5497 return err;
5498 /* check if we are allowed to change */
5499 if (mddev->delta_disks)
5500 err = -EBUSY;
5501 else if (mddev->persistent &&
5502 mddev->major_version == 0)
5503 err = -EINVAL;
5504 else
5505 mddev->reshape_backwards = backwards;
5506 mddev_unlock(mddev);
5507 return err ?: len;
5508 }
5509
5510 static struct md_sysfs_entry md_reshape_direction =
5511 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5512 reshape_direction_store);
5513
5514 static ssize_t
array_size_show(struct mddev * mddev,char * page)5515 array_size_show(struct mddev *mddev, char *page)
5516 {
5517 if (mddev->external_size)
5518 return sprintf(page, "%llu\n",
5519 (unsigned long long)mddev->array_sectors/2);
5520 else
5521 return sprintf(page, "default\n");
5522 }
5523
5524 static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5525 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5526 {
5527 sector_t sectors;
5528 int err;
5529
5530 err = mddev_lock(mddev);
5531 if (err)
5532 return err;
5533
5534 /* cluster raid doesn't support change array_sectors */
5535 if (mddev_is_clustered(mddev)) {
5536 mddev_unlock(mddev);
5537 return -EINVAL;
5538 }
5539
5540 if (strncmp(buf, "default", 7) == 0) {
5541 if (mddev->pers)
5542 sectors = mddev->pers->size(mddev, 0, 0);
5543 else
5544 sectors = mddev->array_sectors;
5545
5546 mddev->external_size = 0;
5547 } else {
5548 if (strict_blocks_to_sectors(buf, §ors) < 0)
5549 err = -EINVAL;
5550 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5551 err = -E2BIG;
5552 else
5553 mddev->external_size = 1;
5554 }
5555
5556 if (!err) {
5557 mddev->array_sectors = sectors;
5558 if (mddev->pers)
5559 set_capacity_and_notify(mddev->gendisk,
5560 mddev->array_sectors);
5561 }
5562 mddev_unlock(mddev);
5563 return err ?: len;
5564 }
5565
5566 static struct md_sysfs_entry md_array_size =
5567 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5568 array_size_store);
5569
5570 static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5571 consistency_policy_show(struct mddev *mddev, char *page)
5572 {
5573 int ret;
5574
5575 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5576 ret = sprintf(page, "journal\n");
5577 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5578 ret = sprintf(page, "ppl\n");
5579 } else if (mddev->bitmap) {
5580 ret = sprintf(page, "bitmap\n");
5581 } else if (mddev->pers) {
5582 if (mddev->pers->sync_request)
5583 ret = sprintf(page, "resync\n");
5584 else
5585 ret = sprintf(page, "none\n");
5586 } else {
5587 ret = sprintf(page, "unknown\n");
5588 }
5589
5590 return ret;
5591 }
5592
5593 static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5594 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5595 {
5596 int err = 0;
5597
5598 if (mddev->pers) {
5599 if (mddev->pers->change_consistency_policy)
5600 err = mddev->pers->change_consistency_policy(mddev, buf);
5601 else
5602 err = -EBUSY;
5603 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5604 set_bit(MD_HAS_PPL, &mddev->flags);
5605 } else {
5606 err = -EINVAL;
5607 }
5608
5609 return err ? err : len;
5610 }
5611
5612 static struct md_sysfs_entry md_consistency_policy =
5613 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5614 consistency_policy_store);
5615
fail_last_dev_show(struct mddev * mddev,char * page)5616 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5617 {
5618 return sprintf(page, "%d\n", mddev->fail_last_dev);
5619 }
5620
5621 /*
5622 * Setting fail_last_dev to true to allow last device to be forcibly removed
5623 * from RAID1/RAID10.
5624 */
5625 static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5626 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5627 {
5628 int ret;
5629 bool value;
5630
5631 ret = kstrtobool(buf, &value);
5632 if (ret)
5633 return ret;
5634
5635 if (value != mddev->fail_last_dev)
5636 mddev->fail_last_dev = value;
5637
5638 return len;
5639 }
5640 static struct md_sysfs_entry md_fail_last_dev =
5641 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5642 fail_last_dev_store);
5643
serialize_policy_show(struct mddev * mddev,char * page)5644 static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5645 {
5646 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
5647 return sprintf(page, "n/a\n");
5648 else
5649 return sprintf(page, "%d\n", mddev->serialize_policy);
5650 }
5651
5652 /*
5653 * Setting serialize_policy to true to enforce write IO is not reordered
5654 * for raid1.
5655 */
5656 static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5657 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5658 {
5659 int err;
5660 bool value;
5661
5662 err = kstrtobool(buf, &value);
5663 if (err)
5664 return err;
5665
5666 if (value == mddev->serialize_policy)
5667 return len;
5668
5669 err = mddev_suspend_and_lock(mddev);
5670 if (err)
5671 return err;
5672 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
5673 pr_err("md: serialize_policy is only effective for raid1\n");
5674 err = -EINVAL;
5675 goto unlock;
5676 }
5677
5678 if (value)
5679 mddev_create_serial_pool(mddev, NULL);
5680 else
5681 mddev_destroy_serial_pool(mddev, NULL);
5682 mddev->serialize_policy = value;
5683 unlock:
5684 mddev_unlock_and_resume(mddev);
5685 return err ?: len;
5686 }
5687
5688 static struct md_sysfs_entry md_serialize_policy =
5689 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5690 serialize_policy_store);
5691
5692
5693 static struct attribute *md_default_attrs[] = {
5694 &md_level.attr,
5695 &md_new_level.attr,
5696 &md_layout.attr,
5697 &md_raid_disks.attr,
5698 &md_uuid.attr,
5699 &md_chunk_size.attr,
5700 &md_size.attr,
5701 &md_resync_start.attr,
5702 &md_metadata.attr,
5703 &md_new_device.attr,
5704 &md_safe_delay.attr,
5705 &md_array_state.attr,
5706 &md_reshape_position.attr,
5707 &md_reshape_direction.attr,
5708 &md_array_size.attr,
5709 &max_corr_read_errors.attr,
5710 &md_consistency_policy.attr,
5711 &md_fail_last_dev.attr,
5712 &md_serialize_policy.attr,
5713 NULL,
5714 };
5715
5716 static const struct attribute_group md_default_group = {
5717 .attrs = md_default_attrs,
5718 };
5719
5720 static struct attribute *md_redundancy_attrs[] = {
5721 &md_scan_mode.attr,
5722 &md_last_scan_mode.attr,
5723 &md_mismatches.attr,
5724 &md_sync_min.attr,
5725 &md_sync_max.attr,
5726 &md_sync_io_depth.attr,
5727 &md_sync_speed.attr,
5728 &md_sync_force_parallel.attr,
5729 &md_sync_completed.attr,
5730 &md_min_sync.attr,
5731 &md_max_sync.attr,
5732 &md_suspend_lo.attr,
5733 &md_suspend_hi.attr,
5734 &md_bitmap.attr,
5735 &md_degraded.attr,
5736 NULL,
5737 };
5738 static const struct attribute_group md_redundancy_group = {
5739 .name = NULL,
5740 .attrs = md_redundancy_attrs,
5741 };
5742
5743 static const struct attribute_group *md_attr_groups[] = {
5744 &md_default_group,
5745 &md_bitmap_group,
5746 NULL,
5747 };
5748
5749 static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)5750 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5751 {
5752 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5753 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5754 ssize_t rv;
5755
5756 if (!entry->show)
5757 return -EIO;
5758 spin_lock(&all_mddevs_lock);
5759 if (!mddev_get(mddev)) {
5760 spin_unlock(&all_mddevs_lock);
5761 return -EBUSY;
5762 }
5763 spin_unlock(&all_mddevs_lock);
5764
5765 rv = entry->show(mddev, page);
5766 mddev_put(mddev);
5767 return rv;
5768 }
5769
5770 static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)5771 md_attr_store(struct kobject *kobj, struct attribute *attr,
5772 const char *page, size_t length)
5773 {
5774 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5775 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5776 ssize_t rv;
5777
5778 if (!entry->store)
5779 return -EIO;
5780 if (!capable(CAP_SYS_ADMIN))
5781 return -EACCES;
5782 spin_lock(&all_mddevs_lock);
5783 if (!mddev_get(mddev)) {
5784 spin_unlock(&all_mddevs_lock);
5785 return -EBUSY;
5786 }
5787 spin_unlock(&all_mddevs_lock);
5788 rv = entry->store(mddev, page, length);
5789 mddev_put(mddev);
5790 return rv;
5791 }
5792
md_kobj_release(struct kobject * ko)5793 static void md_kobj_release(struct kobject *ko)
5794 {
5795 struct mddev *mddev = container_of(ko, struct mddev, kobj);
5796
5797 if (mddev->sysfs_state)
5798 sysfs_put(mddev->sysfs_state);
5799 if (mddev->sysfs_level)
5800 sysfs_put(mddev->sysfs_level);
5801
5802 del_gendisk(mddev->gendisk);
5803 put_disk(mddev->gendisk);
5804 }
5805
5806 static const struct sysfs_ops md_sysfs_ops = {
5807 .show = md_attr_show,
5808 .store = md_attr_store,
5809 };
5810 static const struct kobj_type md_ktype = {
5811 .release = md_kobj_release,
5812 .sysfs_ops = &md_sysfs_ops,
5813 .default_groups = md_attr_groups,
5814 };
5815
5816 int mdp_major = 0;
5817
5818 /* stack the limit for all rdevs into lim */
mddev_stack_rdev_limits(struct mddev * mddev,struct queue_limits * lim,unsigned int flags)5819 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
5820 unsigned int flags)
5821 {
5822 struct md_rdev *rdev;
5823
5824 rdev_for_each(rdev, mddev) {
5825 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
5826 mddev->gendisk->disk_name);
5827 if ((flags & MDDEV_STACK_INTEGRITY) &&
5828 !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
5829 return -EINVAL;
5830 }
5831
5832 return 0;
5833 }
5834 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
5835
5836 /* apply the extra stacking limits from a new rdev into mddev */
mddev_stack_new_rdev(struct mddev * mddev,struct md_rdev * rdev)5837 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
5838 {
5839 struct queue_limits lim;
5840
5841 if (mddev_is_dm(mddev))
5842 return 0;
5843
5844 lim = queue_limits_start_update(mddev->gendisk->queue);
5845 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
5846 mddev->gendisk->disk_name);
5847
5848 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
5849 pr_err("%s: incompatible integrity profile for %pg\n",
5850 mdname(mddev), rdev->bdev);
5851 queue_limits_cancel_update(mddev->gendisk->queue);
5852 return -ENXIO;
5853 }
5854
5855 return queue_limits_commit_update(mddev->gendisk->queue, &lim);
5856 }
5857 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
5858
5859 /* update the optimal I/O size after a reshape */
mddev_update_io_opt(struct mddev * mddev,unsigned int nr_stripes)5860 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
5861 {
5862 struct queue_limits lim;
5863
5864 if (mddev_is_dm(mddev))
5865 return;
5866
5867 /* don't bother updating io_opt if we can't suspend the array */
5868 if (mddev_suspend(mddev, false) < 0)
5869 return;
5870 lim = queue_limits_start_update(mddev->gendisk->queue);
5871 lim.io_opt = lim.io_min * nr_stripes;
5872 queue_limits_commit_update(mddev->gendisk->queue, &lim);
5873 mddev_resume(mddev);
5874 }
5875 EXPORT_SYMBOL_GPL(mddev_update_io_opt);
5876
mddev_delayed_delete(struct work_struct * ws)5877 static void mddev_delayed_delete(struct work_struct *ws)
5878 {
5879 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5880
5881 kobject_put(&mddev->kobj);
5882 }
5883
md_init_stacking_limits(struct queue_limits * lim)5884 void md_init_stacking_limits(struct queue_limits *lim)
5885 {
5886 blk_set_stacking_limits(lim);
5887 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
5888 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
5889 }
5890 EXPORT_SYMBOL_GPL(md_init_stacking_limits);
5891
md_alloc(dev_t dev,char * name)5892 struct mddev *md_alloc(dev_t dev, char *name)
5893 {
5894 /*
5895 * If dev is zero, name is the name of a device to allocate with
5896 * an arbitrary minor number. It will be "md_???"
5897 * If dev is non-zero it must be a device number with a MAJOR of
5898 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
5899 * the device is being created by opening a node in /dev.
5900 * If "name" is not NULL, the device is being created by
5901 * writing to /sys/module/md_mod/parameters/new_array.
5902 */
5903 static DEFINE_MUTEX(disks_mutex);
5904 struct mddev *mddev;
5905 struct gendisk *disk;
5906 int partitioned;
5907 int shift;
5908 int unit;
5909 int error;
5910
5911 /*
5912 * Wait for any previous instance of this device to be completely
5913 * removed (mddev_delayed_delete).
5914 */
5915 flush_workqueue(md_misc_wq);
5916
5917 mutex_lock(&disks_mutex);
5918 mddev = mddev_alloc(dev);
5919 if (IS_ERR(mddev)) {
5920 error = PTR_ERR(mddev);
5921 goto out_unlock;
5922 }
5923
5924 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5925 shift = partitioned ? MdpMinorShift : 0;
5926 unit = MINOR(mddev->unit) >> shift;
5927
5928 if (name && !dev) {
5929 /* Need to ensure that 'name' is not a duplicate.
5930 */
5931 struct mddev *mddev2;
5932 spin_lock(&all_mddevs_lock);
5933
5934 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5935 if (mddev2->gendisk &&
5936 strcmp(mddev2->gendisk->disk_name, name) == 0) {
5937 spin_unlock(&all_mddevs_lock);
5938 error = -EEXIST;
5939 goto out_free_mddev;
5940 }
5941 spin_unlock(&all_mddevs_lock);
5942 }
5943 if (name && dev)
5944 /*
5945 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5946 */
5947 mddev->hold_active = UNTIL_STOP;
5948
5949 disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
5950 if (IS_ERR(disk)) {
5951 error = PTR_ERR(disk);
5952 goto out_free_mddev;
5953 }
5954
5955 disk->major = MAJOR(mddev->unit);
5956 disk->first_minor = unit << shift;
5957 disk->minors = 1 << shift;
5958 if (name)
5959 strcpy(disk->disk_name, name);
5960 else if (partitioned)
5961 sprintf(disk->disk_name, "md_d%d", unit);
5962 else
5963 sprintf(disk->disk_name, "md%d", unit);
5964 disk->fops = &md_fops;
5965 disk->private_data = mddev;
5966
5967 disk->events |= DISK_EVENT_MEDIA_CHANGE;
5968 mddev->gendisk = disk;
5969 error = add_disk(disk);
5970 if (error)
5971 goto out_put_disk;
5972
5973 kobject_init(&mddev->kobj, &md_ktype);
5974 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5975 if (error) {
5976 /*
5977 * The disk is already live at this point. Clear the hold flag
5978 * and let mddev_put take care of the deletion, as it isn't any
5979 * different from a normal close on last release now.
5980 */
5981 mddev->hold_active = 0;
5982 mutex_unlock(&disks_mutex);
5983 mddev_put(mddev);
5984 return ERR_PTR(error);
5985 }
5986
5987 kobject_uevent(&mddev->kobj, KOBJ_ADD);
5988 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5989 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5990 mutex_unlock(&disks_mutex);
5991 return mddev;
5992
5993 out_put_disk:
5994 put_disk(disk);
5995 out_free_mddev:
5996 mddev_free(mddev);
5997 out_unlock:
5998 mutex_unlock(&disks_mutex);
5999 return ERR_PTR(error);
6000 }
6001
md_alloc_and_put(dev_t dev,char * name)6002 static int md_alloc_and_put(dev_t dev, char *name)
6003 {
6004 struct mddev *mddev = md_alloc(dev, name);
6005
6006 if (IS_ERR(mddev))
6007 return PTR_ERR(mddev);
6008 mddev_put(mddev);
6009 return 0;
6010 }
6011
md_probe(dev_t dev)6012 static void md_probe(dev_t dev)
6013 {
6014 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
6015 return;
6016 if (create_on_open)
6017 md_alloc_and_put(dev, NULL);
6018 }
6019
add_named_array(const char * val,const struct kernel_param * kp)6020 static int add_named_array(const char *val, const struct kernel_param *kp)
6021 {
6022 /*
6023 * val must be "md_*" or "mdNNN".
6024 * For "md_*" we allocate an array with a large free minor number, and
6025 * set the name to val. val must not already be an active name.
6026 * For "mdNNN" we allocate an array with the minor number NNN
6027 * which must not already be in use.
6028 */
6029 int len = strlen(val);
6030 char buf[DISK_NAME_LEN];
6031 unsigned long devnum;
6032
6033 while (len && val[len-1] == '\n')
6034 len--;
6035 if (len >= DISK_NAME_LEN)
6036 return -E2BIG;
6037 strscpy(buf, val, len+1);
6038 if (strncmp(buf, "md_", 3) == 0)
6039 return md_alloc_and_put(0, buf);
6040 if (strncmp(buf, "md", 2) == 0 &&
6041 isdigit(buf[2]) &&
6042 kstrtoul(buf+2, 10, &devnum) == 0 &&
6043 devnum <= MINORMASK)
6044 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
6045
6046 return -EINVAL;
6047 }
6048
md_safemode_timeout(struct timer_list * t)6049 static void md_safemode_timeout(struct timer_list *t)
6050 {
6051 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
6052
6053 mddev->safemode = 1;
6054 if (mddev->external)
6055 sysfs_notify_dirent_safe(mddev->sysfs_state);
6056
6057 md_wakeup_thread(mddev->thread);
6058 }
6059
6060 static int start_dirty_degraded;
6061
md_run(struct mddev * mddev)6062 int md_run(struct mddev *mddev)
6063 {
6064 int err;
6065 struct md_rdev *rdev;
6066 struct md_personality *pers;
6067 bool nowait = true;
6068
6069 if (list_empty(&mddev->disks))
6070 /* cannot run an array with no devices.. */
6071 return -EINVAL;
6072
6073 if (mddev->pers)
6074 return -EBUSY;
6075 /* Cannot run until previous stop completes properly */
6076 if (mddev->sysfs_active)
6077 return -EBUSY;
6078
6079 /*
6080 * Analyze all RAID superblock(s)
6081 */
6082 if (!mddev->raid_disks) {
6083 if (!mddev->persistent)
6084 return -EINVAL;
6085 err = analyze_sbs(mddev);
6086 if (err)
6087 return -EINVAL;
6088 }
6089
6090 if (mddev->level != LEVEL_NONE)
6091 request_module("md-level-%d", mddev->level);
6092 else if (mddev->clevel[0])
6093 request_module("md-%s", mddev->clevel);
6094
6095 /*
6096 * Drop all container device buffers, from now on
6097 * the only valid external interface is through the md
6098 * device.
6099 */
6100 mddev->has_superblocks = false;
6101 rdev_for_each(rdev, mddev) {
6102 if (test_bit(Faulty, &rdev->flags))
6103 continue;
6104 sync_blockdev(rdev->bdev);
6105 invalidate_bdev(rdev->bdev);
6106 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6107 mddev->ro = MD_RDONLY;
6108 if (!mddev_is_dm(mddev))
6109 set_disk_ro(mddev->gendisk, 1);
6110 }
6111
6112 if (rdev->sb_page)
6113 mddev->has_superblocks = true;
6114
6115 /* perform some consistency tests on the device.
6116 * We don't want the data to overlap the metadata,
6117 * Internal Bitmap issues have been handled elsewhere.
6118 */
6119 if (rdev->meta_bdev) {
6120 /* Nothing to check */;
6121 } else if (rdev->data_offset < rdev->sb_start) {
6122 if (mddev->dev_sectors &&
6123 rdev->data_offset + mddev->dev_sectors
6124 > rdev->sb_start) {
6125 pr_warn("md: %s: data overlaps metadata\n",
6126 mdname(mddev));
6127 return -EINVAL;
6128 }
6129 } else {
6130 if (rdev->sb_start + rdev->sb_size/512
6131 > rdev->data_offset) {
6132 pr_warn("md: %s: metadata overlaps data\n",
6133 mdname(mddev));
6134 return -EINVAL;
6135 }
6136 }
6137 sysfs_notify_dirent_safe(rdev->sysfs_state);
6138 nowait = nowait && bdev_nowait(rdev->bdev);
6139 }
6140
6141 if (!bioset_initialized(&mddev->bio_set)) {
6142 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6143 if (err)
6144 return err;
6145 }
6146 if (!bioset_initialized(&mddev->sync_set)) {
6147 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6148 if (err)
6149 goto exit_bio_set;
6150 }
6151
6152 if (!bioset_initialized(&mddev->io_clone_set)) {
6153 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
6154 offsetof(struct md_io_clone, bio_clone), 0);
6155 if (err)
6156 goto exit_sync_set;
6157 }
6158
6159 pers = get_pers(mddev->level, mddev->clevel);
6160 if (!pers) {
6161 err = -EINVAL;
6162 goto abort;
6163 }
6164 if (mddev->level != pers->head.id) {
6165 mddev->level = pers->head.id;
6166 mddev->new_level = pers->head.id;
6167 }
6168 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
6169
6170 if (mddev->reshape_position != MaxSector &&
6171 pers->start_reshape == NULL) {
6172 /* This personality cannot handle reshaping... */
6173 put_pers(pers);
6174 err = -EINVAL;
6175 goto abort;
6176 }
6177
6178 if (pers->sync_request) {
6179 /* Warn if this is a potentially silly
6180 * configuration.
6181 */
6182 struct md_rdev *rdev2;
6183 int warned = 0;
6184
6185 rdev_for_each(rdev, mddev)
6186 rdev_for_each(rdev2, mddev) {
6187 if (rdev < rdev2 &&
6188 rdev->bdev->bd_disk ==
6189 rdev2->bdev->bd_disk) {
6190 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6191 mdname(mddev),
6192 rdev->bdev,
6193 rdev2->bdev);
6194 warned = 1;
6195 }
6196 }
6197
6198 if (warned)
6199 pr_warn("True protection against single-disk failure might be compromised.\n");
6200 }
6201
6202 /* dm-raid expect sync_thread to be frozen until resume */
6203 if (mddev->gendisk)
6204 mddev->recovery = 0;
6205
6206 /* may be over-ridden by personality */
6207 mddev->resync_max_sectors = mddev->dev_sectors;
6208
6209 mddev->ok_start_degraded = start_dirty_degraded;
6210
6211 if (start_readonly && md_is_rdwr(mddev))
6212 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6213
6214 err = pers->run(mddev);
6215 if (err)
6216 pr_warn("md: pers->run() failed ...\n");
6217 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6218 WARN_ONCE(!mddev->external_size,
6219 "%s: default size too small, but 'external_size' not in effect?\n",
6220 __func__);
6221 pr_warn("md: invalid array_size %llu > default size %llu\n",
6222 (unsigned long long)mddev->array_sectors / 2,
6223 (unsigned long long)pers->size(mddev, 0, 0) / 2);
6224 err = -EINVAL;
6225 }
6226 if (err == 0 && pers->sync_request &&
6227 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6228 err = mddev->bitmap_ops->create(mddev);
6229 if (err)
6230 pr_warn("%s: failed to create bitmap (%d)\n",
6231 mdname(mddev), err);
6232 }
6233 if (err)
6234 goto bitmap_abort;
6235
6236 if (mddev->bitmap_info.max_write_behind > 0) {
6237 bool create_pool = false;
6238
6239 rdev_for_each(rdev, mddev) {
6240 if (test_bit(WriteMostly, &rdev->flags) &&
6241 rdev_init_serial(rdev))
6242 create_pool = true;
6243 }
6244 if (create_pool && mddev->serial_info_pool == NULL) {
6245 mddev->serial_info_pool =
6246 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6247 sizeof(struct serial_info));
6248 if (!mddev->serial_info_pool) {
6249 err = -ENOMEM;
6250 goto bitmap_abort;
6251 }
6252 }
6253 }
6254
6255 if (pers->sync_request) {
6256 if (mddev->kobj.sd &&
6257 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6258 pr_warn("md: cannot register extra attributes for %s\n",
6259 mdname(mddev));
6260 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6261 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6262 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6263 } else if (mddev->ro == MD_AUTO_READ)
6264 mddev->ro = MD_RDWR;
6265
6266 atomic_set(&mddev->max_corr_read_errors,
6267 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6268 mddev->safemode = 0;
6269 if (mddev_is_clustered(mddev))
6270 mddev->safemode_delay = 0;
6271 else
6272 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6273 mddev->in_sync = 1;
6274 smp_wmb();
6275 spin_lock(&mddev->lock);
6276 mddev->pers = pers;
6277 spin_unlock(&mddev->lock);
6278 rdev_for_each(rdev, mddev)
6279 if (rdev->raid_disk >= 0)
6280 sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6281
6282 if (mddev->degraded && md_is_rdwr(mddev))
6283 /* This ensures that recovering status is reported immediately
6284 * via sysfs - until a lack of spares is confirmed.
6285 */
6286 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6287 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6288
6289 if (mddev->sb_flags)
6290 md_update_sb(mddev, 0);
6291
6292 md_new_event();
6293 return 0;
6294
6295 bitmap_abort:
6296 mddev_detach(mddev);
6297 if (mddev->private)
6298 pers->free(mddev, mddev->private);
6299 mddev->private = NULL;
6300 put_pers(pers);
6301 mddev->bitmap_ops->destroy(mddev);
6302 abort:
6303 bioset_exit(&mddev->io_clone_set);
6304 exit_sync_set:
6305 bioset_exit(&mddev->sync_set);
6306 exit_bio_set:
6307 bioset_exit(&mddev->bio_set);
6308 return err;
6309 }
6310 EXPORT_SYMBOL_GPL(md_run);
6311
do_md_run(struct mddev * mddev)6312 int do_md_run(struct mddev *mddev)
6313 {
6314 int err;
6315
6316 set_bit(MD_NOT_READY, &mddev->flags);
6317 err = md_run(mddev);
6318 if (err)
6319 goto out;
6320
6321 err = mddev->bitmap_ops->load(mddev);
6322 if (err) {
6323 mddev->bitmap_ops->destroy(mddev);
6324 goto out;
6325 }
6326
6327 if (mddev_is_clustered(mddev))
6328 md_allow_write(mddev);
6329
6330 /* run start up tasks that require md_thread */
6331 md_start(mddev);
6332
6333 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6334
6335 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6336 clear_bit(MD_NOT_READY, &mddev->flags);
6337 mddev->changed = 1;
6338 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6339 sysfs_notify_dirent_safe(mddev->sysfs_state);
6340 sysfs_notify_dirent_safe(mddev->sysfs_action);
6341 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6342 out:
6343 clear_bit(MD_NOT_READY, &mddev->flags);
6344 return err;
6345 }
6346
md_start(struct mddev * mddev)6347 int md_start(struct mddev *mddev)
6348 {
6349 int ret = 0;
6350
6351 if (mddev->pers->start) {
6352 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6353 ret = mddev->pers->start(mddev);
6354 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6355 md_wakeup_thread(mddev->sync_thread);
6356 }
6357 return ret;
6358 }
6359 EXPORT_SYMBOL_GPL(md_start);
6360
restart_array(struct mddev * mddev)6361 static int restart_array(struct mddev *mddev)
6362 {
6363 struct gendisk *disk = mddev->gendisk;
6364 struct md_rdev *rdev;
6365 bool has_journal = false;
6366 bool has_readonly = false;
6367
6368 /* Complain if it has no devices */
6369 if (list_empty(&mddev->disks))
6370 return -ENXIO;
6371 if (!mddev->pers)
6372 return -EINVAL;
6373 if (md_is_rdwr(mddev))
6374 return -EBUSY;
6375
6376 rcu_read_lock();
6377 rdev_for_each_rcu(rdev, mddev) {
6378 if (test_bit(Journal, &rdev->flags) &&
6379 !test_bit(Faulty, &rdev->flags))
6380 has_journal = true;
6381 if (rdev_read_only(rdev))
6382 has_readonly = true;
6383 }
6384 rcu_read_unlock();
6385 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6386 /* Don't restart rw with journal missing/faulty */
6387 return -EINVAL;
6388 if (has_readonly)
6389 return -EROFS;
6390
6391 mddev->safemode = 0;
6392 mddev->ro = MD_RDWR;
6393 set_disk_ro(disk, 0);
6394 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6395 /* Kick recovery or resync if necessary */
6396 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6397 md_wakeup_thread(mddev->sync_thread);
6398 sysfs_notify_dirent_safe(mddev->sysfs_state);
6399 return 0;
6400 }
6401
md_clean(struct mddev * mddev)6402 static void md_clean(struct mddev *mddev)
6403 {
6404 mddev->array_sectors = 0;
6405 mddev->external_size = 0;
6406 mddev->dev_sectors = 0;
6407 mddev->raid_disks = 0;
6408 mddev->recovery_cp = 0;
6409 mddev->resync_min = 0;
6410 mddev->resync_max = MaxSector;
6411 mddev->reshape_position = MaxSector;
6412 /* we still need mddev->external in export_rdev, do not clear it yet */
6413 mddev->persistent = 0;
6414 mddev->level = LEVEL_NONE;
6415 mddev->clevel[0] = 0;
6416 /*
6417 * Don't clear MD_CLOSING, or mddev can be opened again.
6418 * 'hold_active != 0' means mddev is still in the creation
6419 * process and will be used later.
6420 */
6421 if (mddev->hold_active)
6422 mddev->flags = 0;
6423 else
6424 mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6425 mddev->sb_flags = 0;
6426 mddev->ro = MD_RDWR;
6427 mddev->metadata_type[0] = 0;
6428 mddev->chunk_sectors = 0;
6429 mddev->ctime = mddev->utime = 0;
6430 mddev->layout = 0;
6431 mddev->max_disks = 0;
6432 mddev->events = 0;
6433 mddev->can_decrease_events = 0;
6434 mddev->delta_disks = 0;
6435 mddev->reshape_backwards = 0;
6436 mddev->new_level = LEVEL_NONE;
6437 mddev->new_layout = 0;
6438 mddev->new_chunk_sectors = 0;
6439 mddev->curr_resync = MD_RESYNC_NONE;
6440 atomic64_set(&mddev->resync_mismatches, 0);
6441 mddev->suspend_lo = mddev->suspend_hi = 0;
6442 mddev->sync_speed_min = mddev->sync_speed_max = 0;
6443 mddev->recovery = 0;
6444 mddev->in_sync = 0;
6445 mddev->changed = 0;
6446 mddev->degraded = 0;
6447 mddev->safemode = 0;
6448 mddev->private = NULL;
6449 mddev->cluster_info = NULL;
6450 mddev->bitmap_info.offset = 0;
6451 mddev->bitmap_info.default_offset = 0;
6452 mddev->bitmap_info.default_space = 0;
6453 mddev->bitmap_info.chunksize = 0;
6454 mddev->bitmap_info.daemon_sleep = 0;
6455 mddev->bitmap_info.max_write_behind = 0;
6456 mddev->bitmap_info.nodes = 0;
6457 }
6458
__md_stop_writes(struct mddev * mddev)6459 static void __md_stop_writes(struct mddev *mddev)
6460 {
6461 timer_delete_sync(&mddev->safemode_timer);
6462
6463 if (mddev->pers && mddev->pers->quiesce) {
6464 mddev->pers->quiesce(mddev, 1);
6465 mddev->pers->quiesce(mddev, 0);
6466 }
6467
6468 mddev->bitmap_ops->flush(mddev);
6469
6470 if (md_is_rdwr(mddev) &&
6471 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6472 mddev->sb_flags)) {
6473 /* mark array as shutdown cleanly */
6474 if (!mddev_is_clustered(mddev))
6475 mddev->in_sync = 1;
6476 md_update_sb(mddev, 1);
6477 }
6478 /* disable policy to guarantee rdevs free resources for serialization */
6479 mddev->serialize_policy = 0;
6480 mddev_destroy_serial_pool(mddev, NULL);
6481 }
6482
md_stop_writes(struct mddev * mddev)6483 void md_stop_writes(struct mddev *mddev)
6484 {
6485 mddev_lock_nointr(mddev);
6486 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6487 stop_sync_thread(mddev, true);
6488 __md_stop_writes(mddev);
6489 mddev_unlock(mddev);
6490 }
6491 EXPORT_SYMBOL_GPL(md_stop_writes);
6492
mddev_detach(struct mddev * mddev)6493 static void mddev_detach(struct mddev *mddev)
6494 {
6495 mddev->bitmap_ops->wait_behind_writes(mddev);
6496 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6497 mddev->pers->quiesce(mddev, 1);
6498 mddev->pers->quiesce(mddev, 0);
6499 }
6500 md_unregister_thread(mddev, &mddev->thread);
6501
6502 /* the unplug fn references 'conf' */
6503 if (!mddev_is_dm(mddev))
6504 blk_sync_queue(mddev->gendisk->queue);
6505 }
6506
__md_stop(struct mddev * mddev)6507 static void __md_stop(struct mddev *mddev)
6508 {
6509 struct md_personality *pers = mddev->pers;
6510
6511 mddev->bitmap_ops->destroy(mddev);
6512 mddev_detach(mddev);
6513 spin_lock(&mddev->lock);
6514 mddev->pers = NULL;
6515 spin_unlock(&mddev->lock);
6516 if (mddev->private)
6517 pers->free(mddev, mddev->private);
6518 mddev->private = NULL;
6519 if (pers->sync_request && mddev->to_remove == NULL)
6520 mddev->to_remove = &md_redundancy_group;
6521 put_pers(pers);
6522 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6523
6524 bioset_exit(&mddev->bio_set);
6525 bioset_exit(&mddev->sync_set);
6526 bioset_exit(&mddev->io_clone_set);
6527 }
6528
md_stop(struct mddev * mddev)6529 void md_stop(struct mddev *mddev)
6530 {
6531 lockdep_assert_held(&mddev->reconfig_mutex);
6532
6533 /* stop the array and free an attached data structures.
6534 * This is called from dm-raid
6535 */
6536 __md_stop_writes(mddev);
6537 __md_stop(mddev);
6538 }
6539
6540 EXPORT_SYMBOL_GPL(md_stop);
6541
6542 /* ensure 'mddev->pers' exist before calling md_set_readonly() */
md_set_readonly(struct mddev * mddev)6543 static int md_set_readonly(struct mddev *mddev)
6544 {
6545 int err = 0;
6546 int did_freeze = 0;
6547
6548 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6549 return -EBUSY;
6550
6551 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6552 did_freeze = 1;
6553 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6554 }
6555
6556 stop_sync_thread(mddev, false);
6557 wait_event(mddev->sb_wait,
6558 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6559 mddev_lock_nointr(mddev);
6560
6561 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6562 pr_warn("md: %s still in use.\n",mdname(mddev));
6563 err = -EBUSY;
6564 goto out;
6565 }
6566
6567 __md_stop_writes(mddev);
6568
6569 if (mddev->ro == MD_RDONLY) {
6570 err = -ENXIO;
6571 goto out;
6572 }
6573
6574 mddev->ro = MD_RDONLY;
6575 set_disk_ro(mddev->gendisk, 1);
6576
6577 out:
6578 if (!err || did_freeze) {
6579 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6580 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6581 sysfs_notify_dirent_safe(mddev->sysfs_state);
6582 }
6583
6584 return err;
6585 }
6586
6587 /* mode:
6588 * 0 - completely stop and dis-assemble array
6589 * 2 - stop but do not disassemble array
6590 */
do_md_stop(struct mddev * mddev,int mode)6591 static int do_md_stop(struct mddev *mddev, int mode)
6592 {
6593 struct gendisk *disk = mddev->gendisk;
6594 struct md_rdev *rdev;
6595 int did_freeze = 0;
6596
6597 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6598 did_freeze = 1;
6599 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6600 }
6601
6602 stop_sync_thread(mddev, true);
6603
6604 if (mddev->sysfs_active ||
6605 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6606 pr_warn("md: %s still in use.\n",mdname(mddev));
6607 if (did_freeze) {
6608 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6609 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6610 }
6611 return -EBUSY;
6612 }
6613 if (mddev->pers) {
6614 if (!md_is_rdwr(mddev))
6615 set_disk_ro(disk, 0);
6616
6617 __md_stop_writes(mddev);
6618 __md_stop(mddev);
6619
6620 /* tell userspace to handle 'inactive' */
6621 sysfs_notify_dirent_safe(mddev->sysfs_state);
6622
6623 rdev_for_each(rdev, mddev)
6624 if (rdev->raid_disk >= 0)
6625 sysfs_unlink_rdev(mddev, rdev);
6626
6627 set_capacity_and_notify(disk, 0);
6628 mddev->changed = 1;
6629
6630 if (!md_is_rdwr(mddev))
6631 mddev->ro = MD_RDWR;
6632 }
6633 /*
6634 * Free resources if final stop
6635 */
6636 if (mode == 0) {
6637 pr_info("md: %s stopped.\n", mdname(mddev));
6638
6639 if (mddev->bitmap_info.file) {
6640 struct file *f = mddev->bitmap_info.file;
6641 spin_lock(&mddev->lock);
6642 mddev->bitmap_info.file = NULL;
6643 spin_unlock(&mddev->lock);
6644 fput(f);
6645 }
6646 mddev->bitmap_info.offset = 0;
6647
6648 export_array(mddev);
6649
6650 md_clean(mddev);
6651 if (mddev->hold_active == UNTIL_STOP)
6652 mddev->hold_active = 0;
6653 }
6654 md_new_event();
6655 sysfs_notify_dirent_safe(mddev->sysfs_state);
6656 return 0;
6657 }
6658
6659 #ifndef MODULE
autorun_array(struct mddev * mddev)6660 static void autorun_array(struct mddev *mddev)
6661 {
6662 struct md_rdev *rdev;
6663 int err;
6664
6665 if (list_empty(&mddev->disks))
6666 return;
6667
6668 pr_info("md: running: ");
6669
6670 rdev_for_each(rdev, mddev) {
6671 pr_cont("<%pg>", rdev->bdev);
6672 }
6673 pr_cont("\n");
6674
6675 err = do_md_run(mddev);
6676 if (err) {
6677 pr_warn("md: do_md_run() returned %d\n", err);
6678 do_md_stop(mddev, 0);
6679 }
6680 }
6681
6682 /*
6683 * lets try to run arrays based on all disks that have arrived
6684 * until now. (those are in pending_raid_disks)
6685 *
6686 * the method: pick the first pending disk, collect all disks with
6687 * the same UUID, remove all from the pending list and put them into
6688 * the 'same_array' list. Then order this list based on superblock
6689 * update time (freshest comes first), kick out 'old' disks and
6690 * compare superblocks. If everything's fine then run it.
6691 *
6692 * If "unit" is allocated, then bump its reference count
6693 */
autorun_devices(int part)6694 static void autorun_devices(int part)
6695 {
6696 struct md_rdev *rdev0, *rdev, *tmp;
6697 struct mddev *mddev;
6698
6699 pr_info("md: autorun ...\n");
6700 while (!list_empty(&pending_raid_disks)) {
6701 int unit;
6702 dev_t dev;
6703 LIST_HEAD(candidates);
6704 rdev0 = list_entry(pending_raid_disks.next,
6705 struct md_rdev, same_set);
6706
6707 pr_debug("md: considering %pg ...\n", rdev0->bdev);
6708 INIT_LIST_HEAD(&candidates);
6709 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6710 if (super_90_load(rdev, rdev0, 0) >= 0) {
6711 pr_debug("md: adding %pg ...\n",
6712 rdev->bdev);
6713 list_move(&rdev->same_set, &candidates);
6714 }
6715 /*
6716 * now we have a set of devices, with all of them having
6717 * mostly sane superblocks. It's time to allocate the
6718 * mddev.
6719 */
6720 if (part) {
6721 dev = MKDEV(mdp_major,
6722 rdev0->preferred_minor << MdpMinorShift);
6723 unit = MINOR(dev) >> MdpMinorShift;
6724 } else {
6725 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6726 unit = MINOR(dev);
6727 }
6728 if (rdev0->preferred_minor != unit) {
6729 pr_warn("md: unit number in %pg is bad: %d\n",
6730 rdev0->bdev, rdev0->preferred_minor);
6731 break;
6732 }
6733
6734 mddev = md_alloc(dev, NULL);
6735 if (IS_ERR(mddev))
6736 break;
6737
6738 if (mddev_suspend_and_lock(mddev))
6739 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6740 else if (mddev->raid_disks || mddev->major_version
6741 || !list_empty(&mddev->disks)) {
6742 pr_warn("md: %s already running, cannot run %pg\n",
6743 mdname(mddev), rdev0->bdev);
6744 mddev_unlock_and_resume(mddev);
6745 } else {
6746 pr_debug("md: created %s\n", mdname(mddev));
6747 mddev->persistent = 1;
6748 rdev_for_each_list(rdev, tmp, &candidates) {
6749 list_del_init(&rdev->same_set);
6750 if (bind_rdev_to_array(rdev, mddev))
6751 export_rdev(rdev, mddev);
6752 }
6753 autorun_array(mddev);
6754 mddev_unlock_and_resume(mddev);
6755 }
6756 /* on success, candidates will be empty, on error
6757 * it won't...
6758 */
6759 rdev_for_each_list(rdev, tmp, &candidates) {
6760 list_del_init(&rdev->same_set);
6761 export_rdev(rdev, mddev);
6762 }
6763 mddev_put(mddev);
6764 }
6765 pr_info("md: ... autorun DONE.\n");
6766 }
6767 #endif /* !MODULE */
6768
get_version(void __user * arg)6769 static int get_version(void __user *arg)
6770 {
6771 mdu_version_t ver;
6772
6773 ver.major = MD_MAJOR_VERSION;
6774 ver.minor = MD_MINOR_VERSION;
6775 ver.patchlevel = MD_PATCHLEVEL_VERSION;
6776
6777 if (copy_to_user(arg, &ver, sizeof(ver)))
6778 return -EFAULT;
6779
6780 return 0;
6781 }
6782
get_array_info(struct mddev * mddev,void __user * arg)6783 static int get_array_info(struct mddev *mddev, void __user *arg)
6784 {
6785 mdu_array_info_t info;
6786 int nr,working,insync,failed,spare;
6787 struct md_rdev *rdev;
6788
6789 nr = working = insync = failed = spare = 0;
6790 rcu_read_lock();
6791 rdev_for_each_rcu(rdev, mddev) {
6792 nr++;
6793 if (test_bit(Faulty, &rdev->flags))
6794 failed++;
6795 else {
6796 working++;
6797 if (test_bit(In_sync, &rdev->flags))
6798 insync++;
6799 else if (test_bit(Journal, &rdev->flags))
6800 /* TODO: add journal count to md_u.h */
6801 ;
6802 else
6803 spare++;
6804 }
6805 }
6806 rcu_read_unlock();
6807
6808 info.major_version = mddev->major_version;
6809 info.minor_version = mddev->minor_version;
6810 info.patch_version = MD_PATCHLEVEL_VERSION;
6811 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6812 info.level = mddev->level;
6813 info.size = mddev->dev_sectors / 2;
6814 if (info.size != mddev->dev_sectors / 2) /* overflow */
6815 info.size = -1;
6816 info.nr_disks = nr;
6817 info.raid_disks = mddev->raid_disks;
6818 info.md_minor = mddev->md_minor;
6819 info.not_persistent= !mddev->persistent;
6820
6821 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6822 info.state = 0;
6823 if (mddev->in_sync)
6824 info.state = (1<<MD_SB_CLEAN);
6825 if (mddev->bitmap && mddev->bitmap_info.offset)
6826 info.state |= (1<<MD_SB_BITMAP_PRESENT);
6827 if (mddev_is_clustered(mddev))
6828 info.state |= (1<<MD_SB_CLUSTERED);
6829 info.active_disks = insync;
6830 info.working_disks = working;
6831 info.failed_disks = failed;
6832 info.spare_disks = spare;
6833
6834 info.layout = mddev->layout;
6835 info.chunk_size = mddev->chunk_sectors << 9;
6836
6837 if (copy_to_user(arg, &info, sizeof(info)))
6838 return -EFAULT;
6839
6840 return 0;
6841 }
6842
get_bitmap_file(struct mddev * mddev,void __user * arg)6843 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6844 {
6845 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6846 char *ptr;
6847 int err;
6848
6849 file = kzalloc(sizeof(*file), GFP_NOIO);
6850 if (!file)
6851 return -ENOMEM;
6852
6853 err = 0;
6854 spin_lock(&mddev->lock);
6855 /* bitmap enabled */
6856 if (mddev->bitmap_info.file) {
6857 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6858 sizeof(file->pathname));
6859 if (IS_ERR(ptr))
6860 err = PTR_ERR(ptr);
6861 else
6862 memmove(file->pathname, ptr,
6863 sizeof(file->pathname)-(ptr-file->pathname));
6864 }
6865 spin_unlock(&mddev->lock);
6866
6867 if (err == 0 &&
6868 copy_to_user(arg, file, sizeof(*file)))
6869 err = -EFAULT;
6870
6871 kfree(file);
6872 return err;
6873 }
6874
get_disk_info(struct mddev * mddev,void __user * arg)6875 static int get_disk_info(struct mddev *mddev, void __user * arg)
6876 {
6877 mdu_disk_info_t info;
6878 struct md_rdev *rdev;
6879
6880 if (copy_from_user(&info, arg, sizeof(info)))
6881 return -EFAULT;
6882
6883 rcu_read_lock();
6884 rdev = md_find_rdev_nr_rcu(mddev, info.number);
6885 if (rdev) {
6886 info.major = MAJOR(rdev->bdev->bd_dev);
6887 info.minor = MINOR(rdev->bdev->bd_dev);
6888 info.raid_disk = rdev->raid_disk;
6889 info.state = 0;
6890 if (test_bit(Faulty, &rdev->flags))
6891 info.state |= (1<<MD_DISK_FAULTY);
6892 else if (test_bit(In_sync, &rdev->flags)) {
6893 info.state |= (1<<MD_DISK_ACTIVE);
6894 info.state |= (1<<MD_DISK_SYNC);
6895 }
6896 if (test_bit(Journal, &rdev->flags))
6897 info.state |= (1<<MD_DISK_JOURNAL);
6898 if (test_bit(WriteMostly, &rdev->flags))
6899 info.state |= (1<<MD_DISK_WRITEMOSTLY);
6900 if (test_bit(FailFast, &rdev->flags))
6901 info.state |= (1<<MD_DISK_FAILFAST);
6902 } else {
6903 info.major = info.minor = 0;
6904 info.raid_disk = -1;
6905 info.state = (1<<MD_DISK_REMOVED);
6906 }
6907 rcu_read_unlock();
6908
6909 if (copy_to_user(arg, &info, sizeof(info)))
6910 return -EFAULT;
6911
6912 return 0;
6913 }
6914
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)6915 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6916 {
6917 struct md_rdev *rdev;
6918 dev_t dev = MKDEV(info->major,info->minor);
6919
6920 if (mddev_is_clustered(mddev) &&
6921 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6922 pr_warn("%s: Cannot add to clustered mddev.\n",
6923 mdname(mddev));
6924 return -EINVAL;
6925 }
6926
6927 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6928 return -EOVERFLOW;
6929
6930 if (!mddev->raid_disks) {
6931 int err;
6932 /* expecting a device which has a superblock */
6933 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6934 if (IS_ERR(rdev)) {
6935 pr_warn("md: md_import_device returned %ld\n",
6936 PTR_ERR(rdev));
6937 return PTR_ERR(rdev);
6938 }
6939 if (!list_empty(&mddev->disks)) {
6940 struct md_rdev *rdev0
6941 = list_entry(mddev->disks.next,
6942 struct md_rdev, same_set);
6943 err = super_types[mddev->major_version]
6944 .load_super(rdev, rdev0, mddev->minor_version);
6945 if (err < 0) {
6946 pr_warn("md: %pg has different UUID to %pg\n",
6947 rdev->bdev,
6948 rdev0->bdev);
6949 export_rdev(rdev, mddev);
6950 return -EINVAL;
6951 }
6952 }
6953 err = bind_rdev_to_array(rdev, mddev);
6954 if (err)
6955 export_rdev(rdev, mddev);
6956 return err;
6957 }
6958
6959 /*
6960 * md_add_new_disk can be used once the array is assembled
6961 * to add "hot spares". They must already have a superblock
6962 * written
6963 */
6964 if (mddev->pers) {
6965 int err;
6966 if (!mddev->pers->hot_add_disk) {
6967 pr_warn("%s: personality does not support diskops!\n",
6968 mdname(mddev));
6969 return -EINVAL;
6970 }
6971 if (mddev->persistent)
6972 rdev = md_import_device(dev, mddev->major_version,
6973 mddev->minor_version);
6974 else
6975 rdev = md_import_device(dev, -1, -1);
6976 if (IS_ERR(rdev)) {
6977 pr_warn("md: md_import_device returned %ld\n",
6978 PTR_ERR(rdev));
6979 return PTR_ERR(rdev);
6980 }
6981 /* set saved_raid_disk if appropriate */
6982 if (!mddev->persistent) {
6983 if (info->state & (1<<MD_DISK_SYNC) &&
6984 info->raid_disk < mddev->raid_disks) {
6985 rdev->raid_disk = info->raid_disk;
6986 clear_bit(Bitmap_sync, &rdev->flags);
6987 } else
6988 rdev->raid_disk = -1;
6989 rdev->saved_raid_disk = rdev->raid_disk;
6990 } else
6991 super_types[mddev->major_version].
6992 validate_super(mddev, NULL/*freshest*/, rdev);
6993 if ((info->state & (1<<MD_DISK_SYNC)) &&
6994 rdev->raid_disk != info->raid_disk) {
6995 /* This was a hot-add request, but events doesn't
6996 * match, so reject it.
6997 */
6998 export_rdev(rdev, mddev);
6999 return -EINVAL;
7000 }
7001
7002 clear_bit(In_sync, &rdev->flags); /* just to be sure */
7003 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7004 set_bit(WriteMostly, &rdev->flags);
7005 else
7006 clear_bit(WriteMostly, &rdev->flags);
7007 if (info->state & (1<<MD_DISK_FAILFAST))
7008 set_bit(FailFast, &rdev->flags);
7009 else
7010 clear_bit(FailFast, &rdev->flags);
7011
7012 if (info->state & (1<<MD_DISK_JOURNAL)) {
7013 struct md_rdev *rdev2;
7014 bool has_journal = false;
7015
7016 /* make sure no existing journal disk */
7017 rdev_for_each(rdev2, mddev) {
7018 if (test_bit(Journal, &rdev2->flags)) {
7019 has_journal = true;
7020 break;
7021 }
7022 }
7023 if (has_journal || mddev->bitmap) {
7024 export_rdev(rdev, mddev);
7025 return -EBUSY;
7026 }
7027 set_bit(Journal, &rdev->flags);
7028 }
7029 /*
7030 * check whether the device shows up in other nodes
7031 */
7032 if (mddev_is_clustered(mddev)) {
7033 if (info->state & (1 << MD_DISK_CANDIDATE))
7034 set_bit(Candidate, &rdev->flags);
7035 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
7036 /* --add initiated by this node */
7037 err = mddev->cluster_ops->add_new_disk(mddev, rdev);
7038 if (err) {
7039 export_rdev(rdev, mddev);
7040 return err;
7041 }
7042 }
7043 }
7044
7045 rdev->raid_disk = -1;
7046 err = bind_rdev_to_array(rdev, mddev);
7047
7048 if (err)
7049 export_rdev(rdev, mddev);
7050
7051 if (mddev_is_clustered(mddev)) {
7052 if (info->state & (1 << MD_DISK_CANDIDATE)) {
7053 if (!err) {
7054 err = mddev->cluster_ops->new_disk_ack(
7055 mddev, err == 0);
7056 if (err)
7057 md_kick_rdev_from_array(rdev);
7058 }
7059 } else {
7060 if (err)
7061 mddev->cluster_ops->add_new_disk_cancel(mddev);
7062 else
7063 err = add_bound_rdev(rdev);
7064 }
7065
7066 } else if (!err)
7067 err = add_bound_rdev(rdev);
7068
7069 return err;
7070 }
7071
7072 /* otherwise, md_add_new_disk is only allowed
7073 * for major_version==0 superblocks
7074 */
7075 if (mddev->major_version != 0) {
7076 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7077 return -EINVAL;
7078 }
7079
7080 if (!(info->state & (1<<MD_DISK_FAULTY))) {
7081 int err;
7082 rdev = md_import_device(dev, -1, 0);
7083 if (IS_ERR(rdev)) {
7084 pr_warn("md: error, md_import_device() returned %ld\n",
7085 PTR_ERR(rdev));
7086 return PTR_ERR(rdev);
7087 }
7088 rdev->desc_nr = info->number;
7089 if (info->raid_disk < mddev->raid_disks)
7090 rdev->raid_disk = info->raid_disk;
7091 else
7092 rdev->raid_disk = -1;
7093
7094 if (rdev->raid_disk < mddev->raid_disks)
7095 if (info->state & (1<<MD_DISK_SYNC))
7096 set_bit(In_sync, &rdev->flags);
7097
7098 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7099 set_bit(WriteMostly, &rdev->flags);
7100 if (info->state & (1<<MD_DISK_FAILFAST))
7101 set_bit(FailFast, &rdev->flags);
7102
7103 if (!mddev->persistent) {
7104 pr_debug("md: nonpersistent superblock ...\n");
7105 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7106 } else
7107 rdev->sb_start = calc_dev_sboffset(rdev);
7108 rdev->sectors = rdev->sb_start;
7109
7110 err = bind_rdev_to_array(rdev, mddev);
7111 if (err) {
7112 export_rdev(rdev, mddev);
7113 return err;
7114 }
7115 }
7116
7117 return 0;
7118 }
7119
hot_remove_disk(struct mddev * mddev,dev_t dev)7120 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7121 {
7122 struct md_rdev *rdev;
7123
7124 if (!mddev->pers)
7125 return -ENODEV;
7126
7127 rdev = find_rdev(mddev, dev);
7128 if (!rdev)
7129 return -ENXIO;
7130
7131 if (rdev->raid_disk < 0)
7132 goto kick_rdev;
7133
7134 clear_bit(Blocked, &rdev->flags);
7135 remove_and_add_spares(mddev, rdev);
7136
7137 if (rdev->raid_disk >= 0)
7138 goto busy;
7139
7140 kick_rdev:
7141 if (mddev_is_clustered(mddev) &&
7142 mddev->cluster_ops->remove_disk(mddev, rdev))
7143 goto busy;
7144
7145 md_kick_rdev_from_array(rdev);
7146 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7147 if (!mddev->thread)
7148 md_update_sb(mddev, 1);
7149 md_new_event();
7150
7151 return 0;
7152 busy:
7153 pr_debug("md: cannot remove active disk %pg from %s ...\n",
7154 rdev->bdev, mdname(mddev));
7155 return -EBUSY;
7156 }
7157
hot_add_disk(struct mddev * mddev,dev_t dev)7158 static int hot_add_disk(struct mddev *mddev, dev_t dev)
7159 {
7160 int err;
7161 struct md_rdev *rdev;
7162
7163 if (!mddev->pers)
7164 return -ENODEV;
7165
7166 if (mddev->major_version != 0) {
7167 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7168 mdname(mddev));
7169 return -EINVAL;
7170 }
7171 if (!mddev->pers->hot_add_disk) {
7172 pr_warn("%s: personality does not support diskops!\n",
7173 mdname(mddev));
7174 return -EINVAL;
7175 }
7176
7177 rdev = md_import_device(dev, -1, 0);
7178 if (IS_ERR(rdev)) {
7179 pr_warn("md: error, md_import_device() returned %ld\n",
7180 PTR_ERR(rdev));
7181 return -EINVAL;
7182 }
7183
7184 if (mddev->persistent)
7185 rdev->sb_start = calc_dev_sboffset(rdev);
7186 else
7187 rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7188
7189 rdev->sectors = rdev->sb_start;
7190
7191 if (test_bit(Faulty, &rdev->flags)) {
7192 pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7193 rdev->bdev, mdname(mddev));
7194 err = -EINVAL;
7195 goto abort_export;
7196 }
7197
7198 clear_bit(In_sync, &rdev->flags);
7199 rdev->desc_nr = -1;
7200 rdev->saved_raid_disk = -1;
7201 err = bind_rdev_to_array(rdev, mddev);
7202 if (err)
7203 goto abort_export;
7204
7205 /*
7206 * The rest should better be atomic, we can have disk failures
7207 * noticed in interrupt contexts ...
7208 */
7209
7210 rdev->raid_disk = -1;
7211
7212 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7213 if (!mddev->thread)
7214 md_update_sb(mddev, 1);
7215 /*
7216 * Kick recovery, maybe this spare has to be added to the
7217 * array immediately.
7218 */
7219 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7220 md_new_event();
7221 return 0;
7222
7223 abort_export:
7224 export_rdev(rdev, mddev);
7225 return err;
7226 }
7227
set_bitmap_file(struct mddev * mddev,int fd)7228 static int set_bitmap_file(struct mddev *mddev, int fd)
7229 {
7230 int err = 0;
7231
7232 if (mddev->pers) {
7233 if (!mddev->pers->quiesce || !mddev->thread)
7234 return -EBUSY;
7235 if (mddev->recovery || mddev->sync_thread)
7236 return -EBUSY;
7237 /* we should be able to change the bitmap.. */
7238 }
7239
7240 if (fd >= 0) {
7241 struct inode *inode;
7242 struct file *f;
7243
7244 if (mddev->bitmap || mddev->bitmap_info.file)
7245 return -EEXIST; /* cannot add when bitmap is present */
7246
7247 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7248 pr_warn("%s: bitmap files not supported by this kernel\n",
7249 mdname(mddev));
7250 return -EINVAL;
7251 }
7252 pr_warn("%s: using deprecated bitmap file support\n",
7253 mdname(mddev));
7254
7255 f = fget(fd);
7256
7257 if (f == NULL) {
7258 pr_warn("%s: error: failed to get bitmap file\n",
7259 mdname(mddev));
7260 return -EBADF;
7261 }
7262
7263 inode = f->f_mapping->host;
7264 if (!S_ISREG(inode->i_mode)) {
7265 pr_warn("%s: error: bitmap file must be a regular file\n",
7266 mdname(mddev));
7267 err = -EBADF;
7268 } else if (!(f->f_mode & FMODE_WRITE)) {
7269 pr_warn("%s: error: bitmap file must open for write\n",
7270 mdname(mddev));
7271 err = -EBADF;
7272 } else if (atomic_read(&inode->i_writecount) != 1) {
7273 pr_warn("%s: error: bitmap file is already in use\n",
7274 mdname(mddev));
7275 err = -EBUSY;
7276 }
7277 if (err) {
7278 fput(f);
7279 return err;
7280 }
7281 mddev->bitmap_info.file = f;
7282 mddev->bitmap_info.offset = 0; /* file overrides offset */
7283 } else if (mddev->bitmap == NULL)
7284 return -ENOENT; /* cannot remove what isn't there */
7285 err = 0;
7286 if (mddev->pers) {
7287 if (fd >= 0) {
7288 err = mddev->bitmap_ops->create(mddev);
7289 if (!err)
7290 err = mddev->bitmap_ops->load(mddev);
7291
7292 if (err) {
7293 mddev->bitmap_ops->destroy(mddev);
7294 fd = -1;
7295 }
7296 } else if (fd < 0) {
7297 mddev->bitmap_ops->destroy(mddev);
7298 }
7299 }
7300
7301 if (fd < 0) {
7302 struct file *f = mddev->bitmap_info.file;
7303 if (f) {
7304 spin_lock(&mddev->lock);
7305 mddev->bitmap_info.file = NULL;
7306 spin_unlock(&mddev->lock);
7307 fput(f);
7308 }
7309 }
7310
7311 return err;
7312 }
7313
7314 /*
7315 * md_set_array_info is used two different ways
7316 * The original usage is when creating a new array.
7317 * In this usage, raid_disks is > 0 and it together with
7318 * level, size, not_persistent,layout,chunksize determine the
7319 * shape of the array.
7320 * This will always create an array with a type-0.90.0 superblock.
7321 * The newer usage is when assembling an array.
7322 * In this case raid_disks will be 0, and the major_version field is
7323 * use to determine which style super-blocks are to be found on the devices.
7324 * The minor and patch _version numbers are also kept incase the
7325 * super_block handler wishes to interpret them.
7326 */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7327 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7328 {
7329 if (info->raid_disks == 0) {
7330 /* just setting version number for superblock loading */
7331 if (info->major_version < 0 ||
7332 info->major_version >= ARRAY_SIZE(super_types) ||
7333 super_types[info->major_version].name == NULL) {
7334 /* maybe try to auto-load a module? */
7335 pr_warn("md: superblock version %d not known\n",
7336 info->major_version);
7337 return -EINVAL;
7338 }
7339 mddev->major_version = info->major_version;
7340 mddev->minor_version = info->minor_version;
7341 mddev->patch_version = info->patch_version;
7342 mddev->persistent = !info->not_persistent;
7343 /* ensure mddev_put doesn't delete this now that there
7344 * is some minimal configuration.
7345 */
7346 mddev->ctime = ktime_get_real_seconds();
7347 return 0;
7348 }
7349 mddev->major_version = MD_MAJOR_VERSION;
7350 mddev->minor_version = MD_MINOR_VERSION;
7351 mddev->patch_version = MD_PATCHLEVEL_VERSION;
7352 mddev->ctime = ktime_get_real_seconds();
7353
7354 mddev->level = info->level;
7355 mddev->clevel[0] = 0;
7356 mddev->dev_sectors = 2 * (sector_t)info->size;
7357 mddev->raid_disks = info->raid_disks;
7358 /* don't set md_minor, it is determined by which /dev/md* was
7359 * openned
7360 */
7361 if (info->state & (1<<MD_SB_CLEAN))
7362 mddev->recovery_cp = MaxSector;
7363 else
7364 mddev->recovery_cp = 0;
7365 mddev->persistent = ! info->not_persistent;
7366 mddev->external = 0;
7367
7368 mddev->layout = info->layout;
7369 if (mddev->level == 0)
7370 /* Cannot trust RAID0 layout info here */
7371 mddev->layout = -1;
7372 mddev->chunk_sectors = info->chunk_size >> 9;
7373
7374 if (mddev->persistent) {
7375 mddev->max_disks = MD_SB_DISKS;
7376 mddev->flags = 0;
7377 mddev->sb_flags = 0;
7378 }
7379 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7380
7381 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7382 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7383 mddev->bitmap_info.offset = 0;
7384
7385 mddev->reshape_position = MaxSector;
7386
7387 /*
7388 * Generate a 128 bit UUID
7389 */
7390 get_random_bytes(mddev->uuid, 16);
7391
7392 mddev->new_level = mddev->level;
7393 mddev->new_chunk_sectors = mddev->chunk_sectors;
7394 mddev->new_layout = mddev->layout;
7395 mddev->delta_disks = 0;
7396 mddev->reshape_backwards = 0;
7397
7398 return 0;
7399 }
7400
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7401 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7402 {
7403 lockdep_assert_held(&mddev->reconfig_mutex);
7404
7405 if (mddev->external_size)
7406 return;
7407
7408 mddev->array_sectors = array_sectors;
7409 }
7410 EXPORT_SYMBOL(md_set_array_sectors);
7411
update_size(struct mddev * mddev,sector_t num_sectors)7412 static int update_size(struct mddev *mddev, sector_t num_sectors)
7413 {
7414 struct md_rdev *rdev;
7415 int rv;
7416 int fit = (num_sectors == 0);
7417 sector_t old_dev_sectors = mddev->dev_sectors;
7418
7419 if (mddev->pers->resize == NULL)
7420 return -EINVAL;
7421 /* The "num_sectors" is the number of sectors of each device that
7422 * is used. This can only make sense for arrays with redundancy.
7423 * linear and raid0 always use whatever space is available. We can only
7424 * consider changing this number if no resync or reconstruction is
7425 * happening, and if the new size is acceptable. It must fit before the
7426 * sb_start or, if that is <data_offset, it must fit before the size
7427 * of each device. If num_sectors is zero, we find the largest size
7428 * that fits.
7429 */
7430 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7431 return -EBUSY;
7432 if (!md_is_rdwr(mddev))
7433 return -EROFS;
7434
7435 rdev_for_each(rdev, mddev) {
7436 sector_t avail = rdev->sectors;
7437
7438 if (fit && (num_sectors == 0 || num_sectors > avail))
7439 num_sectors = avail;
7440 if (avail < num_sectors)
7441 return -ENOSPC;
7442 }
7443 rv = mddev->pers->resize(mddev, num_sectors);
7444 if (!rv) {
7445 if (mddev_is_clustered(mddev))
7446 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
7447 else if (!mddev_is_dm(mddev))
7448 set_capacity_and_notify(mddev->gendisk,
7449 mddev->array_sectors);
7450 }
7451 return rv;
7452 }
7453
update_raid_disks(struct mddev * mddev,int raid_disks)7454 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7455 {
7456 int rv;
7457 struct md_rdev *rdev;
7458 /* change the number of raid disks */
7459 if (mddev->pers->check_reshape == NULL)
7460 return -EINVAL;
7461 if (!md_is_rdwr(mddev))
7462 return -EROFS;
7463 if (raid_disks <= 0 ||
7464 (mddev->max_disks && raid_disks >= mddev->max_disks))
7465 return -EINVAL;
7466 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7467 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7468 mddev->reshape_position != MaxSector)
7469 return -EBUSY;
7470
7471 rdev_for_each(rdev, mddev) {
7472 if (mddev->raid_disks < raid_disks &&
7473 rdev->data_offset < rdev->new_data_offset)
7474 return -EINVAL;
7475 if (mddev->raid_disks > raid_disks &&
7476 rdev->data_offset > rdev->new_data_offset)
7477 return -EINVAL;
7478 }
7479
7480 mddev->delta_disks = raid_disks - mddev->raid_disks;
7481 if (mddev->delta_disks < 0)
7482 mddev->reshape_backwards = 1;
7483 else if (mddev->delta_disks > 0)
7484 mddev->reshape_backwards = 0;
7485
7486 rv = mddev->pers->check_reshape(mddev);
7487 if (rv < 0) {
7488 mddev->delta_disks = 0;
7489 mddev->reshape_backwards = 0;
7490 }
7491 return rv;
7492 }
7493
get_cluster_ops(struct mddev * mddev)7494 static int get_cluster_ops(struct mddev *mddev)
7495 {
7496 xa_lock(&md_submodule);
7497 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
7498 if (mddev->cluster_ops &&
7499 !try_module_get(mddev->cluster_ops->head.owner))
7500 mddev->cluster_ops = NULL;
7501 xa_unlock(&md_submodule);
7502
7503 return mddev->cluster_ops == NULL ? -ENOENT : 0;
7504 }
7505
put_cluster_ops(struct mddev * mddev)7506 static void put_cluster_ops(struct mddev *mddev)
7507 {
7508 if (!mddev->cluster_ops)
7509 return;
7510
7511 mddev->cluster_ops->leave(mddev);
7512 module_put(mddev->cluster_ops->head.owner);
7513 mddev->cluster_ops = NULL;
7514 }
7515
7516 /*
7517 * update_array_info is used to change the configuration of an
7518 * on-line array.
7519 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7520 * fields in the info are checked against the array.
7521 * Any differences that cannot be handled will cause an error.
7522 * Normally, only one change can be managed at a time.
7523 */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7524 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7525 {
7526 int rv = 0;
7527 int cnt = 0;
7528 int state = 0;
7529
7530 /* calculate expected state,ignoring low bits */
7531 if (mddev->bitmap && mddev->bitmap_info.offset)
7532 state |= (1 << MD_SB_BITMAP_PRESENT);
7533
7534 if (mddev->major_version != info->major_version ||
7535 mddev->minor_version != info->minor_version ||
7536 /* mddev->patch_version != info->patch_version || */
7537 mddev->ctime != info->ctime ||
7538 mddev->level != info->level ||
7539 /* mddev->layout != info->layout || */
7540 mddev->persistent != !info->not_persistent ||
7541 mddev->chunk_sectors != info->chunk_size >> 9 ||
7542 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7543 ((state^info->state) & 0xfffffe00)
7544 )
7545 return -EINVAL;
7546 /* Check there is only one change */
7547 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7548 cnt++;
7549 if (mddev->raid_disks != info->raid_disks)
7550 cnt++;
7551 if (mddev->layout != info->layout)
7552 cnt++;
7553 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7554 cnt++;
7555 if (cnt == 0)
7556 return 0;
7557 if (cnt > 1)
7558 return -EINVAL;
7559
7560 if (mddev->layout != info->layout) {
7561 /* Change layout
7562 * we don't need to do anything at the md level, the
7563 * personality will take care of it all.
7564 */
7565 if (mddev->pers->check_reshape == NULL)
7566 return -EINVAL;
7567 else {
7568 mddev->new_layout = info->layout;
7569 rv = mddev->pers->check_reshape(mddev);
7570 if (rv)
7571 mddev->new_layout = mddev->layout;
7572 return rv;
7573 }
7574 }
7575 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7576 rv = update_size(mddev, (sector_t)info->size * 2);
7577
7578 if (mddev->raid_disks != info->raid_disks)
7579 rv = update_raid_disks(mddev, info->raid_disks);
7580
7581 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7582 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7583 rv = -EINVAL;
7584 goto err;
7585 }
7586 if (mddev->recovery || mddev->sync_thread) {
7587 rv = -EBUSY;
7588 goto err;
7589 }
7590 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7591 /* add the bitmap */
7592 if (mddev->bitmap) {
7593 rv = -EEXIST;
7594 goto err;
7595 }
7596 if (mddev->bitmap_info.default_offset == 0) {
7597 rv = -EINVAL;
7598 goto err;
7599 }
7600 mddev->bitmap_info.offset =
7601 mddev->bitmap_info.default_offset;
7602 mddev->bitmap_info.space =
7603 mddev->bitmap_info.default_space;
7604 rv = mddev->bitmap_ops->create(mddev);
7605 if (!rv)
7606 rv = mddev->bitmap_ops->load(mddev);
7607
7608 if (rv)
7609 mddev->bitmap_ops->destroy(mddev);
7610 } else {
7611 struct md_bitmap_stats stats;
7612
7613 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
7614 if (rv)
7615 goto err;
7616
7617 if (stats.file) {
7618 rv = -EINVAL;
7619 goto err;
7620 }
7621
7622 if (mddev->bitmap_info.nodes) {
7623 /* hold PW on all the bitmap lock */
7624 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7625 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7626 rv = -EPERM;
7627 mddev->cluster_ops->unlock_all_bitmaps(mddev);
7628 goto err;
7629 }
7630
7631 mddev->bitmap_info.nodes = 0;
7632 put_cluster_ops(mddev);
7633 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7634 }
7635 mddev->bitmap_ops->destroy(mddev);
7636 mddev->bitmap_info.offset = 0;
7637 }
7638 }
7639 md_update_sb(mddev, 1);
7640 return rv;
7641 err:
7642 return rv;
7643 }
7644
set_disk_faulty(struct mddev * mddev,dev_t dev)7645 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7646 {
7647 struct md_rdev *rdev;
7648 int err = 0;
7649
7650 if (mddev->pers == NULL)
7651 return -ENODEV;
7652
7653 rcu_read_lock();
7654 rdev = md_find_rdev_rcu(mddev, dev);
7655 if (!rdev)
7656 err = -ENODEV;
7657 else {
7658 md_error(mddev, rdev);
7659 if (test_bit(MD_BROKEN, &mddev->flags))
7660 err = -EBUSY;
7661 }
7662 rcu_read_unlock();
7663 return err;
7664 }
7665
7666 /*
7667 * We have a problem here : there is no easy way to give a CHS
7668 * virtual geometry. We currently pretend that we have a 2 heads
7669 * 4 sectors (with a BIG number of cylinders...). This drives
7670 * dosfs just mad... ;-)
7671 */
md_getgeo(struct block_device * bdev,struct hd_geometry * geo)7672 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7673 {
7674 struct mddev *mddev = bdev->bd_disk->private_data;
7675
7676 geo->heads = 2;
7677 geo->sectors = 4;
7678 geo->cylinders = mddev->array_sectors / 8;
7679 return 0;
7680 }
7681
md_ioctl_valid(unsigned int cmd)7682 static inline int md_ioctl_valid(unsigned int cmd)
7683 {
7684 switch (cmd) {
7685 case GET_ARRAY_INFO:
7686 case GET_DISK_INFO:
7687 case RAID_VERSION:
7688 return 0;
7689 case ADD_NEW_DISK:
7690 case GET_BITMAP_FILE:
7691 case HOT_ADD_DISK:
7692 case HOT_REMOVE_DISK:
7693 case RESTART_ARRAY_RW:
7694 case RUN_ARRAY:
7695 case SET_ARRAY_INFO:
7696 case SET_BITMAP_FILE:
7697 case SET_DISK_FAULTY:
7698 case STOP_ARRAY:
7699 case STOP_ARRAY_RO:
7700 case CLUSTERED_DISK_NACK:
7701 if (!capable(CAP_SYS_ADMIN))
7702 return -EACCES;
7703 return 0;
7704 default:
7705 return -ENOTTY;
7706 }
7707 }
7708
md_ioctl_need_suspend(unsigned int cmd)7709 static bool md_ioctl_need_suspend(unsigned int cmd)
7710 {
7711 switch (cmd) {
7712 case ADD_NEW_DISK:
7713 case HOT_ADD_DISK:
7714 case HOT_REMOVE_DISK:
7715 case SET_BITMAP_FILE:
7716 case SET_ARRAY_INFO:
7717 return true;
7718 default:
7719 return false;
7720 }
7721 }
7722
__md_set_array_info(struct mddev * mddev,void __user * argp)7723 static int __md_set_array_info(struct mddev *mddev, void __user *argp)
7724 {
7725 mdu_array_info_t info;
7726 int err;
7727
7728 if (!argp)
7729 memset(&info, 0, sizeof(info));
7730 else if (copy_from_user(&info, argp, sizeof(info)))
7731 return -EFAULT;
7732
7733 if (mddev->pers) {
7734 err = update_array_info(mddev, &info);
7735 if (err)
7736 pr_warn("md: couldn't update array info. %d\n", err);
7737 return err;
7738 }
7739
7740 if (!list_empty(&mddev->disks)) {
7741 pr_warn("md: array %s already has disks!\n", mdname(mddev));
7742 return -EBUSY;
7743 }
7744
7745 if (mddev->raid_disks) {
7746 pr_warn("md: array %s already initialised!\n", mdname(mddev));
7747 return -EBUSY;
7748 }
7749
7750 err = md_set_array_info(mddev, &info);
7751 if (err)
7752 pr_warn("md: couldn't set array info. %d\n", err);
7753
7754 return err;
7755 }
7756
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7757 static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
7758 unsigned int cmd, unsigned long arg)
7759 {
7760 int err = 0;
7761 void __user *argp = (void __user *)arg;
7762 struct mddev *mddev = NULL;
7763
7764 err = md_ioctl_valid(cmd);
7765 if (err)
7766 return err;
7767
7768 /*
7769 * Commands dealing with the RAID driver but not any
7770 * particular array:
7771 */
7772 if (cmd == RAID_VERSION)
7773 return get_version(argp);
7774
7775 /*
7776 * Commands creating/starting a new array:
7777 */
7778
7779 mddev = bdev->bd_disk->private_data;
7780
7781 /* Some actions do not requires the mutex */
7782 switch (cmd) {
7783 case GET_ARRAY_INFO:
7784 if (!mddev->raid_disks && !mddev->external)
7785 return -ENODEV;
7786 return get_array_info(mddev, argp);
7787
7788 case GET_DISK_INFO:
7789 if (!mddev->raid_disks && !mddev->external)
7790 return -ENODEV;
7791 return get_disk_info(mddev, argp);
7792
7793 case SET_DISK_FAULTY:
7794 return set_disk_faulty(mddev, new_decode_dev(arg));
7795
7796 case GET_BITMAP_FILE:
7797 return get_bitmap_file(mddev, argp);
7798 }
7799
7800 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7801 /* Need to flush page cache, and ensure no-one else opens
7802 * and writes
7803 */
7804 err = mddev_set_closing_and_sync_blockdev(mddev, 1);
7805 if (err)
7806 return err;
7807 }
7808
7809 if (!md_is_rdwr(mddev))
7810 flush_work(&mddev->sync_work);
7811
7812 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
7813 mddev_lock(mddev);
7814 if (err) {
7815 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7816 err, cmd);
7817 goto out;
7818 }
7819
7820 if (cmd == SET_ARRAY_INFO) {
7821 err = __md_set_array_info(mddev, argp);
7822 goto unlock;
7823 }
7824
7825 /*
7826 * Commands querying/configuring an existing array:
7827 */
7828 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7829 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7830 if ((!mddev->raid_disks && !mddev->external)
7831 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7832 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7833 && cmd != GET_BITMAP_FILE) {
7834 err = -ENODEV;
7835 goto unlock;
7836 }
7837
7838 /*
7839 * Commands even a read-only array can execute:
7840 */
7841 switch (cmd) {
7842 case RESTART_ARRAY_RW:
7843 err = restart_array(mddev);
7844 goto unlock;
7845
7846 case STOP_ARRAY:
7847 err = do_md_stop(mddev, 0);
7848 goto unlock;
7849
7850 case STOP_ARRAY_RO:
7851 if (mddev->pers)
7852 err = md_set_readonly(mddev);
7853 goto unlock;
7854
7855 case HOT_REMOVE_DISK:
7856 err = hot_remove_disk(mddev, new_decode_dev(arg));
7857 goto unlock;
7858
7859 case ADD_NEW_DISK:
7860 /* We can support ADD_NEW_DISK on read-only arrays
7861 * only if we are re-adding a preexisting device.
7862 * So require mddev->pers and MD_DISK_SYNC.
7863 */
7864 if (mddev->pers) {
7865 mdu_disk_info_t info;
7866 if (copy_from_user(&info, argp, sizeof(info)))
7867 err = -EFAULT;
7868 else if (!(info.state & (1<<MD_DISK_SYNC)))
7869 /* Need to clear read-only for this */
7870 break;
7871 else
7872 err = md_add_new_disk(mddev, &info);
7873 goto unlock;
7874 }
7875 break;
7876 }
7877
7878 /*
7879 * The remaining ioctls are changing the state of the
7880 * superblock, so we do not allow them on read-only arrays.
7881 */
7882 if (!md_is_rdwr(mddev) && mddev->pers) {
7883 if (mddev->ro != MD_AUTO_READ) {
7884 err = -EROFS;
7885 goto unlock;
7886 }
7887 mddev->ro = MD_RDWR;
7888 sysfs_notify_dirent_safe(mddev->sysfs_state);
7889 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7890 /* mddev_unlock will wake thread */
7891 /* If a device failed while we were read-only, we
7892 * need to make sure the metadata is updated now.
7893 */
7894 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7895 mddev_unlock(mddev);
7896 wait_event(mddev->sb_wait,
7897 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7898 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7899 mddev_lock_nointr(mddev);
7900 }
7901 }
7902
7903 switch (cmd) {
7904 case ADD_NEW_DISK:
7905 {
7906 mdu_disk_info_t info;
7907 if (copy_from_user(&info, argp, sizeof(info)))
7908 err = -EFAULT;
7909 else
7910 err = md_add_new_disk(mddev, &info);
7911 goto unlock;
7912 }
7913
7914 case CLUSTERED_DISK_NACK:
7915 if (mddev_is_clustered(mddev))
7916 mddev->cluster_ops->new_disk_ack(mddev, false);
7917 else
7918 err = -EINVAL;
7919 goto unlock;
7920
7921 case HOT_ADD_DISK:
7922 err = hot_add_disk(mddev, new_decode_dev(arg));
7923 goto unlock;
7924
7925 case RUN_ARRAY:
7926 err = do_md_run(mddev);
7927 goto unlock;
7928
7929 case SET_BITMAP_FILE:
7930 err = set_bitmap_file(mddev, (int)arg);
7931 goto unlock;
7932
7933 default:
7934 err = -EINVAL;
7935 goto unlock;
7936 }
7937
7938 unlock:
7939 if (mddev->hold_active == UNTIL_IOCTL &&
7940 err != -EINVAL)
7941 mddev->hold_active = 0;
7942
7943 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
7944 mddev_unlock(mddev);
7945
7946 out:
7947 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
7948 clear_bit(MD_CLOSING, &mddev->flags);
7949 return err;
7950 }
7951 #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7952 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
7953 unsigned int cmd, unsigned long arg)
7954 {
7955 switch (cmd) {
7956 case HOT_REMOVE_DISK:
7957 case HOT_ADD_DISK:
7958 case SET_DISK_FAULTY:
7959 case SET_BITMAP_FILE:
7960 /* These take in integer arg, do not convert */
7961 break;
7962 default:
7963 arg = (unsigned long)compat_ptr(arg);
7964 break;
7965 }
7966
7967 return md_ioctl(bdev, mode, cmd, arg);
7968 }
7969 #endif /* CONFIG_COMPAT */
7970
md_set_read_only(struct block_device * bdev,bool ro)7971 static int md_set_read_only(struct block_device *bdev, bool ro)
7972 {
7973 struct mddev *mddev = bdev->bd_disk->private_data;
7974 int err;
7975
7976 err = mddev_lock(mddev);
7977 if (err)
7978 return err;
7979
7980 if (!mddev->raid_disks && !mddev->external) {
7981 err = -ENODEV;
7982 goto out_unlock;
7983 }
7984
7985 /*
7986 * Transitioning to read-auto need only happen for arrays that call
7987 * md_write_start and which are not ready for writes yet.
7988 */
7989 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
7990 err = restart_array(mddev);
7991 if (err)
7992 goto out_unlock;
7993 mddev->ro = MD_AUTO_READ;
7994 }
7995
7996 out_unlock:
7997 mddev_unlock(mddev);
7998 return err;
7999 }
8000
md_open(struct gendisk * disk,blk_mode_t mode)8001 static int md_open(struct gendisk *disk, blk_mode_t mode)
8002 {
8003 struct mddev *mddev;
8004 int err;
8005
8006 spin_lock(&all_mddevs_lock);
8007 mddev = mddev_get(disk->private_data);
8008 spin_unlock(&all_mddevs_lock);
8009 if (!mddev)
8010 return -ENODEV;
8011
8012 err = mutex_lock_interruptible(&mddev->open_mutex);
8013 if (err)
8014 goto out;
8015
8016 err = -ENODEV;
8017 if (test_bit(MD_CLOSING, &mddev->flags))
8018 goto out_unlock;
8019
8020 atomic_inc(&mddev->openers);
8021 mutex_unlock(&mddev->open_mutex);
8022
8023 disk_check_media_change(disk);
8024 return 0;
8025
8026 out_unlock:
8027 mutex_unlock(&mddev->open_mutex);
8028 out:
8029 mddev_put(mddev);
8030 return err;
8031 }
8032
md_release(struct gendisk * disk)8033 static void md_release(struct gendisk *disk)
8034 {
8035 struct mddev *mddev = disk->private_data;
8036
8037 BUG_ON(!mddev);
8038 atomic_dec(&mddev->openers);
8039 mddev_put(mddev);
8040 }
8041
md_check_events(struct gendisk * disk,unsigned int clearing)8042 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
8043 {
8044 struct mddev *mddev = disk->private_data;
8045 unsigned int ret = 0;
8046
8047 if (mddev->changed)
8048 ret = DISK_EVENT_MEDIA_CHANGE;
8049 mddev->changed = 0;
8050 return ret;
8051 }
8052
md_free_disk(struct gendisk * disk)8053 static void md_free_disk(struct gendisk *disk)
8054 {
8055 struct mddev *mddev = disk->private_data;
8056
8057 mddev_free(mddev);
8058 }
8059
8060 const struct block_device_operations md_fops =
8061 {
8062 .owner = THIS_MODULE,
8063 .submit_bio = md_submit_bio,
8064 .open = md_open,
8065 .release = md_release,
8066 .ioctl = md_ioctl,
8067 #ifdef CONFIG_COMPAT
8068 .compat_ioctl = md_compat_ioctl,
8069 #endif
8070 .getgeo = md_getgeo,
8071 .check_events = md_check_events,
8072 .set_read_only = md_set_read_only,
8073 .free_disk = md_free_disk,
8074 };
8075
md_thread(void * arg)8076 static int md_thread(void *arg)
8077 {
8078 struct md_thread *thread = arg;
8079
8080 /*
8081 * md_thread is a 'system-thread', it's priority should be very
8082 * high. We avoid resource deadlocks individually in each
8083 * raid personality. (RAID5 does preallocation) We also use RR and
8084 * the very same RT priority as kswapd, thus we will never get
8085 * into a priority inversion deadlock.
8086 *
8087 * we definitely have to have equal or higher priority than
8088 * bdflush, otherwise bdflush will deadlock if there are too
8089 * many dirty RAID5 blocks.
8090 */
8091
8092 allow_signal(SIGKILL);
8093 while (!kthread_should_stop()) {
8094
8095 /* We need to wait INTERRUPTIBLE so that
8096 * we don't add to the load-average.
8097 * That means we need to be sure no signals are
8098 * pending
8099 */
8100 if (signal_pending(current))
8101 flush_signals(current);
8102
8103 wait_event_interruptible_timeout
8104 (thread->wqueue,
8105 test_bit(THREAD_WAKEUP, &thread->flags)
8106 || kthread_should_stop() || kthread_should_park(),
8107 thread->timeout);
8108
8109 clear_bit(THREAD_WAKEUP, &thread->flags);
8110 if (kthread_should_park())
8111 kthread_parkme();
8112 if (!kthread_should_stop())
8113 thread->run(thread);
8114 }
8115
8116 return 0;
8117 }
8118
md_wakeup_thread_directly(struct md_thread __rcu * thread)8119 static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
8120 {
8121 struct md_thread *t;
8122
8123 rcu_read_lock();
8124 t = rcu_dereference(thread);
8125 if (t)
8126 wake_up_process(t->tsk);
8127 rcu_read_unlock();
8128 }
8129
md_wakeup_thread(struct md_thread __rcu * thread)8130 void md_wakeup_thread(struct md_thread __rcu *thread)
8131 {
8132 struct md_thread *t;
8133
8134 rcu_read_lock();
8135 t = rcu_dereference(thread);
8136 if (t) {
8137 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8138 set_bit(THREAD_WAKEUP, &t->flags);
8139 if (wq_has_sleeper(&t->wqueue))
8140 wake_up(&t->wqueue);
8141 }
8142 rcu_read_unlock();
8143 }
8144 EXPORT_SYMBOL(md_wakeup_thread);
8145
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8146 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8147 struct mddev *mddev, const char *name)
8148 {
8149 struct md_thread *thread;
8150
8151 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8152 if (!thread)
8153 return NULL;
8154
8155 init_waitqueue_head(&thread->wqueue);
8156
8157 thread->run = run;
8158 thread->mddev = mddev;
8159 thread->timeout = MAX_SCHEDULE_TIMEOUT;
8160 thread->tsk = kthread_run(md_thread, thread,
8161 "%s_%s",
8162 mdname(thread->mddev),
8163 name);
8164 if (IS_ERR(thread->tsk)) {
8165 kfree(thread);
8166 return NULL;
8167 }
8168 return thread;
8169 }
8170 EXPORT_SYMBOL(md_register_thread);
8171
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8172 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8173 {
8174 struct md_thread *thread = rcu_dereference_protected(*threadp,
8175 lockdep_is_held(&mddev->reconfig_mutex));
8176
8177 if (!thread)
8178 return;
8179
8180 rcu_assign_pointer(*threadp, NULL);
8181 synchronize_rcu();
8182
8183 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8184 kthread_stop(thread->tsk);
8185 kfree(thread);
8186 }
8187 EXPORT_SYMBOL(md_unregister_thread);
8188
md_error(struct mddev * mddev,struct md_rdev * rdev)8189 void md_error(struct mddev *mddev, struct md_rdev *rdev)
8190 {
8191 if (!rdev || test_bit(Faulty, &rdev->flags))
8192 return;
8193
8194 if (!mddev->pers || !mddev->pers->error_handler)
8195 return;
8196 mddev->pers->error_handler(mddev, rdev);
8197
8198 if (mddev->pers->head.id == ID_RAID0 ||
8199 mddev->pers->head.id == ID_LINEAR)
8200 return;
8201
8202 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8203 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8204 sysfs_notify_dirent_safe(rdev->sysfs_state);
8205 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8206 if (!test_bit(MD_BROKEN, &mddev->flags)) {
8207 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8208 md_wakeup_thread(mddev->thread);
8209 }
8210 if (mddev->event_work.func)
8211 queue_work(md_misc_wq, &mddev->event_work);
8212 md_new_event();
8213 }
8214 EXPORT_SYMBOL(md_error);
8215
8216 /* seq_file implementation /proc/mdstat */
8217
status_unused(struct seq_file * seq)8218 static void status_unused(struct seq_file *seq)
8219 {
8220 int i = 0;
8221 struct md_rdev *rdev;
8222
8223 seq_printf(seq, "unused devices: ");
8224
8225 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8226 i++;
8227 seq_printf(seq, "%pg ", rdev->bdev);
8228 }
8229 if (!i)
8230 seq_printf(seq, "<none>");
8231
8232 seq_printf(seq, "\n");
8233 }
8234
status_personalities(struct seq_file * seq)8235 static void status_personalities(struct seq_file *seq)
8236 {
8237 struct md_submodule_head *head;
8238 unsigned long i;
8239
8240 seq_puts(seq, "Personalities : ");
8241
8242 xa_lock(&md_submodule);
8243 xa_for_each(&md_submodule, i, head)
8244 if (head->type == MD_PERSONALITY)
8245 seq_printf(seq, "[%s] ", head->name);
8246 xa_unlock(&md_submodule);
8247
8248 seq_puts(seq, "\n");
8249 }
8250
status_resync(struct seq_file * seq,struct mddev * mddev)8251 static int status_resync(struct seq_file *seq, struct mddev *mddev)
8252 {
8253 sector_t max_sectors, resync, res;
8254 unsigned long dt, db = 0;
8255 sector_t rt, curr_mark_cnt, resync_mark_cnt;
8256 int scale, recovery_active;
8257 unsigned int per_milli;
8258
8259 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8260 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8261 max_sectors = mddev->resync_max_sectors;
8262 else
8263 max_sectors = mddev->dev_sectors;
8264
8265 resync = mddev->curr_resync;
8266 if (resync < MD_RESYNC_ACTIVE) {
8267 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8268 /* Still cleaning up */
8269 resync = max_sectors;
8270 } else if (resync > max_sectors) {
8271 resync = max_sectors;
8272 } else {
8273 res = atomic_read(&mddev->recovery_active);
8274 /*
8275 * Resync has started, but the subtraction has overflowed or
8276 * yielded one of the special values. Force it to active to
8277 * ensure the status reports an active resync.
8278 */
8279 if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8280 resync = MD_RESYNC_ACTIVE;
8281 else
8282 resync -= res;
8283 }
8284
8285 if (resync == MD_RESYNC_NONE) {
8286 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8287 struct md_rdev *rdev;
8288
8289 rdev_for_each(rdev, mddev)
8290 if (rdev->raid_disk >= 0 &&
8291 !test_bit(Faulty, &rdev->flags) &&
8292 rdev->recovery_offset != MaxSector &&
8293 rdev->recovery_offset) {
8294 seq_printf(seq, "\trecover=REMOTE");
8295 return 1;
8296 }
8297 if (mddev->reshape_position != MaxSector)
8298 seq_printf(seq, "\treshape=REMOTE");
8299 else
8300 seq_printf(seq, "\tresync=REMOTE");
8301 return 1;
8302 }
8303 if (mddev->recovery_cp < MaxSector) {
8304 seq_printf(seq, "\tresync=PENDING");
8305 return 1;
8306 }
8307 return 0;
8308 }
8309 if (resync < MD_RESYNC_ACTIVE) {
8310 seq_printf(seq, "\tresync=DELAYED");
8311 return 1;
8312 }
8313
8314 WARN_ON(max_sectors == 0);
8315 /* Pick 'scale' such that (resync>>scale)*1000 will fit
8316 * in a sector_t, and (max_sectors>>scale) will fit in a
8317 * u32, as those are the requirements for sector_div.
8318 * Thus 'scale' must be at least 10
8319 */
8320 scale = 10;
8321 if (sizeof(sector_t) > sizeof(unsigned long)) {
8322 while ( max_sectors/2 > (1ULL<<(scale+32)))
8323 scale++;
8324 }
8325 res = (resync>>scale)*1000;
8326 sector_div(res, (u32)((max_sectors>>scale)+1));
8327
8328 per_milli = res;
8329 {
8330 int i, x = per_milli/50, y = 20-x;
8331 seq_printf(seq, "[");
8332 for (i = 0; i < x; i++)
8333 seq_printf(seq, "=");
8334 seq_printf(seq, ">");
8335 for (i = 0; i < y; i++)
8336 seq_printf(seq, ".");
8337 seq_printf(seq, "] ");
8338 }
8339 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8340 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8341 "reshape" :
8342 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8343 "check" :
8344 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8345 "resync" : "recovery"))),
8346 per_milli/10, per_milli % 10,
8347 (unsigned long long) resync/2,
8348 (unsigned long long) max_sectors/2);
8349
8350 /*
8351 * dt: time from mark until now
8352 * db: blocks written from mark until now
8353 * rt: remaining time
8354 *
8355 * rt is a sector_t, which is always 64bit now. We are keeping
8356 * the original algorithm, but it is not really necessary.
8357 *
8358 * Original algorithm:
8359 * So we divide before multiply in case it is 32bit and close
8360 * to the limit.
8361 * We scale the divisor (db) by 32 to avoid losing precision
8362 * near the end of resync when the number of remaining sectors
8363 * is close to 'db'.
8364 * We then divide rt by 32 after multiplying by db to compensate.
8365 * The '+1' avoids division by zero if db is very small.
8366 */
8367 dt = ((jiffies - mddev->resync_mark) / HZ);
8368 if (!dt) dt++;
8369
8370 curr_mark_cnt = mddev->curr_mark_cnt;
8371 recovery_active = atomic_read(&mddev->recovery_active);
8372 resync_mark_cnt = mddev->resync_mark_cnt;
8373
8374 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8375 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8376
8377 rt = max_sectors - resync; /* number of remaining sectors */
8378 rt = div64_u64(rt, db/32+1);
8379 rt *= dt;
8380 rt >>= 5;
8381
8382 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8383 ((unsigned long)rt % 60)/6);
8384
8385 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8386 return 1;
8387 }
8388
md_seq_start(struct seq_file * seq,loff_t * pos)8389 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8390 __acquires(&all_mddevs_lock)
8391 {
8392 seq->poll_event = atomic_read(&md_event_count);
8393 spin_lock(&all_mddevs_lock);
8394
8395 return seq_list_start_head(&all_mddevs, *pos);
8396 }
8397
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8398 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8399 {
8400 return seq_list_next(v, &all_mddevs, pos);
8401 }
8402
md_seq_stop(struct seq_file * seq,void * v)8403 static void md_seq_stop(struct seq_file *seq, void *v)
8404 __releases(&all_mddevs_lock)
8405 {
8406 spin_unlock(&all_mddevs_lock);
8407 }
8408
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8409 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8410 {
8411 struct md_bitmap_stats stats;
8412 unsigned long used_pages;
8413 unsigned long chunk_kb;
8414 int err;
8415
8416 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8417 if (err)
8418 return;
8419
8420 chunk_kb = mddev->bitmap_info.chunksize >> 10;
8421 used_pages = stats.pages - stats.missing_pages;
8422
8423 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8424 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8425 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8426 chunk_kb ? "KB" : "B");
8427
8428 if (stats.file) {
8429 seq_puts(seq, ", file: ");
8430 seq_file_path(seq, stats.file, " \t\n");
8431 }
8432
8433 seq_putc(seq, '\n');
8434 }
8435
md_seq_show(struct seq_file * seq,void * v)8436 static int md_seq_show(struct seq_file *seq, void *v)
8437 {
8438 struct mddev *mddev;
8439 sector_t sectors;
8440 struct md_rdev *rdev;
8441
8442 if (v == &all_mddevs) {
8443 status_personalities(seq);
8444 if (list_empty(&all_mddevs))
8445 status_unused(seq);
8446 return 0;
8447 }
8448
8449 mddev = list_entry(v, struct mddev, all_mddevs);
8450 if (!mddev_get(mddev))
8451 return 0;
8452
8453 spin_unlock(&all_mddevs_lock);
8454
8455 /* prevent bitmap to be freed after checking */
8456 mutex_lock(&mddev->bitmap_info.mutex);
8457
8458 spin_lock(&mddev->lock);
8459 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8460 seq_printf(seq, "%s : ", mdname(mddev));
8461 if (mddev->pers) {
8462 if (test_bit(MD_BROKEN, &mddev->flags))
8463 seq_printf(seq, "broken");
8464 else
8465 seq_printf(seq, "active");
8466 if (mddev->ro == MD_RDONLY)
8467 seq_printf(seq, " (read-only)");
8468 if (mddev->ro == MD_AUTO_READ)
8469 seq_printf(seq, " (auto-read-only)");
8470 seq_printf(seq, " %s", mddev->pers->head.name);
8471 } else {
8472 seq_printf(seq, "inactive");
8473 }
8474
8475 sectors = 0;
8476 rcu_read_lock();
8477 rdev_for_each_rcu(rdev, mddev) {
8478 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8479
8480 if (test_bit(WriteMostly, &rdev->flags))
8481 seq_printf(seq, "(W)");
8482 if (test_bit(Journal, &rdev->flags))
8483 seq_printf(seq, "(J)");
8484 if (test_bit(Faulty, &rdev->flags)) {
8485 seq_printf(seq, "(F)");
8486 continue;
8487 }
8488 if (rdev->raid_disk < 0)
8489 seq_printf(seq, "(S)"); /* spare */
8490 if (test_bit(Replacement, &rdev->flags))
8491 seq_printf(seq, "(R)");
8492 sectors += rdev->sectors;
8493 }
8494 rcu_read_unlock();
8495
8496 if (!list_empty(&mddev->disks)) {
8497 if (mddev->pers)
8498 seq_printf(seq, "\n %llu blocks",
8499 (unsigned long long)
8500 mddev->array_sectors / 2);
8501 else
8502 seq_printf(seq, "\n %llu blocks",
8503 (unsigned long long)sectors / 2);
8504 }
8505 if (mddev->persistent) {
8506 if (mddev->major_version != 0 ||
8507 mddev->minor_version != 90) {
8508 seq_printf(seq," super %d.%d",
8509 mddev->major_version,
8510 mddev->minor_version);
8511 }
8512 } else if (mddev->external)
8513 seq_printf(seq, " super external:%s",
8514 mddev->metadata_type);
8515 else
8516 seq_printf(seq, " super non-persistent");
8517
8518 if (mddev->pers) {
8519 mddev->pers->status(seq, mddev);
8520 seq_printf(seq, "\n ");
8521 if (mddev->pers->sync_request) {
8522 if (status_resync(seq, mddev))
8523 seq_printf(seq, "\n ");
8524 }
8525 } else
8526 seq_printf(seq, "\n ");
8527
8528 md_bitmap_status(seq, mddev);
8529
8530 seq_printf(seq, "\n");
8531 }
8532 spin_unlock(&mddev->lock);
8533 mutex_unlock(&mddev->bitmap_info.mutex);
8534 spin_lock(&all_mddevs_lock);
8535
8536 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8537 status_unused(seq);
8538
8539 mddev_put_locked(mddev);
8540 return 0;
8541 }
8542
8543 static const struct seq_operations md_seq_ops = {
8544 .start = md_seq_start,
8545 .next = md_seq_next,
8546 .stop = md_seq_stop,
8547 .show = md_seq_show,
8548 };
8549
md_seq_open(struct inode * inode,struct file * file)8550 static int md_seq_open(struct inode *inode, struct file *file)
8551 {
8552 struct seq_file *seq;
8553 int error;
8554
8555 error = seq_open(file, &md_seq_ops);
8556 if (error)
8557 return error;
8558
8559 seq = file->private_data;
8560 seq->poll_event = atomic_read(&md_event_count);
8561 return error;
8562 }
8563
8564 static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8565 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8566 {
8567 struct seq_file *seq = filp->private_data;
8568 __poll_t mask;
8569
8570 if (md_unloading)
8571 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8572 poll_wait(filp, &md_event_waiters, wait);
8573
8574 /* always allow read */
8575 mask = EPOLLIN | EPOLLRDNORM;
8576
8577 if (seq->poll_event != atomic_read(&md_event_count))
8578 mask |= EPOLLERR | EPOLLPRI;
8579 return mask;
8580 }
8581
8582 static const struct proc_ops mdstat_proc_ops = {
8583 .proc_open = md_seq_open,
8584 .proc_read = seq_read,
8585 .proc_lseek = seq_lseek,
8586 .proc_release = seq_release,
8587 .proc_poll = mdstat_poll,
8588 };
8589
register_md_submodule(struct md_submodule_head * msh)8590 int register_md_submodule(struct md_submodule_head *msh)
8591 {
8592 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
8593 }
8594 EXPORT_SYMBOL_GPL(register_md_submodule);
8595
unregister_md_submodule(struct md_submodule_head * msh)8596 void unregister_md_submodule(struct md_submodule_head *msh)
8597 {
8598 xa_erase(&md_submodule, msh->id);
8599 }
8600 EXPORT_SYMBOL_GPL(unregister_md_submodule);
8601
md_setup_cluster(struct mddev * mddev,int nodes)8602 int md_setup_cluster(struct mddev *mddev, int nodes)
8603 {
8604 int ret = get_cluster_ops(mddev);
8605
8606 if (ret) {
8607 request_module("md-cluster");
8608 ret = get_cluster_ops(mddev);
8609 }
8610
8611 /* ensure module won't be unloaded */
8612 if (ret) {
8613 pr_warn("can't find md-cluster module or get its reference.\n");
8614 return ret;
8615 }
8616
8617 ret = mddev->cluster_ops->join(mddev, nodes);
8618 if (!ret)
8619 mddev->safemode_delay = 0;
8620 return ret;
8621 }
8622
md_cluster_stop(struct mddev * mddev)8623 void md_cluster_stop(struct mddev *mddev)
8624 {
8625 put_cluster_ops(mddev);
8626 }
8627
is_rdev_holder_idle(struct md_rdev * rdev,bool init)8628 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
8629 {
8630 unsigned long last_events = rdev->last_events;
8631
8632 if (!bdev_is_partition(rdev->bdev))
8633 return true;
8634
8635 /*
8636 * If rdev is partition, and user doesn't issue IO to the array, the
8637 * array is still not idle if user issues IO to other partitions.
8638 */
8639 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
8640 sectors) -
8641 part_stat_read_accum(rdev->bdev, sectors);
8642
8643 return init || rdev->last_events <= last_events;
8644 }
8645
8646 /*
8647 * mddev is idle if following conditions are matched since last check:
8648 * 1) mddev doesn't have normal IO completed;
8649 * 2) mddev doesn't have inflight normal IO;
8650 * 3) if any member disk is partition, and other partitions don't have IO
8651 * completed;
8652 *
8653 * Noted this checking rely on IO accounting is enabled.
8654 */
is_mddev_idle(struct mddev * mddev,int init)8655 static bool is_mddev_idle(struct mddev *mddev, int init)
8656 {
8657 unsigned long last_events = mddev->normal_io_events;
8658 struct gendisk *disk;
8659 struct md_rdev *rdev;
8660 bool idle = true;
8661
8662 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
8663 if (!disk)
8664 return true;
8665
8666 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
8667 if (!init && (mddev->normal_io_events > last_events ||
8668 bdev_count_inflight(disk->part0)))
8669 idle = false;
8670
8671 rcu_read_lock();
8672 rdev_for_each_rcu(rdev, mddev)
8673 if (!is_rdev_holder_idle(rdev, init))
8674 idle = false;
8675 rcu_read_unlock();
8676
8677 return idle;
8678 }
8679
md_done_sync(struct mddev * mddev,int blocks,int ok)8680 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8681 {
8682 /* another "blocks" (512byte) blocks have been synced */
8683 atomic_sub(blocks, &mddev->recovery_active);
8684 wake_up(&mddev->recovery_wait);
8685 if (!ok) {
8686 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8687 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8688 md_wakeup_thread(mddev->thread);
8689 // stop recovery, signal do_sync ....
8690 }
8691 }
8692 EXPORT_SYMBOL(md_done_sync);
8693
8694 /* md_write_start(mddev, bi)
8695 * If we need to update some array metadata (e.g. 'active' flag
8696 * in superblock) before writing, schedule a superblock update
8697 * and wait for it to complete.
8698 * A return value of 'false' means that the write wasn't recorded
8699 * and cannot proceed as the array is being suspend.
8700 */
md_write_start(struct mddev * mddev,struct bio * bi)8701 void md_write_start(struct mddev *mddev, struct bio *bi)
8702 {
8703 int did_change = 0;
8704
8705 if (bio_data_dir(bi) != WRITE)
8706 return;
8707
8708 BUG_ON(mddev->ro == MD_RDONLY);
8709 if (mddev->ro == MD_AUTO_READ) {
8710 /* need to switch to read/write */
8711 mddev->ro = MD_RDWR;
8712 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8713 md_wakeup_thread(mddev->thread);
8714 md_wakeup_thread(mddev->sync_thread);
8715 did_change = 1;
8716 }
8717 rcu_read_lock();
8718 percpu_ref_get(&mddev->writes_pending);
8719 smp_mb(); /* Match smp_mb in set_in_sync() */
8720 if (mddev->safemode == 1)
8721 mddev->safemode = 0;
8722 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8723 if (mddev->in_sync || mddev->sync_checkers) {
8724 spin_lock(&mddev->lock);
8725 if (mddev->in_sync) {
8726 mddev->in_sync = 0;
8727 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8728 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8729 md_wakeup_thread(mddev->thread);
8730 did_change = 1;
8731 }
8732 spin_unlock(&mddev->lock);
8733 }
8734 rcu_read_unlock();
8735 if (did_change)
8736 sysfs_notify_dirent_safe(mddev->sysfs_state);
8737 if (!mddev->has_superblocks)
8738 return;
8739 wait_event(mddev->sb_wait,
8740 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8741 }
8742 EXPORT_SYMBOL(md_write_start);
8743
8744 /* md_write_inc can only be called when md_write_start() has
8745 * already been called at least once of the current request.
8746 * It increments the counter and is useful when a single request
8747 * is split into several parts. Each part causes an increment and
8748 * so needs a matching md_write_end().
8749 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8750 * a spinlocked region.
8751 */
md_write_inc(struct mddev * mddev,struct bio * bi)8752 void md_write_inc(struct mddev *mddev, struct bio *bi)
8753 {
8754 if (bio_data_dir(bi) != WRITE)
8755 return;
8756 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
8757 percpu_ref_get(&mddev->writes_pending);
8758 }
8759 EXPORT_SYMBOL(md_write_inc);
8760
md_write_end(struct mddev * mddev)8761 void md_write_end(struct mddev *mddev)
8762 {
8763 percpu_ref_put(&mddev->writes_pending);
8764
8765 if (mddev->safemode == 2)
8766 md_wakeup_thread(mddev->thread);
8767 else if (mddev->safemode_delay)
8768 /* The roundup() ensures this only performs locking once
8769 * every ->safemode_delay jiffies
8770 */
8771 mod_timer(&mddev->safemode_timer,
8772 roundup(jiffies, mddev->safemode_delay) +
8773 mddev->safemode_delay);
8774 }
8775
8776 EXPORT_SYMBOL(md_write_end);
8777
8778 /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)8779 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8780 struct bio *bio, sector_t start, sector_t size)
8781 {
8782 struct bio *discard_bio = NULL;
8783
8784 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8785 &discard_bio) || !discard_bio)
8786 return;
8787
8788 bio_chain(discard_bio, bio);
8789 bio_clone_blkg_association(discard_bio, bio);
8790 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
8791 submit_bio_noacct(discard_bio);
8792 }
8793 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8794
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)8795 static void md_bitmap_start(struct mddev *mddev,
8796 struct md_io_clone *md_io_clone)
8797 {
8798 if (mddev->pers->bitmap_sector)
8799 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
8800 &md_io_clone->sectors);
8801
8802 mddev->bitmap_ops->start_write(mddev, md_io_clone->offset,
8803 md_io_clone->sectors);
8804 }
8805
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)8806 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
8807 {
8808 mddev->bitmap_ops->end_write(mddev, md_io_clone->offset,
8809 md_io_clone->sectors);
8810 }
8811
md_end_clone_io(struct bio * bio)8812 static void md_end_clone_io(struct bio *bio)
8813 {
8814 struct md_io_clone *md_io_clone = bio->bi_private;
8815 struct bio *orig_bio = md_io_clone->orig_bio;
8816 struct mddev *mddev = md_io_clone->mddev;
8817
8818 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8819 md_bitmap_end(mddev, md_io_clone);
8820
8821 if (bio->bi_status && !orig_bio->bi_status)
8822 orig_bio->bi_status = bio->bi_status;
8823
8824 if (md_io_clone->start_time)
8825 bio_end_io_acct(orig_bio, md_io_clone->start_time);
8826
8827 bio_put(bio);
8828 bio_endio(orig_bio);
8829 percpu_ref_put(&mddev->active_io);
8830 }
8831
md_clone_bio(struct mddev * mddev,struct bio ** bio)8832 static void md_clone_bio(struct mddev *mddev, struct bio **bio)
8833 {
8834 struct block_device *bdev = (*bio)->bi_bdev;
8835 struct md_io_clone *md_io_clone;
8836 struct bio *clone =
8837 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
8838
8839 md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
8840 md_io_clone->orig_bio = *bio;
8841 md_io_clone->mddev = mddev;
8842 if (blk_queue_io_stat(bdev->bd_disk->queue))
8843 md_io_clone->start_time = bio_start_io_acct(*bio);
8844
8845 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
8846 md_io_clone->offset = (*bio)->bi_iter.bi_sector;
8847 md_io_clone->sectors = bio_sectors(*bio);
8848 md_bitmap_start(mddev, md_io_clone);
8849 }
8850
8851 clone->bi_end_io = md_end_clone_io;
8852 clone->bi_private = md_io_clone;
8853 *bio = clone;
8854 }
8855
md_account_bio(struct mddev * mddev,struct bio ** bio)8856 void md_account_bio(struct mddev *mddev, struct bio **bio)
8857 {
8858 percpu_ref_get(&mddev->active_io);
8859 md_clone_bio(mddev, bio);
8860 }
8861 EXPORT_SYMBOL_GPL(md_account_bio);
8862
md_free_cloned_bio(struct bio * bio)8863 void md_free_cloned_bio(struct bio *bio)
8864 {
8865 struct md_io_clone *md_io_clone = bio->bi_private;
8866 struct bio *orig_bio = md_io_clone->orig_bio;
8867 struct mddev *mddev = md_io_clone->mddev;
8868
8869 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8870 md_bitmap_end(mddev, md_io_clone);
8871
8872 if (bio->bi_status && !orig_bio->bi_status)
8873 orig_bio->bi_status = bio->bi_status;
8874
8875 if (md_io_clone->start_time)
8876 bio_end_io_acct(orig_bio, md_io_clone->start_time);
8877
8878 bio_put(bio);
8879 percpu_ref_put(&mddev->active_io);
8880 }
8881 EXPORT_SYMBOL_GPL(md_free_cloned_bio);
8882
8883 /* md_allow_write(mddev)
8884 * Calling this ensures that the array is marked 'active' so that writes
8885 * may proceed without blocking. It is important to call this before
8886 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8887 * Must be called with mddev_lock held.
8888 */
md_allow_write(struct mddev * mddev)8889 void md_allow_write(struct mddev *mddev)
8890 {
8891 if (!mddev->pers)
8892 return;
8893 if (!md_is_rdwr(mddev))
8894 return;
8895 if (!mddev->pers->sync_request)
8896 return;
8897
8898 spin_lock(&mddev->lock);
8899 if (mddev->in_sync) {
8900 mddev->in_sync = 0;
8901 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8902 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8903 if (mddev->safemode_delay &&
8904 mddev->safemode == 0)
8905 mddev->safemode = 1;
8906 spin_unlock(&mddev->lock);
8907 md_update_sb(mddev, 0);
8908 sysfs_notify_dirent_safe(mddev->sysfs_state);
8909 /* wait for the dirty state to be recorded in the metadata */
8910 wait_event(mddev->sb_wait,
8911 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8912 } else
8913 spin_unlock(&mddev->lock);
8914 }
8915 EXPORT_SYMBOL_GPL(md_allow_write);
8916
md_sync_max_sectors(struct mddev * mddev,enum sync_action action)8917 static sector_t md_sync_max_sectors(struct mddev *mddev,
8918 enum sync_action action)
8919 {
8920 switch (action) {
8921 case ACTION_RESYNC:
8922 case ACTION_CHECK:
8923 case ACTION_REPAIR:
8924 atomic64_set(&mddev->resync_mismatches, 0);
8925 fallthrough;
8926 case ACTION_RESHAPE:
8927 return mddev->resync_max_sectors;
8928 case ACTION_RECOVER:
8929 return mddev->dev_sectors;
8930 default:
8931 return 0;
8932 }
8933 }
8934
md_sync_position(struct mddev * mddev,enum sync_action action)8935 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
8936 {
8937 sector_t start = 0;
8938 struct md_rdev *rdev;
8939
8940 switch (action) {
8941 case ACTION_CHECK:
8942 case ACTION_REPAIR:
8943 return mddev->resync_min;
8944 case ACTION_RESYNC:
8945 if (!mddev->bitmap)
8946 return mddev->recovery_cp;
8947 return 0;
8948 case ACTION_RESHAPE:
8949 /*
8950 * If the original node aborts reshaping then we continue the
8951 * reshaping, so set again to avoid restart reshape from the
8952 * first beginning
8953 */
8954 if (mddev_is_clustered(mddev) &&
8955 mddev->reshape_position != MaxSector)
8956 return mddev->reshape_position;
8957 return 0;
8958 case ACTION_RECOVER:
8959 start = MaxSector;
8960 rcu_read_lock();
8961 rdev_for_each_rcu(rdev, mddev)
8962 if (rdev->raid_disk >= 0 &&
8963 !test_bit(Journal, &rdev->flags) &&
8964 !test_bit(Faulty, &rdev->flags) &&
8965 !test_bit(In_sync, &rdev->flags) &&
8966 rdev->recovery_offset < start)
8967 start = rdev->recovery_offset;
8968 rcu_read_unlock();
8969
8970 /* If there is a bitmap, we need to make sure all
8971 * writes that started before we added a spare
8972 * complete before we start doing a recovery.
8973 * Otherwise the write might complete and (via
8974 * bitmap_endwrite) set a bit in the bitmap after the
8975 * recovery has checked that bit and skipped that
8976 * region.
8977 */
8978 if (mddev->bitmap) {
8979 mddev->pers->quiesce(mddev, 1);
8980 mddev->pers->quiesce(mddev, 0);
8981 }
8982 return start;
8983 default:
8984 return MaxSector;
8985 }
8986 }
8987
sync_io_within_limit(struct mddev * mddev)8988 static bool sync_io_within_limit(struct mddev *mddev)
8989 {
8990 int io_sectors;
8991
8992 /*
8993 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
8994 * RESYNC_PAGES(64k) per IO.
8995 */
8996 if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
8997 io_sectors = 8;
8998 else
8999 io_sectors = 128;
9000
9001 return atomic_read(&mddev->recovery_active) <
9002 io_sectors * sync_io_depth(mddev);
9003 }
9004
9005 #define SYNC_MARKS 10
9006 #define SYNC_MARK_STEP (3*HZ)
9007 #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)9008 void md_do_sync(struct md_thread *thread)
9009 {
9010 struct mddev *mddev = thread->mddev;
9011 struct mddev *mddev2;
9012 unsigned int currspeed = 0, window;
9013 sector_t max_sectors,j, io_sectors, recovery_done;
9014 unsigned long mark[SYNC_MARKS];
9015 unsigned long update_time;
9016 sector_t mark_cnt[SYNC_MARKS];
9017 int last_mark,m;
9018 sector_t last_check;
9019 int skipped = 0;
9020 struct md_rdev *rdev;
9021 enum sync_action action;
9022 const char *desc;
9023 struct blk_plug plug;
9024 int ret;
9025
9026 /* just incase thread restarts... */
9027 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9028 return;
9029
9030 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9031 goto skip;
9032
9033 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
9034 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
9035 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9036 goto skip;
9037 }
9038
9039 if (mddev_is_clustered(mddev)) {
9040 ret = mddev->cluster_ops->resync_start(mddev);
9041 if (ret)
9042 goto skip;
9043
9044 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
9045 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
9046 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
9047 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
9048 && ((unsigned long long)mddev->curr_resync_completed
9049 < (unsigned long long)mddev->resync_max_sectors))
9050 goto skip;
9051 }
9052
9053 action = md_sync_action(mddev);
9054 desc = md_sync_action_name(action);
9055 mddev->last_sync_action = action;
9056
9057 /*
9058 * Before starting a resync we must have set curr_resync to
9059 * 2, and then checked that every "conflicting" array has curr_resync
9060 * less than ours. When we find one that is the same or higher
9061 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
9062 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
9063 * This will mean we have to start checking from the beginning again.
9064 *
9065 */
9066 if (mddev_is_clustered(mddev))
9067 mddev->cluster_ops->resync_start_notify(mddev);
9068 do {
9069 int mddev2_minor = -1;
9070 mddev->curr_resync = MD_RESYNC_DELAYED;
9071
9072 try_again:
9073 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9074 goto skip;
9075 spin_lock(&all_mddevs_lock);
9076 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
9077 if (test_bit(MD_DELETED, &mddev2->flags))
9078 continue;
9079 if (mddev2 == mddev)
9080 continue;
9081 if (!mddev->parallel_resync
9082 && mddev2->curr_resync
9083 && match_mddev_units(mddev, mddev2)) {
9084 DEFINE_WAIT(wq);
9085 if (mddev < mddev2 &&
9086 mddev->curr_resync == MD_RESYNC_DELAYED) {
9087 /* arbitrarily yield */
9088 mddev->curr_resync = MD_RESYNC_YIELDED;
9089 wake_up(&resync_wait);
9090 }
9091 if (mddev > mddev2 &&
9092 mddev->curr_resync == MD_RESYNC_YIELDED)
9093 /* no need to wait here, we can wait the next
9094 * time 'round when curr_resync == 2
9095 */
9096 continue;
9097 /* We need to wait 'interruptible' so as not to
9098 * contribute to the load average, and not to
9099 * be caught by 'softlockup'
9100 */
9101 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
9102 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9103 mddev2->curr_resync >= mddev->curr_resync) {
9104 if (mddev2_minor != mddev2->md_minor) {
9105 mddev2_minor = mddev2->md_minor;
9106 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
9107 desc, mdname(mddev),
9108 mdname(mddev2));
9109 }
9110 spin_unlock(&all_mddevs_lock);
9111
9112 if (signal_pending(current))
9113 flush_signals(current);
9114 schedule();
9115 finish_wait(&resync_wait, &wq);
9116 goto try_again;
9117 }
9118 finish_wait(&resync_wait, &wq);
9119 }
9120 }
9121 spin_unlock(&all_mddevs_lock);
9122 } while (mddev->curr_resync < MD_RESYNC_DELAYED);
9123
9124 max_sectors = md_sync_max_sectors(mddev, action);
9125 j = md_sync_position(mddev, action);
9126
9127 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9128 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
9129 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9130 speed_max(mddev), desc);
9131
9132 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9133
9134 io_sectors = 0;
9135 for (m = 0; m < SYNC_MARKS; m++) {
9136 mark[m] = jiffies;
9137 mark_cnt[m] = io_sectors;
9138 }
9139 last_mark = 0;
9140 mddev->resync_mark = mark[last_mark];
9141 mddev->resync_mark_cnt = mark_cnt[last_mark];
9142
9143 /*
9144 * Tune reconstruction:
9145 */
9146 window = 32 * (PAGE_SIZE / 512);
9147 pr_debug("md: using %dk window, over a total of %lluk.\n",
9148 window/2, (unsigned long long)max_sectors/2);
9149
9150 atomic_set(&mddev->recovery_active, 0);
9151 last_check = 0;
9152
9153 if (j >= MD_RESYNC_ACTIVE) {
9154 pr_debug("md: resuming %s of %s from checkpoint.\n",
9155 desc, mdname(mddev));
9156 mddev->curr_resync = j;
9157 } else
9158 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9159 mddev->curr_resync_completed = j;
9160 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9161 md_new_event();
9162 update_time = jiffies;
9163
9164 blk_start_plug(&plug);
9165 while (j < max_sectors) {
9166 sector_t sectors;
9167
9168 skipped = 0;
9169
9170 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9171 ((mddev->curr_resync > mddev->curr_resync_completed &&
9172 (mddev->curr_resync - mddev->curr_resync_completed)
9173 > (max_sectors >> 4)) ||
9174 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9175 (j - mddev->curr_resync_completed)*2
9176 >= mddev->resync_max - mddev->curr_resync_completed ||
9177 mddev->curr_resync_completed > mddev->resync_max
9178 )) {
9179 /* time to update curr_resync_completed */
9180 wait_event(mddev->recovery_wait,
9181 atomic_read(&mddev->recovery_active) == 0);
9182 mddev->curr_resync_completed = j;
9183 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9184 j > mddev->recovery_cp)
9185 mddev->recovery_cp = j;
9186 update_time = jiffies;
9187 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9188 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9189 }
9190
9191 while (j >= mddev->resync_max &&
9192 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9193 /* As this condition is controlled by user-space,
9194 * we can block indefinitely, so use '_interruptible'
9195 * to avoid triggering warnings.
9196 */
9197 flush_signals(current); /* just in case */
9198 wait_event_interruptible(mddev->recovery_wait,
9199 mddev->resync_max > j
9200 || test_bit(MD_RECOVERY_INTR,
9201 &mddev->recovery));
9202 }
9203
9204 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9205 break;
9206
9207 sectors = mddev->pers->sync_request(mddev, j, max_sectors,
9208 &skipped);
9209 if (sectors == 0) {
9210 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9211 break;
9212 }
9213
9214 if (!skipped) { /* actual IO requested */
9215 io_sectors += sectors;
9216 atomic_add(sectors, &mddev->recovery_active);
9217 }
9218
9219 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9220 break;
9221
9222 j += sectors;
9223 if (j > max_sectors)
9224 /* when skipping, extra large numbers can be returned. */
9225 j = max_sectors;
9226 if (j >= MD_RESYNC_ACTIVE)
9227 mddev->curr_resync = j;
9228 mddev->curr_mark_cnt = io_sectors;
9229 if (last_check == 0)
9230 /* this is the earliest that rebuild will be
9231 * visible in /proc/mdstat
9232 */
9233 md_new_event();
9234
9235 if (last_check + window > io_sectors || j == max_sectors)
9236 continue;
9237
9238 last_check = io_sectors;
9239 repeat:
9240 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9241 /* step marks */
9242 int next = (last_mark+1) % SYNC_MARKS;
9243
9244 mddev->resync_mark = mark[next];
9245 mddev->resync_mark_cnt = mark_cnt[next];
9246 mark[next] = jiffies;
9247 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9248 last_mark = next;
9249 }
9250
9251 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9252 break;
9253
9254 /*
9255 * this loop exits only if either when we are slower than
9256 * the 'hard' speed limit, or the system was IO-idle for
9257 * a jiffy.
9258 * the system might be non-idle CPU-wise, but we only care
9259 * about not overloading the IO subsystem. (things like an
9260 * e2fsck being done on the RAID array should execute fast)
9261 */
9262 cond_resched();
9263
9264 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9265 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9266 /((jiffies-mddev->resync_mark)/HZ +1) +1;
9267
9268 if (currspeed > speed_min(mddev)) {
9269 if (currspeed > speed_max(mddev)) {
9270 msleep(500);
9271 goto repeat;
9272 }
9273 if (!sync_io_within_limit(mddev) &&
9274 !is_mddev_idle(mddev, 0)) {
9275 /*
9276 * Give other IO more of a chance.
9277 * The faster the devices, the less we wait.
9278 */
9279 wait_event(mddev->recovery_wait,
9280 !atomic_read(&mddev->recovery_active));
9281 }
9282 }
9283 }
9284 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9285 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9286 ? "interrupted" : "done");
9287 /*
9288 * this also signals 'finished resyncing' to md_stop
9289 */
9290 blk_finish_plug(&plug);
9291 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9292
9293 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9294 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9295 mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9296 mddev->curr_resync_completed = mddev->curr_resync;
9297 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9298 }
9299 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
9300
9301 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9302 mddev->curr_resync > MD_RESYNC_ACTIVE) {
9303 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9304 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9305 if (mddev->curr_resync >= mddev->recovery_cp) {
9306 pr_debug("md: checkpointing %s of %s.\n",
9307 desc, mdname(mddev));
9308 if (test_bit(MD_RECOVERY_ERROR,
9309 &mddev->recovery))
9310 mddev->recovery_cp =
9311 mddev->curr_resync_completed;
9312 else
9313 mddev->recovery_cp =
9314 mddev->curr_resync;
9315 }
9316 } else
9317 mddev->recovery_cp = MaxSector;
9318 } else {
9319 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9320 mddev->curr_resync = MaxSector;
9321 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9322 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9323 rcu_read_lock();
9324 rdev_for_each_rcu(rdev, mddev)
9325 if (rdev->raid_disk >= 0 &&
9326 mddev->delta_disks >= 0 &&
9327 !test_bit(Journal, &rdev->flags) &&
9328 !test_bit(Faulty, &rdev->flags) &&
9329 !test_bit(In_sync, &rdev->flags) &&
9330 rdev->recovery_offset < mddev->curr_resync)
9331 rdev->recovery_offset = mddev->curr_resync;
9332 rcu_read_unlock();
9333 }
9334 }
9335 }
9336 skip:
9337 /* set CHANGE_PENDING here since maybe another update is needed,
9338 * so other nodes are informed. It should be harmless for normal
9339 * raid */
9340 set_mask_bits(&mddev->sb_flags, 0,
9341 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9342
9343 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9344 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9345 mddev->delta_disks > 0 &&
9346 mddev->pers->finish_reshape &&
9347 mddev->pers->size &&
9348 !mddev_is_dm(mddev)) {
9349 mddev_lock_nointr(mddev);
9350 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9351 mddev_unlock(mddev);
9352 if (!mddev_is_clustered(mddev))
9353 set_capacity_and_notify(mddev->gendisk,
9354 mddev->array_sectors);
9355 }
9356
9357 spin_lock(&mddev->lock);
9358 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9359 /* We completed so min/max setting can be forgotten if used. */
9360 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9361 mddev->resync_min = 0;
9362 mddev->resync_max = MaxSector;
9363 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9364 mddev->resync_min = mddev->curr_resync_completed;
9365 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9366 mddev->curr_resync = MD_RESYNC_NONE;
9367 spin_unlock(&mddev->lock);
9368
9369 wake_up(&resync_wait);
9370 md_wakeup_thread(mddev->thread);
9371 return;
9372 }
9373 EXPORT_SYMBOL_GPL(md_do_sync);
9374
rdev_removeable(struct md_rdev * rdev)9375 static bool rdev_removeable(struct md_rdev *rdev)
9376 {
9377 /* rdev is not used. */
9378 if (rdev->raid_disk < 0)
9379 return false;
9380
9381 /* There are still inflight io, don't remove this rdev. */
9382 if (atomic_read(&rdev->nr_pending))
9383 return false;
9384
9385 /*
9386 * An error occurred but has not yet been acknowledged by the metadata
9387 * handler, don't remove this rdev.
9388 */
9389 if (test_bit(Blocked, &rdev->flags))
9390 return false;
9391
9392 /* Fautly rdev is not used, it's safe to remove it. */
9393 if (test_bit(Faulty, &rdev->flags))
9394 return true;
9395
9396 /* Journal disk can only be removed if it's faulty. */
9397 if (test_bit(Journal, &rdev->flags))
9398 return false;
9399
9400 /*
9401 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9402 * replacement has just become active from pers->spare_active(), and
9403 * then pers->hot_remove_disk() will replace this rdev with replacement.
9404 */
9405 if (!test_bit(In_sync, &rdev->flags))
9406 return true;
9407
9408 return false;
9409 }
9410
rdev_is_spare(struct md_rdev * rdev)9411 static bool rdev_is_spare(struct md_rdev *rdev)
9412 {
9413 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9414 !test_bit(In_sync, &rdev->flags) &&
9415 !test_bit(Journal, &rdev->flags) &&
9416 !test_bit(Faulty, &rdev->flags);
9417 }
9418
rdev_addable(struct md_rdev * rdev)9419 static bool rdev_addable(struct md_rdev *rdev)
9420 {
9421 /* rdev is already used, don't add it again. */
9422 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9423 test_bit(Faulty, &rdev->flags))
9424 return false;
9425
9426 /* Allow to add journal disk. */
9427 if (test_bit(Journal, &rdev->flags))
9428 return true;
9429
9430 /* Allow to add if array is read-write. */
9431 if (md_is_rdwr(rdev->mddev))
9432 return true;
9433
9434 /*
9435 * For read-only array, only allow to readd a rdev. And if bitmap is
9436 * used, don't allow to readd a rdev that is too old.
9437 */
9438 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9439 return true;
9440
9441 return false;
9442 }
9443
md_spares_need_change(struct mddev * mddev)9444 static bool md_spares_need_change(struct mddev *mddev)
9445 {
9446 struct md_rdev *rdev;
9447
9448 rcu_read_lock();
9449 rdev_for_each_rcu(rdev, mddev) {
9450 if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9451 rcu_read_unlock();
9452 return true;
9453 }
9454 }
9455 rcu_read_unlock();
9456 return false;
9457 }
9458
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9459 static int remove_and_add_spares(struct mddev *mddev,
9460 struct md_rdev *this)
9461 {
9462 struct md_rdev *rdev;
9463 int spares = 0;
9464 int removed = 0;
9465
9466 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9467 /* Mustn't remove devices when resync thread is running */
9468 return 0;
9469
9470 rdev_for_each(rdev, mddev) {
9471 if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9472 !mddev->pers->hot_remove_disk(mddev, rdev)) {
9473 sysfs_unlink_rdev(mddev, rdev);
9474 rdev->saved_raid_disk = rdev->raid_disk;
9475 rdev->raid_disk = -1;
9476 removed++;
9477 }
9478 }
9479
9480 if (removed && mddev->kobj.sd)
9481 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9482
9483 if (this && removed)
9484 goto no_add;
9485
9486 rdev_for_each(rdev, mddev) {
9487 if (this && this != rdev)
9488 continue;
9489 if (rdev_is_spare(rdev))
9490 spares++;
9491 if (!rdev_addable(rdev))
9492 continue;
9493 if (!test_bit(Journal, &rdev->flags))
9494 rdev->recovery_offset = 0;
9495 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9496 /* failure here is OK */
9497 sysfs_link_rdev(mddev, rdev);
9498 if (!test_bit(Journal, &rdev->flags))
9499 spares++;
9500 md_new_event();
9501 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9502 }
9503 }
9504 no_add:
9505 if (removed)
9506 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9507 return spares;
9508 }
9509
md_choose_sync_action(struct mddev * mddev,int * spares)9510 static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9511 {
9512 /* Check if reshape is in progress first. */
9513 if (mddev->reshape_position != MaxSector) {
9514 if (mddev->pers->check_reshape == NULL ||
9515 mddev->pers->check_reshape(mddev) != 0)
9516 return false;
9517
9518 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9519 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9520 return true;
9521 }
9522
9523 /* Check if resync is in progress. */
9524 if (mddev->recovery_cp < MaxSector) {
9525 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9526 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9527 return true;
9528 }
9529
9530 /*
9531 * Remove any failed drives, then add spares if possible. Spares are
9532 * also removed and re-added, to allow the personality to fail the
9533 * re-add.
9534 */
9535 *spares = remove_and_add_spares(mddev, NULL);
9536 if (*spares) {
9537 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9538 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9539 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9540
9541 /* Start new recovery. */
9542 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9543 return true;
9544 }
9545
9546 /* Delay to choose resync/check/repair in md_do_sync(). */
9547 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9548 return true;
9549
9550 /* Nothing to be done */
9551 return false;
9552 }
9553
md_start_sync(struct work_struct * ws)9554 static void md_start_sync(struct work_struct *ws)
9555 {
9556 struct mddev *mddev = container_of(ws, struct mddev, sync_work);
9557 int spares = 0;
9558 bool suspend = false;
9559 char *name;
9560
9561 /*
9562 * If reshape is still in progress, spares won't be added or removed
9563 * from conf until reshape is done.
9564 */
9565 if (mddev->reshape_position == MaxSector &&
9566 md_spares_need_change(mddev)) {
9567 suspend = true;
9568 mddev_suspend(mddev, false);
9569 }
9570
9571 mddev_lock_nointr(mddev);
9572 if (!md_is_rdwr(mddev)) {
9573 /*
9574 * On a read-only array we can:
9575 * - remove failed devices
9576 * - add already-in_sync devices if the array itself is in-sync.
9577 * As we only add devices that are already in-sync, we can
9578 * activate the spares immediately.
9579 */
9580 remove_and_add_spares(mddev, NULL);
9581 goto not_running;
9582 }
9583
9584 if (!md_choose_sync_action(mddev, &spares))
9585 goto not_running;
9586
9587 if (!mddev->pers->sync_request)
9588 goto not_running;
9589
9590 /*
9591 * We are adding a device or devices to an array which has the bitmap
9592 * stored on all devices. So make sure all bitmap pages get written.
9593 */
9594 if (spares)
9595 mddev->bitmap_ops->write_all(mddev);
9596
9597 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
9598 "reshape" : "resync";
9599 rcu_assign_pointer(mddev->sync_thread,
9600 md_register_thread(md_do_sync, mddev, name));
9601 if (!mddev->sync_thread) {
9602 pr_warn("%s: could not start resync thread...\n",
9603 mdname(mddev));
9604 /* leave the spares where they are, it shouldn't hurt */
9605 goto not_running;
9606 }
9607
9608 mddev_unlock(mddev);
9609 /*
9610 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9611 * not set it again. Otherwise, we may cause issue like this one:
9612 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
9613 * Therefore, use __mddev_resume(mddev, false).
9614 */
9615 if (suspend)
9616 __mddev_resume(mddev, false);
9617 md_wakeup_thread(mddev->sync_thread);
9618 sysfs_notify_dirent_safe(mddev->sysfs_action);
9619 md_new_event();
9620 return;
9621
9622 not_running:
9623 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9624 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9625 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9626 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9627 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9628 mddev_unlock(mddev);
9629 /*
9630 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9631 * not set it again. Otherwise, we may cause issue like this one:
9632 * https://bugzilla.kernel.org/show_bug.cgi?id=218200
9633 * Therefore, use __mddev_resume(mddev, false).
9634 */
9635 if (suspend)
9636 __mddev_resume(mddev, false);
9637
9638 wake_up(&resync_wait);
9639 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
9640 mddev->sysfs_action)
9641 sysfs_notify_dirent_safe(mddev->sysfs_action);
9642 }
9643
unregister_sync_thread(struct mddev * mddev)9644 static void unregister_sync_thread(struct mddev *mddev)
9645 {
9646 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9647 /* resync/recovery still happening */
9648 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9649 return;
9650 }
9651
9652 if (WARN_ON_ONCE(!mddev->sync_thread))
9653 return;
9654
9655 md_reap_sync_thread(mddev);
9656 }
9657
9658 /*
9659 * This routine is regularly called by all per-raid-array threads to
9660 * deal with generic issues like resync and super-block update.
9661 * Raid personalities that don't have a thread (linear/raid0) do not
9662 * need this as they never do any recovery or update the superblock.
9663 *
9664 * It does not do any resync itself, but rather "forks" off other threads
9665 * to do that as needed.
9666 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9667 * "->recovery" and create a thread at ->sync_thread.
9668 * When the thread finishes it sets MD_RECOVERY_DONE
9669 * and wakeups up this thread which will reap the thread and finish up.
9670 * This thread also removes any faulty devices (with nr_pending == 0).
9671 *
9672 * The overall approach is:
9673 * 1/ if the superblock needs updating, update it.
9674 * 2/ If a recovery thread is running, don't do anything else.
9675 * 3/ If recovery has finished, clean up, possibly marking spares active.
9676 * 4/ If there are any faulty devices, remove them.
9677 * 5/ If array is degraded, try to add spares devices
9678 * 6/ If array has spares or is not in-sync, start a resync thread.
9679 */
md_check_recovery(struct mddev * mddev)9680 void md_check_recovery(struct mddev *mddev)
9681 {
9682 if (mddev->bitmap)
9683 mddev->bitmap_ops->daemon_work(mddev);
9684
9685 if (signal_pending(current)) {
9686 if (mddev->pers->sync_request && !mddev->external) {
9687 pr_debug("md: %s in immediate safe mode\n",
9688 mdname(mddev));
9689 mddev->safemode = 2;
9690 }
9691 flush_signals(current);
9692 }
9693
9694 if (!md_is_rdwr(mddev) &&
9695 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9696 !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9697 return;
9698 if ( ! (
9699 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9700 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9701 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9702 (mddev->external == 0 && mddev->safemode == 1) ||
9703 (mddev->safemode == 2
9704 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9705 ))
9706 return;
9707
9708 if (mddev_trylock(mddev)) {
9709 bool try_set_sync = mddev->safemode != 0;
9710
9711 if (!mddev->external && mddev->safemode == 1)
9712 mddev->safemode = 0;
9713
9714 if (!md_is_rdwr(mddev)) {
9715 struct md_rdev *rdev;
9716
9717 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9718 unregister_sync_thread(mddev);
9719 goto unlock;
9720 }
9721
9722 if (!mddev->external && mddev->in_sync)
9723 /*
9724 * 'Blocked' flag not needed as failed devices
9725 * will be recorded if array switched to read/write.
9726 * Leaving it set will prevent the device
9727 * from being removed.
9728 */
9729 rdev_for_each(rdev, mddev)
9730 clear_bit(Blocked, &rdev->flags);
9731
9732 /*
9733 * There is no thread, but we need to call
9734 * ->spare_active and clear saved_raid_disk
9735 */
9736 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9737 md_reap_sync_thread(mddev);
9738
9739 /*
9740 * Let md_start_sync() to remove and add rdevs to the
9741 * array.
9742 */
9743 if (md_spares_need_change(mddev)) {
9744 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9745 queue_work(md_misc_wq, &mddev->sync_work);
9746 }
9747
9748 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9749 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9750 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9751
9752 goto unlock;
9753 }
9754
9755 if (mddev_is_clustered(mddev)) {
9756 struct md_rdev *rdev, *tmp;
9757 /* kick the device if another node issued a
9758 * remove disk.
9759 */
9760 rdev_for_each_safe(rdev, tmp, mddev) {
9761 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9762 rdev->raid_disk < 0)
9763 md_kick_rdev_from_array(rdev);
9764 }
9765 }
9766
9767 if (try_set_sync && !mddev->external && !mddev->in_sync) {
9768 spin_lock(&mddev->lock);
9769 set_in_sync(mddev);
9770 spin_unlock(&mddev->lock);
9771 }
9772
9773 if (mddev->sb_flags)
9774 md_update_sb(mddev, 0);
9775
9776 /*
9777 * Never start a new sync thread if MD_RECOVERY_RUNNING is
9778 * still set.
9779 */
9780 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9781 unregister_sync_thread(mddev);
9782 goto unlock;
9783 }
9784
9785 /* Set RUNNING before clearing NEEDED to avoid
9786 * any transients in the value of "sync_action".
9787 */
9788 mddev->curr_resync_completed = 0;
9789 spin_lock(&mddev->lock);
9790 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9791 spin_unlock(&mddev->lock);
9792 /* Clear some bits that don't mean anything, but
9793 * might be left set
9794 */
9795 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9796 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9797
9798 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9799 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
9800 queue_work(md_misc_wq, &mddev->sync_work);
9801 } else {
9802 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9803 wake_up(&resync_wait);
9804 }
9805
9806 unlock:
9807 wake_up(&mddev->sb_wait);
9808 mddev_unlock(mddev);
9809 }
9810 }
9811 EXPORT_SYMBOL(md_check_recovery);
9812
md_reap_sync_thread(struct mddev * mddev)9813 void md_reap_sync_thread(struct mddev *mddev)
9814 {
9815 struct md_rdev *rdev;
9816 sector_t old_dev_sectors = mddev->dev_sectors;
9817 bool is_reshaped = false;
9818
9819 /* resync has finished, collect result */
9820 md_unregister_thread(mddev, &mddev->sync_thread);
9821 atomic_inc(&mddev->sync_seq);
9822
9823 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9824 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9825 mddev->degraded != mddev->raid_disks) {
9826 /* success...*/
9827 /* activate any spares */
9828 if (mddev->pers->spare_active(mddev)) {
9829 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9830 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9831 }
9832 }
9833 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9834 mddev->pers->finish_reshape) {
9835 mddev->pers->finish_reshape(mddev);
9836 if (mddev_is_clustered(mddev))
9837 is_reshaped = true;
9838 }
9839
9840 /* If array is no-longer degraded, then any saved_raid_disk
9841 * information must be scrapped.
9842 */
9843 if (!mddev->degraded)
9844 rdev_for_each(rdev, mddev)
9845 rdev->saved_raid_disk = -1;
9846
9847 md_update_sb(mddev, 1);
9848 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9849 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9850 * clustered raid */
9851 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9852 mddev->cluster_ops->resync_finish(mddev);
9853 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9854 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9855 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9856 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9857 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9858 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9859 /*
9860 * We call mddev->cluster_ops->update_size here because sync_size could
9861 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9862 * so it is time to update size across cluster.
9863 */
9864 if (mddev_is_clustered(mddev) && is_reshaped
9865 && !test_bit(MD_CLOSING, &mddev->flags))
9866 mddev->cluster_ops->update_size(mddev, old_dev_sectors);
9867 /* flag recovery needed just to double check */
9868 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9869 sysfs_notify_dirent_safe(mddev->sysfs_completed);
9870 sysfs_notify_dirent_safe(mddev->sysfs_action);
9871 md_new_event();
9872 if (mddev->event_work.func)
9873 queue_work(md_misc_wq, &mddev->event_work);
9874 wake_up(&resync_wait);
9875 }
9876 EXPORT_SYMBOL(md_reap_sync_thread);
9877
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)9878 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9879 {
9880 sysfs_notify_dirent_safe(rdev->sysfs_state);
9881 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
9882 msecs_to_jiffies(5000));
9883 rdev_dec_pending(rdev, mddev);
9884 }
9885 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9886
md_finish_reshape(struct mddev * mddev)9887 void md_finish_reshape(struct mddev *mddev)
9888 {
9889 /* called be personality module when reshape completes. */
9890 struct md_rdev *rdev;
9891
9892 rdev_for_each(rdev, mddev) {
9893 if (rdev->data_offset > rdev->new_data_offset)
9894 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9895 else
9896 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9897 rdev->data_offset = rdev->new_data_offset;
9898 }
9899 }
9900 EXPORT_SYMBOL(md_finish_reshape);
9901
9902 /* Bad block management */
9903
9904 /* Returns true on success, false on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9905 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9906 int is_new)
9907 {
9908 struct mddev *mddev = rdev->mddev;
9909
9910 /*
9911 * Recording new badblocks for faulty rdev will force unnecessary
9912 * super block updating. This is fragile for external management because
9913 * userspace daemon may trying to remove this device and deadlock may
9914 * occur. This will be probably solved in the mdadm, but it is safer to
9915 * avoid it.
9916 */
9917 if (test_bit(Faulty, &rdev->flags))
9918 return true;
9919
9920 if (is_new)
9921 s += rdev->new_data_offset;
9922 else
9923 s += rdev->data_offset;
9924
9925 if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
9926 return false;
9927
9928 /* Make sure they get written out promptly */
9929 if (test_bit(ExternalBbl, &rdev->flags))
9930 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9931 sysfs_notify_dirent_safe(rdev->sysfs_state);
9932 set_mask_bits(&mddev->sb_flags, 0,
9933 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9934 md_wakeup_thread(rdev->mddev->thread);
9935 return true;
9936 }
9937 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9938
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9939 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9940 int is_new)
9941 {
9942 if (is_new)
9943 s += rdev->new_data_offset;
9944 else
9945 s += rdev->data_offset;
9946
9947 if (!badblocks_clear(&rdev->badblocks, s, sectors))
9948 return;
9949
9950 if (test_bit(ExternalBbl, &rdev->flags))
9951 sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9952 }
9953 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9954
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)9955 static int md_notify_reboot(struct notifier_block *this,
9956 unsigned long code, void *x)
9957 {
9958 struct mddev *mddev;
9959 int need_delay = 0;
9960
9961 spin_lock(&all_mddevs_lock);
9962 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
9963 if (!mddev_get(mddev))
9964 continue;
9965 spin_unlock(&all_mddevs_lock);
9966 if (mddev_trylock(mddev)) {
9967 if (mddev->pers)
9968 __md_stop_writes(mddev);
9969 if (mddev->persistent)
9970 mddev->safemode = 2;
9971 mddev_unlock(mddev);
9972 }
9973 need_delay = 1;
9974 spin_lock(&all_mddevs_lock);
9975 mddev_put_locked(mddev);
9976 }
9977 spin_unlock(&all_mddevs_lock);
9978
9979 /*
9980 * certain more exotic SCSI devices are known to be
9981 * volatile wrt too early system reboots. While the
9982 * right place to handle this issue is the given
9983 * driver, we do want to have a safe RAID driver ...
9984 */
9985 if (need_delay)
9986 msleep(1000);
9987
9988 return NOTIFY_DONE;
9989 }
9990
9991 static struct notifier_block md_notifier = {
9992 .notifier_call = md_notify_reboot,
9993 .next = NULL,
9994 .priority = INT_MAX, /* before any real devices */
9995 };
9996
md_geninit(void)9997 static void md_geninit(void)
9998 {
9999 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
10000
10001 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
10002 }
10003
md_init(void)10004 static int __init md_init(void)
10005 {
10006 int ret = -ENOMEM;
10007
10008 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
10009 if (!md_wq)
10010 goto err_wq;
10011
10012 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
10013 if (!md_misc_wq)
10014 goto err_misc_wq;
10015
10016 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
10017 0);
10018 if (!md_bitmap_wq)
10019 goto err_bitmap_wq;
10020
10021 ret = __register_blkdev(MD_MAJOR, "md", md_probe);
10022 if (ret < 0)
10023 goto err_md;
10024
10025 ret = __register_blkdev(0, "mdp", md_probe);
10026 if (ret < 0)
10027 goto err_mdp;
10028 mdp_major = ret;
10029
10030 register_reboot_notifier(&md_notifier);
10031 raid_table_header = register_sysctl("dev/raid", raid_table);
10032
10033 md_geninit();
10034 return 0;
10035
10036 err_mdp:
10037 unregister_blkdev(MD_MAJOR, "md");
10038 err_md:
10039 destroy_workqueue(md_bitmap_wq);
10040 err_bitmap_wq:
10041 destroy_workqueue(md_misc_wq);
10042 err_misc_wq:
10043 destroy_workqueue(md_wq);
10044 err_wq:
10045 return ret;
10046 }
10047
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)10048 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
10049 {
10050 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
10051 struct md_rdev *rdev2, *tmp;
10052 int role, ret;
10053
10054 /*
10055 * If size is changed in another node then we need to
10056 * do resize as well.
10057 */
10058 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
10059 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
10060 if (ret)
10061 pr_info("md-cluster: resize failed\n");
10062 else
10063 mddev->bitmap_ops->update_sb(mddev->bitmap);
10064 }
10065
10066 /* Check for change of roles in the active devices */
10067 rdev_for_each_safe(rdev2, tmp, mddev) {
10068 if (test_bit(Faulty, &rdev2->flags))
10069 continue;
10070
10071 /* Check if the roles changed */
10072 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
10073
10074 if (test_bit(Candidate, &rdev2->flags)) {
10075 if (role == MD_DISK_ROLE_FAULTY) {
10076 pr_info("md: Removing Candidate device %pg because add failed\n",
10077 rdev2->bdev);
10078 md_kick_rdev_from_array(rdev2);
10079 continue;
10080 }
10081 else
10082 clear_bit(Candidate, &rdev2->flags);
10083 }
10084
10085 if (role != rdev2->raid_disk) {
10086 /*
10087 * got activated except reshape is happening.
10088 */
10089 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
10090 !(le32_to_cpu(sb->feature_map) &
10091 MD_FEATURE_RESHAPE_ACTIVE) &&
10092 !mddev->cluster_ops->resync_status_get(mddev)) {
10093 /*
10094 * -1 to make raid1_add_disk() set conf->fullsync
10095 * to 1. This could avoid skipping sync when the
10096 * remote node is down during resyncing.
10097 */
10098 if ((le32_to_cpu(sb->feature_map)
10099 & MD_FEATURE_RECOVERY_OFFSET))
10100 rdev2->saved_raid_disk = -1;
10101 else
10102 rdev2->saved_raid_disk = role;
10103 ret = remove_and_add_spares(mddev, rdev2);
10104 pr_info("Activated spare: %pg\n",
10105 rdev2->bdev);
10106 /* wakeup mddev->thread here, so array could
10107 * perform resync with the new activated disk */
10108 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10109 md_wakeup_thread(mddev->thread);
10110 }
10111 /* device faulty
10112 * We just want to do the minimum to mark the disk
10113 * as faulty. The recovery is performed by the
10114 * one who initiated the error.
10115 */
10116 if (role == MD_DISK_ROLE_FAULTY ||
10117 role == MD_DISK_ROLE_JOURNAL) {
10118 md_error(mddev, rdev2);
10119 clear_bit(Blocked, &rdev2->flags);
10120 }
10121 }
10122 }
10123
10124 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
10125 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
10126 if (ret)
10127 pr_warn("md: updating array disks failed. %d\n", ret);
10128 }
10129
10130 /*
10131 * Since mddev->delta_disks has already updated in update_raid_disks,
10132 * so it is time to check reshape.
10133 */
10134 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10135 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10136 /*
10137 * reshape is happening in the remote node, we need to
10138 * update reshape_position and call start_reshape.
10139 */
10140 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10141 if (mddev->pers->update_reshape_pos)
10142 mddev->pers->update_reshape_pos(mddev);
10143 if (mddev->pers->start_reshape)
10144 mddev->pers->start_reshape(mddev);
10145 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10146 mddev->reshape_position != MaxSector &&
10147 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10148 /* reshape is just done in another node. */
10149 mddev->reshape_position = MaxSector;
10150 if (mddev->pers->update_reshape_pos)
10151 mddev->pers->update_reshape_pos(mddev);
10152 }
10153
10154 /* Finally set the event to be up to date */
10155 mddev->events = le64_to_cpu(sb->events);
10156 }
10157
read_rdev(struct mddev * mddev,struct md_rdev * rdev)10158 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10159 {
10160 int err;
10161 struct page *swapout = rdev->sb_page;
10162 struct mdp_superblock_1 *sb;
10163
10164 /* Store the sb page of the rdev in the swapout temporary
10165 * variable in case we err in the future
10166 */
10167 rdev->sb_page = NULL;
10168 err = alloc_disk_sb(rdev);
10169 if (err == 0) {
10170 ClearPageUptodate(rdev->sb_page);
10171 rdev->sb_loaded = 0;
10172 err = super_types[mddev->major_version].
10173 load_super(rdev, NULL, mddev->minor_version);
10174 }
10175 if (err < 0) {
10176 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10177 __func__, __LINE__, rdev->desc_nr, err);
10178 if (rdev->sb_page)
10179 put_page(rdev->sb_page);
10180 rdev->sb_page = swapout;
10181 rdev->sb_loaded = 1;
10182 return err;
10183 }
10184
10185 sb = page_address(rdev->sb_page);
10186 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10187 * is not set
10188 */
10189
10190 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10191 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10192
10193 /* The other node finished recovery, call spare_active to set
10194 * device In_sync and mddev->degraded
10195 */
10196 if (rdev->recovery_offset == MaxSector &&
10197 !test_bit(In_sync, &rdev->flags) &&
10198 mddev->pers->spare_active(mddev))
10199 sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10200
10201 put_page(swapout);
10202 return 0;
10203 }
10204
md_reload_sb(struct mddev * mddev,int nr)10205 void md_reload_sb(struct mddev *mddev, int nr)
10206 {
10207 struct md_rdev *rdev = NULL, *iter;
10208 int err;
10209
10210 /* Find the rdev */
10211 rdev_for_each_rcu(iter, mddev) {
10212 if (iter->desc_nr == nr) {
10213 rdev = iter;
10214 break;
10215 }
10216 }
10217
10218 if (!rdev) {
10219 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10220 return;
10221 }
10222
10223 err = read_rdev(mddev, rdev);
10224 if (err < 0)
10225 return;
10226
10227 check_sb_changes(mddev, rdev);
10228
10229 /* Read all rdev's to update recovery_offset */
10230 rdev_for_each_rcu(rdev, mddev) {
10231 if (!test_bit(Faulty, &rdev->flags))
10232 read_rdev(mddev, rdev);
10233 }
10234 }
10235 EXPORT_SYMBOL(md_reload_sb);
10236
10237 #ifndef MODULE
10238
10239 /*
10240 * Searches all registered partitions for autorun RAID arrays
10241 * at boot time.
10242 */
10243
10244 static DEFINE_MUTEX(detected_devices_mutex);
10245 static LIST_HEAD(all_detected_devices);
10246 struct detected_devices_node {
10247 struct list_head list;
10248 dev_t dev;
10249 };
10250
md_autodetect_dev(dev_t dev)10251 void md_autodetect_dev(dev_t dev)
10252 {
10253 struct detected_devices_node *node_detected_dev;
10254
10255 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
10256 if (node_detected_dev) {
10257 node_detected_dev->dev = dev;
10258 mutex_lock(&detected_devices_mutex);
10259 list_add_tail(&node_detected_dev->list, &all_detected_devices);
10260 mutex_unlock(&detected_devices_mutex);
10261 }
10262 }
10263
md_autostart_arrays(int part)10264 void md_autostart_arrays(int part)
10265 {
10266 struct md_rdev *rdev;
10267 struct detected_devices_node *node_detected_dev;
10268 dev_t dev;
10269 int i_scanned, i_passed;
10270
10271 i_scanned = 0;
10272 i_passed = 0;
10273
10274 pr_info("md: Autodetecting RAID arrays.\n");
10275
10276 mutex_lock(&detected_devices_mutex);
10277 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10278 i_scanned++;
10279 node_detected_dev = list_entry(all_detected_devices.next,
10280 struct detected_devices_node, list);
10281 list_del(&node_detected_dev->list);
10282 dev = node_detected_dev->dev;
10283 kfree(node_detected_dev);
10284 mutex_unlock(&detected_devices_mutex);
10285 rdev = md_import_device(dev,0, 90);
10286 mutex_lock(&detected_devices_mutex);
10287 if (IS_ERR(rdev))
10288 continue;
10289
10290 if (test_bit(Faulty, &rdev->flags))
10291 continue;
10292
10293 set_bit(AutoDetected, &rdev->flags);
10294 list_add(&rdev->same_set, &pending_raid_disks);
10295 i_passed++;
10296 }
10297 mutex_unlock(&detected_devices_mutex);
10298
10299 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10300
10301 autorun_devices(part);
10302 }
10303
10304 #endif /* !MODULE */
10305
md_exit(void)10306 static __exit void md_exit(void)
10307 {
10308 struct mddev *mddev;
10309 int delay = 1;
10310
10311 unregister_blkdev(MD_MAJOR,"md");
10312 unregister_blkdev(mdp_major, "mdp");
10313 unregister_reboot_notifier(&md_notifier);
10314 unregister_sysctl_table(raid_table_header);
10315
10316 /* We cannot unload the modules while some process is
10317 * waiting for us in select() or poll() - wake them up
10318 */
10319 md_unloading = 1;
10320 while (waitqueue_active(&md_event_waiters)) {
10321 /* not safe to leave yet */
10322 wake_up(&md_event_waiters);
10323 msleep(delay);
10324 delay += delay;
10325 }
10326 remove_proc_entry("mdstat", NULL);
10327
10328 spin_lock(&all_mddevs_lock);
10329 list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10330 if (!mddev_get(mddev))
10331 continue;
10332 spin_unlock(&all_mddevs_lock);
10333 export_array(mddev);
10334 mddev->ctime = 0;
10335 mddev->hold_active = 0;
10336 /*
10337 * As the mddev is now fully clear, mddev_put will schedule
10338 * the mddev for destruction by a workqueue, and the
10339 * destroy_workqueue() below will wait for that to complete.
10340 */
10341 spin_lock(&all_mddevs_lock);
10342 mddev_put_locked(mddev);
10343 }
10344 spin_unlock(&all_mddevs_lock);
10345
10346 destroy_workqueue(md_misc_wq);
10347 destroy_workqueue(md_bitmap_wq);
10348 destroy_workqueue(md_wq);
10349 }
10350
10351 subsys_initcall(md_init);
module_exit(md_exit)10352 module_exit(md_exit)
10353
10354 static int get_ro(char *buffer, const struct kernel_param *kp)
10355 {
10356 return sprintf(buffer, "%d\n", start_readonly);
10357 }
set_ro(const char * val,const struct kernel_param * kp)10358 static int set_ro(const char *val, const struct kernel_param *kp)
10359 {
10360 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10361 }
10362
10363 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10364 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10365 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10366 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10367
10368 MODULE_LICENSE("GPL");
10369 MODULE_DESCRIPTION("MD RAID framework");
10370 MODULE_ALIAS("md");
10371 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10372