xref: /linux/drivers/md/md.c (revision 6e11664f148454a127dd89e8698c3e3e80e5f62f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3    md.c : Multiple Devices driver for Linux
4      Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 
6      completely rewritten, based on the MD driver code from Marc Zyngier
7 
8    Changes:
9 
10    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14    - kmod support by: Cyrus Durgin
15    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 
18    - lots of fixes and improvements to the RAID1/RAID5 and generic
19      RAID code (such as request based resynchronization):
20 
21      Neil Brown <neilb@cse.unsw.edu.au>.
22 
23    - persistent bitmap code
24      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25 
26 
27    Errors, Warnings, etc.
28    Please use:
29      pr_crit() for error conditions that risk data loss
30      pr_err() for error conditions that are unexpected, like an IO error
31          or internal inconsistency
32      pr_warn() for error conditions that could have been predicated, like
33          adding a device to an array when it has incompatible metadata
34      pr_info() for every interesting, very rare events, like an array starting
35          or stopping, or resync starting or stopping
36      pr_debug() for everything else.
37 
38 */
39 
40 #include <linux/sched/mm.h>
41 #include <linux/sched/signal.h>
42 #include <linux/kthread.h>
43 #include <linux/blkdev.h>
44 #include <linux/blk-integrity.h>
45 #include <linux/badblocks.h>
46 #include <linux/sysctl.h>
47 #include <linux/seq_file.h>
48 #include <linux/fs.h>
49 #include <linux/poll.h>
50 #include <linux/ctype.h>
51 #include <linux/string.h>
52 #include <linux/hdreg.h>
53 #include <linux/proc_fs.h>
54 #include <linux/random.h>
55 #include <linux/major.h>
56 #include <linux/module.h>
57 #include <linux/reboot.h>
58 #include <linux/file.h>
59 #include <linux/compat.h>
60 #include <linux/delay.h>
61 #include <linux/raid/md_p.h>
62 #include <linux/raid/md_u.h>
63 #include <linux/raid/detect.h>
64 #include <linux/slab.h>
65 #include <linux/percpu-refcount.h>
66 #include <linux/part_stat.h>
67 
68 #include "md.h"
69 #include "md-bitmap.h"
70 #include "md-cluster.h"
71 
72 static const char *action_name[NR_SYNC_ACTIONS] = {
73 	[ACTION_RESYNC]		= "resync",
74 	[ACTION_RECOVER]	= "recover",
75 	[ACTION_CHECK]		= "check",
76 	[ACTION_REPAIR]		= "repair",
77 	[ACTION_RESHAPE]	= "reshape",
78 	[ACTION_FROZEN]		= "frozen",
79 	[ACTION_IDLE]		= "idle",
80 };
81 
82 static DEFINE_XARRAY(md_submodule);
83 
84 static const struct kobj_type md_ktype;
85 
86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87 static struct workqueue_struct *md_wq;
88 
89 /*
90  * This workqueue is used for sync_work to register new sync_thread, and for
91  * del_work to remove rdev, and for event_work that is only set by dm-raid.
92  *
93  * Noted that sync_work will grab reconfig_mutex, hence never flush this
94  * workqueue whith reconfig_mutex grabbed.
95  */
96 static struct workqueue_struct *md_misc_wq;
97 struct workqueue_struct *md_bitmap_wq;
98 
99 static int remove_and_add_spares(struct mddev *mddev,
100 				 struct md_rdev *this);
101 static void mddev_detach(struct mddev *mddev);
102 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
103 static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
104 
105 /*
106  * Default number of read corrections we'll attempt on an rdev
107  * before ejecting it from the array. We divide the read error
108  * count by 2 for every hour elapsed between read errors.
109  */
110 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
111 /* Default safemode delay: 200 msec */
112 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
113 /*
114  * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
115  * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
116  * does not show up that much. Increase it if you want to have more guaranteed
117  * speed. Note that the RAID driver will use the maximum bandwidth
118  * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
119  *
120  * Background sync IO speed control:
121  *
122  * - below speed min:
123  *   no limit;
124  * - above speed min and below speed max:
125  *   a) if mddev is idle, then no limit;
126  *   b) if mddev is busy handling normal IO, then limit inflight sync IO
127  *   to sync_io_depth;
128  * - above speed max:
129  *   sync IO can't be issued;
130  *
131  * Following configurations can be changed via /proc/sys/dev/raid/ for system
132  * or /sys/block/mdX/md/ for one array.
133  */
134 static int sysctl_speed_limit_min = 1000;
135 static int sysctl_speed_limit_max = 200000;
136 static int sysctl_sync_io_depth = 32;
137 
speed_min(struct mddev * mddev)138 static int speed_min(struct mddev *mddev)
139 {
140 	return mddev->sync_speed_min ?
141 		mddev->sync_speed_min : sysctl_speed_limit_min;
142 }
143 
speed_max(struct mddev * mddev)144 static int speed_max(struct mddev *mddev)
145 {
146 	return mddev->sync_speed_max ?
147 		mddev->sync_speed_max : sysctl_speed_limit_max;
148 }
149 
sync_io_depth(struct mddev * mddev)150 static int sync_io_depth(struct mddev *mddev)
151 {
152 	return mddev->sync_io_depth ?
153 		mddev->sync_io_depth : sysctl_sync_io_depth;
154 }
155 
rdev_uninit_serial(struct md_rdev * rdev)156 static void rdev_uninit_serial(struct md_rdev *rdev)
157 {
158 	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
159 		return;
160 
161 	kvfree(rdev->serial);
162 	rdev->serial = NULL;
163 }
164 
rdevs_uninit_serial(struct mddev * mddev)165 static void rdevs_uninit_serial(struct mddev *mddev)
166 {
167 	struct md_rdev *rdev;
168 
169 	rdev_for_each(rdev, mddev)
170 		rdev_uninit_serial(rdev);
171 }
172 
rdev_init_serial(struct md_rdev * rdev)173 static int rdev_init_serial(struct md_rdev *rdev)
174 {
175 	/* serial_nums equals with BARRIER_BUCKETS_NR */
176 	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
177 	struct serial_in_rdev *serial = NULL;
178 
179 	if (test_bit(CollisionCheck, &rdev->flags))
180 		return 0;
181 
182 	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
183 			  GFP_KERNEL);
184 	if (!serial)
185 		return -ENOMEM;
186 
187 	for (i = 0; i < serial_nums; i++) {
188 		struct serial_in_rdev *serial_tmp = &serial[i];
189 
190 		spin_lock_init(&serial_tmp->serial_lock);
191 		serial_tmp->serial_rb = RB_ROOT_CACHED;
192 		init_waitqueue_head(&serial_tmp->serial_io_wait);
193 	}
194 
195 	rdev->serial = serial;
196 	set_bit(CollisionCheck, &rdev->flags);
197 
198 	return 0;
199 }
200 
rdevs_init_serial(struct mddev * mddev)201 static int rdevs_init_serial(struct mddev *mddev)
202 {
203 	struct md_rdev *rdev;
204 	int ret = 0;
205 
206 	rdev_for_each(rdev, mddev) {
207 		ret = rdev_init_serial(rdev);
208 		if (ret)
209 			break;
210 	}
211 
212 	/* Free all resources if pool is not existed */
213 	if (ret && !mddev->serial_info_pool)
214 		rdevs_uninit_serial(mddev);
215 
216 	return ret;
217 }
218 
219 /*
220  * rdev needs to enable serial stuffs if it meets the conditions:
221  * 1. it is multi-queue device flaged with writemostly.
222  * 2. the write-behind mode is enabled.
223  */
rdev_need_serial(struct md_rdev * rdev)224 static int rdev_need_serial(struct md_rdev *rdev)
225 {
226 	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
227 		rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
228 		test_bit(WriteMostly, &rdev->flags));
229 }
230 
231 /*
232  * Init resource for rdev(s), then create serial_info_pool if:
233  * 1. rdev is the first device which return true from rdev_enable_serial.
234  * 2. rdev is NULL, means we want to enable serialization for all rdevs.
235  */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev)236 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
237 {
238 	int ret = 0;
239 
240 	if (rdev && !rdev_need_serial(rdev) &&
241 	    !test_bit(CollisionCheck, &rdev->flags))
242 		return;
243 
244 	if (!rdev)
245 		ret = rdevs_init_serial(mddev);
246 	else
247 		ret = rdev_init_serial(rdev);
248 	if (ret)
249 		return;
250 
251 	if (mddev->serial_info_pool == NULL) {
252 		/*
253 		 * already in memalloc noio context by
254 		 * mddev_suspend()
255 		 */
256 		mddev->serial_info_pool =
257 			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
258 						sizeof(struct serial_info));
259 		if (!mddev->serial_info_pool) {
260 			rdevs_uninit_serial(mddev);
261 			pr_err("can't alloc memory pool for serialization\n");
262 		}
263 	}
264 }
265 
266 /*
267  * Free resource from rdev(s), and destroy serial_info_pool under conditions:
268  * 1. rdev is the last device flaged with CollisionCheck.
269  * 2. when bitmap is destroyed while policy is not enabled.
270  * 3. for disable policy, the pool is destroyed only when no rdev needs it.
271  */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev)272 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
273 {
274 	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
275 		return;
276 
277 	if (mddev->serial_info_pool) {
278 		struct md_rdev *temp;
279 		int num = 0; /* used to track if other rdevs need the pool */
280 
281 		rdev_for_each(temp, mddev) {
282 			if (!rdev) {
283 				if (!mddev->serialize_policy ||
284 				    !rdev_need_serial(temp))
285 					rdev_uninit_serial(temp);
286 				else
287 					num++;
288 			} else if (temp != rdev &&
289 				   test_bit(CollisionCheck, &temp->flags))
290 				num++;
291 		}
292 
293 		if (rdev)
294 			rdev_uninit_serial(rdev);
295 
296 		if (num)
297 			pr_info("The mempool could be used by other devices\n");
298 		else {
299 			mempool_destroy(mddev->serial_info_pool);
300 			mddev->serial_info_pool = NULL;
301 		}
302 	}
303 }
304 
305 static struct ctl_table_header *raid_table_header;
306 
307 static const struct ctl_table raid_table[] = {
308 	{
309 		.procname	= "speed_limit_min",
310 		.data		= &sysctl_speed_limit_min,
311 		.maxlen		= sizeof(int),
312 		.mode		= 0644,
313 		.proc_handler	= proc_dointvec,
314 	},
315 	{
316 		.procname	= "speed_limit_max",
317 		.data		= &sysctl_speed_limit_max,
318 		.maxlen		= sizeof(int),
319 		.mode		= 0644,
320 		.proc_handler	= proc_dointvec,
321 	},
322 	{
323 		.procname	= "sync_io_depth",
324 		.data		= &sysctl_sync_io_depth,
325 		.maxlen		= sizeof(int),
326 		.mode		= 0644,
327 		.proc_handler	= proc_dointvec,
328 	},
329 };
330 
331 static int start_readonly;
332 
333 /*
334  * The original mechanism for creating an md device is to create
335  * a device node in /dev and to open it.  This causes races with device-close.
336  * The preferred method is to write to the "new_array" module parameter.
337  * This can avoid races.
338  * Setting create_on_open to false disables the original mechanism
339  * so all the races disappear.
340  */
341 static bool create_on_open = true;
342 
343 /*
344  * We have a system wide 'event count' that is incremented
345  * on any 'interesting' event, and readers of /proc/mdstat
346  * can use 'poll' or 'select' to find out when the event
347  * count increases.
348  *
349  * Events are:
350  *  start array, stop array, error, add device, remove device,
351  *  start build, activate spare
352  */
353 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
354 static atomic_t md_event_count;
md_new_event(void)355 void md_new_event(void)
356 {
357 	atomic_inc(&md_event_count);
358 	wake_up(&md_event_waiters);
359 }
360 EXPORT_SYMBOL_GPL(md_new_event);
361 
362 /*
363  * Enables to iterate over all existing md arrays
364  * all_mddevs_lock protects this list.
365  */
366 static LIST_HEAD(all_mddevs);
367 static DEFINE_SPINLOCK(all_mddevs_lock);
368 
is_md_suspended(struct mddev * mddev)369 static bool is_md_suspended(struct mddev *mddev)
370 {
371 	return percpu_ref_is_dying(&mddev->active_io);
372 }
373 /* Rather than calling directly into the personality make_request function,
374  * IO requests come here first so that we can check if the device is
375  * being suspended pending a reconfiguration.
376  * We hold a refcount over the call to ->make_request.  By the time that
377  * call has finished, the bio has been linked into some internal structure
378  * and so is visible to ->quiesce(), so we don't need the refcount any more.
379  */
is_suspended(struct mddev * mddev,struct bio * bio)380 static bool is_suspended(struct mddev *mddev, struct bio *bio)
381 {
382 	if (is_md_suspended(mddev))
383 		return true;
384 	if (bio_data_dir(bio) != WRITE)
385 		return false;
386 	if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
387 		return false;
388 	if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
389 		return false;
390 	if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
391 		return false;
392 	return true;
393 }
394 
md_handle_request(struct mddev * mddev,struct bio * bio)395 bool md_handle_request(struct mddev *mddev, struct bio *bio)
396 {
397 check_suspended:
398 	if (is_suspended(mddev, bio)) {
399 		DEFINE_WAIT(__wait);
400 		/* Bail out if REQ_NOWAIT is set for the bio */
401 		if (bio->bi_opf & REQ_NOWAIT) {
402 			bio_wouldblock_error(bio);
403 			return true;
404 		}
405 		for (;;) {
406 			prepare_to_wait(&mddev->sb_wait, &__wait,
407 					TASK_UNINTERRUPTIBLE);
408 			if (!is_suspended(mddev, bio))
409 				break;
410 			schedule();
411 		}
412 		finish_wait(&mddev->sb_wait, &__wait);
413 	}
414 	if (!percpu_ref_tryget_live(&mddev->active_io))
415 		goto check_suspended;
416 
417 	if (!mddev->pers->make_request(mddev, bio)) {
418 		percpu_ref_put(&mddev->active_io);
419 		if (!mddev->gendisk && mddev->pers->prepare_suspend)
420 			return false;
421 		goto check_suspended;
422 	}
423 
424 	percpu_ref_put(&mddev->active_io);
425 	return true;
426 }
427 EXPORT_SYMBOL(md_handle_request);
428 
md_submit_bio(struct bio * bio)429 static void md_submit_bio(struct bio *bio)
430 {
431 	const int rw = bio_data_dir(bio);
432 	struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
433 
434 	if (mddev == NULL || mddev->pers == NULL) {
435 		bio_io_error(bio);
436 		return;
437 	}
438 
439 	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
440 		bio_io_error(bio);
441 		return;
442 	}
443 
444 	bio = bio_split_to_limits(bio);
445 	if (!bio)
446 		return;
447 
448 	if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
449 		if (bio_sectors(bio) != 0)
450 			bio->bi_status = BLK_STS_IOERR;
451 		bio_endio(bio);
452 		return;
453 	}
454 
455 	/* bio could be mergeable after passing to underlayer */
456 	bio->bi_opf &= ~REQ_NOMERGE;
457 
458 	md_handle_request(mddev, bio);
459 }
460 
461 /*
462  * Make sure no new requests are submitted to the device, and any requests that
463  * have been submitted are completely handled.
464  */
mddev_suspend(struct mddev * mddev,bool interruptible)465 int mddev_suspend(struct mddev *mddev, bool interruptible)
466 {
467 	int err = 0;
468 
469 	/*
470 	 * hold reconfig_mutex to wait for normal io will deadlock, because
471 	 * other context can't update super_block, and normal io can rely on
472 	 * updating super_block.
473 	 */
474 	lockdep_assert_not_held(&mddev->reconfig_mutex);
475 
476 	if (interruptible)
477 		err = mutex_lock_interruptible(&mddev->suspend_mutex);
478 	else
479 		mutex_lock(&mddev->suspend_mutex);
480 	if (err)
481 		return err;
482 
483 	if (mddev->suspended) {
484 		WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
485 		mutex_unlock(&mddev->suspend_mutex);
486 		return 0;
487 	}
488 
489 	percpu_ref_kill(&mddev->active_io);
490 	if (interruptible)
491 		err = wait_event_interruptible(mddev->sb_wait,
492 				percpu_ref_is_zero(&mddev->active_io));
493 	else
494 		wait_event(mddev->sb_wait,
495 				percpu_ref_is_zero(&mddev->active_io));
496 	if (err) {
497 		percpu_ref_resurrect(&mddev->active_io);
498 		mutex_unlock(&mddev->suspend_mutex);
499 		return err;
500 	}
501 
502 	/*
503 	 * For raid456, io might be waiting for reshape to make progress,
504 	 * allow new reshape to start while waiting for io to be done to
505 	 * prevent deadlock.
506 	 */
507 	WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
508 
509 	/* restrict memory reclaim I/O during raid array is suspend */
510 	mddev->noio_flag = memalloc_noio_save();
511 
512 	mutex_unlock(&mddev->suspend_mutex);
513 	return 0;
514 }
515 EXPORT_SYMBOL_GPL(mddev_suspend);
516 
__mddev_resume(struct mddev * mddev,bool recovery_needed)517 static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
518 {
519 	lockdep_assert_not_held(&mddev->reconfig_mutex);
520 
521 	mutex_lock(&mddev->suspend_mutex);
522 	WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
523 	if (mddev->suspended) {
524 		mutex_unlock(&mddev->suspend_mutex);
525 		return;
526 	}
527 
528 	/* entred the memalloc scope from mddev_suspend() */
529 	memalloc_noio_restore(mddev->noio_flag);
530 
531 	percpu_ref_resurrect(&mddev->active_io);
532 	wake_up(&mddev->sb_wait);
533 
534 	if (recovery_needed)
535 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
536 	md_wakeup_thread(mddev->thread);
537 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
538 
539 	mutex_unlock(&mddev->suspend_mutex);
540 }
541 
mddev_resume(struct mddev * mddev)542 void mddev_resume(struct mddev *mddev)
543 {
544 	return __mddev_resume(mddev, true);
545 }
546 EXPORT_SYMBOL_GPL(mddev_resume);
547 
548 /* sync bdev before setting device to readonly or stopping raid*/
mddev_set_closing_and_sync_blockdev(struct mddev * mddev,int opener_num)549 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
550 {
551 	mutex_lock(&mddev->open_mutex);
552 	if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
553 		mutex_unlock(&mddev->open_mutex);
554 		return -EBUSY;
555 	}
556 	if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
557 		mutex_unlock(&mddev->open_mutex);
558 		return -EBUSY;
559 	}
560 	mutex_unlock(&mddev->open_mutex);
561 
562 	sync_blockdev(mddev->gendisk->part0);
563 	return 0;
564 }
565 
566 /*
567  * The only difference from bio_chain_endio() is that the current
568  * bi_status of bio does not affect the bi_status of parent.
569  */
md_end_flush(struct bio * bio)570 static void md_end_flush(struct bio *bio)
571 {
572 	struct bio *parent = bio->bi_private;
573 
574 	/*
575 	 * If any flush io error before the power failure,
576 	 * disk data may be lost.
577 	 */
578 	if (bio->bi_status)
579 		pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
580 			blk_status_to_errno(bio->bi_status));
581 
582 	bio_put(bio);
583 	bio_endio(parent);
584 }
585 
md_flush_request(struct mddev * mddev,struct bio * bio)586 bool md_flush_request(struct mddev *mddev, struct bio *bio)
587 {
588 	struct md_rdev *rdev;
589 	struct bio *new;
590 
591 	/*
592 	 * md_flush_reqeust() should be called under md_handle_request() and
593 	 * 'active_io' is already grabbed. Hence it's safe to get rdev directly
594 	 * without rcu protection.
595 	 */
596 	WARN_ON(percpu_ref_is_zero(&mddev->active_io));
597 
598 	rdev_for_each(rdev, mddev) {
599 		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
600 			continue;
601 
602 		new = bio_alloc_bioset(rdev->bdev, 0,
603 				       REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
604 				       &mddev->bio_set);
605 		new->bi_private = bio;
606 		new->bi_end_io = md_end_flush;
607 		bio_inc_remaining(bio);
608 		submit_bio(new);
609 	}
610 
611 	if (bio_sectors(bio) == 0) {
612 		bio_endio(bio);
613 		return true;
614 	}
615 
616 	bio->bi_opf &= ~REQ_PREFLUSH;
617 	return false;
618 }
619 EXPORT_SYMBOL(md_flush_request);
620 
mddev_get(struct mddev * mddev)621 static inline struct mddev *mddev_get(struct mddev *mddev)
622 {
623 	lockdep_assert_held(&all_mddevs_lock);
624 
625 	if (test_bit(MD_DELETED, &mddev->flags))
626 		return NULL;
627 	atomic_inc(&mddev->active);
628 	return mddev;
629 }
630 
631 static void mddev_delayed_delete(struct work_struct *ws);
632 
__mddev_put(struct mddev * mddev)633 static void __mddev_put(struct mddev *mddev)
634 {
635 	if (mddev->raid_disks || !list_empty(&mddev->disks) ||
636 	    mddev->ctime || mddev->hold_active)
637 		return;
638 
639 	/*
640 	 * Call queue_work inside the spinlock so that flush_workqueue() after
641 	 * mddev_find will succeed in waiting for the work to be done.
642 	 */
643 	queue_work(md_misc_wq, &mddev->del_work);
644 }
645 
mddev_put_locked(struct mddev * mddev)646 static void mddev_put_locked(struct mddev *mddev)
647 {
648 	if (atomic_dec_and_test(&mddev->active))
649 		__mddev_put(mddev);
650 }
651 
mddev_put(struct mddev * mddev)652 void mddev_put(struct mddev *mddev)
653 {
654 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
655 		return;
656 
657 	__mddev_put(mddev);
658 	spin_unlock(&all_mddevs_lock);
659 }
660 
661 static void md_safemode_timeout(struct timer_list *t);
662 static void md_start_sync(struct work_struct *ws);
663 
active_io_release(struct percpu_ref * ref)664 static void active_io_release(struct percpu_ref *ref)
665 {
666 	struct mddev *mddev = container_of(ref, struct mddev, active_io);
667 
668 	wake_up(&mddev->sb_wait);
669 }
670 
no_op(struct percpu_ref * r)671 static void no_op(struct percpu_ref *r) {}
672 
mddev_init(struct mddev * mddev)673 int mddev_init(struct mddev *mddev)
674 {
675 
676 	if (percpu_ref_init(&mddev->active_io, active_io_release,
677 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
678 		return -ENOMEM;
679 
680 	if (percpu_ref_init(&mddev->writes_pending, no_op,
681 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
682 		percpu_ref_exit(&mddev->active_io);
683 		return -ENOMEM;
684 	}
685 
686 	/* We want to start with the refcount at zero */
687 	percpu_ref_put(&mddev->writes_pending);
688 
689 	mutex_init(&mddev->open_mutex);
690 	mutex_init(&mddev->reconfig_mutex);
691 	mutex_init(&mddev->suspend_mutex);
692 	mutex_init(&mddev->bitmap_info.mutex);
693 	INIT_LIST_HEAD(&mddev->disks);
694 	INIT_LIST_HEAD(&mddev->all_mddevs);
695 	INIT_LIST_HEAD(&mddev->deleting);
696 	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
697 	atomic_set(&mddev->active, 1);
698 	atomic_set(&mddev->openers, 0);
699 	atomic_set(&mddev->sync_seq, 0);
700 	spin_lock_init(&mddev->lock);
701 	init_waitqueue_head(&mddev->sb_wait);
702 	init_waitqueue_head(&mddev->recovery_wait);
703 	mddev->reshape_position = MaxSector;
704 	mddev->reshape_backwards = 0;
705 	mddev->last_sync_action = ACTION_IDLE;
706 	mddev->resync_min = 0;
707 	mddev->resync_max = MaxSector;
708 	mddev->level = LEVEL_NONE;
709 	mddev_set_bitmap_ops(mddev);
710 
711 	INIT_WORK(&mddev->sync_work, md_start_sync);
712 	INIT_WORK(&mddev->del_work, mddev_delayed_delete);
713 
714 	return 0;
715 }
716 EXPORT_SYMBOL_GPL(mddev_init);
717 
mddev_destroy(struct mddev * mddev)718 void mddev_destroy(struct mddev *mddev)
719 {
720 	percpu_ref_exit(&mddev->active_io);
721 	percpu_ref_exit(&mddev->writes_pending);
722 }
723 EXPORT_SYMBOL_GPL(mddev_destroy);
724 
mddev_find_locked(dev_t unit)725 static struct mddev *mddev_find_locked(dev_t unit)
726 {
727 	struct mddev *mddev;
728 
729 	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
730 		if (mddev->unit == unit)
731 			return mddev;
732 
733 	return NULL;
734 }
735 
736 /* find an unused unit number */
mddev_alloc_unit(void)737 static dev_t mddev_alloc_unit(void)
738 {
739 	static int next_minor = 512;
740 	int start = next_minor;
741 	bool is_free = 0;
742 	dev_t dev = 0;
743 
744 	while (!is_free) {
745 		dev = MKDEV(MD_MAJOR, next_minor);
746 		next_minor++;
747 		if (next_minor > MINORMASK)
748 			next_minor = 0;
749 		if (next_minor == start)
750 			return 0;		/* Oh dear, all in use. */
751 		is_free = !mddev_find_locked(dev);
752 	}
753 
754 	return dev;
755 }
756 
mddev_alloc(dev_t unit)757 static struct mddev *mddev_alloc(dev_t unit)
758 {
759 	struct mddev *new;
760 	int error;
761 
762 	if (unit && MAJOR(unit) != MD_MAJOR)
763 		unit &= ~((1 << MdpMinorShift) - 1);
764 
765 	new = kzalloc(sizeof(*new), GFP_KERNEL);
766 	if (!new)
767 		return ERR_PTR(-ENOMEM);
768 
769 	error = mddev_init(new);
770 	if (error)
771 		goto out_free_new;
772 
773 	spin_lock(&all_mddevs_lock);
774 	if (unit) {
775 		error = -EEXIST;
776 		if (mddev_find_locked(unit))
777 			goto out_destroy_new;
778 		new->unit = unit;
779 		if (MAJOR(unit) == MD_MAJOR)
780 			new->md_minor = MINOR(unit);
781 		else
782 			new->md_minor = MINOR(unit) >> MdpMinorShift;
783 		new->hold_active = UNTIL_IOCTL;
784 	} else {
785 		error = -ENODEV;
786 		new->unit = mddev_alloc_unit();
787 		if (!new->unit)
788 			goto out_destroy_new;
789 		new->md_minor = MINOR(new->unit);
790 		new->hold_active = UNTIL_STOP;
791 	}
792 
793 	list_add(&new->all_mddevs, &all_mddevs);
794 	spin_unlock(&all_mddevs_lock);
795 	return new;
796 
797 out_destroy_new:
798 	spin_unlock(&all_mddevs_lock);
799 	mddev_destroy(new);
800 out_free_new:
801 	kfree(new);
802 	return ERR_PTR(error);
803 }
804 
mddev_free(struct mddev * mddev)805 static void mddev_free(struct mddev *mddev)
806 {
807 	spin_lock(&all_mddevs_lock);
808 	list_del(&mddev->all_mddevs);
809 	spin_unlock(&all_mddevs_lock);
810 
811 	mddev_destroy(mddev);
812 	kfree(mddev);
813 }
814 
815 static const struct attribute_group md_redundancy_group;
816 
mddev_unlock(struct mddev * mddev)817 void mddev_unlock(struct mddev *mddev)
818 {
819 	struct md_rdev *rdev;
820 	struct md_rdev *tmp;
821 	LIST_HEAD(delete);
822 
823 	if (!list_empty(&mddev->deleting))
824 		list_splice_init(&mddev->deleting, &delete);
825 
826 	if (mddev->to_remove) {
827 		/* These cannot be removed under reconfig_mutex as
828 		 * an access to the files will try to take reconfig_mutex
829 		 * while holding the file unremovable, which leads to
830 		 * a deadlock.
831 		 * So hold set sysfs_active while the remove in happeing,
832 		 * and anything else which might set ->to_remove or my
833 		 * otherwise change the sysfs namespace will fail with
834 		 * -EBUSY if sysfs_active is still set.
835 		 * We set sysfs_active under reconfig_mutex and elsewhere
836 		 * test it under the same mutex to ensure its correct value
837 		 * is seen.
838 		 */
839 		const struct attribute_group *to_remove = mddev->to_remove;
840 		mddev->to_remove = NULL;
841 		mddev->sysfs_active = 1;
842 		mutex_unlock(&mddev->reconfig_mutex);
843 
844 		if (mddev->kobj.sd) {
845 			if (to_remove != &md_redundancy_group)
846 				sysfs_remove_group(&mddev->kobj, to_remove);
847 			if (mddev->pers == NULL ||
848 			    mddev->pers->sync_request == NULL) {
849 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
850 				if (mddev->sysfs_action)
851 					sysfs_put(mddev->sysfs_action);
852 				if (mddev->sysfs_completed)
853 					sysfs_put(mddev->sysfs_completed);
854 				if (mddev->sysfs_degraded)
855 					sysfs_put(mddev->sysfs_degraded);
856 				mddev->sysfs_action = NULL;
857 				mddev->sysfs_completed = NULL;
858 				mddev->sysfs_degraded = NULL;
859 			}
860 		}
861 		mddev->sysfs_active = 0;
862 	} else
863 		mutex_unlock(&mddev->reconfig_mutex);
864 
865 	md_wakeup_thread(mddev->thread);
866 	wake_up(&mddev->sb_wait);
867 
868 	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
869 		list_del_init(&rdev->same_set);
870 		kobject_del(&rdev->kobj);
871 		export_rdev(rdev, mddev);
872 	}
873 
874 	/* Call del_gendisk after release reconfig_mutex to avoid
875 	 * deadlock (e.g. call del_gendisk under the lock and an
876 	 * access to sysfs files waits the lock)
877 	 * And MD_DELETED is only used for md raid which is set in
878 	 * do_md_stop. dm raid only uses md_stop to stop. So dm raid
879 	 * doesn't need to check MD_DELETED when getting reconfig lock
880 	 */
881 	if (test_bit(MD_DELETED, &mddev->flags))
882 		del_gendisk(mddev->gendisk);
883 }
884 EXPORT_SYMBOL_GPL(mddev_unlock);
885 
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)886 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
887 {
888 	struct md_rdev *rdev;
889 
890 	rdev_for_each_rcu(rdev, mddev)
891 		if (rdev->desc_nr == nr)
892 			return rdev;
893 
894 	return NULL;
895 }
896 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
897 
find_rdev(struct mddev * mddev,dev_t dev)898 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
899 {
900 	struct md_rdev *rdev;
901 
902 	rdev_for_each(rdev, mddev)
903 		if (rdev->bdev->bd_dev == dev)
904 			return rdev;
905 
906 	return NULL;
907 }
908 
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)909 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
910 {
911 	struct md_rdev *rdev;
912 
913 	rdev_for_each_rcu(rdev, mddev)
914 		if (rdev->bdev->bd_dev == dev)
915 			return rdev;
916 
917 	return NULL;
918 }
919 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
920 
get_pers(int level,char * clevel)921 static struct md_personality *get_pers(int level, char *clevel)
922 {
923 	struct md_personality *ret = NULL;
924 	struct md_submodule_head *head;
925 	unsigned long i;
926 
927 	xa_lock(&md_submodule);
928 	xa_for_each(&md_submodule, i, head) {
929 		if (head->type != MD_PERSONALITY)
930 			continue;
931 		if ((level != LEVEL_NONE && head->id == level) ||
932 		    !strcmp(head->name, clevel)) {
933 			if (try_module_get(head->owner))
934 				ret = (void *)head;
935 			break;
936 		}
937 	}
938 	xa_unlock(&md_submodule);
939 
940 	if (!ret) {
941 		if (level != LEVEL_NONE)
942 			pr_warn("md: personality for level %d is not loaded!\n",
943 				level);
944 		else
945 			pr_warn("md: personality for level %s is not loaded!\n",
946 				clevel);
947 	}
948 
949 	return ret;
950 }
951 
put_pers(struct md_personality * pers)952 static void put_pers(struct md_personality *pers)
953 {
954 	module_put(pers->head.owner);
955 }
956 
957 /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)958 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
959 {
960 	return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
961 }
962 
alloc_disk_sb(struct md_rdev * rdev)963 static int alloc_disk_sb(struct md_rdev *rdev)
964 {
965 	rdev->sb_page = alloc_page(GFP_KERNEL);
966 	if (!rdev->sb_page)
967 		return -ENOMEM;
968 	return 0;
969 }
970 
md_rdev_clear(struct md_rdev * rdev)971 void md_rdev_clear(struct md_rdev *rdev)
972 {
973 	if (rdev->sb_page) {
974 		put_page(rdev->sb_page);
975 		rdev->sb_loaded = 0;
976 		rdev->sb_page = NULL;
977 		rdev->sb_start = 0;
978 		rdev->sectors = 0;
979 	}
980 	if (rdev->bb_page) {
981 		put_page(rdev->bb_page);
982 		rdev->bb_page = NULL;
983 	}
984 	badblocks_exit(&rdev->badblocks);
985 }
986 EXPORT_SYMBOL_GPL(md_rdev_clear);
987 
super_written(struct bio * bio)988 static void super_written(struct bio *bio)
989 {
990 	struct md_rdev *rdev = bio->bi_private;
991 	struct mddev *mddev = rdev->mddev;
992 
993 	if (bio->bi_status) {
994 		pr_err("md: %s gets error=%d\n", __func__,
995 		       blk_status_to_errno(bio->bi_status));
996 		md_error(mddev, rdev);
997 		if (!test_bit(Faulty, &rdev->flags)
998 		    && (bio->bi_opf & MD_FAILFAST)) {
999 			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
1000 			set_bit(LastDev, &rdev->flags);
1001 		}
1002 	} else
1003 		clear_bit(LastDev, &rdev->flags);
1004 
1005 	bio_put(bio);
1006 
1007 	rdev_dec_pending(rdev, mddev);
1008 
1009 	if (atomic_dec_and_test(&mddev->pending_writes))
1010 		wake_up(&mddev->sb_wait);
1011 }
1012 
md_super_write(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page)1013 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
1014 		   sector_t sector, int size, struct page *page)
1015 {
1016 	/* write first size bytes of page to sector of rdev
1017 	 * Increment mddev->pending_writes before returning
1018 	 * and decrement it on completion, waking up sb_wait
1019 	 * if zero is reached.
1020 	 * If an error occurred, call md_error
1021 	 */
1022 	struct bio *bio;
1023 
1024 	if (!page)
1025 		return;
1026 
1027 	if (test_bit(Faulty, &rdev->flags))
1028 		return;
1029 
1030 	bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1031 			      1,
1032 			      REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1033 				  | REQ_PREFLUSH | REQ_FUA,
1034 			      GFP_NOIO, &mddev->sync_set);
1035 
1036 	atomic_inc(&rdev->nr_pending);
1037 
1038 	bio->bi_iter.bi_sector = sector;
1039 	__bio_add_page(bio, page, size, 0);
1040 	bio->bi_private = rdev;
1041 	bio->bi_end_io = super_written;
1042 
1043 	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1044 	    test_bit(FailFast, &rdev->flags) &&
1045 	    !test_bit(LastDev, &rdev->flags))
1046 		bio->bi_opf |= MD_FAILFAST;
1047 
1048 	atomic_inc(&mddev->pending_writes);
1049 	submit_bio(bio);
1050 }
1051 
md_super_wait(struct mddev * mddev)1052 int md_super_wait(struct mddev *mddev)
1053 {
1054 	/* wait for all superblock writes that were scheduled to complete */
1055 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1056 	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1057 		return -EAGAIN;
1058 	return 0;
1059 }
1060 
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)1061 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1062 		 struct page *page, blk_opf_t opf, bool metadata_op)
1063 {
1064 	struct bio bio;
1065 	struct bio_vec bvec;
1066 
1067 	if (metadata_op && rdev->meta_bdev)
1068 		bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1069 	else
1070 		bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1071 
1072 	if (metadata_op)
1073 		bio.bi_iter.bi_sector = sector + rdev->sb_start;
1074 	else if (rdev->mddev->reshape_position != MaxSector &&
1075 		 (rdev->mddev->reshape_backwards ==
1076 		  (sector >= rdev->mddev->reshape_position)))
1077 		bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1078 	else
1079 		bio.bi_iter.bi_sector = sector + rdev->data_offset;
1080 	__bio_add_page(&bio, page, size, 0);
1081 
1082 	submit_bio_wait(&bio);
1083 
1084 	return !bio.bi_status;
1085 }
1086 EXPORT_SYMBOL_GPL(sync_page_io);
1087 
read_disk_sb(struct md_rdev * rdev,int size)1088 static int read_disk_sb(struct md_rdev *rdev, int size)
1089 {
1090 	if (rdev->sb_loaded)
1091 		return 0;
1092 
1093 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1094 		goto fail;
1095 	rdev->sb_loaded = 1;
1096 	return 0;
1097 
1098 fail:
1099 	pr_err("md: disabled device %pg, could not read superblock.\n",
1100 	       rdev->bdev);
1101 	return -EINVAL;
1102 }
1103 
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1104 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1105 {
1106 	return	sb1->set_uuid0 == sb2->set_uuid0 &&
1107 		sb1->set_uuid1 == sb2->set_uuid1 &&
1108 		sb1->set_uuid2 == sb2->set_uuid2 &&
1109 		sb1->set_uuid3 == sb2->set_uuid3;
1110 }
1111 
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1112 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1113 {
1114 	int ret;
1115 	mdp_super_t *tmp1, *tmp2;
1116 
1117 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1118 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1119 
1120 	if (!tmp1 || !tmp2) {
1121 		ret = 0;
1122 		goto abort;
1123 	}
1124 
1125 	*tmp1 = *sb1;
1126 	*tmp2 = *sb2;
1127 
1128 	/*
1129 	 * nr_disks is not constant
1130 	 */
1131 	tmp1->nr_disks = 0;
1132 	tmp2->nr_disks = 0;
1133 
1134 	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1135 abort:
1136 	kfree(tmp1);
1137 	kfree(tmp2);
1138 	return ret;
1139 }
1140 
md_csum_fold(u32 csum)1141 static u32 md_csum_fold(u32 csum)
1142 {
1143 	csum = (csum & 0xffff) + (csum >> 16);
1144 	return (csum & 0xffff) + (csum >> 16);
1145 }
1146 
calc_sb_csum(mdp_super_t * sb)1147 static unsigned int calc_sb_csum(mdp_super_t *sb)
1148 {
1149 	u64 newcsum = 0;
1150 	u32 *sb32 = (u32*)sb;
1151 	int i;
1152 	unsigned int disk_csum, csum;
1153 
1154 	disk_csum = sb->sb_csum;
1155 	sb->sb_csum = 0;
1156 
1157 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
1158 		newcsum += sb32[i];
1159 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
1160 
1161 #ifdef CONFIG_ALPHA
1162 	/* This used to use csum_partial, which was wrong for several
1163 	 * reasons including that different results are returned on
1164 	 * different architectures.  It isn't critical that we get exactly
1165 	 * the same return value as before (we always csum_fold before
1166 	 * testing, and that removes any differences).  However as we
1167 	 * know that csum_partial always returned a 16bit value on
1168 	 * alphas, do a fold to maximise conformity to previous behaviour.
1169 	 */
1170 	sb->sb_csum = md_csum_fold(disk_csum);
1171 #else
1172 	sb->sb_csum = disk_csum;
1173 #endif
1174 	return csum;
1175 }
1176 
1177 /*
1178  * Handle superblock details.
1179  * We want to be able to handle multiple superblock formats
1180  * so we have a common interface to them all, and an array of
1181  * different handlers.
1182  * We rely on user-space to write the initial superblock, and support
1183  * reading and updating of superblocks.
1184  * Interface methods are:
1185  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1186  *      loads and validates a superblock on dev.
1187  *      if refdev != NULL, compare superblocks on both devices
1188  *    Return:
1189  *      0 - dev has a superblock that is compatible with refdev
1190  *      1 - dev has a superblock that is compatible and newer than refdev
1191  *          so dev should be used as the refdev in future
1192  *     -EINVAL superblock incompatible or invalid
1193  *     -othererror e.g. -EIO
1194  *
1195  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1196  *      Verify that dev is acceptable into mddev.
1197  *       The first time, mddev->raid_disks will be 0, and data from
1198  *       dev should be merged in.  Subsequent calls check that dev
1199  *       is new enough.  Return 0 or -EINVAL
1200  *
1201  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1202  *     Update the superblock for rdev with data in mddev
1203  *     This does not write to disc.
1204  *
1205  */
1206 
1207 struct super_type  {
1208 	char		    *name;
1209 	struct module	    *owner;
1210 	int		    (*load_super)(struct md_rdev *rdev,
1211 					  struct md_rdev *refdev,
1212 					  int minor_version);
1213 	int		    (*validate_super)(struct mddev *mddev,
1214 					      struct md_rdev *freshest,
1215 					      struct md_rdev *rdev);
1216 	void		    (*sync_super)(struct mddev *mddev,
1217 					  struct md_rdev *rdev);
1218 	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1219 						sector_t num_sectors);
1220 	int		    (*allow_new_offset)(struct md_rdev *rdev,
1221 						unsigned long long new_offset);
1222 };
1223 
1224 /*
1225  * Check that the given mddev has no bitmap.
1226  *
1227  * This function is called from the run method of all personalities that do not
1228  * support bitmaps. It prints an error message and returns non-zero if mddev
1229  * has a bitmap. Otherwise, it returns 0.
1230  *
1231  */
md_check_no_bitmap(struct mddev * mddev)1232 int md_check_no_bitmap(struct mddev *mddev)
1233 {
1234 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1235 		return 0;
1236 	pr_warn("%s: bitmaps are not supported for %s\n",
1237 		mdname(mddev), mddev->pers->head.name);
1238 	return 1;
1239 }
1240 EXPORT_SYMBOL(md_check_no_bitmap);
1241 
1242 /*
1243  * load_super for 0.90.0
1244  */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1245 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1246 {
1247 	mdp_super_t *sb;
1248 	int ret;
1249 	bool spare_disk = true;
1250 
1251 	/*
1252 	 * Calculate the position of the superblock (512byte sectors),
1253 	 * it's at the end of the disk.
1254 	 *
1255 	 * It also happens to be a multiple of 4Kb.
1256 	 */
1257 	rdev->sb_start = calc_dev_sboffset(rdev);
1258 
1259 	ret = read_disk_sb(rdev, MD_SB_BYTES);
1260 	if (ret)
1261 		return ret;
1262 
1263 	ret = -EINVAL;
1264 
1265 	sb = page_address(rdev->sb_page);
1266 
1267 	if (sb->md_magic != MD_SB_MAGIC) {
1268 		pr_warn("md: invalid raid superblock magic on %pg\n",
1269 			rdev->bdev);
1270 		goto abort;
1271 	}
1272 
1273 	if (sb->major_version != 0 ||
1274 	    sb->minor_version < 90 ||
1275 	    sb->minor_version > 91) {
1276 		pr_warn("Bad version number %d.%d on %pg\n",
1277 			sb->major_version, sb->minor_version, rdev->bdev);
1278 		goto abort;
1279 	}
1280 
1281 	if (sb->raid_disks <= 0)
1282 		goto abort;
1283 
1284 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1285 		pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1286 		goto abort;
1287 	}
1288 
1289 	rdev->preferred_minor = sb->md_minor;
1290 	rdev->data_offset = 0;
1291 	rdev->new_data_offset = 0;
1292 	rdev->sb_size = MD_SB_BYTES;
1293 	rdev->badblocks.shift = -1;
1294 
1295 	rdev->desc_nr = sb->this_disk.number;
1296 
1297 	/* not spare disk */
1298 	if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1299 	    sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1300 		spare_disk = false;
1301 
1302 	if (!refdev) {
1303 		if (!spare_disk)
1304 			ret = 1;
1305 		else
1306 			ret = 0;
1307 	} else {
1308 		__u64 ev1, ev2;
1309 		mdp_super_t *refsb = page_address(refdev->sb_page);
1310 		if (!md_uuid_equal(refsb, sb)) {
1311 			pr_warn("md: %pg has different UUID to %pg\n",
1312 				rdev->bdev, refdev->bdev);
1313 			goto abort;
1314 		}
1315 		if (!md_sb_equal(refsb, sb)) {
1316 			pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1317 				rdev->bdev, refdev->bdev);
1318 			goto abort;
1319 		}
1320 		ev1 = md_event(sb);
1321 		ev2 = md_event(refsb);
1322 
1323 		if (!spare_disk && ev1 > ev2)
1324 			ret = 1;
1325 		else
1326 			ret = 0;
1327 	}
1328 	rdev->sectors = rdev->sb_start;
1329 	/* Limit to 4TB as metadata cannot record more than that.
1330 	 * (not needed for Linear and RAID0 as metadata doesn't
1331 	 * record this size)
1332 	 */
1333 	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1334 		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1335 
1336 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1337 		/* "this cannot possibly happen" ... */
1338 		ret = -EINVAL;
1339 
1340  abort:
1341 	return ret;
1342 }
1343 
md_bitmap_events_cleared(struct mddev * mddev)1344 static u64 md_bitmap_events_cleared(struct mddev *mddev)
1345 {
1346 	struct md_bitmap_stats stats;
1347 	int err;
1348 
1349 	err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
1350 	if (err)
1351 		return 0;
1352 
1353 	return stats.events_cleared;
1354 }
1355 
1356 /*
1357  * validate_super for 0.90.0
1358  * note: we are not using "freshest" for 0.9 superblock
1359  */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1360 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1361 {
1362 	mdp_disk_t *desc;
1363 	mdp_super_t *sb = page_address(rdev->sb_page);
1364 	__u64 ev1 = md_event(sb);
1365 
1366 	rdev->raid_disk = -1;
1367 	clear_bit(Faulty, &rdev->flags);
1368 	clear_bit(In_sync, &rdev->flags);
1369 	clear_bit(Bitmap_sync, &rdev->flags);
1370 	clear_bit(WriteMostly, &rdev->flags);
1371 
1372 	if (mddev->raid_disks == 0) {
1373 		mddev->major_version = 0;
1374 		mddev->minor_version = sb->minor_version;
1375 		mddev->patch_version = sb->patch_version;
1376 		mddev->external = 0;
1377 		mddev->chunk_sectors = sb->chunk_size >> 9;
1378 		mddev->ctime = sb->ctime;
1379 		mddev->utime = sb->utime;
1380 		mddev->level = sb->level;
1381 		mddev->clevel[0] = 0;
1382 		mddev->layout = sb->layout;
1383 		mddev->raid_disks = sb->raid_disks;
1384 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1385 		mddev->events = ev1;
1386 		mddev->bitmap_info.offset = 0;
1387 		mddev->bitmap_info.space = 0;
1388 		/* bitmap can use 60 K after the 4K superblocks */
1389 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1390 		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1391 		mddev->reshape_backwards = 0;
1392 
1393 		if (mddev->minor_version >= 91) {
1394 			mddev->reshape_position = sb->reshape_position;
1395 			mddev->delta_disks = sb->delta_disks;
1396 			mddev->new_level = sb->new_level;
1397 			mddev->new_layout = sb->new_layout;
1398 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1399 			if (mddev->delta_disks < 0)
1400 				mddev->reshape_backwards = 1;
1401 		} else {
1402 			mddev->reshape_position = MaxSector;
1403 			mddev->delta_disks = 0;
1404 			mddev->new_level = mddev->level;
1405 			mddev->new_layout = mddev->layout;
1406 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1407 		}
1408 		if (mddev->level == 0)
1409 			mddev->layout = -1;
1410 
1411 		if (sb->state & (1<<MD_SB_CLEAN))
1412 			mddev->recovery_cp = MaxSector;
1413 		else {
1414 			if (sb->events_hi == sb->cp_events_hi &&
1415 				sb->events_lo == sb->cp_events_lo) {
1416 				mddev->recovery_cp = sb->recovery_cp;
1417 			} else
1418 				mddev->recovery_cp = 0;
1419 		}
1420 
1421 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1422 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1423 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1424 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1425 
1426 		mddev->max_disks = MD_SB_DISKS;
1427 
1428 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1429 		    mddev->bitmap_info.file == NULL) {
1430 			mddev->bitmap_info.offset =
1431 				mddev->bitmap_info.default_offset;
1432 			mddev->bitmap_info.space =
1433 				mddev->bitmap_info.default_space;
1434 		}
1435 
1436 	} else if (mddev->pers == NULL) {
1437 		/* Insist on good event counter while assembling, except
1438 		 * for spares (which don't need an event count) */
1439 		++ev1;
1440 		if (sb->disks[rdev->desc_nr].state & (
1441 			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1442 			if (ev1 < mddev->events)
1443 				return -EINVAL;
1444 	} else if (mddev->bitmap) {
1445 		/* if adding to array with a bitmap, then we can accept an
1446 		 * older device ... but not too old.
1447 		 */
1448 		if (ev1 < md_bitmap_events_cleared(mddev))
1449 			return 0;
1450 		if (ev1 < mddev->events)
1451 			set_bit(Bitmap_sync, &rdev->flags);
1452 	} else {
1453 		if (ev1 < mddev->events)
1454 			/* just a hot-add of a new device, leave raid_disk at -1 */
1455 			return 0;
1456 	}
1457 
1458 	desc = sb->disks + rdev->desc_nr;
1459 
1460 	if (desc->state & (1<<MD_DISK_FAULTY))
1461 		set_bit(Faulty, &rdev->flags);
1462 	else if (desc->state & (1<<MD_DISK_SYNC)) {
1463 		set_bit(In_sync, &rdev->flags);
1464 		rdev->raid_disk = desc->raid_disk;
1465 		rdev->saved_raid_disk = desc->raid_disk;
1466 	} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1467 		/* active but not in sync implies recovery up to
1468 		 * reshape position.  We don't know exactly where
1469 		 * that is, so set to zero for now
1470 		 */
1471 		if (mddev->minor_version >= 91) {
1472 			rdev->recovery_offset = 0;
1473 			rdev->raid_disk = desc->raid_disk;
1474 		}
1475 	}
1476 	if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1477 		set_bit(WriteMostly, &rdev->flags);
1478 	if (desc->state & (1<<MD_DISK_FAILFAST))
1479 		set_bit(FailFast, &rdev->flags);
1480 	return 0;
1481 }
1482 
1483 /*
1484  * sync_super for 0.90.0
1485  */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1486 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1487 {
1488 	mdp_super_t *sb;
1489 	struct md_rdev *rdev2;
1490 	int next_spare = mddev->raid_disks;
1491 
1492 	/* make rdev->sb match mddev data..
1493 	 *
1494 	 * 1/ zero out disks
1495 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1496 	 * 3/ any empty disks < next_spare become removed
1497 	 *
1498 	 * disks[0] gets initialised to REMOVED because
1499 	 * we cannot be sure from other fields if it has
1500 	 * been initialised or not.
1501 	 */
1502 	int i;
1503 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1504 
1505 	rdev->sb_size = MD_SB_BYTES;
1506 
1507 	sb = page_address(rdev->sb_page);
1508 
1509 	memset(sb, 0, sizeof(*sb));
1510 
1511 	sb->md_magic = MD_SB_MAGIC;
1512 	sb->major_version = mddev->major_version;
1513 	sb->patch_version = mddev->patch_version;
1514 	sb->gvalid_words  = 0; /* ignored */
1515 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1516 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1517 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1518 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1519 
1520 	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1521 	sb->level = mddev->level;
1522 	sb->size = mddev->dev_sectors / 2;
1523 	sb->raid_disks = mddev->raid_disks;
1524 	sb->md_minor = mddev->md_minor;
1525 	sb->not_persistent = 0;
1526 	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1527 	sb->state = 0;
1528 	sb->events_hi = (mddev->events>>32);
1529 	sb->events_lo = (u32)mddev->events;
1530 
1531 	if (mddev->reshape_position == MaxSector)
1532 		sb->minor_version = 90;
1533 	else {
1534 		sb->minor_version = 91;
1535 		sb->reshape_position = mddev->reshape_position;
1536 		sb->new_level = mddev->new_level;
1537 		sb->delta_disks = mddev->delta_disks;
1538 		sb->new_layout = mddev->new_layout;
1539 		sb->new_chunk = mddev->new_chunk_sectors << 9;
1540 	}
1541 	mddev->minor_version = sb->minor_version;
1542 	if (mddev->in_sync)
1543 	{
1544 		sb->recovery_cp = mddev->recovery_cp;
1545 		sb->cp_events_hi = (mddev->events>>32);
1546 		sb->cp_events_lo = (u32)mddev->events;
1547 		if (mddev->recovery_cp == MaxSector)
1548 			sb->state = (1<< MD_SB_CLEAN);
1549 	} else
1550 		sb->recovery_cp = 0;
1551 
1552 	sb->layout = mddev->layout;
1553 	sb->chunk_size = mddev->chunk_sectors << 9;
1554 
1555 	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1556 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1557 
1558 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1559 	rdev_for_each(rdev2, mddev) {
1560 		mdp_disk_t *d;
1561 		int desc_nr;
1562 		int is_active = test_bit(In_sync, &rdev2->flags);
1563 
1564 		if (rdev2->raid_disk >= 0 &&
1565 		    sb->minor_version >= 91)
1566 			/* we have nowhere to store the recovery_offset,
1567 			 * but if it is not below the reshape_position,
1568 			 * we can piggy-back on that.
1569 			 */
1570 			is_active = 1;
1571 		if (rdev2->raid_disk < 0 ||
1572 		    test_bit(Faulty, &rdev2->flags))
1573 			is_active = 0;
1574 		if (is_active)
1575 			desc_nr = rdev2->raid_disk;
1576 		else
1577 			desc_nr = next_spare++;
1578 		rdev2->desc_nr = desc_nr;
1579 		d = &sb->disks[rdev2->desc_nr];
1580 		nr_disks++;
1581 		d->number = rdev2->desc_nr;
1582 		d->major = MAJOR(rdev2->bdev->bd_dev);
1583 		d->minor = MINOR(rdev2->bdev->bd_dev);
1584 		if (is_active)
1585 			d->raid_disk = rdev2->raid_disk;
1586 		else
1587 			d->raid_disk = rdev2->desc_nr; /* compatibility */
1588 		if (test_bit(Faulty, &rdev2->flags))
1589 			d->state = (1<<MD_DISK_FAULTY);
1590 		else if (is_active) {
1591 			d->state = (1<<MD_DISK_ACTIVE);
1592 			if (test_bit(In_sync, &rdev2->flags))
1593 				d->state |= (1<<MD_DISK_SYNC);
1594 			active++;
1595 			working++;
1596 		} else {
1597 			d->state = 0;
1598 			spare++;
1599 			working++;
1600 		}
1601 		if (test_bit(WriteMostly, &rdev2->flags))
1602 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1603 		if (test_bit(FailFast, &rdev2->flags))
1604 			d->state |= (1<<MD_DISK_FAILFAST);
1605 	}
1606 	/* now set the "removed" and "faulty" bits on any missing devices */
1607 	for (i=0 ; i < mddev->raid_disks ; i++) {
1608 		mdp_disk_t *d = &sb->disks[i];
1609 		if (d->state == 0 && d->number == 0) {
1610 			d->number = i;
1611 			d->raid_disk = i;
1612 			d->state = (1<<MD_DISK_REMOVED);
1613 			d->state |= (1<<MD_DISK_FAULTY);
1614 			failed++;
1615 		}
1616 	}
1617 	sb->nr_disks = nr_disks;
1618 	sb->active_disks = active;
1619 	sb->working_disks = working;
1620 	sb->failed_disks = failed;
1621 	sb->spare_disks = spare;
1622 
1623 	sb->this_disk = sb->disks[rdev->desc_nr];
1624 	sb->sb_csum = calc_sb_csum(sb);
1625 }
1626 
1627 /*
1628  * rdev_size_change for 0.90.0
1629  */
1630 static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1631 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1632 {
1633 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1634 		return 0; /* component must fit device */
1635 	if (rdev->mddev->bitmap_info.offset)
1636 		return 0; /* can't move bitmap */
1637 	rdev->sb_start = calc_dev_sboffset(rdev);
1638 	if (!num_sectors || num_sectors > rdev->sb_start)
1639 		num_sectors = rdev->sb_start;
1640 	/* Limit to 4TB as metadata cannot record more than that.
1641 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1642 	 */
1643 	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1644 		num_sectors = (sector_t)(2ULL << 32) - 2;
1645 	do {
1646 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1647 		       rdev->sb_page);
1648 	} while (md_super_wait(rdev->mddev) < 0);
1649 	return num_sectors;
1650 }
1651 
1652 static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1653 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1654 {
1655 	/* non-zero offset changes not possible with v0.90 */
1656 	return new_offset == 0;
1657 }
1658 
1659 /*
1660  * version 1 superblock
1661  */
1662 
calc_sb_1_csum(struct mdp_superblock_1 * sb)1663 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1664 {
1665 	__le32 disk_csum;
1666 	u32 csum;
1667 	unsigned long long newcsum;
1668 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1669 	__le32 *isuper = (__le32*)sb;
1670 
1671 	disk_csum = sb->sb_csum;
1672 	sb->sb_csum = 0;
1673 	newcsum = 0;
1674 	for (; size >= 4; size -= 4)
1675 		newcsum += le32_to_cpu(*isuper++);
1676 
1677 	if (size == 2)
1678 		newcsum += le16_to_cpu(*(__le16*) isuper);
1679 
1680 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1681 	sb->sb_csum = disk_csum;
1682 	return cpu_to_le32(csum);
1683 }
1684 
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1685 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1686 {
1687 	struct mdp_superblock_1 *sb;
1688 	int ret;
1689 	sector_t sb_start;
1690 	sector_t sectors;
1691 	int bmask;
1692 	bool spare_disk = true;
1693 
1694 	/*
1695 	 * Calculate the position of the superblock in 512byte sectors.
1696 	 * It is always aligned to a 4K boundary and
1697 	 * depeding on minor_version, it can be:
1698 	 * 0: At least 8K, but less than 12K, from end of device
1699 	 * 1: At start of device
1700 	 * 2: 4K from start of device.
1701 	 */
1702 	switch(minor_version) {
1703 	case 0:
1704 		sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1705 		sb_start &= ~(sector_t)(4*2-1);
1706 		break;
1707 	case 1:
1708 		sb_start = 0;
1709 		break;
1710 	case 2:
1711 		sb_start = 8;
1712 		break;
1713 	default:
1714 		return -EINVAL;
1715 	}
1716 	rdev->sb_start = sb_start;
1717 
1718 	/* superblock is rarely larger than 1K, but it can be larger,
1719 	 * and it is safe to read 4k, so we do that
1720 	 */
1721 	ret = read_disk_sb(rdev, 4096);
1722 	if (ret) return ret;
1723 
1724 	sb = page_address(rdev->sb_page);
1725 
1726 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1727 	    sb->major_version != cpu_to_le32(1) ||
1728 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1729 	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1730 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1731 		return -EINVAL;
1732 
1733 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1734 		pr_warn("md: invalid superblock checksum on %pg\n",
1735 			rdev->bdev);
1736 		return -EINVAL;
1737 	}
1738 	if (le64_to_cpu(sb->data_size) < 10) {
1739 		pr_warn("md: data_size too small on %pg\n",
1740 			rdev->bdev);
1741 		return -EINVAL;
1742 	}
1743 	if (sb->pad0 ||
1744 	    sb->pad3[0] ||
1745 	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1746 		/* Some padding is non-zero, might be a new feature */
1747 		return -EINVAL;
1748 
1749 	rdev->preferred_minor = 0xffff;
1750 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1751 	rdev->new_data_offset = rdev->data_offset;
1752 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1753 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1754 		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1755 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1756 
1757 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1758 	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1759 	if (rdev->sb_size & bmask)
1760 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1761 
1762 	if (minor_version
1763 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1764 		return -EINVAL;
1765 	if (minor_version
1766 	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1767 		return -EINVAL;
1768 
1769 	rdev->desc_nr = le32_to_cpu(sb->dev_number);
1770 
1771 	if (!rdev->bb_page) {
1772 		rdev->bb_page = alloc_page(GFP_KERNEL);
1773 		if (!rdev->bb_page)
1774 			return -ENOMEM;
1775 	}
1776 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1777 	    rdev->badblocks.count == 0) {
1778 		/* need to load the bad block list.
1779 		 * Currently we limit it to one page.
1780 		 */
1781 		s32 offset;
1782 		sector_t bb_sector;
1783 		__le64 *bbp;
1784 		int i;
1785 		int sectors = le16_to_cpu(sb->bblog_size);
1786 		if (sectors > (PAGE_SIZE / 512))
1787 			return -EINVAL;
1788 		offset = le32_to_cpu(sb->bblog_offset);
1789 		if (offset == 0)
1790 			return -EINVAL;
1791 		bb_sector = (long long)offset;
1792 		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1793 				  rdev->bb_page, REQ_OP_READ, true))
1794 			return -EIO;
1795 		bbp = (__le64 *)page_address(rdev->bb_page);
1796 		rdev->badblocks.shift = sb->bblog_shift;
1797 		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1798 			u64 bb = le64_to_cpu(*bbp);
1799 			int count = bb & (0x3ff);
1800 			u64 sector = bb >> 10;
1801 			sector <<= sb->bblog_shift;
1802 			count <<= sb->bblog_shift;
1803 			if (bb + 1 == 0)
1804 				break;
1805 			if (!badblocks_set(&rdev->badblocks, sector, count, 1))
1806 				return -EINVAL;
1807 		}
1808 	} else if (sb->bblog_offset != 0)
1809 		rdev->badblocks.shift = 0;
1810 
1811 	if ((le32_to_cpu(sb->feature_map) &
1812 	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1813 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1814 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1815 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1816 	}
1817 
1818 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1819 	    sb->level != 0)
1820 		return -EINVAL;
1821 
1822 	/* not spare disk */
1823 	if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1824 	    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1825 	     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1826 		spare_disk = false;
1827 
1828 	if (!refdev) {
1829 		if (!spare_disk)
1830 			ret = 1;
1831 		else
1832 			ret = 0;
1833 	} else {
1834 		__u64 ev1, ev2;
1835 		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1836 
1837 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1838 		    sb->level != refsb->level ||
1839 		    sb->layout != refsb->layout ||
1840 		    sb->chunksize != refsb->chunksize) {
1841 			pr_warn("md: %pg has strangely different superblock to %pg\n",
1842 				rdev->bdev,
1843 				refdev->bdev);
1844 			return -EINVAL;
1845 		}
1846 		ev1 = le64_to_cpu(sb->events);
1847 		ev2 = le64_to_cpu(refsb->events);
1848 
1849 		if (!spare_disk && ev1 > ev2)
1850 			ret = 1;
1851 		else
1852 			ret = 0;
1853 	}
1854 	if (minor_version)
1855 		sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1856 	else
1857 		sectors = rdev->sb_start;
1858 	if (sectors < le64_to_cpu(sb->data_size))
1859 		return -EINVAL;
1860 	rdev->sectors = le64_to_cpu(sb->data_size);
1861 	return ret;
1862 }
1863 
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1864 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1865 {
1866 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1867 	__u64 ev1 = le64_to_cpu(sb->events);
1868 	int role;
1869 
1870 	rdev->raid_disk = -1;
1871 	clear_bit(Faulty, &rdev->flags);
1872 	clear_bit(In_sync, &rdev->flags);
1873 	clear_bit(Bitmap_sync, &rdev->flags);
1874 	clear_bit(WriteMostly, &rdev->flags);
1875 
1876 	if (mddev->raid_disks == 0) {
1877 		mddev->major_version = 1;
1878 		mddev->patch_version = 0;
1879 		mddev->external = 0;
1880 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1881 		mddev->ctime = le64_to_cpu(sb->ctime);
1882 		mddev->utime = le64_to_cpu(sb->utime);
1883 		mddev->level = le32_to_cpu(sb->level);
1884 		mddev->clevel[0] = 0;
1885 		mddev->layout = le32_to_cpu(sb->layout);
1886 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1887 		mddev->dev_sectors = le64_to_cpu(sb->size);
1888 		mddev->events = ev1;
1889 		mddev->bitmap_info.offset = 0;
1890 		mddev->bitmap_info.space = 0;
1891 		/* Default location for bitmap is 1K after superblock
1892 		 * using 3K - total of 4K
1893 		 */
1894 		mddev->bitmap_info.default_offset = 1024 >> 9;
1895 		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1896 		mddev->reshape_backwards = 0;
1897 
1898 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1899 		memcpy(mddev->uuid, sb->set_uuid, 16);
1900 
1901 		mddev->max_disks =  (4096-256)/2;
1902 
1903 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1904 		    mddev->bitmap_info.file == NULL) {
1905 			mddev->bitmap_info.offset =
1906 				(__s32)le32_to_cpu(sb->bitmap_offset);
1907 			/* Metadata doesn't record how much space is available.
1908 			 * For 1.0, we assume we can use up to the superblock
1909 			 * if before, else to 4K beyond superblock.
1910 			 * For others, assume no change is possible.
1911 			 */
1912 			if (mddev->minor_version > 0)
1913 				mddev->bitmap_info.space = 0;
1914 			else if (mddev->bitmap_info.offset > 0)
1915 				mddev->bitmap_info.space =
1916 					8 - mddev->bitmap_info.offset;
1917 			else
1918 				mddev->bitmap_info.space =
1919 					-mddev->bitmap_info.offset;
1920 		}
1921 
1922 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1923 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1924 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1925 			mddev->new_level = le32_to_cpu(sb->new_level);
1926 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1927 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1928 			if (mddev->delta_disks < 0 ||
1929 			    (mddev->delta_disks == 0 &&
1930 			     (le32_to_cpu(sb->feature_map)
1931 			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1932 				mddev->reshape_backwards = 1;
1933 		} else {
1934 			mddev->reshape_position = MaxSector;
1935 			mddev->delta_disks = 0;
1936 			mddev->new_level = mddev->level;
1937 			mddev->new_layout = mddev->layout;
1938 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1939 		}
1940 
1941 		if (mddev->level == 0 &&
1942 		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1943 			mddev->layout = -1;
1944 
1945 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1946 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1947 
1948 		if (le32_to_cpu(sb->feature_map) &
1949 		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1950 			if (le32_to_cpu(sb->feature_map) &
1951 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1952 				return -EINVAL;
1953 			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1954 			    (le32_to_cpu(sb->feature_map) &
1955 					    MD_FEATURE_MULTIPLE_PPLS))
1956 				return -EINVAL;
1957 			set_bit(MD_HAS_PPL, &mddev->flags);
1958 		}
1959 	} else if (mddev->pers == NULL) {
1960 		/* Insist of good event counter while assembling, except for
1961 		 * spares (which don't need an event count).
1962 		 * Similar to mdadm, we allow event counter difference of 1
1963 		 * from the freshest device.
1964 		 */
1965 		if (rdev->desc_nr >= 0 &&
1966 		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1967 		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1968 		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1969 			if (ev1 + 1 < mddev->events)
1970 				return -EINVAL;
1971 	} else if (mddev->bitmap) {
1972 		/* If adding to array with a bitmap, then we can accept an
1973 		 * older device, but not too old.
1974 		 */
1975 		if (ev1 < md_bitmap_events_cleared(mddev))
1976 			return 0;
1977 		if (ev1 < mddev->events)
1978 			set_bit(Bitmap_sync, &rdev->flags);
1979 	} else {
1980 		if (ev1 < mddev->events)
1981 			/* just a hot-add of a new device, leave raid_disk at -1 */
1982 			return 0;
1983 	}
1984 
1985 	if (rdev->desc_nr < 0 ||
1986 	    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1987 		role = MD_DISK_ROLE_SPARE;
1988 		rdev->desc_nr = -1;
1989 	} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
1990 		/*
1991 		 * If we are assembling, and our event counter is smaller than the
1992 		 * highest event counter, we cannot trust our superblock about the role.
1993 		 * It could happen that our rdev was marked as Faulty, and all other
1994 		 * superblocks were updated with +1 event counter.
1995 		 * Then, before the next superblock update, which typically happens when
1996 		 * remove_and_add_spares() removes the device from the array, there was
1997 		 * a crash or reboot.
1998 		 * If we allow current rdev without consulting the freshest superblock,
1999 		 * we could cause data corruption.
2000 		 * Note that in this case our event counter is smaller by 1 than the
2001 		 * highest, otherwise, this rdev would not be allowed into array;
2002 		 * both kernel and mdadm allow event counter difference of 1.
2003 		 */
2004 		struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
2005 		u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
2006 
2007 		if (rdev->desc_nr >= freshest_max_dev) {
2008 			/* this is unexpected, better not proceed */
2009 			pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
2010 				mdname(mddev), rdev->bdev, rdev->desc_nr,
2011 				freshest->bdev, freshest_max_dev);
2012 			return -EUCLEAN;
2013 		}
2014 
2015 		role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
2016 		pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
2017 			 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
2018 	} else {
2019 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2020 	}
2021 	switch (role) {
2022 	case MD_DISK_ROLE_SPARE: /* spare */
2023 		break;
2024 	case MD_DISK_ROLE_FAULTY: /* faulty */
2025 		set_bit(Faulty, &rdev->flags);
2026 		break;
2027 	case MD_DISK_ROLE_JOURNAL: /* journal device */
2028 		if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
2029 			/* journal device without journal feature */
2030 			pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2031 			return -EINVAL;
2032 		}
2033 		set_bit(Journal, &rdev->flags);
2034 		rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2035 		rdev->raid_disk = 0;
2036 		break;
2037 	default:
2038 		rdev->saved_raid_disk = role;
2039 		if ((le32_to_cpu(sb->feature_map) &
2040 		     MD_FEATURE_RECOVERY_OFFSET)) {
2041 			rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2042 			if (!(le32_to_cpu(sb->feature_map) &
2043 			      MD_FEATURE_RECOVERY_BITMAP))
2044 				rdev->saved_raid_disk = -1;
2045 		} else {
2046 			/*
2047 			 * If the array is FROZEN, then the device can't
2048 			 * be in_sync with rest of array.
2049 			 */
2050 			if (!test_bit(MD_RECOVERY_FROZEN,
2051 				      &mddev->recovery))
2052 				set_bit(In_sync, &rdev->flags);
2053 		}
2054 		rdev->raid_disk = role;
2055 		break;
2056 	}
2057 	if (sb->devflags & WriteMostly1)
2058 		set_bit(WriteMostly, &rdev->flags);
2059 	if (sb->devflags & FailFast1)
2060 		set_bit(FailFast, &rdev->flags);
2061 	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2062 		set_bit(Replacement, &rdev->flags);
2063 
2064 	return 0;
2065 }
2066 
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)2067 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2068 {
2069 	struct mdp_superblock_1 *sb;
2070 	struct md_rdev *rdev2;
2071 	int max_dev, i;
2072 	/* make rdev->sb match mddev and rdev data. */
2073 
2074 	sb = page_address(rdev->sb_page);
2075 
2076 	sb->feature_map = 0;
2077 	sb->pad0 = 0;
2078 	sb->recovery_offset = cpu_to_le64(0);
2079 	memset(sb->pad3, 0, sizeof(sb->pad3));
2080 
2081 	sb->utime = cpu_to_le64((__u64)mddev->utime);
2082 	sb->events = cpu_to_le64(mddev->events);
2083 	if (mddev->in_sync)
2084 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2085 	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2086 		sb->resync_offset = cpu_to_le64(MaxSector);
2087 	else
2088 		sb->resync_offset = cpu_to_le64(0);
2089 
2090 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2091 
2092 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2093 	sb->size = cpu_to_le64(mddev->dev_sectors);
2094 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2095 	sb->level = cpu_to_le32(mddev->level);
2096 	sb->layout = cpu_to_le32(mddev->layout);
2097 	if (test_bit(FailFast, &rdev->flags))
2098 		sb->devflags |= FailFast1;
2099 	else
2100 		sb->devflags &= ~FailFast1;
2101 
2102 	if (test_bit(WriteMostly, &rdev->flags))
2103 		sb->devflags |= WriteMostly1;
2104 	else
2105 		sb->devflags &= ~WriteMostly1;
2106 	sb->data_offset = cpu_to_le64(rdev->data_offset);
2107 	sb->data_size = cpu_to_le64(rdev->sectors);
2108 
2109 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2110 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2111 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2112 	}
2113 
2114 	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2115 	    !test_bit(In_sync, &rdev->flags)) {
2116 		sb->feature_map |=
2117 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2118 		sb->recovery_offset =
2119 			cpu_to_le64(rdev->recovery_offset);
2120 		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2121 			sb->feature_map |=
2122 				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2123 	}
2124 	/* Note: recovery_offset and journal_tail share space  */
2125 	if (test_bit(Journal, &rdev->flags))
2126 		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2127 	if (test_bit(Replacement, &rdev->flags))
2128 		sb->feature_map |=
2129 			cpu_to_le32(MD_FEATURE_REPLACEMENT);
2130 
2131 	if (mddev->reshape_position != MaxSector) {
2132 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2133 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2134 		sb->new_layout = cpu_to_le32(mddev->new_layout);
2135 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2136 		sb->new_level = cpu_to_le32(mddev->new_level);
2137 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2138 		if (mddev->delta_disks == 0 &&
2139 		    mddev->reshape_backwards)
2140 			sb->feature_map
2141 				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2142 		if (rdev->new_data_offset != rdev->data_offset) {
2143 			sb->feature_map
2144 				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2145 			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2146 							     - rdev->data_offset));
2147 		}
2148 	}
2149 
2150 	if (mddev_is_clustered(mddev))
2151 		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2152 
2153 	if (rdev->badblocks.count == 0)
2154 		/* Nothing to do for bad blocks*/ ;
2155 	else if (sb->bblog_offset == 0)
2156 		/* Cannot record bad blocks on this device */
2157 		md_error(mddev, rdev);
2158 	else {
2159 		struct badblocks *bb = &rdev->badblocks;
2160 		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2161 		u64 *p = bb->page;
2162 		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2163 		if (bb->changed) {
2164 			unsigned seq;
2165 
2166 retry:
2167 			seq = read_seqbegin(&bb->lock);
2168 
2169 			memset(bbp, 0xff, PAGE_SIZE);
2170 
2171 			for (i = 0 ; i < bb->count ; i++) {
2172 				u64 internal_bb = p[i];
2173 				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2174 						| BB_LEN(internal_bb));
2175 				bbp[i] = cpu_to_le64(store_bb);
2176 			}
2177 			bb->changed = 0;
2178 			if (read_seqretry(&bb->lock, seq))
2179 				goto retry;
2180 
2181 			bb->sector = (rdev->sb_start +
2182 				      (int)le32_to_cpu(sb->bblog_offset));
2183 			bb->size = le16_to_cpu(sb->bblog_size);
2184 		}
2185 	}
2186 
2187 	max_dev = 0;
2188 	rdev_for_each(rdev2, mddev)
2189 		if (rdev2->desc_nr+1 > max_dev)
2190 			max_dev = rdev2->desc_nr+1;
2191 
2192 	if (max_dev > le32_to_cpu(sb->max_dev)) {
2193 		int bmask;
2194 		sb->max_dev = cpu_to_le32(max_dev);
2195 		rdev->sb_size = max_dev * 2 + 256;
2196 		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2197 		if (rdev->sb_size & bmask)
2198 			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2199 	} else
2200 		max_dev = le32_to_cpu(sb->max_dev);
2201 
2202 	for (i=0; i<max_dev;i++)
2203 		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2204 
2205 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2206 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2207 
2208 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2209 		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2210 			sb->feature_map |=
2211 			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2212 		else
2213 			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2214 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2215 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2216 	}
2217 
2218 	rdev_for_each(rdev2, mddev) {
2219 		i = rdev2->desc_nr;
2220 		if (test_bit(Faulty, &rdev2->flags))
2221 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2222 		else if (test_bit(In_sync, &rdev2->flags))
2223 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2224 		else if (test_bit(Journal, &rdev2->flags))
2225 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2226 		else if (rdev2->raid_disk >= 0)
2227 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2228 		else
2229 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2230 	}
2231 
2232 	sb->sb_csum = calc_sb_1_csum(sb);
2233 }
2234 
super_1_choose_bm_space(sector_t dev_size)2235 static sector_t super_1_choose_bm_space(sector_t dev_size)
2236 {
2237 	sector_t bm_space;
2238 
2239 	/* if the device is bigger than 8Gig, save 64k for bitmap
2240 	 * usage, if bigger than 200Gig, save 128k
2241 	 */
2242 	if (dev_size < 64*2)
2243 		bm_space = 0;
2244 	else if (dev_size - 64*2 >= 200*1024*1024*2)
2245 		bm_space = 128*2;
2246 	else if (dev_size - 4*2 > 8*1024*1024*2)
2247 		bm_space = 64*2;
2248 	else
2249 		bm_space = 4*2;
2250 	return bm_space;
2251 }
2252 
2253 static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2254 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2255 {
2256 	struct mdp_superblock_1 *sb;
2257 	sector_t max_sectors;
2258 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2259 		return 0; /* component must fit device */
2260 	if (rdev->data_offset != rdev->new_data_offset)
2261 		return 0; /* too confusing */
2262 	if (rdev->sb_start < rdev->data_offset) {
2263 		/* minor versions 1 and 2; superblock before data */
2264 		max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2265 		if (!num_sectors || num_sectors > max_sectors)
2266 			num_sectors = max_sectors;
2267 	} else if (rdev->mddev->bitmap_info.offset) {
2268 		/* minor version 0 with bitmap we can't move */
2269 		return 0;
2270 	} else {
2271 		/* minor version 0; superblock after data */
2272 		sector_t sb_start, bm_space;
2273 		sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2274 
2275 		/* 8K is for superblock */
2276 		sb_start = dev_size - 8*2;
2277 		sb_start &= ~(sector_t)(4*2 - 1);
2278 
2279 		bm_space = super_1_choose_bm_space(dev_size);
2280 
2281 		/* Space that can be used to store date needs to decrease
2282 		 * superblock bitmap space and bad block space(4K)
2283 		 */
2284 		max_sectors = sb_start - bm_space - 4*2;
2285 
2286 		if (!num_sectors || num_sectors > max_sectors)
2287 			num_sectors = max_sectors;
2288 		rdev->sb_start = sb_start;
2289 	}
2290 	sb = page_address(rdev->sb_page);
2291 	sb->data_size = cpu_to_le64(num_sectors);
2292 	sb->super_offset = cpu_to_le64(rdev->sb_start);
2293 	sb->sb_csum = calc_sb_1_csum(sb);
2294 	do {
2295 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2296 			       rdev->sb_page);
2297 	} while (md_super_wait(rdev->mddev) < 0);
2298 	return num_sectors;
2299 
2300 }
2301 
2302 static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2303 super_1_allow_new_offset(struct md_rdev *rdev,
2304 			 unsigned long long new_offset)
2305 {
2306 	/* All necessary checks on new >= old have been done */
2307 	if (new_offset >= rdev->data_offset)
2308 		return 1;
2309 
2310 	/* with 1.0 metadata, there is no metadata to tread on
2311 	 * so we can always move back */
2312 	if (rdev->mddev->minor_version == 0)
2313 		return 1;
2314 
2315 	/* otherwise we must be sure not to step on
2316 	 * any metadata, so stay:
2317 	 * 36K beyond start of superblock
2318 	 * beyond end of badblocks
2319 	 * beyond write-intent bitmap
2320 	 */
2321 	if (rdev->sb_start + (32+4)*2 > new_offset)
2322 		return 0;
2323 
2324 	if (!rdev->mddev->bitmap_info.file) {
2325 		struct mddev *mddev = rdev->mddev;
2326 		struct md_bitmap_stats stats;
2327 		int err;
2328 
2329 		err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
2330 		if (!err && rdev->sb_start + mddev->bitmap_info.offset +
2331 		    stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
2332 			return 0;
2333 	}
2334 
2335 	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2336 		return 0;
2337 
2338 	return 1;
2339 }
2340 
2341 static struct super_type super_types[] = {
2342 	[0] = {
2343 		.name	= "0.90.0",
2344 		.owner	= THIS_MODULE,
2345 		.load_super	    = super_90_load,
2346 		.validate_super	    = super_90_validate,
2347 		.sync_super	    = super_90_sync,
2348 		.rdev_size_change   = super_90_rdev_size_change,
2349 		.allow_new_offset   = super_90_allow_new_offset,
2350 	},
2351 	[1] = {
2352 		.name	= "md-1",
2353 		.owner	= THIS_MODULE,
2354 		.load_super	    = super_1_load,
2355 		.validate_super	    = super_1_validate,
2356 		.sync_super	    = super_1_sync,
2357 		.rdev_size_change   = super_1_rdev_size_change,
2358 		.allow_new_offset   = super_1_allow_new_offset,
2359 	},
2360 };
2361 
sync_super(struct mddev * mddev,struct md_rdev * rdev)2362 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2363 {
2364 	if (mddev->sync_super) {
2365 		mddev->sync_super(mddev, rdev);
2366 		return;
2367 	}
2368 
2369 	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2370 
2371 	super_types[mddev->major_version].sync_super(mddev, rdev);
2372 }
2373 
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2374 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2375 {
2376 	struct md_rdev *rdev, *rdev2;
2377 
2378 	rcu_read_lock();
2379 	rdev_for_each_rcu(rdev, mddev1) {
2380 		if (test_bit(Faulty, &rdev->flags) ||
2381 		    test_bit(Journal, &rdev->flags) ||
2382 		    rdev->raid_disk == -1)
2383 			continue;
2384 		rdev_for_each_rcu(rdev2, mddev2) {
2385 			if (test_bit(Faulty, &rdev2->flags) ||
2386 			    test_bit(Journal, &rdev2->flags) ||
2387 			    rdev2->raid_disk == -1)
2388 				continue;
2389 			if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2390 				rcu_read_unlock();
2391 				return 1;
2392 			}
2393 		}
2394 	}
2395 	rcu_read_unlock();
2396 	return 0;
2397 }
2398 
2399 static LIST_HEAD(pending_raid_disks);
2400 
2401 /*
2402  * Try to register data integrity profile for an mddev
2403  *
2404  * This is called when an array is started and after a disk has been kicked
2405  * from the array. It only succeeds if all working and active component devices
2406  * are integrity capable with matching profiles.
2407  */
md_integrity_register(struct mddev * mddev)2408 int md_integrity_register(struct mddev *mddev)
2409 {
2410 	if (list_empty(&mddev->disks))
2411 		return 0; /* nothing to do */
2412 	if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
2413 		return 0; /* shouldn't register */
2414 
2415 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2416 	return 0;
2417 }
2418 EXPORT_SYMBOL(md_integrity_register);
2419 
rdev_read_only(struct md_rdev * rdev)2420 static bool rdev_read_only(struct md_rdev *rdev)
2421 {
2422 	return bdev_read_only(rdev->bdev) ||
2423 		(rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2424 }
2425 
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2426 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2427 {
2428 	char b[BDEVNAME_SIZE];
2429 	int err;
2430 
2431 	/* prevent duplicates */
2432 	if (find_rdev(mddev, rdev->bdev->bd_dev))
2433 		return -EEXIST;
2434 
2435 	if (rdev_read_only(rdev) && mddev->pers)
2436 		return -EROFS;
2437 
2438 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2439 	if (!test_bit(Journal, &rdev->flags) &&
2440 	    rdev->sectors &&
2441 	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2442 		if (mddev->pers) {
2443 			/* Cannot change size, so fail
2444 			 * If mddev->level <= 0, then we don't care
2445 			 * about aligning sizes (e.g. linear)
2446 			 */
2447 			if (mddev->level > 0)
2448 				return -ENOSPC;
2449 		} else
2450 			mddev->dev_sectors = rdev->sectors;
2451 	}
2452 
2453 	/* Verify rdev->desc_nr is unique.
2454 	 * If it is -1, assign a free number, else
2455 	 * check number is not in use
2456 	 */
2457 	rcu_read_lock();
2458 	if (rdev->desc_nr < 0) {
2459 		int choice = 0;
2460 		if (mddev->pers)
2461 			choice = mddev->raid_disks;
2462 		while (md_find_rdev_nr_rcu(mddev, choice))
2463 			choice++;
2464 		rdev->desc_nr = choice;
2465 	} else {
2466 		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2467 			rcu_read_unlock();
2468 			return -EBUSY;
2469 		}
2470 	}
2471 	rcu_read_unlock();
2472 	if (!test_bit(Journal, &rdev->flags) &&
2473 	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2474 		pr_warn("md: %s: array is limited to %d devices\n",
2475 			mdname(mddev), mddev->max_disks);
2476 		return -EBUSY;
2477 	}
2478 	snprintf(b, sizeof(b), "%pg", rdev->bdev);
2479 	strreplace(b, '/', '!');
2480 
2481 	rdev->mddev = mddev;
2482 	pr_debug("md: bind<%s>\n", b);
2483 
2484 	if (mddev->raid_disks)
2485 		mddev_create_serial_pool(mddev, rdev);
2486 
2487 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2488 		goto fail;
2489 
2490 	/* failure here is OK */
2491 	err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2492 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2493 	rdev->sysfs_unack_badblocks =
2494 		sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2495 	rdev->sysfs_badblocks =
2496 		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2497 
2498 	list_add_rcu(&rdev->same_set, &mddev->disks);
2499 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2500 
2501 	/* May as well allow recovery to be retried once */
2502 	mddev->recovery_disabled++;
2503 
2504 	return 0;
2505 
2506  fail:
2507 	pr_warn("md: failed to register dev-%s for %s\n",
2508 		b, mdname(mddev));
2509 	mddev_destroy_serial_pool(mddev, rdev);
2510 	return err;
2511 }
2512 
2513 void md_autodetect_dev(dev_t dev);
2514 
2515 /* just for claiming the bdev */
2516 static struct md_rdev claim_rdev;
2517 
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2518 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2519 {
2520 	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2521 	md_rdev_clear(rdev);
2522 #ifndef MODULE
2523 	if (test_bit(AutoDetected, &rdev->flags))
2524 		md_autodetect_dev(rdev->bdev->bd_dev);
2525 #endif
2526 	fput(rdev->bdev_file);
2527 	rdev->bdev = NULL;
2528 	kobject_put(&rdev->kobj);
2529 }
2530 
md_kick_rdev_from_array(struct md_rdev * rdev)2531 static void md_kick_rdev_from_array(struct md_rdev *rdev)
2532 {
2533 	struct mddev *mddev = rdev->mddev;
2534 
2535 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2536 	list_del_rcu(&rdev->same_set);
2537 	pr_debug("md: unbind<%pg>\n", rdev->bdev);
2538 	mddev_destroy_serial_pool(rdev->mddev, rdev);
2539 	WRITE_ONCE(rdev->mddev, NULL);
2540 	sysfs_remove_link(&rdev->kobj, "block");
2541 	sysfs_put(rdev->sysfs_state);
2542 	sysfs_put(rdev->sysfs_unack_badblocks);
2543 	sysfs_put(rdev->sysfs_badblocks);
2544 	rdev->sysfs_state = NULL;
2545 	rdev->sysfs_unack_badblocks = NULL;
2546 	rdev->sysfs_badblocks = NULL;
2547 	rdev->badblocks.count = 0;
2548 
2549 	synchronize_rcu();
2550 
2551 	/*
2552 	 * kobject_del() will wait for all in progress writers to be done, where
2553 	 * reconfig_mutex is held, hence it can't be called under
2554 	 * reconfig_mutex and it's delayed to mddev_unlock().
2555 	 */
2556 	list_add(&rdev->same_set, &mddev->deleting);
2557 }
2558 
export_array(struct mddev * mddev)2559 static void export_array(struct mddev *mddev)
2560 {
2561 	struct md_rdev *rdev;
2562 
2563 	while (!list_empty(&mddev->disks)) {
2564 		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2565 					same_set);
2566 		md_kick_rdev_from_array(rdev);
2567 	}
2568 	mddev->raid_disks = 0;
2569 	mddev->major_version = 0;
2570 }
2571 
set_in_sync(struct mddev * mddev)2572 static bool set_in_sync(struct mddev *mddev)
2573 {
2574 	lockdep_assert_held(&mddev->lock);
2575 	if (!mddev->in_sync) {
2576 		mddev->sync_checkers++;
2577 		spin_unlock(&mddev->lock);
2578 		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2579 		spin_lock(&mddev->lock);
2580 		if (!mddev->in_sync &&
2581 		    percpu_ref_is_zero(&mddev->writes_pending)) {
2582 			mddev->in_sync = 1;
2583 			/*
2584 			 * Ensure ->in_sync is visible before we clear
2585 			 * ->sync_checkers.
2586 			 */
2587 			smp_mb();
2588 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2589 			sysfs_notify_dirent_safe(mddev->sysfs_state);
2590 		}
2591 		if (--mddev->sync_checkers == 0)
2592 			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2593 	}
2594 	if (mddev->safemode == 1)
2595 		mddev->safemode = 0;
2596 	return mddev->in_sync;
2597 }
2598 
sync_sbs(struct mddev * mddev,int nospares)2599 static void sync_sbs(struct mddev *mddev, int nospares)
2600 {
2601 	/* Update each superblock (in-memory image), but
2602 	 * if we are allowed to, skip spares which already
2603 	 * have the right event counter, or have one earlier
2604 	 * (which would mean they aren't being marked as dirty
2605 	 * with the rest of the array)
2606 	 */
2607 	struct md_rdev *rdev;
2608 	rdev_for_each(rdev, mddev) {
2609 		if (rdev->sb_events == mddev->events ||
2610 		    (nospares &&
2611 		     rdev->raid_disk < 0 &&
2612 		     rdev->sb_events+1 == mddev->events)) {
2613 			/* Don't update this superblock */
2614 			rdev->sb_loaded = 2;
2615 		} else {
2616 			sync_super(mddev, rdev);
2617 			rdev->sb_loaded = 1;
2618 		}
2619 	}
2620 }
2621 
does_sb_need_changing(struct mddev * mddev)2622 static bool does_sb_need_changing(struct mddev *mddev)
2623 {
2624 	struct md_rdev *rdev = NULL, *iter;
2625 	struct mdp_superblock_1 *sb;
2626 	int role;
2627 
2628 	/* Find a good rdev */
2629 	rdev_for_each(iter, mddev)
2630 		if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2631 			rdev = iter;
2632 			break;
2633 		}
2634 
2635 	/* No good device found. */
2636 	if (!rdev)
2637 		return false;
2638 
2639 	sb = page_address(rdev->sb_page);
2640 	/* Check if a device has become faulty or a spare become active */
2641 	rdev_for_each(rdev, mddev) {
2642 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2643 		/* Device activated? */
2644 		if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2645 		    !test_bit(Faulty, &rdev->flags))
2646 			return true;
2647 		/* Device turned faulty? */
2648 		if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2649 			return true;
2650 	}
2651 
2652 	/* Check if any mddev parameters have changed */
2653 	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2654 	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2655 	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2656 	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2657 	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2658 		return true;
2659 
2660 	return false;
2661 }
2662 
md_update_sb(struct mddev * mddev,int force_change)2663 void md_update_sb(struct mddev *mddev, int force_change)
2664 {
2665 	struct md_rdev *rdev;
2666 	int sync_req;
2667 	int nospares = 0;
2668 	int any_badblocks_changed = 0;
2669 	int ret = -1;
2670 
2671 	if (!md_is_rdwr(mddev)) {
2672 		if (force_change)
2673 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2674 		return;
2675 	}
2676 
2677 repeat:
2678 	if (mddev_is_clustered(mddev)) {
2679 		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2680 			force_change = 1;
2681 		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2682 			nospares = 1;
2683 		ret = mddev->cluster_ops->metadata_update_start(mddev);
2684 		/* Has someone else has updated the sb */
2685 		if (!does_sb_need_changing(mddev)) {
2686 			if (ret == 0)
2687 				mddev->cluster_ops->metadata_update_cancel(mddev);
2688 			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2689 							 BIT(MD_SB_CHANGE_DEVS) |
2690 							 BIT(MD_SB_CHANGE_CLEAN));
2691 			return;
2692 		}
2693 	}
2694 
2695 	/*
2696 	 * First make sure individual recovery_offsets are correct
2697 	 * curr_resync_completed can only be used during recovery.
2698 	 * During reshape/resync it might use array-addresses rather
2699 	 * that device addresses.
2700 	 */
2701 	rdev_for_each(rdev, mddev) {
2702 		if (rdev->raid_disk >= 0 &&
2703 		    mddev->delta_disks >= 0 &&
2704 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2705 		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2706 		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2707 		    !test_bit(Journal, &rdev->flags) &&
2708 		    !test_bit(In_sync, &rdev->flags) &&
2709 		    mddev->curr_resync_completed > rdev->recovery_offset)
2710 				rdev->recovery_offset = mddev->curr_resync_completed;
2711 
2712 	}
2713 	if (!mddev->persistent) {
2714 		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2715 		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2716 		if (!mddev->external) {
2717 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2718 			rdev_for_each(rdev, mddev) {
2719 				if (rdev->badblocks.changed) {
2720 					rdev->badblocks.changed = 0;
2721 					ack_all_badblocks(&rdev->badblocks);
2722 					md_error(mddev, rdev);
2723 				}
2724 				clear_bit(Blocked, &rdev->flags);
2725 				clear_bit(BlockedBadBlocks, &rdev->flags);
2726 				wake_up(&rdev->blocked_wait);
2727 			}
2728 		}
2729 		wake_up(&mddev->sb_wait);
2730 		return;
2731 	}
2732 
2733 	spin_lock(&mddev->lock);
2734 
2735 	mddev->utime = ktime_get_real_seconds();
2736 
2737 	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2738 		force_change = 1;
2739 	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2740 		/* just a clean<-> dirty transition, possibly leave spares alone,
2741 		 * though if events isn't the right even/odd, we will have to do
2742 		 * spares after all
2743 		 */
2744 		nospares = 1;
2745 	if (force_change)
2746 		nospares = 0;
2747 	if (mddev->degraded)
2748 		/* If the array is degraded, then skipping spares is both
2749 		 * dangerous and fairly pointless.
2750 		 * Dangerous because a device that was removed from the array
2751 		 * might have a event_count that still looks up-to-date,
2752 		 * so it can be re-added without a resync.
2753 		 * Pointless because if there are any spares to skip,
2754 		 * then a recovery will happen and soon that array won't
2755 		 * be degraded any more and the spare can go back to sleep then.
2756 		 */
2757 		nospares = 0;
2758 
2759 	sync_req = mddev->in_sync;
2760 
2761 	/* If this is just a dirty<->clean transition, and the array is clean
2762 	 * and 'events' is odd, we can roll back to the previous clean state */
2763 	if (nospares
2764 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2765 	    && mddev->can_decrease_events
2766 	    && mddev->events != 1) {
2767 		mddev->events--;
2768 		mddev->can_decrease_events = 0;
2769 	} else {
2770 		/* otherwise we have to go forward and ... */
2771 		mddev->events ++;
2772 		mddev->can_decrease_events = nospares;
2773 	}
2774 
2775 	/*
2776 	 * This 64-bit counter should never wrap.
2777 	 * Either we are in around ~1 trillion A.C., assuming
2778 	 * 1 reboot per second, or we have a bug...
2779 	 */
2780 	WARN_ON(mddev->events == 0);
2781 
2782 	rdev_for_each(rdev, mddev) {
2783 		if (rdev->badblocks.changed)
2784 			any_badblocks_changed++;
2785 		if (test_bit(Faulty, &rdev->flags))
2786 			set_bit(FaultRecorded, &rdev->flags);
2787 	}
2788 
2789 	sync_sbs(mddev, nospares);
2790 	spin_unlock(&mddev->lock);
2791 
2792 	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2793 		 mdname(mddev), mddev->in_sync);
2794 
2795 	mddev_add_trace_msg(mddev, "md md_update_sb");
2796 rewrite:
2797 	mddev->bitmap_ops->update_sb(mddev->bitmap);
2798 	rdev_for_each(rdev, mddev) {
2799 		if (rdev->sb_loaded != 1)
2800 			continue; /* no noise on spare devices */
2801 
2802 		if (!test_bit(Faulty, &rdev->flags)) {
2803 			md_super_write(mddev,rdev,
2804 				       rdev->sb_start, rdev->sb_size,
2805 				       rdev->sb_page);
2806 			pr_debug("md: (write) %pg's sb offset: %llu\n",
2807 				 rdev->bdev,
2808 				 (unsigned long long)rdev->sb_start);
2809 			rdev->sb_events = mddev->events;
2810 			if (rdev->badblocks.size) {
2811 				md_super_write(mddev, rdev,
2812 					       rdev->badblocks.sector,
2813 					       rdev->badblocks.size << 9,
2814 					       rdev->bb_page);
2815 				rdev->badblocks.size = 0;
2816 			}
2817 
2818 		} else
2819 			pr_debug("md: %pg (skipping faulty)\n",
2820 				 rdev->bdev);
2821 	}
2822 	if (md_super_wait(mddev) < 0)
2823 		goto rewrite;
2824 	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2825 
2826 	if (mddev_is_clustered(mddev) && ret == 0)
2827 		mddev->cluster_ops->metadata_update_finish(mddev);
2828 
2829 	if (mddev->in_sync != sync_req ||
2830 	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2831 			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2832 		/* have to write it out again */
2833 		goto repeat;
2834 	wake_up(&mddev->sb_wait);
2835 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2836 		sysfs_notify_dirent_safe(mddev->sysfs_completed);
2837 
2838 	rdev_for_each(rdev, mddev) {
2839 		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2840 			clear_bit(Blocked, &rdev->flags);
2841 
2842 		if (any_badblocks_changed)
2843 			ack_all_badblocks(&rdev->badblocks);
2844 		clear_bit(BlockedBadBlocks, &rdev->flags);
2845 		wake_up(&rdev->blocked_wait);
2846 	}
2847 }
2848 EXPORT_SYMBOL(md_update_sb);
2849 
add_bound_rdev(struct md_rdev * rdev)2850 static int add_bound_rdev(struct md_rdev *rdev)
2851 {
2852 	struct mddev *mddev = rdev->mddev;
2853 	int err = 0;
2854 	bool add_journal = test_bit(Journal, &rdev->flags);
2855 
2856 	if (!mddev->pers->hot_remove_disk || add_journal) {
2857 		/* If there is hot_add_disk but no hot_remove_disk
2858 		 * then added disks for geometry changes,
2859 		 * and should be added immediately.
2860 		 */
2861 		super_types[mddev->major_version].
2862 			validate_super(mddev, NULL/*freshest*/, rdev);
2863 		err = mddev->pers->hot_add_disk(mddev, rdev);
2864 		if (err) {
2865 			md_kick_rdev_from_array(rdev);
2866 			return err;
2867 		}
2868 	}
2869 	sysfs_notify_dirent_safe(rdev->sysfs_state);
2870 
2871 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2872 	if (mddev->degraded)
2873 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2874 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2875 	md_new_event();
2876 	return 0;
2877 }
2878 
2879 /* words written to sysfs files may, or may not, be \n terminated.
2880  * We want to accept with case. For this we use cmd_match.
2881  */
cmd_match(const char * cmd,const char * str)2882 static int cmd_match(const char *cmd, const char *str)
2883 {
2884 	/* See if cmd, written into a sysfs file, matches
2885 	 * str.  They must either be the same, or cmd can
2886 	 * have a trailing newline
2887 	 */
2888 	while (*cmd && *str && *cmd == *str) {
2889 		cmd++;
2890 		str++;
2891 	}
2892 	if (*cmd == '\n')
2893 		cmd++;
2894 	if (*str || *cmd)
2895 		return 0;
2896 	return 1;
2897 }
2898 
2899 struct rdev_sysfs_entry {
2900 	struct attribute attr;
2901 	ssize_t (*show)(struct md_rdev *, char *);
2902 	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2903 };
2904 
2905 static ssize_t
state_show(struct md_rdev * rdev,char * page)2906 state_show(struct md_rdev *rdev, char *page)
2907 {
2908 	char *sep = ",";
2909 	size_t len = 0;
2910 	unsigned long flags = READ_ONCE(rdev->flags);
2911 
2912 	if (test_bit(Faulty, &flags) ||
2913 	    (!test_bit(ExternalBbl, &flags) &&
2914 	    rdev->badblocks.unacked_exist))
2915 		len += sprintf(page+len, "faulty%s", sep);
2916 	if (test_bit(In_sync, &flags))
2917 		len += sprintf(page+len, "in_sync%s", sep);
2918 	if (test_bit(Journal, &flags))
2919 		len += sprintf(page+len, "journal%s", sep);
2920 	if (test_bit(WriteMostly, &flags))
2921 		len += sprintf(page+len, "write_mostly%s", sep);
2922 	if (test_bit(Blocked, &flags) ||
2923 	    (rdev->badblocks.unacked_exist
2924 	     && !test_bit(Faulty, &flags)))
2925 		len += sprintf(page+len, "blocked%s", sep);
2926 	if (!test_bit(Faulty, &flags) &&
2927 	    !test_bit(Journal, &flags) &&
2928 	    !test_bit(In_sync, &flags))
2929 		len += sprintf(page+len, "spare%s", sep);
2930 	if (test_bit(WriteErrorSeen, &flags))
2931 		len += sprintf(page+len, "write_error%s", sep);
2932 	if (test_bit(WantReplacement, &flags))
2933 		len += sprintf(page+len, "want_replacement%s", sep);
2934 	if (test_bit(Replacement, &flags))
2935 		len += sprintf(page+len, "replacement%s", sep);
2936 	if (test_bit(ExternalBbl, &flags))
2937 		len += sprintf(page+len, "external_bbl%s", sep);
2938 	if (test_bit(FailFast, &flags))
2939 		len += sprintf(page+len, "failfast%s", sep);
2940 
2941 	if (len)
2942 		len -= strlen(sep);
2943 
2944 	return len+sprintf(page+len, "\n");
2945 }
2946 
2947 static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)2948 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2949 {
2950 	/* can write
2951 	 *  faulty  - simulates an error
2952 	 *  remove  - disconnects the device
2953 	 *  writemostly - sets write_mostly
2954 	 *  -writemostly - clears write_mostly
2955 	 *  blocked - sets the Blocked flags
2956 	 *  -blocked - clears the Blocked and possibly simulates an error
2957 	 *  insync - sets Insync providing device isn't active
2958 	 *  -insync - clear Insync for a device with a slot assigned,
2959 	 *            so that it gets rebuilt based on bitmap
2960 	 *  write_error - sets WriteErrorSeen
2961 	 *  -write_error - clears WriteErrorSeen
2962 	 *  {,-}failfast - set/clear FailFast
2963 	 */
2964 
2965 	struct mddev *mddev = rdev->mddev;
2966 	int err = -EINVAL;
2967 	bool need_update_sb = false;
2968 
2969 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2970 		md_error(rdev->mddev, rdev);
2971 
2972 		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
2973 			err = -EBUSY;
2974 		else
2975 			err = 0;
2976 	} else if (cmd_match(buf, "remove")) {
2977 		if (rdev->mddev->pers) {
2978 			clear_bit(Blocked, &rdev->flags);
2979 			remove_and_add_spares(rdev->mddev, rdev);
2980 		}
2981 		if (rdev->raid_disk >= 0)
2982 			err = -EBUSY;
2983 		else {
2984 			err = 0;
2985 			if (mddev_is_clustered(mddev))
2986 				err = mddev->cluster_ops->remove_disk(mddev, rdev);
2987 
2988 			if (err == 0) {
2989 				md_kick_rdev_from_array(rdev);
2990 				if (mddev->pers)
2991 					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2992 				md_new_event();
2993 			}
2994 		}
2995 	} else if (cmd_match(buf, "writemostly")) {
2996 		set_bit(WriteMostly, &rdev->flags);
2997 		mddev_create_serial_pool(rdev->mddev, rdev);
2998 		need_update_sb = true;
2999 		err = 0;
3000 	} else if (cmd_match(buf, "-writemostly")) {
3001 		mddev_destroy_serial_pool(rdev->mddev, rdev);
3002 		clear_bit(WriteMostly, &rdev->flags);
3003 		need_update_sb = true;
3004 		err = 0;
3005 	} else if (cmd_match(buf, "blocked")) {
3006 		set_bit(Blocked, &rdev->flags);
3007 		err = 0;
3008 	} else if (cmd_match(buf, "-blocked")) {
3009 		if (!test_bit(Faulty, &rdev->flags) &&
3010 		    !test_bit(ExternalBbl, &rdev->flags) &&
3011 		    rdev->badblocks.unacked_exist) {
3012 			/* metadata handler doesn't understand badblocks,
3013 			 * so we need to fail the device
3014 			 */
3015 			md_error(rdev->mddev, rdev);
3016 		}
3017 		clear_bit(Blocked, &rdev->flags);
3018 		clear_bit(BlockedBadBlocks, &rdev->flags);
3019 		wake_up(&rdev->blocked_wait);
3020 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3021 
3022 		err = 0;
3023 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3024 		set_bit(In_sync, &rdev->flags);
3025 		err = 0;
3026 	} else if (cmd_match(buf, "failfast")) {
3027 		set_bit(FailFast, &rdev->flags);
3028 		need_update_sb = true;
3029 		err = 0;
3030 	} else if (cmd_match(buf, "-failfast")) {
3031 		clear_bit(FailFast, &rdev->flags);
3032 		need_update_sb = true;
3033 		err = 0;
3034 	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3035 		   !test_bit(Journal, &rdev->flags)) {
3036 		if (rdev->mddev->pers == NULL) {
3037 			clear_bit(In_sync, &rdev->flags);
3038 			rdev->saved_raid_disk = rdev->raid_disk;
3039 			rdev->raid_disk = -1;
3040 			err = 0;
3041 		}
3042 	} else if (cmd_match(buf, "write_error")) {
3043 		set_bit(WriteErrorSeen, &rdev->flags);
3044 		err = 0;
3045 	} else if (cmd_match(buf, "-write_error")) {
3046 		clear_bit(WriteErrorSeen, &rdev->flags);
3047 		err = 0;
3048 	} else if (cmd_match(buf, "want_replacement")) {
3049 		/* Any non-spare device that is not a replacement can
3050 		 * become want_replacement at any time, but we then need to
3051 		 * check if recovery is needed.
3052 		 */
3053 		if (rdev->raid_disk >= 0 &&
3054 		    !test_bit(Journal, &rdev->flags) &&
3055 		    !test_bit(Replacement, &rdev->flags))
3056 			set_bit(WantReplacement, &rdev->flags);
3057 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3058 		err = 0;
3059 	} else if (cmd_match(buf, "-want_replacement")) {
3060 		/* Clearing 'want_replacement' is always allowed.
3061 		 * Once replacements starts it is too late though.
3062 		 */
3063 		err = 0;
3064 		clear_bit(WantReplacement, &rdev->flags);
3065 	} else if (cmd_match(buf, "replacement")) {
3066 		/* Can only set a device as a replacement when array has not
3067 		 * yet been started.  Once running, replacement is automatic
3068 		 * from spares, or by assigning 'slot'.
3069 		 */
3070 		if (rdev->mddev->pers)
3071 			err = -EBUSY;
3072 		else {
3073 			set_bit(Replacement, &rdev->flags);
3074 			err = 0;
3075 		}
3076 	} else if (cmd_match(buf, "-replacement")) {
3077 		/* Similarly, can only clear Replacement before start */
3078 		if (rdev->mddev->pers)
3079 			err = -EBUSY;
3080 		else {
3081 			clear_bit(Replacement, &rdev->flags);
3082 			err = 0;
3083 		}
3084 	} else if (cmd_match(buf, "re-add")) {
3085 		if (!rdev->mddev->pers)
3086 			err = -EINVAL;
3087 		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3088 				rdev->saved_raid_disk >= 0) {
3089 			/* clear_bit is performed _after_ all the devices
3090 			 * have their local Faulty bit cleared. If any writes
3091 			 * happen in the meantime in the local node, they
3092 			 * will land in the local bitmap, which will be synced
3093 			 * by this node eventually
3094 			 */
3095 			if (!mddev_is_clustered(rdev->mddev) ||
3096 			    (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
3097 				clear_bit(Faulty, &rdev->flags);
3098 				err = add_bound_rdev(rdev);
3099 			}
3100 		} else
3101 			err = -EBUSY;
3102 	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3103 		set_bit(ExternalBbl, &rdev->flags);
3104 		rdev->badblocks.shift = 0;
3105 		err = 0;
3106 	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3107 		clear_bit(ExternalBbl, &rdev->flags);
3108 		err = 0;
3109 	}
3110 	if (need_update_sb)
3111 		md_update_sb(mddev, 1);
3112 	if (!err)
3113 		sysfs_notify_dirent_safe(rdev->sysfs_state);
3114 	return err ? err : len;
3115 }
3116 static struct rdev_sysfs_entry rdev_state =
3117 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3118 
3119 static ssize_t
errors_show(struct md_rdev * rdev,char * page)3120 errors_show(struct md_rdev *rdev, char *page)
3121 {
3122 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3123 }
3124 
3125 static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3126 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3127 {
3128 	unsigned int n;
3129 	int rv;
3130 
3131 	rv = kstrtouint(buf, 10, &n);
3132 	if (rv < 0)
3133 		return rv;
3134 	atomic_set(&rdev->corrected_errors, n);
3135 	return len;
3136 }
3137 static struct rdev_sysfs_entry rdev_errors =
3138 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3139 
3140 static ssize_t
slot_show(struct md_rdev * rdev,char * page)3141 slot_show(struct md_rdev *rdev, char *page)
3142 {
3143 	if (test_bit(Journal, &rdev->flags))
3144 		return sprintf(page, "journal\n");
3145 	else if (rdev->raid_disk < 0)
3146 		return sprintf(page, "none\n");
3147 	else
3148 		return sprintf(page, "%d\n", rdev->raid_disk);
3149 }
3150 
3151 static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3152 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3153 {
3154 	int slot;
3155 	int err;
3156 
3157 	if (test_bit(Journal, &rdev->flags))
3158 		return -EBUSY;
3159 	if (strncmp(buf, "none", 4)==0)
3160 		slot = -1;
3161 	else {
3162 		err = kstrtouint(buf, 10, (unsigned int *)&slot);
3163 		if (err < 0)
3164 			return err;
3165 		if (slot < 0)
3166 			/* overflow */
3167 			return -ENOSPC;
3168 	}
3169 	if (rdev->mddev->pers && slot == -1) {
3170 		/* Setting 'slot' on an active array requires also
3171 		 * updating the 'rd%d' link, and communicating
3172 		 * with the personality with ->hot_*_disk.
3173 		 * For now we only support removing
3174 		 * failed/spare devices.  This normally happens automatically,
3175 		 * but not when the metadata is externally managed.
3176 		 */
3177 		if (rdev->raid_disk == -1)
3178 			return -EEXIST;
3179 		/* personality does all needed checks */
3180 		if (rdev->mddev->pers->hot_remove_disk == NULL)
3181 			return -EINVAL;
3182 		clear_bit(Blocked, &rdev->flags);
3183 		remove_and_add_spares(rdev->mddev, rdev);
3184 		if (rdev->raid_disk >= 0)
3185 			return -EBUSY;
3186 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3187 	} else if (rdev->mddev->pers) {
3188 		/* Activating a spare .. or possibly reactivating
3189 		 * if we ever get bitmaps working here.
3190 		 */
3191 		int err;
3192 
3193 		if (rdev->raid_disk != -1)
3194 			return -EBUSY;
3195 
3196 		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3197 			return -EBUSY;
3198 
3199 		if (rdev->mddev->pers->hot_add_disk == NULL)
3200 			return -EINVAL;
3201 
3202 		if (slot >= rdev->mddev->raid_disks &&
3203 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3204 			return -ENOSPC;
3205 
3206 		rdev->raid_disk = slot;
3207 		if (test_bit(In_sync, &rdev->flags))
3208 			rdev->saved_raid_disk = slot;
3209 		else
3210 			rdev->saved_raid_disk = -1;
3211 		clear_bit(In_sync, &rdev->flags);
3212 		clear_bit(Bitmap_sync, &rdev->flags);
3213 		err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3214 		if (err) {
3215 			rdev->raid_disk = -1;
3216 			return err;
3217 		} else
3218 			sysfs_notify_dirent_safe(rdev->sysfs_state);
3219 		/* failure here is OK */;
3220 		sysfs_link_rdev(rdev->mddev, rdev);
3221 		/* don't wakeup anyone, leave that to userspace. */
3222 	} else {
3223 		if (slot >= rdev->mddev->raid_disks &&
3224 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3225 			return -ENOSPC;
3226 		rdev->raid_disk = slot;
3227 		/* assume it is working */
3228 		clear_bit(Faulty, &rdev->flags);
3229 		clear_bit(WriteMostly, &rdev->flags);
3230 		set_bit(In_sync, &rdev->flags);
3231 		sysfs_notify_dirent_safe(rdev->sysfs_state);
3232 	}
3233 	return len;
3234 }
3235 
3236 static struct rdev_sysfs_entry rdev_slot =
3237 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3238 
3239 static ssize_t
offset_show(struct md_rdev * rdev,char * page)3240 offset_show(struct md_rdev *rdev, char *page)
3241 {
3242 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3243 }
3244 
3245 static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3246 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3247 {
3248 	unsigned long long offset;
3249 	if (kstrtoull(buf, 10, &offset) < 0)
3250 		return -EINVAL;
3251 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3252 		return -EBUSY;
3253 	if (rdev->sectors && rdev->mddev->external)
3254 		/* Must set offset before size, so overlap checks
3255 		 * can be sane */
3256 		return -EBUSY;
3257 	rdev->data_offset = offset;
3258 	rdev->new_data_offset = offset;
3259 	return len;
3260 }
3261 
3262 static struct rdev_sysfs_entry rdev_offset =
3263 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3264 
new_offset_show(struct md_rdev * rdev,char * page)3265 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3266 {
3267 	return sprintf(page, "%llu\n",
3268 		       (unsigned long long)rdev->new_data_offset);
3269 }
3270 
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3271 static ssize_t new_offset_store(struct md_rdev *rdev,
3272 				const char *buf, size_t len)
3273 {
3274 	unsigned long long new_offset;
3275 	struct mddev *mddev = rdev->mddev;
3276 
3277 	if (kstrtoull(buf, 10, &new_offset) < 0)
3278 		return -EINVAL;
3279 
3280 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3281 		return -EBUSY;
3282 	if (new_offset == rdev->data_offset)
3283 		/* reset is always permitted */
3284 		;
3285 	else if (new_offset > rdev->data_offset) {
3286 		/* must not push array size beyond rdev_sectors */
3287 		if (new_offset - rdev->data_offset
3288 		    + mddev->dev_sectors > rdev->sectors)
3289 				return -E2BIG;
3290 	}
3291 	/* Metadata worries about other space details. */
3292 
3293 	/* decreasing the offset is inconsistent with a backwards
3294 	 * reshape.
3295 	 */
3296 	if (new_offset < rdev->data_offset &&
3297 	    mddev->reshape_backwards)
3298 		return -EINVAL;
3299 	/* Increasing offset is inconsistent with forwards
3300 	 * reshape.  reshape_direction should be set to
3301 	 * 'backwards' first.
3302 	 */
3303 	if (new_offset > rdev->data_offset &&
3304 	    !mddev->reshape_backwards)
3305 		return -EINVAL;
3306 
3307 	if (mddev->pers && mddev->persistent &&
3308 	    !super_types[mddev->major_version]
3309 	    .allow_new_offset(rdev, new_offset))
3310 		return -E2BIG;
3311 	rdev->new_data_offset = new_offset;
3312 	if (new_offset > rdev->data_offset)
3313 		mddev->reshape_backwards = 1;
3314 	else if (new_offset < rdev->data_offset)
3315 		mddev->reshape_backwards = 0;
3316 
3317 	return len;
3318 }
3319 static struct rdev_sysfs_entry rdev_new_offset =
3320 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3321 
3322 static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3323 rdev_size_show(struct md_rdev *rdev, char *page)
3324 {
3325 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3326 }
3327 
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3328 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3329 {
3330 	/* check if two start/length pairs overlap */
3331 	if (a->data_offset + a->sectors <= b->data_offset)
3332 		return false;
3333 	if (b->data_offset + b->sectors <= a->data_offset)
3334 		return false;
3335 	return true;
3336 }
3337 
md_rdev_overlaps(struct md_rdev * rdev)3338 static bool md_rdev_overlaps(struct md_rdev *rdev)
3339 {
3340 	struct mddev *mddev;
3341 	struct md_rdev *rdev2;
3342 
3343 	spin_lock(&all_mddevs_lock);
3344 	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3345 		if (test_bit(MD_DELETED, &mddev->flags))
3346 			continue;
3347 		rdev_for_each(rdev2, mddev) {
3348 			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3349 			    md_rdevs_overlap(rdev, rdev2)) {
3350 				spin_unlock(&all_mddevs_lock);
3351 				return true;
3352 			}
3353 		}
3354 	}
3355 	spin_unlock(&all_mddevs_lock);
3356 	return false;
3357 }
3358 
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3359 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3360 {
3361 	unsigned long long blocks;
3362 	sector_t new;
3363 
3364 	if (kstrtoull(buf, 10, &blocks) < 0)
3365 		return -EINVAL;
3366 
3367 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3368 		return -EINVAL; /* sector conversion overflow */
3369 
3370 	new = blocks * 2;
3371 	if (new != blocks * 2)
3372 		return -EINVAL; /* unsigned long long to sector_t overflow */
3373 
3374 	*sectors = new;
3375 	return 0;
3376 }
3377 
3378 static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3379 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3380 {
3381 	struct mddev *my_mddev = rdev->mddev;
3382 	sector_t oldsectors = rdev->sectors;
3383 	sector_t sectors;
3384 
3385 	if (test_bit(Journal, &rdev->flags))
3386 		return -EBUSY;
3387 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3388 		return -EINVAL;
3389 	if (rdev->data_offset != rdev->new_data_offset)
3390 		return -EINVAL; /* too confusing */
3391 	if (my_mddev->pers && rdev->raid_disk >= 0) {
3392 		if (my_mddev->persistent) {
3393 			sectors = super_types[my_mddev->major_version].
3394 				rdev_size_change(rdev, sectors);
3395 			if (!sectors)
3396 				return -EBUSY;
3397 		} else if (!sectors)
3398 			sectors = bdev_nr_sectors(rdev->bdev) -
3399 				rdev->data_offset;
3400 		if (!my_mddev->pers->resize)
3401 			/* Cannot change size for RAID0 or Linear etc */
3402 			return -EINVAL;
3403 	}
3404 	if (sectors < my_mddev->dev_sectors)
3405 		return -EINVAL; /* component must fit device */
3406 
3407 	rdev->sectors = sectors;
3408 
3409 	/*
3410 	 * Check that all other rdevs with the same bdev do not overlap.  This
3411 	 * check does not provide a hard guarantee, it just helps avoid
3412 	 * dangerous mistakes.
3413 	 */
3414 	if (sectors > oldsectors && my_mddev->external &&
3415 	    md_rdev_overlaps(rdev)) {
3416 		/*
3417 		 * Someone else could have slipped in a size change here, but
3418 		 * doing so is just silly.  We put oldsectors back because we
3419 		 * know it is safe, and trust userspace not to race with itself.
3420 		 */
3421 		rdev->sectors = oldsectors;
3422 		return -EBUSY;
3423 	}
3424 	return len;
3425 }
3426 
3427 static struct rdev_sysfs_entry rdev_size =
3428 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3429 
recovery_start_show(struct md_rdev * rdev,char * page)3430 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3431 {
3432 	unsigned long long recovery_start = rdev->recovery_offset;
3433 
3434 	if (test_bit(In_sync, &rdev->flags) ||
3435 	    recovery_start == MaxSector)
3436 		return sprintf(page, "none\n");
3437 
3438 	return sprintf(page, "%llu\n", recovery_start);
3439 }
3440 
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3441 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3442 {
3443 	unsigned long long recovery_start;
3444 
3445 	if (cmd_match(buf, "none"))
3446 		recovery_start = MaxSector;
3447 	else if (kstrtoull(buf, 10, &recovery_start))
3448 		return -EINVAL;
3449 
3450 	if (rdev->mddev->pers &&
3451 	    rdev->raid_disk >= 0)
3452 		return -EBUSY;
3453 
3454 	rdev->recovery_offset = recovery_start;
3455 	if (recovery_start == MaxSector)
3456 		set_bit(In_sync, &rdev->flags);
3457 	else
3458 		clear_bit(In_sync, &rdev->flags);
3459 	return len;
3460 }
3461 
3462 static struct rdev_sysfs_entry rdev_recovery_start =
3463 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3464 
3465 /* sysfs access to bad-blocks list.
3466  * We present two files.
3467  * 'bad-blocks' lists sector numbers and lengths of ranges that
3468  *    are recorded as bad.  The list is truncated to fit within
3469  *    the one-page limit of sysfs.
3470  *    Writing "sector length" to this file adds an acknowledged
3471  *    bad block list.
3472  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3473  *    been acknowledged.  Writing to this file adds bad blocks
3474  *    without acknowledging them.  This is largely for testing.
3475  */
bb_show(struct md_rdev * rdev,char * page)3476 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3477 {
3478 	return badblocks_show(&rdev->badblocks, page, 0);
3479 }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3480 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3481 {
3482 	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3483 	/* Maybe that ack was all we needed */
3484 	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3485 		wake_up(&rdev->blocked_wait);
3486 	return rv;
3487 }
3488 static struct rdev_sysfs_entry rdev_bad_blocks =
3489 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3490 
ubb_show(struct md_rdev * rdev,char * page)3491 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3492 {
3493 	return badblocks_show(&rdev->badblocks, page, 1);
3494 }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3495 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3496 {
3497 	return badblocks_store(&rdev->badblocks, page, len, 1);
3498 }
3499 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3500 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3501 
3502 static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3503 ppl_sector_show(struct md_rdev *rdev, char *page)
3504 {
3505 	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3506 }
3507 
3508 static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3509 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3510 {
3511 	unsigned long long sector;
3512 
3513 	if (kstrtoull(buf, 10, &sector) < 0)
3514 		return -EINVAL;
3515 	if (sector != (sector_t)sector)
3516 		return -EINVAL;
3517 
3518 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3519 	    rdev->raid_disk >= 0)
3520 		return -EBUSY;
3521 
3522 	if (rdev->mddev->persistent) {
3523 		if (rdev->mddev->major_version == 0)
3524 			return -EINVAL;
3525 		if ((sector > rdev->sb_start &&
3526 		     sector - rdev->sb_start > S16_MAX) ||
3527 		    (sector < rdev->sb_start &&
3528 		     rdev->sb_start - sector > -S16_MIN))
3529 			return -EINVAL;
3530 		rdev->ppl.offset = sector - rdev->sb_start;
3531 	} else if (!rdev->mddev->external) {
3532 		return -EBUSY;
3533 	}
3534 	rdev->ppl.sector = sector;
3535 	return len;
3536 }
3537 
3538 static struct rdev_sysfs_entry rdev_ppl_sector =
3539 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3540 
3541 static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3542 ppl_size_show(struct md_rdev *rdev, char *page)
3543 {
3544 	return sprintf(page, "%u\n", rdev->ppl.size);
3545 }
3546 
3547 static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3548 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3549 {
3550 	unsigned int size;
3551 
3552 	if (kstrtouint(buf, 10, &size) < 0)
3553 		return -EINVAL;
3554 
3555 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3556 	    rdev->raid_disk >= 0)
3557 		return -EBUSY;
3558 
3559 	if (rdev->mddev->persistent) {
3560 		if (rdev->mddev->major_version == 0)
3561 			return -EINVAL;
3562 		if (size > U16_MAX)
3563 			return -EINVAL;
3564 	} else if (!rdev->mddev->external) {
3565 		return -EBUSY;
3566 	}
3567 	rdev->ppl.size = size;
3568 	return len;
3569 }
3570 
3571 static struct rdev_sysfs_entry rdev_ppl_size =
3572 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3573 
3574 static struct attribute *rdev_default_attrs[] = {
3575 	&rdev_state.attr,
3576 	&rdev_errors.attr,
3577 	&rdev_slot.attr,
3578 	&rdev_offset.attr,
3579 	&rdev_new_offset.attr,
3580 	&rdev_size.attr,
3581 	&rdev_recovery_start.attr,
3582 	&rdev_bad_blocks.attr,
3583 	&rdev_unack_bad_blocks.attr,
3584 	&rdev_ppl_sector.attr,
3585 	&rdev_ppl_size.attr,
3586 	NULL,
3587 };
3588 ATTRIBUTE_GROUPS(rdev_default);
3589 static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3590 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3591 {
3592 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3593 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3594 
3595 	if (!entry->show)
3596 		return -EIO;
3597 	if (!rdev->mddev)
3598 		return -ENODEV;
3599 	return entry->show(rdev, page);
3600 }
3601 
3602 static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3603 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3604 	      const char *page, size_t length)
3605 {
3606 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3607 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3608 	struct kernfs_node *kn = NULL;
3609 	bool suspend = false;
3610 	ssize_t rv;
3611 	struct mddev *mddev = READ_ONCE(rdev->mddev);
3612 
3613 	if (!entry->store)
3614 		return -EIO;
3615 	if (!capable(CAP_SYS_ADMIN))
3616 		return -EACCES;
3617 	if (!mddev)
3618 		return -ENODEV;
3619 
3620 	if (entry->store == state_store) {
3621 		if (cmd_match(page, "remove"))
3622 			kn = sysfs_break_active_protection(kobj, attr);
3623 		if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3624 		    cmd_match(page, "writemostly") ||
3625 		    cmd_match(page, "-writemostly"))
3626 			suspend = true;
3627 	}
3628 
3629 	rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3630 	if (!rv) {
3631 		if (rdev->mddev == NULL)
3632 			rv = -ENODEV;
3633 		else
3634 			rv = entry->store(rdev, page, length);
3635 		suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3636 	}
3637 
3638 	if (kn)
3639 		sysfs_unbreak_active_protection(kn);
3640 
3641 	return rv;
3642 }
3643 
rdev_free(struct kobject * ko)3644 static void rdev_free(struct kobject *ko)
3645 {
3646 	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3647 	kfree(rdev);
3648 }
3649 static const struct sysfs_ops rdev_sysfs_ops = {
3650 	.show		= rdev_attr_show,
3651 	.store		= rdev_attr_store,
3652 };
3653 static const struct kobj_type rdev_ktype = {
3654 	.release	= rdev_free,
3655 	.sysfs_ops	= &rdev_sysfs_ops,
3656 	.default_groups	= rdev_default_groups,
3657 };
3658 
md_rdev_init(struct md_rdev * rdev)3659 int md_rdev_init(struct md_rdev *rdev)
3660 {
3661 	rdev->desc_nr = -1;
3662 	rdev->saved_raid_disk = -1;
3663 	rdev->raid_disk = -1;
3664 	rdev->flags = 0;
3665 	rdev->data_offset = 0;
3666 	rdev->new_data_offset = 0;
3667 	rdev->sb_events = 0;
3668 	rdev->last_read_error = 0;
3669 	rdev->sb_loaded = 0;
3670 	rdev->bb_page = NULL;
3671 	atomic_set(&rdev->nr_pending, 0);
3672 	atomic_set(&rdev->read_errors, 0);
3673 	atomic_set(&rdev->corrected_errors, 0);
3674 
3675 	INIT_LIST_HEAD(&rdev->same_set);
3676 	init_waitqueue_head(&rdev->blocked_wait);
3677 
3678 	/* Add space to store bad block list.
3679 	 * This reserves the space even on arrays where it cannot
3680 	 * be used - I wonder if that matters
3681 	 */
3682 	return badblocks_init(&rdev->badblocks, 0);
3683 }
3684 EXPORT_SYMBOL_GPL(md_rdev_init);
3685 
3686 /*
3687  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3688  *
3689  * mark the device faulty if:
3690  *
3691  *   - the device is nonexistent (zero size)
3692  *   - the device has no valid superblock
3693  *
3694  * a faulty rdev _never_ has rdev->sb set.
3695  */
md_import_device(dev_t newdev,int super_format,int super_minor)3696 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3697 {
3698 	struct md_rdev *rdev;
3699 	sector_t size;
3700 	int err;
3701 
3702 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3703 	if (!rdev)
3704 		return ERR_PTR(-ENOMEM);
3705 
3706 	err = md_rdev_init(rdev);
3707 	if (err)
3708 		goto out_free_rdev;
3709 	err = alloc_disk_sb(rdev);
3710 	if (err)
3711 		goto out_clear_rdev;
3712 
3713 	rdev->bdev_file = bdev_file_open_by_dev(newdev,
3714 			BLK_OPEN_READ | BLK_OPEN_WRITE,
3715 			super_format == -2 ? &claim_rdev : rdev, NULL);
3716 	if (IS_ERR(rdev->bdev_file)) {
3717 		pr_warn("md: could not open device unknown-block(%u,%u).\n",
3718 			MAJOR(newdev), MINOR(newdev));
3719 		err = PTR_ERR(rdev->bdev_file);
3720 		goto out_clear_rdev;
3721 	}
3722 	rdev->bdev = file_bdev(rdev->bdev_file);
3723 
3724 	kobject_init(&rdev->kobj, &rdev_ktype);
3725 
3726 	size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3727 	if (!size) {
3728 		pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3729 			rdev->bdev);
3730 		err = -EINVAL;
3731 		goto out_blkdev_put;
3732 	}
3733 
3734 	if (super_format >= 0) {
3735 		err = super_types[super_format].
3736 			load_super(rdev, NULL, super_minor);
3737 		if (err == -EINVAL) {
3738 			pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3739 				rdev->bdev,
3740 				super_format, super_minor);
3741 			goto out_blkdev_put;
3742 		}
3743 		if (err < 0) {
3744 			pr_warn("md: could not read %pg's sb, not importing!\n",
3745 				rdev->bdev);
3746 			goto out_blkdev_put;
3747 		}
3748 	}
3749 
3750 	return rdev;
3751 
3752 out_blkdev_put:
3753 	fput(rdev->bdev_file);
3754 out_clear_rdev:
3755 	md_rdev_clear(rdev);
3756 out_free_rdev:
3757 	kfree(rdev);
3758 	return ERR_PTR(err);
3759 }
3760 
3761 /*
3762  * Check a full RAID array for plausibility
3763  */
3764 
analyze_sbs(struct mddev * mddev)3765 static int analyze_sbs(struct mddev *mddev)
3766 {
3767 	int i;
3768 	struct md_rdev *rdev, *freshest, *tmp;
3769 
3770 	freshest = NULL;
3771 	rdev_for_each_safe(rdev, tmp, mddev)
3772 		switch (super_types[mddev->major_version].
3773 			load_super(rdev, freshest, mddev->minor_version)) {
3774 		case 1:
3775 			freshest = rdev;
3776 			break;
3777 		case 0:
3778 			break;
3779 		default:
3780 			pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3781 				rdev->bdev);
3782 			md_kick_rdev_from_array(rdev);
3783 		}
3784 
3785 	/* Cannot find a valid fresh disk */
3786 	if (!freshest) {
3787 		pr_warn("md: cannot find a valid disk\n");
3788 		return -EINVAL;
3789 	}
3790 
3791 	super_types[mddev->major_version].
3792 		validate_super(mddev, NULL/*freshest*/, freshest);
3793 
3794 	i = 0;
3795 	rdev_for_each_safe(rdev, tmp, mddev) {
3796 		if (mddev->max_disks &&
3797 		    (rdev->desc_nr >= mddev->max_disks ||
3798 		     i > mddev->max_disks)) {
3799 			pr_warn("md: %s: %pg: only %d devices permitted\n",
3800 				mdname(mddev), rdev->bdev,
3801 				mddev->max_disks);
3802 			md_kick_rdev_from_array(rdev);
3803 			continue;
3804 		}
3805 		if (rdev != freshest) {
3806 			if (super_types[mddev->major_version].
3807 			    validate_super(mddev, freshest, rdev)) {
3808 				pr_warn("md: kicking non-fresh %pg from array!\n",
3809 					rdev->bdev);
3810 				md_kick_rdev_from_array(rdev);
3811 				continue;
3812 			}
3813 		}
3814 		if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3815 		    !test_bit(Journal, &rdev->flags)) {
3816 			rdev->raid_disk = -1;
3817 			clear_bit(In_sync, &rdev->flags);
3818 		}
3819 	}
3820 
3821 	return 0;
3822 }
3823 
3824 /* Read a fixed-point number.
3825  * Numbers in sysfs attributes should be in "standard" units where
3826  * possible, so time should be in seconds.
3827  * However we internally use a a much smaller unit such as
3828  * milliseconds or jiffies.
3829  * This function takes a decimal number with a possible fractional
3830  * component, and produces an integer which is the result of
3831  * multiplying that number by 10^'scale'.
3832  * all without any floating-point arithmetic.
3833  */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3834 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3835 {
3836 	unsigned long result = 0;
3837 	long decimals = -1;
3838 	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3839 		if (*cp == '.')
3840 			decimals = 0;
3841 		else if (decimals < scale) {
3842 			unsigned int value;
3843 			value = *cp - '0';
3844 			result = result * 10 + value;
3845 			if (decimals >= 0)
3846 				decimals++;
3847 		}
3848 		cp++;
3849 	}
3850 	if (*cp == '\n')
3851 		cp++;
3852 	if (*cp)
3853 		return -EINVAL;
3854 	if (decimals < 0)
3855 		decimals = 0;
3856 	*res = result * int_pow(10, scale - decimals);
3857 	return 0;
3858 }
3859 
3860 static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3861 safe_delay_show(struct mddev *mddev, char *page)
3862 {
3863 	unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3864 
3865 	return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3866 }
3867 static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3868 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3869 {
3870 	unsigned long msec;
3871 
3872 	if (mddev_is_clustered(mddev)) {
3873 		pr_warn("md: Safemode is disabled for clustered mode\n");
3874 		return -EINVAL;
3875 	}
3876 
3877 	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3878 		return -EINVAL;
3879 	if (msec == 0)
3880 		mddev->safemode_delay = 0;
3881 	else {
3882 		unsigned long old_delay = mddev->safemode_delay;
3883 		unsigned long new_delay = (msec*HZ)/1000;
3884 
3885 		if (new_delay == 0)
3886 			new_delay = 1;
3887 		mddev->safemode_delay = new_delay;
3888 		if (new_delay < old_delay || old_delay == 0)
3889 			mod_timer(&mddev->safemode_timer, jiffies+1);
3890 	}
3891 	return len;
3892 }
3893 static struct md_sysfs_entry md_safe_delay =
3894 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3895 
3896 static ssize_t
level_show(struct mddev * mddev,char * page)3897 level_show(struct mddev *mddev, char *page)
3898 {
3899 	struct md_personality *p;
3900 	int ret;
3901 	spin_lock(&mddev->lock);
3902 	p = mddev->pers;
3903 	if (p)
3904 		ret = sprintf(page, "%s\n", p->head.name);
3905 	else if (mddev->clevel[0])
3906 		ret = sprintf(page, "%s\n", mddev->clevel);
3907 	else if (mddev->level != LEVEL_NONE)
3908 		ret = sprintf(page, "%d\n", mddev->level);
3909 	else
3910 		ret = 0;
3911 	spin_unlock(&mddev->lock);
3912 	return ret;
3913 }
3914 
3915 static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)3916 level_store(struct mddev *mddev, const char *buf, size_t len)
3917 {
3918 	char clevel[16];
3919 	ssize_t rv;
3920 	size_t slen = len;
3921 	struct md_personality *pers, *oldpers;
3922 	long level;
3923 	void *priv, *oldpriv;
3924 	struct md_rdev *rdev;
3925 
3926 	if (slen == 0 || slen >= sizeof(clevel))
3927 		return -EINVAL;
3928 
3929 	rv = mddev_suspend_and_lock(mddev);
3930 	if (rv)
3931 		return rv;
3932 
3933 	if (mddev->pers == NULL) {
3934 		memcpy(mddev->clevel, buf, slen);
3935 		if (mddev->clevel[slen-1] == '\n')
3936 			slen--;
3937 		mddev->clevel[slen] = 0;
3938 		mddev->level = LEVEL_NONE;
3939 		rv = len;
3940 		goto out_unlock;
3941 	}
3942 	rv = -EROFS;
3943 	if (!md_is_rdwr(mddev))
3944 		goto out_unlock;
3945 
3946 	/* request to change the personality.  Need to ensure:
3947 	 *  - array is not engaged in resync/recovery/reshape
3948 	 *  - old personality can be suspended
3949 	 *  - new personality will access other array.
3950 	 */
3951 
3952 	rv = -EBUSY;
3953 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3954 	    mddev->reshape_position != MaxSector ||
3955 	    mddev->sysfs_active)
3956 		goto out_unlock;
3957 
3958 	rv = -EINVAL;
3959 	if (!mddev->pers->quiesce) {
3960 		pr_warn("md: %s: %s does not support online personality change\n",
3961 			mdname(mddev), mddev->pers->head.name);
3962 		goto out_unlock;
3963 	}
3964 
3965 	/* Now find the new personality */
3966 	memcpy(clevel, buf, slen);
3967 	if (clevel[slen-1] == '\n')
3968 		slen--;
3969 	clevel[slen] = 0;
3970 	if (kstrtol(clevel, 10, &level))
3971 		level = LEVEL_NONE;
3972 
3973 	if (request_module("md-%s", clevel) != 0)
3974 		request_module("md-level-%s", clevel);
3975 	pers = get_pers(level, clevel);
3976 	if (!pers) {
3977 		rv = -EINVAL;
3978 		goto out_unlock;
3979 	}
3980 
3981 	if (pers == mddev->pers) {
3982 		/* Nothing to do! */
3983 		put_pers(pers);
3984 		rv = len;
3985 		goto out_unlock;
3986 	}
3987 	if (!pers->takeover) {
3988 		put_pers(pers);
3989 		pr_warn("md: %s: %s does not support personality takeover\n",
3990 			mdname(mddev), clevel);
3991 		rv = -EINVAL;
3992 		goto out_unlock;
3993 	}
3994 
3995 	rdev_for_each(rdev, mddev)
3996 		rdev->new_raid_disk = rdev->raid_disk;
3997 
3998 	/* ->takeover must set new_* and/or delta_disks
3999 	 * if it succeeds, and may set them when it fails.
4000 	 */
4001 	priv = pers->takeover(mddev);
4002 	if (IS_ERR(priv)) {
4003 		mddev->new_level = mddev->level;
4004 		mddev->new_layout = mddev->layout;
4005 		mddev->new_chunk_sectors = mddev->chunk_sectors;
4006 		mddev->raid_disks -= mddev->delta_disks;
4007 		mddev->delta_disks = 0;
4008 		mddev->reshape_backwards = 0;
4009 		put_pers(pers);
4010 		pr_warn("md: %s: %s would not accept array\n",
4011 			mdname(mddev), clevel);
4012 		rv = PTR_ERR(priv);
4013 		goto out_unlock;
4014 	}
4015 
4016 	/* Looks like we have a winner */
4017 	mddev_detach(mddev);
4018 
4019 	spin_lock(&mddev->lock);
4020 	oldpers = mddev->pers;
4021 	oldpriv = mddev->private;
4022 	mddev->pers = pers;
4023 	mddev->private = priv;
4024 	strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
4025 	mddev->level = mddev->new_level;
4026 	mddev->layout = mddev->new_layout;
4027 	mddev->chunk_sectors = mddev->new_chunk_sectors;
4028 	mddev->delta_disks = 0;
4029 	mddev->reshape_backwards = 0;
4030 	mddev->degraded = 0;
4031 	spin_unlock(&mddev->lock);
4032 
4033 	if (oldpers->sync_request == NULL &&
4034 	    mddev->external) {
4035 		/* We are converting from a no-redundancy array
4036 		 * to a redundancy array and metadata is managed
4037 		 * externally so we need to be sure that writes
4038 		 * won't block due to a need to transition
4039 		 *      clean->dirty
4040 		 * until external management is started.
4041 		 */
4042 		mddev->in_sync = 0;
4043 		mddev->safemode_delay = 0;
4044 		mddev->safemode = 0;
4045 	}
4046 
4047 	oldpers->free(mddev, oldpriv);
4048 
4049 	if (oldpers->sync_request == NULL &&
4050 	    pers->sync_request != NULL) {
4051 		/* need to add the md_redundancy_group */
4052 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4053 			pr_warn("md: cannot register extra attributes for %s\n",
4054 				mdname(mddev));
4055 		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4056 		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4057 		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4058 	}
4059 	if (oldpers->sync_request != NULL &&
4060 	    pers->sync_request == NULL) {
4061 		/* need to remove the md_redundancy_group */
4062 		if (mddev->to_remove == NULL)
4063 			mddev->to_remove = &md_redundancy_group;
4064 	}
4065 
4066 	put_pers(oldpers);
4067 
4068 	rdev_for_each(rdev, mddev) {
4069 		if (rdev->raid_disk < 0)
4070 			continue;
4071 		if (rdev->new_raid_disk >= mddev->raid_disks)
4072 			rdev->new_raid_disk = -1;
4073 		if (rdev->new_raid_disk == rdev->raid_disk)
4074 			continue;
4075 		sysfs_unlink_rdev(mddev, rdev);
4076 	}
4077 	rdev_for_each(rdev, mddev) {
4078 		if (rdev->raid_disk < 0)
4079 			continue;
4080 		if (rdev->new_raid_disk == rdev->raid_disk)
4081 			continue;
4082 		rdev->raid_disk = rdev->new_raid_disk;
4083 		if (rdev->raid_disk < 0)
4084 			clear_bit(In_sync, &rdev->flags);
4085 		else {
4086 			if (sysfs_link_rdev(mddev, rdev))
4087 				pr_warn("md: cannot register rd%d for %s after level change\n",
4088 					rdev->raid_disk, mdname(mddev));
4089 		}
4090 	}
4091 
4092 	if (pers->sync_request == NULL) {
4093 		/* this is now an array without redundancy, so
4094 		 * it must always be in_sync
4095 		 */
4096 		mddev->in_sync = 1;
4097 		timer_delete_sync(&mddev->safemode_timer);
4098 	}
4099 	pers->run(mddev);
4100 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4101 	if (!mddev->thread)
4102 		md_update_sb(mddev, 1);
4103 	sysfs_notify_dirent_safe(mddev->sysfs_level);
4104 	md_new_event();
4105 	rv = len;
4106 out_unlock:
4107 	mddev_unlock_and_resume(mddev);
4108 	return rv;
4109 }
4110 
4111 static struct md_sysfs_entry md_level =
4112 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4113 
4114 static ssize_t
new_level_show(struct mddev * mddev,char * page)4115 new_level_show(struct mddev *mddev, char *page)
4116 {
4117 	return sprintf(page, "%d\n", mddev->new_level);
4118 }
4119 
4120 static ssize_t
new_level_store(struct mddev * mddev,const char * buf,size_t len)4121 new_level_store(struct mddev *mddev, const char *buf, size_t len)
4122 {
4123 	unsigned int n;
4124 	int err;
4125 
4126 	err = kstrtouint(buf, 10, &n);
4127 	if (err < 0)
4128 		return err;
4129 	err = mddev_lock(mddev);
4130 	if (err)
4131 		return err;
4132 
4133 	mddev->new_level = n;
4134 	md_update_sb(mddev, 1);
4135 
4136 	mddev_unlock(mddev);
4137 	return len;
4138 }
4139 static struct md_sysfs_entry md_new_level =
4140 __ATTR(new_level, 0664, new_level_show, new_level_store);
4141 
4142 static ssize_t
layout_show(struct mddev * mddev,char * page)4143 layout_show(struct mddev *mddev, char *page)
4144 {
4145 	/* just a number, not meaningful for all levels */
4146 	if (mddev->reshape_position != MaxSector &&
4147 	    mddev->layout != mddev->new_layout)
4148 		return sprintf(page, "%d (%d)\n",
4149 			       mddev->new_layout, mddev->layout);
4150 	return sprintf(page, "%d\n", mddev->layout);
4151 }
4152 
4153 static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4154 layout_store(struct mddev *mddev, const char *buf, size_t len)
4155 {
4156 	unsigned int n;
4157 	int err;
4158 
4159 	err = kstrtouint(buf, 10, &n);
4160 	if (err < 0)
4161 		return err;
4162 	err = mddev_lock(mddev);
4163 	if (err)
4164 		return err;
4165 
4166 	if (mddev->pers) {
4167 		if (mddev->pers->check_reshape == NULL)
4168 			err = -EBUSY;
4169 		else if (!md_is_rdwr(mddev))
4170 			err = -EROFS;
4171 		else {
4172 			mddev->new_layout = n;
4173 			err = mddev->pers->check_reshape(mddev);
4174 			if (err)
4175 				mddev->new_layout = mddev->layout;
4176 		}
4177 	} else {
4178 		mddev->new_layout = n;
4179 		if (mddev->reshape_position == MaxSector)
4180 			mddev->layout = n;
4181 	}
4182 	mddev_unlock(mddev);
4183 	return err ?: len;
4184 }
4185 static struct md_sysfs_entry md_layout =
4186 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4187 
4188 static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4189 raid_disks_show(struct mddev *mddev, char *page)
4190 {
4191 	if (mddev->raid_disks == 0)
4192 		return 0;
4193 	if (mddev->reshape_position != MaxSector &&
4194 	    mddev->delta_disks != 0)
4195 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4196 			       mddev->raid_disks - mddev->delta_disks);
4197 	return sprintf(page, "%d\n", mddev->raid_disks);
4198 }
4199 
4200 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4201 
4202 static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4203 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4204 {
4205 	unsigned int n;
4206 	int err;
4207 
4208 	err = kstrtouint(buf, 10, &n);
4209 	if (err < 0)
4210 		return err;
4211 
4212 	err = mddev_lock(mddev);
4213 	if (err)
4214 		return err;
4215 	if (mddev->pers)
4216 		err = update_raid_disks(mddev, n);
4217 	else if (mddev->reshape_position != MaxSector) {
4218 		struct md_rdev *rdev;
4219 		int olddisks = mddev->raid_disks - mddev->delta_disks;
4220 
4221 		err = -EINVAL;
4222 		rdev_for_each(rdev, mddev) {
4223 			if (olddisks < n &&
4224 			    rdev->data_offset < rdev->new_data_offset)
4225 				goto out_unlock;
4226 			if (olddisks > n &&
4227 			    rdev->data_offset > rdev->new_data_offset)
4228 				goto out_unlock;
4229 		}
4230 		err = 0;
4231 		mddev->delta_disks = n - olddisks;
4232 		mddev->raid_disks = n;
4233 		mddev->reshape_backwards = (mddev->delta_disks < 0);
4234 	} else
4235 		mddev->raid_disks = n;
4236 out_unlock:
4237 	mddev_unlock(mddev);
4238 	return err ? err : len;
4239 }
4240 static struct md_sysfs_entry md_raid_disks =
4241 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4242 
4243 static ssize_t
uuid_show(struct mddev * mddev,char * page)4244 uuid_show(struct mddev *mddev, char *page)
4245 {
4246 	return sprintf(page, "%pU\n", mddev->uuid);
4247 }
4248 static struct md_sysfs_entry md_uuid =
4249 __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4250 
4251 static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4252 chunk_size_show(struct mddev *mddev, char *page)
4253 {
4254 	if (mddev->reshape_position != MaxSector &&
4255 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
4256 		return sprintf(page, "%d (%d)\n",
4257 			       mddev->new_chunk_sectors << 9,
4258 			       mddev->chunk_sectors << 9);
4259 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4260 }
4261 
4262 static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4263 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4264 {
4265 	unsigned long n;
4266 	int err;
4267 
4268 	err = kstrtoul(buf, 10, &n);
4269 	if (err < 0)
4270 		return err;
4271 
4272 	err = mddev_lock(mddev);
4273 	if (err)
4274 		return err;
4275 	if (mddev->pers) {
4276 		if (mddev->pers->check_reshape == NULL)
4277 			err = -EBUSY;
4278 		else if (!md_is_rdwr(mddev))
4279 			err = -EROFS;
4280 		else {
4281 			mddev->new_chunk_sectors = n >> 9;
4282 			err = mddev->pers->check_reshape(mddev);
4283 			if (err)
4284 				mddev->new_chunk_sectors = mddev->chunk_sectors;
4285 		}
4286 	} else {
4287 		mddev->new_chunk_sectors = n >> 9;
4288 		if (mddev->reshape_position == MaxSector)
4289 			mddev->chunk_sectors = n >> 9;
4290 	}
4291 	mddev_unlock(mddev);
4292 	return err ?: len;
4293 }
4294 static struct md_sysfs_entry md_chunk_size =
4295 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4296 
4297 static ssize_t
resync_start_show(struct mddev * mddev,char * page)4298 resync_start_show(struct mddev *mddev, char *page)
4299 {
4300 	if (mddev->recovery_cp == MaxSector)
4301 		return sprintf(page, "none\n");
4302 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4303 }
4304 
4305 static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4306 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4307 {
4308 	unsigned long long n;
4309 	int err;
4310 
4311 	if (cmd_match(buf, "none"))
4312 		n = MaxSector;
4313 	else {
4314 		err = kstrtoull(buf, 10, &n);
4315 		if (err < 0)
4316 			return err;
4317 		if (n != (sector_t)n)
4318 			return -EINVAL;
4319 	}
4320 
4321 	err = mddev_lock(mddev);
4322 	if (err)
4323 		return err;
4324 	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4325 		err = -EBUSY;
4326 
4327 	if (!err) {
4328 		mddev->recovery_cp = n;
4329 		if (mddev->pers)
4330 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4331 	}
4332 	mddev_unlock(mddev);
4333 	return err ?: len;
4334 }
4335 static struct md_sysfs_entry md_resync_start =
4336 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4337 		resync_start_show, resync_start_store);
4338 
4339 /*
4340  * The array state can be:
4341  *
4342  * clear
4343  *     No devices, no size, no level
4344  *     Equivalent to STOP_ARRAY ioctl
4345  * inactive
4346  *     May have some settings, but array is not active
4347  *        all IO results in error
4348  *     When written, doesn't tear down array, but just stops it
4349  * suspended (not supported yet)
4350  *     All IO requests will block. The array can be reconfigured.
4351  *     Writing this, if accepted, will block until array is quiescent
4352  * readonly
4353  *     no resync can happen.  no superblocks get written.
4354  *     write requests fail
4355  * read-auto
4356  *     like readonly, but behaves like 'clean' on a write request.
4357  *
4358  * clean - no pending writes, but otherwise active.
4359  *     When written to inactive array, starts without resync
4360  *     If a write request arrives then
4361  *       if metadata is known, mark 'dirty' and switch to 'active'.
4362  *       if not known, block and switch to write-pending
4363  *     If written to an active array that has pending writes, then fails.
4364  * active
4365  *     fully active: IO and resync can be happening.
4366  *     When written to inactive array, starts with resync
4367  *
4368  * write-pending
4369  *     clean, but writes are blocked waiting for 'active' to be written.
4370  *
4371  * active-idle
4372  *     like active, but no writes have been seen for a while (100msec).
4373  *
4374  * broken
4375 *     Array is failed. It's useful because mounted-arrays aren't stopped
4376 *     when array is failed, so this state will at least alert the user that
4377 *     something is wrong.
4378  */
4379 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4380 		   write_pending, active_idle, broken, bad_word};
4381 static char *array_states[] = {
4382 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4383 	"write-pending", "active-idle", "broken", NULL };
4384 
match_word(const char * word,char ** list)4385 static int match_word(const char *word, char **list)
4386 {
4387 	int n;
4388 	for (n=0; list[n]; n++)
4389 		if (cmd_match(word, list[n]))
4390 			break;
4391 	return n;
4392 }
4393 
4394 static ssize_t
array_state_show(struct mddev * mddev,char * page)4395 array_state_show(struct mddev *mddev, char *page)
4396 {
4397 	enum array_state st = inactive;
4398 
4399 	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4400 		switch(mddev->ro) {
4401 		case MD_RDONLY:
4402 			st = readonly;
4403 			break;
4404 		case MD_AUTO_READ:
4405 			st = read_auto;
4406 			break;
4407 		case MD_RDWR:
4408 			spin_lock(&mddev->lock);
4409 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4410 				st = write_pending;
4411 			else if (mddev->in_sync)
4412 				st = clean;
4413 			else if (mddev->safemode)
4414 				st = active_idle;
4415 			else
4416 				st = active;
4417 			spin_unlock(&mddev->lock);
4418 		}
4419 
4420 		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4421 			st = broken;
4422 	} else {
4423 		if (list_empty(&mddev->disks) &&
4424 		    mddev->raid_disks == 0 &&
4425 		    mddev->dev_sectors == 0)
4426 			st = clear;
4427 		else
4428 			st = inactive;
4429 	}
4430 	return sprintf(page, "%s\n", array_states[st]);
4431 }
4432 
4433 static int do_md_stop(struct mddev *mddev, int ro);
4434 static int md_set_readonly(struct mddev *mddev);
4435 static int restart_array(struct mddev *mddev);
4436 
4437 static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4438 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4439 {
4440 	int err = 0;
4441 	enum array_state st = match_word(buf, array_states);
4442 
4443 	/* No lock dependent actions */
4444 	switch (st) {
4445 	case suspended:		/* not supported yet */
4446 	case write_pending:	/* cannot be set */
4447 	case active_idle:	/* cannot be set */
4448 	case broken:		/* cannot be set */
4449 	case bad_word:
4450 		return -EINVAL;
4451 	case clear:
4452 	case readonly:
4453 	case inactive:
4454 	case read_auto:
4455 		if (!mddev->pers || !md_is_rdwr(mddev))
4456 			break;
4457 		/* write sysfs will not open mddev and opener should be 0 */
4458 		err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4459 		if (err)
4460 			return err;
4461 		break;
4462 	default:
4463 		break;
4464 	}
4465 
4466 	if (mddev->pers && (st == active || st == clean) &&
4467 	    mddev->ro != MD_RDONLY) {
4468 		/* don't take reconfig_mutex when toggling between
4469 		 * clean and active
4470 		 */
4471 		spin_lock(&mddev->lock);
4472 		if (st == active) {
4473 			restart_array(mddev);
4474 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4475 			md_wakeup_thread(mddev->thread);
4476 			wake_up(&mddev->sb_wait);
4477 		} else /* st == clean */ {
4478 			restart_array(mddev);
4479 			if (!set_in_sync(mddev))
4480 				err = -EBUSY;
4481 		}
4482 		if (!err)
4483 			sysfs_notify_dirent_safe(mddev->sysfs_state);
4484 		spin_unlock(&mddev->lock);
4485 		return err ?: len;
4486 	}
4487 	err = mddev_lock(mddev);
4488 	if (err)
4489 		return err;
4490 
4491 	switch (st) {
4492 	case inactive:
4493 		/* stop an active array, return 0 otherwise */
4494 		if (mddev->pers)
4495 			err = do_md_stop(mddev, 2);
4496 		break;
4497 	case clear:
4498 		err = do_md_stop(mddev, 0);
4499 		break;
4500 	case readonly:
4501 		if (mddev->pers)
4502 			err = md_set_readonly(mddev);
4503 		else {
4504 			mddev->ro = MD_RDONLY;
4505 			set_disk_ro(mddev->gendisk, 1);
4506 			err = do_md_run(mddev);
4507 		}
4508 		break;
4509 	case read_auto:
4510 		if (mddev->pers) {
4511 			if (md_is_rdwr(mddev))
4512 				err = md_set_readonly(mddev);
4513 			else if (mddev->ro == MD_RDONLY)
4514 				err = restart_array(mddev);
4515 			if (err == 0) {
4516 				mddev->ro = MD_AUTO_READ;
4517 				set_disk_ro(mddev->gendisk, 0);
4518 			}
4519 		} else {
4520 			mddev->ro = MD_AUTO_READ;
4521 			err = do_md_run(mddev);
4522 		}
4523 		break;
4524 	case clean:
4525 		if (mddev->pers) {
4526 			err = restart_array(mddev);
4527 			if (err)
4528 				break;
4529 			spin_lock(&mddev->lock);
4530 			if (!set_in_sync(mddev))
4531 				err = -EBUSY;
4532 			spin_unlock(&mddev->lock);
4533 		} else
4534 			err = -EINVAL;
4535 		break;
4536 	case active:
4537 		if (mddev->pers) {
4538 			err = restart_array(mddev);
4539 			if (err)
4540 				break;
4541 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4542 			wake_up(&mddev->sb_wait);
4543 			err = 0;
4544 		} else {
4545 			mddev->ro = MD_RDWR;
4546 			set_disk_ro(mddev->gendisk, 0);
4547 			err = do_md_run(mddev);
4548 		}
4549 		break;
4550 	default:
4551 		err = -EINVAL;
4552 		break;
4553 	}
4554 
4555 	if (!err) {
4556 		if (mddev->hold_active == UNTIL_IOCTL)
4557 			mddev->hold_active = 0;
4558 		sysfs_notify_dirent_safe(mddev->sysfs_state);
4559 	}
4560 	mddev_unlock(mddev);
4561 
4562 	if (st == readonly || st == read_auto || st == inactive ||
4563 	    (err && st == clear))
4564 		clear_bit(MD_CLOSING, &mddev->flags);
4565 
4566 	return err ?: len;
4567 }
4568 static struct md_sysfs_entry md_array_state =
4569 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4570 
4571 static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4572 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4573 	return sprintf(page, "%d\n",
4574 		       atomic_read(&mddev->max_corr_read_errors));
4575 }
4576 
4577 static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4578 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4579 {
4580 	unsigned int n;
4581 	int rv;
4582 
4583 	rv = kstrtouint(buf, 10, &n);
4584 	if (rv < 0)
4585 		return rv;
4586 	if (n > INT_MAX)
4587 		return -EINVAL;
4588 	atomic_set(&mddev->max_corr_read_errors, n);
4589 	return len;
4590 }
4591 
4592 static struct md_sysfs_entry max_corr_read_errors =
4593 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4594 	max_corrected_read_errors_store);
4595 
4596 static ssize_t
null_show(struct mddev * mddev,char * page)4597 null_show(struct mddev *mddev, char *page)
4598 {
4599 	return -EINVAL;
4600 }
4601 
4602 static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4603 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4604 {
4605 	/* buf must be %d:%d\n? giving major and minor numbers */
4606 	/* The new device is added to the array.
4607 	 * If the array has a persistent superblock, we read the
4608 	 * superblock to initialise info and check validity.
4609 	 * Otherwise, only checking done is that in bind_rdev_to_array,
4610 	 * which mainly checks size.
4611 	 */
4612 	char *e;
4613 	int major = simple_strtoul(buf, &e, 10);
4614 	int minor;
4615 	dev_t dev;
4616 	struct md_rdev *rdev;
4617 	int err;
4618 
4619 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4620 		return -EINVAL;
4621 	minor = simple_strtoul(e+1, &e, 10);
4622 	if (*e && *e != '\n')
4623 		return -EINVAL;
4624 	dev = MKDEV(major, minor);
4625 	if (major != MAJOR(dev) ||
4626 	    minor != MINOR(dev))
4627 		return -EOVERFLOW;
4628 
4629 	err = mddev_suspend_and_lock(mddev);
4630 	if (err)
4631 		return err;
4632 	if (mddev->persistent) {
4633 		rdev = md_import_device(dev, mddev->major_version,
4634 					mddev->minor_version);
4635 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4636 			struct md_rdev *rdev0
4637 				= list_entry(mddev->disks.next,
4638 					     struct md_rdev, same_set);
4639 			err = super_types[mddev->major_version]
4640 				.load_super(rdev, rdev0, mddev->minor_version);
4641 			if (err < 0)
4642 				goto out;
4643 		}
4644 	} else if (mddev->external)
4645 		rdev = md_import_device(dev, -2, -1);
4646 	else
4647 		rdev = md_import_device(dev, -1, -1);
4648 
4649 	if (IS_ERR(rdev)) {
4650 		mddev_unlock_and_resume(mddev);
4651 		return PTR_ERR(rdev);
4652 	}
4653 	err = bind_rdev_to_array(rdev, mddev);
4654  out:
4655 	if (err)
4656 		export_rdev(rdev, mddev);
4657 	mddev_unlock_and_resume(mddev);
4658 	if (!err)
4659 		md_new_event();
4660 	return err ? err : len;
4661 }
4662 
4663 static struct md_sysfs_entry md_new_device =
4664 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4665 
4666 static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4667 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4668 {
4669 	char *end;
4670 	unsigned long chunk, end_chunk;
4671 	int err;
4672 
4673 	err = mddev_lock(mddev);
4674 	if (err)
4675 		return err;
4676 	if (!mddev->bitmap)
4677 		goto out;
4678 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4679 	while (*buf) {
4680 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4681 		if (buf == end)
4682 			break;
4683 
4684 		if (*end == '-') { /* range */
4685 			buf = end + 1;
4686 			end_chunk = simple_strtoul(buf, &end, 0);
4687 			if (buf == end)
4688 				break;
4689 		}
4690 
4691 		if (*end && !isspace(*end))
4692 			break;
4693 
4694 		mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
4695 		buf = skip_spaces(end);
4696 	}
4697 	mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
4698 out:
4699 	mddev_unlock(mddev);
4700 	return len;
4701 }
4702 
4703 static struct md_sysfs_entry md_bitmap =
4704 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4705 
4706 static ssize_t
size_show(struct mddev * mddev,char * page)4707 size_show(struct mddev *mddev, char *page)
4708 {
4709 	return sprintf(page, "%llu\n",
4710 		(unsigned long long)mddev->dev_sectors / 2);
4711 }
4712 
4713 static int update_size(struct mddev *mddev, sector_t num_sectors);
4714 
4715 static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4716 size_store(struct mddev *mddev, const char *buf, size_t len)
4717 {
4718 	/* If array is inactive, we can reduce the component size, but
4719 	 * not increase it (except from 0).
4720 	 * If array is active, we can try an on-line resize
4721 	 */
4722 	sector_t sectors;
4723 	int err = strict_blocks_to_sectors(buf, &sectors);
4724 
4725 	if (err < 0)
4726 		return err;
4727 	err = mddev_lock(mddev);
4728 	if (err)
4729 		return err;
4730 	if (mddev->pers) {
4731 		err = update_size(mddev, sectors);
4732 		if (err == 0)
4733 			md_update_sb(mddev, 1);
4734 	} else {
4735 		if (mddev->dev_sectors == 0 ||
4736 		    mddev->dev_sectors > sectors)
4737 			mddev->dev_sectors = sectors;
4738 		else
4739 			err = -ENOSPC;
4740 	}
4741 	mddev_unlock(mddev);
4742 	return err ? err : len;
4743 }
4744 
4745 static struct md_sysfs_entry md_size =
4746 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4747 
4748 /* Metadata version.
4749  * This is one of
4750  *   'none' for arrays with no metadata (good luck...)
4751  *   'external' for arrays with externally managed metadata,
4752  * or N.M for internally known formats
4753  */
4754 static ssize_t
metadata_show(struct mddev * mddev,char * page)4755 metadata_show(struct mddev *mddev, char *page)
4756 {
4757 	if (mddev->persistent)
4758 		return sprintf(page, "%d.%d\n",
4759 			       mddev->major_version, mddev->minor_version);
4760 	else if (mddev->external)
4761 		return sprintf(page, "external:%s\n", mddev->metadata_type);
4762 	else
4763 		return sprintf(page, "none\n");
4764 }
4765 
4766 static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4767 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4768 {
4769 	int major, minor;
4770 	char *e;
4771 	int err;
4772 	/* Changing the details of 'external' metadata is
4773 	 * always permitted.  Otherwise there must be
4774 	 * no devices attached to the array.
4775 	 */
4776 
4777 	err = mddev_lock(mddev);
4778 	if (err)
4779 		return err;
4780 	err = -EBUSY;
4781 	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4782 		;
4783 	else if (!list_empty(&mddev->disks))
4784 		goto out_unlock;
4785 
4786 	err = 0;
4787 	if (cmd_match(buf, "none")) {
4788 		mddev->persistent = 0;
4789 		mddev->external = 0;
4790 		mddev->major_version = 0;
4791 		mddev->minor_version = 90;
4792 		goto out_unlock;
4793 	}
4794 	if (strncmp(buf, "external:", 9) == 0) {
4795 		size_t namelen = len-9;
4796 		if (namelen >= sizeof(mddev->metadata_type))
4797 			namelen = sizeof(mddev->metadata_type)-1;
4798 		memcpy(mddev->metadata_type, buf+9, namelen);
4799 		mddev->metadata_type[namelen] = 0;
4800 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4801 			mddev->metadata_type[--namelen] = 0;
4802 		mddev->persistent = 0;
4803 		mddev->external = 1;
4804 		mddev->major_version = 0;
4805 		mddev->minor_version = 90;
4806 		goto out_unlock;
4807 	}
4808 	major = simple_strtoul(buf, &e, 10);
4809 	err = -EINVAL;
4810 	if (e==buf || *e != '.')
4811 		goto out_unlock;
4812 	buf = e+1;
4813 	minor = simple_strtoul(buf, &e, 10);
4814 	if (e==buf || (*e && *e != '\n') )
4815 		goto out_unlock;
4816 	err = -ENOENT;
4817 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4818 		goto out_unlock;
4819 	mddev->major_version = major;
4820 	mddev->minor_version = minor;
4821 	mddev->persistent = 1;
4822 	mddev->external = 0;
4823 	err = 0;
4824 out_unlock:
4825 	mddev_unlock(mddev);
4826 	return err ?: len;
4827 }
4828 
4829 static struct md_sysfs_entry md_metadata =
4830 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4831 
md_sync_action(struct mddev * mddev)4832 enum sync_action md_sync_action(struct mddev *mddev)
4833 {
4834 	unsigned long recovery = mddev->recovery;
4835 
4836 	/*
4837 	 * frozen has the highest priority, means running sync_thread will be
4838 	 * stopped immediately, and no new sync_thread can start.
4839 	 */
4840 	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4841 		return ACTION_FROZEN;
4842 
4843 	/*
4844 	 * read-only array can't register sync_thread, and it can only
4845 	 * add/remove spares.
4846 	 */
4847 	if (!md_is_rdwr(mddev))
4848 		return ACTION_IDLE;
4849 
4850 	/*
4851 	 * idle means no sync_thread is running, and no new sync_thread is
4852 	 * requested.
4853 	 */
4854 	if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
4855 	    !test_bit(MD_RECOVERY_NEEDED, &recovery))
4856 		return ACTION_IDLE;
4857 
4858 	if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
4859 	    mddev->reshape_position != MaxSector)
4860 		return ACTION_RESHAPE;
4861 
4862 	if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4863 		return ACTION_RECOVER;
4864 
4865 	if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4866 		/*
4867 		 * MD_RECOVERY_CHECK must be paired with
4868 		 * MD_RECOVERY_REQUESTED.
4869 		 */
4870 		if (test_bit(MD_RECOVERY_CHECK, &recovery))
4871 			return ACTION_CHECK;
4872 		if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
4873 			return ACTION_REPAIR;
4874 		return ACTION_RESYNC;
4875 	}
4876 
4877 	/*
4878 	 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
4879 	 * sync_action is specified.
4880 	 */
4881 	return ACTION_IDLE;
4882 }
4883 
md_sync_action_by_name(const char * page)4884 enum sync_action md_sync_action_by_name(const char *page)
4885 {
4886 	enum sync_action action;
4887 
4888 	for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
4889 		if (cmd_match(page, action_name[action]))
4890 			return action;
4891 	}
4892 
4893 	return NR_SYNC_ACTIONS;
4894 }
4895 
md_sync_action_name(enum sync_action action)4896 const char *md_sync_action_name(enum sync_action action)
4897 {
4898 	return action_name[action];
4899 }
4900 
4901 static ssize_t
action_show(struct mddev * mddev,char * page)4902 action_show(struct mddev *mddev, char *page)
4903 {
4904 	enum sync_action action = md_sync_action(mddev);
4905 
4906 	return sprintf(page, "%s\n", md_sync_action_name(action));
4907 }
4908 
4909 /**
4910  * stop_sync_thread() - wait for sync_thread to stop if it's running.
4911  * @mddev:	the array.
4912  * @locked:	if set, reconfig_mutex will still be held after this function
4913  *		return; if not set, reconfig_mutex will be released after this
4914  *		function return.
4915  */
stop_sync_thread(struct mddev * mddev,bool locked)4916 static void stop_sync_thread(struct mddev *mddev, bool locked)
4917 {
4918 	int sync_seq = atomic_read(&mddev->sync_seq);
4919 
4920 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4921 		if (!locked)
4922 			mddev_unlock(mddev);
4923 		return;
4924 	}
4925 
4926 	mddev_unlock(mddev);
4927 
4928 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4929 	/*
4930 	 * Thread might be blocked waiting for metadata update which will now
4931 	 * never happen
4932 	 */
4933 	md_wakeup_thread_directly(mddev->sync_thread);
4934 	if (work_pending(&mddev->sync_work))
4935 		flush_work(&mddev->sync_work);
4936 
4937 	wait_event(resync_wait,
4938 		   !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4939 		   (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
4940 		    sync_seq != atomic_read(&mddev->sync_seq)));
4941 
4942 	if (locked)
4943 		mddev_lock_nointr(mddev);
4944 }
4945 
md_idle_sync_thread(struct mddev * mddev)4946 void md_idle_sync_thread(struct mddev *mddev)
4947 {
4948 	lockdep_assert_held(&mddev->reconfig_mutex);
4949 
4950 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4951 	stop_sync_thread(mddev, true);
4952 }
4953 EXPORT_SYMBOL_GPL(md_idle_sync_thread);
4954 
md_frozen_sync_thread(struct mddev * mddev)4955 void md_frozen_sync_thread(struct mddev *mddev)
4956 {
4957 	lockdep_assert_held(&mddev->reconfig_mutex);
4958 
4959 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4960 	stop_sync_thread(mddev, true);
4961 }
4962 EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
4963 
md_unfrozen_sync_thread(struct mddev * mddev)4964 void md_unfrozen_sync_thread(struct mddev *mddev)
4965 {
4966 	lockdep_assert_held(&mddev->reconfig_mutex);
4967 
4968 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4969 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4970 	md_wakeup_thread(mddev->thread);
4971 	sysfs_notify_dirent_safe(mddev->sysfs_action);
4972 }
4973 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
4974 
mddev_start_reshape(struct mddev * mddev)4975 static int mddev_start_reshape(struct mddev *mddev)
4976 {
4977 	int ret;
4978 
4979 	if (mddev->pers->start_reshape == NULL)
4980 		return -EINVAL;
4981 
4982 	if (mddev->reshape_position == MaxSector ||
4983 	    mddev->pers->check_reshape == NULL ||
4984 	    mddev->pers->check_reshape(mddev)) {
4985 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4986 		ret = mddev->pers->start_reshape(mddev);
4987 		if (ret)
4988 			return ret;
4989 	} else {
4990 		/*
4991 		 * If reshape is still in progress, and md_check_recovery() can
4992 		 * continue to reshape, don't restart reshape because data can
4993 		 * be corrupted for raid456.
4994 		 */
4995 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4996 	}
4997 
4998 	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4999 	return 0;
5000 }
5001 
5002 static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)5003 action_store(struct mddev *mddev, const char *page, size_t len)
5004 {
5005 	int ret;
5006 	enum sync_action action;
5007 
5008 	if (!mddev->pers || !mddev->pers->sync_request)
5009 		return -EINVAL;
5010 
5011 retry:
5012 	if (work_busy(&mddev->sync_work))
5013 		flush_work(&mddev->sync_work);
5014 
5015 	ret = mddev_lock(mddev);
5016 	if (ret)
5017 		return ret;
5018 
5019 	if (work_busy(&mddev->sync_work)) {
5020 		mddev_unlock(mddev);
5021 		goto retry;
5022 	}
5023 
5024 	action = md_sync_action_by_name(page);
5025 
5026 	/* TODO: mdadm rely on "idle" to start sync_thread. */
5027 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5028 		switch (action) {
5029 		case ACTION_FROZEN:
5030 			md_frozen_sync_thread(mddev);
5031 			ret = len;
5032 			goto out;
5033 		case ACTION_IDLE:
5034 			md_idle_sync_thread(mddev);
5035 			break;
5036 		case ACTION_RESHAPE:
5037 		case ACTION_RECOVER:
5038 		case ACTION_CHECK:
5039 		case ACTION_REPAIR:
5040 		case ACTION_RESYNC:
5041 			ret = -EBUSY;
5042 			goto out;
5043 		default:
5044 			ret = -EINVAL;
5045 			goto out;
5046 		}
5047 	} else {
5048 		switch (action) {
5049 		case ACTION_FROZEN:
5050 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5051 			ret = len;
5052 			goto out;
5053 		case ACTION_RESHAPE:
5054 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5055 			ret = mddev_start_reshape(mddev);
5056 			if (ret)
5057 				goto out;
5058 			break;
5059 		case ACTION_RECOVER:
5060 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5061 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5062 			break;
5063 		case ACTION_CHECK:
5064 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5065 			fallthrough;
5066 		case ACTION_REPAIR:
5067 			set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5068 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5069 			fallthrough;
5070 		case ACTION_RESYNC:
5071 		case ACTION_IDLE:
5072 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5073 			break;
5074 		default:
5075 			ret = -EINVAL;
5076 			goto out;
5077 		}
5078 	}
5079 
5080 	if (mddev->ro == MD_AUTO_READ) {
5081 		/* A write to sync_action is enough to justify
5082 		 * canceling read-auto mode
5083 		 */
5084 		mddev->ro = MD_RDWR;
5085 		md_wakeup_thread(mddev->sync_thread);
5086 	}
5087 
5088 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5089 	md_wakeup_thread(mddev->thread);
5090 	sysfs_notify_dirent_safe(mddev->sysfs_action);
5091 	ret = len;
5092 
5093 out:
5094 	mddev_unlock(mddev);
5095 	return ret;
5096 }
5097 
5098 static struct md_sysfs_entry md_scan_mode =
5099 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5100 
5101 static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)5102 last_sync_action_show(struct mddev *mddev, char *page)
5103 {
5104 	return sprintf(page, "%s\n",
5105 		       md_sync_action_name(mddev->last_sync_action));
5106 }
5107 
5108 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5109 
5110 static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)5111 mismatch_cnt_show(struct mddev *mddev, char *page)
5112 {
5113 	return sprintf(page, "%llu\n",
5114 		       (unsigned long long)
5115 		       atomic64_read(&mddev->resync_mismatches));
5116 }
5117 
5118 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5119 
5120 static ssize_t
sync_min_show(struct mddev * mddev,char * page)5121 sync_min_show(struct mddev *mddev, char *page)
5122 {
5123 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
5124 		       mddev->sync_speed_min ? "local" : "system");
5125 }
5126 
5127 static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)5128 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5129 {
5130 	unsigned int min;
5131 	int rv;
5132 
5133 	if (strncmp(buf, "system", 6) == 0) {
5134 		min = 0;
5135 	} else {
5136 		rv = kstrtouint(buf, 10, &min);
5137 		if (rv < 0)
5138 			return rv;
5139 		if (min == 0)
5140 			return -EINVAL;
5141 	}
5142 	mddev->sync_speed_min = min;
5143 	return len;
5144 }
5145 
5146 static struct md_sysfs_entry md_sync_min =
5147 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5148 
5149 static ssize_t
sync_max_show(struct mddev * mddev,char * page)5150 sync_max_show(struct mddev *mddev, char *page)
5151 {
5152 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
5153 		       mddev->sync_speed_max ? "local" : "system");
5154 }
5155 
5156 static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5157 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5158 {
5159 	unsigned int max;
5160 	int rv;
5161 
5162 	if (strncmp(buf, "system", 6) == 0) {
5163 		max = 0;
5164 	} else {
5165 		rv = kstrtouint(buf, 10, &max);
5166 		if (rv < 0)
5167 			return rv;
5168 		if (max == 0)
5169 			return -EINVAL;
5170 	}
5171 	mddev->sync_speed_max = max;
5172 	return len;
5173 }
5174 
5175 static struct md_sysfs_entry md_sync_max =
5176 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5177 
5178 static ssize_t
sync_io_depth_show(struct mddev * mddev,char * page)5179 sync_io_depth_show(struct mddev *mddev, char *page)
5180 {
5181 	return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
5182 		       mddev->sync_io_depth ? "local" : "system");
5183 }
5184 
5185 static ssize_t
sync_io_depth_store(struct mddev * mddev,const char * buf,size_t len)5186 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
5187 {
5188 	unsigned int max;
5189 	int rv;
5190 
5191 	if (strncmp(buf, "system", 6) == 0) {
5192 		max = 0;
5193 	} else {
5194 		rv = kstrtouint(buf, 10, &max);
5195 		if (rv < 0)
5196 			return rv;
5197 		if (max == 0)
5198 			return -EINVAL;
5199 	}
5200 	mddev->sync_io_depth = max;
5201 	return len;
5202 }
5203 
5204 static struct md_sysfs_entry md_sync_io_depth =
5205 __ATTR_RW(sync_io_depth);
5206 
5207 static ssize_t
degraded_show(struct mddev * mddev,char * page)5208 degraded_show(struct mddev *mddev, char *page)
5209 {
5210 	return sprintf(page, "%d\n", mddev->degraded);
5211 }
5212 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5213 
5214 static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5215 sync_force_parallel_show(struct mddev *mddev, char *page)
5216 {
5217 	return sprintf(page, "%d\n", mddev->parallel_resync);
5218 }
5219 
5220 static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5221 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5222 {
5223 	long n;
5224 
5225 	if (kstrtol(buf, 10, &n))
5226 		return -EINVAL;
5227 
5228 	if (n != 0 && n != 1)
5229 		return -EINVAL;
5230 
5231 	mddev->parallel_resync = n;
5232 
5233 	if (mddev->sync_thread)
5234 		wake_up(&resync_wait);
5235 
5236 	return len;
5237 }
5238 
5239 /* force parallel resync, even with shared block devices */
5240 static struct md_sysfs_entry md_sync_force_parallel =
5241 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5242        sync_force_parallel_show, sync_force_parallel_store);
5243 
5244 static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5245 sync_speed_show(struct mddev *mddev, char *page)
5246 {
5247 	unsigned long resync, dt, db;
5248 	if (mddev->curr_resync == MD_RESYNC_NONE)
5249 		return sprintf(page, "none\n");
5250 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5251 	dt = (jiffies - mddev->resync_mark) / HZ;
5252 	if (!dt) dt++;
5253 	db = resync - mddev->resync_mark_cnt;
5254 	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5255 }
5256 
5257 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5258 
5259 static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5260 sync_completed_show(struct mddev *mddev, char *page)
5261 {
5262 	unsigned long long max_sectors, resync;
5263 
5264 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5265 		return sprintf(page, "none\n");
5266 
5267 	if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5268 	    mddev->curr_resync == MD_RESYNC_DELAYED)
5269 		return sprintf(page, "delayed\n");
5270 
5271 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5272 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5273 		max_sectors = mddev->resync_max_sectors;
5274 	else
5275 		max_sectors = mddev->dev_sectors;
5276 
5277 	resync = mddev->curr_resync_completed;
5278 	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5279 }
5280 
5281 static struct md_sysfs_entry md_sync_completed =
5282 	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5283 
5284 static ssize_t
min_sync_show(struct mddev * mddev,char * page)5285 min_sync_show(struct mddev *mddev, char *page)
5286 {
5287 	return sprintf(page, "%llu\n",
5288 		       (unsigned long long)mddev->resync_min);
5289 }
5290 static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5291 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5292 {
5293 	unsigned long long min;
5294 	int err;
5295 
5296 	if (kstrtoull(buf, 10, &min))
5297 		return -EINVAL;
5298 
5299 	spin_lock(&mddev->lock);
5300 	err = -EINVAL;
5301 	if (min > mddev->resync_max)
5302 		goto out_unlock;
5303 
5304 	err = -EBUSY;
5305 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5306 		goto out_unlock;
5307 
5308 	/* Round down to multiple of 4K for safety */
5309 	mddev->resync_min = round_down(min, 8);
5310 	err = 0;
5311 
5312 out_unlock:
5313 	spin_unlock(&mddev->lock);
5314 	return err ?: len;
5315 }
5316 
5317 static struct md_sysfs_entry md_min_sync =
5318 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5319 
5320 static ssize_t
max_sync_show(struct mddev * mddev,char * page)5321 max_sync_show(struct mddev *mddev, char *page)
5322 {
5323 	if (mddev->resync_max == MaxSector)
5324 		return sprintf(page, "max\n");
5325 	else
5326 		return sprintf(page, "%llu\n",
5327 			       (unsigned long long)mddev->resync_max);
5328 }
5329 static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5330 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5331 {
5332 	int err;
5333 	spin_lock(&mddev->lock);
5334 	if (strncmp(buf, "max", 3) == 0)
5335 		mddev->resync_max = MaxSector;
5336 	else {
5337 		unsigned long long max;
5338 		int chunk;
5339 
5340 		err = -EINVAL;
5341 		if (kstrtoull(buf, 10, &max))
5342 			goto out_unlock;
5343 		if (max < mddev->resync_min)
5344 			goto out_unlock;
5345 
5346 		err = -EBUSY;
5347 		if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5348 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5349 			goto out_unlock;
5350 
5351 		/* Must be a multiple of chunk_size */
5352 		chunk = mddev->chunk_sectors;
5353 		if (chunk) {
5354 			sector_t temp = max;
5355 
5356 			err = -EINVAL;
5357 			if (sector_div(temp, chunk))
5358 				goto out_unlock;
5359 		}
5360 		mddev->resync_max = max;
5361 	}
5362 	wake_up(&mddev->recovery_wait);
5363 	err = 0;
5364 out_unlock:
5365 	spin_unlock(&mddev->lock);
5366 	return err ?: len;
5367 }
5368 
5369 static struct md_sysfs_entry md_max_sync =
5370 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5371 
5372 static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5373 suspend_lo_show(struct mddev *mddev, char *page)
5374 {
5375 	return sprintf(page, "%llu\n",
5376 		       (unsigned long long)READ_ONCE(mddev->suspend_lo));
5377 }
5378 
5379 static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5380 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5381 {
5382 	unsigned long long new;
5383 	int err;
5384 
5385 	err = kstrtoull(buf, 10, &new);
5386 	if (err < 0)
5387 		return err;
5388 	if (new != (sector_t)new)
5389 		return -EINVAL;
5390 
5391 	err = mddev_suspend(mddev, true);
5392 	if (err)
5393 		return err;
5394 
5395 	WRITE_ONCE(mddev->suspend_lo, new);
5396 	mddev_resume(mddev);
5397 
5398 	return len;
5399 }
5400 static struct md_sysfs_entry md_suspend_lo =
5401 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5402 
5403 static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5404 suspend_hi_show(struct mddev *mddev, char *page)
5405 {
5406 	return sprintf(page, "%llu\n",
5407 		       (unsigned long long)READ_ONCE(mddev->suspend_hi));
5408 }
5409 
5410 static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5411 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5412 {
5413 	unsigned long long new;
5414 	int err;
5415 
5416 	err = kstrtoull(buf, 10, &new);
5417 	if (err < 0)
5418 		return err;
5419 	if (new != (sector_t)new)
5420 		return -EINVAL;
5421 
5422 	err = mddev_suspend(mddev, true);
5423 	if (err)
5424 		return err;
5425 
5426 	WRITE_ONCE(mddev->suspend_hi, new);
5427 	mddev_resume(mddev);
5428 
5429 	return len;
5430 }
5431 static struct md_sysfs_entry md_suspend_hi =
5432 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5433 
5434 static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5435 reshape_position_show(struct mddev *mddev, char *page)
5436 {
5437 	if (mddev->reshape_position != MaxSector)
5438 		return sprintf(page, "%llu\n",
5439 			       (unsigned long long)mddev->reshape_position);
5440 	strcpy(page, "none\n");
5441 	return 5;
5442 }
5443 
5444 static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5445 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5446 {
5447 	struct md_rdev *rdev;
5448 	unsigned long long new;
5449 	int err;
5450 
5451 	err = kstrtoull(buf, 10, &new);
5452 	if (err < 0)
5453 		return err;
5454 	if (new != (sector_t)new)
5455 		return -EINVAL;
5456 	err = mddev_lock(mddev);
5457 	if (err)
5458 		return err;
5459 	err = -EBUSY;
5460 	if (mddev->pers)
5461 		goto unlock;
5462 	mddev->reshape_position = new;
5463 	mddev->delta_disks = 0;
5464 	mddev->reshape_backwards = 0;
5465 	mddev->new_level = mddev->level;
5466 	mddev->new_layout = mddev->layout;
5467 	mddev->new_chunk_sectors = mddev->chunk_sectors;
5468 	rdev_for_each(rdev, mddev)
5469 		rdev->new_data_offset = rdev->data_offset;
5470 	err = 0;
5471 unlock:
5472 	mddev_unlock(mddev);
5473 	return err ?: len;
5474 }
5475 
5476 static struct md_sysfs_entry md_reshape_position =
5477 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5478        reshape_position_store);
5479 
5480 static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5481 reshape_direction_show(struct mddev *mddev, char *page)
5482 {
5483 	return sprintf(page, "%s\n",
5484 		       mddev->reshape_backwards ? "backwards" : "forwards");
5485 }
5486 
5487 static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5488 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5489 {
5490 	int backwards = 0;
5491 	int err;
5492 
5493 	if (cmd_match(buf, "forwards"))
5494 		backwards = 0;
5495 	else if (cmd_match(buf, "backwards"))
5496 		backwards = 1;
5497 	else
5498 		return -EINVAL;
5499 	if (mddev->reshape_backwards == backwards)
5500 		return len;
5501 
5502 	err = mddev_lock(mddev);
5503 	if (err)
5504 		return err;
5505 	/* check if we are allowed to change */
5506 	if (mddev->delta_disks)
5507 		err = -EBUSY;
5508 	else if (mddev->persistent &&
5509 	    mddev->major_version == 0)
5510 		err =  -EINVAL;
5511 	else
5512 		mddev->reshape_backwards = backwards;
5513 	mddev_unlock(mddev);
5514 	return err ?: len;
5515 }
5516 
5517 static struct md_sysfs_entry md_reshape_direction =
5518 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5519        reshape_direction_store);
5520 
5521 static ssize_t
array_size_show(struct mddev * mddev,char * page)5522 array_size_show(struct mddev *mddev, char *page)
5523 {
5524 	if (mddev->external_size)
5525 		return sprintf(page, "%llu\n",
5526 			       (unsigned long long)mddev->array_sectors/2);
5527 	else
5528 		return sprintf(page, "default\n");
5529 }
5530 
5531 static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5532 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5533 {
5534 	sector_t sectors;
5535 	int err;
5536 
5537 	err = mddev_lock(mddev);
5538 	if (err)
5539 		return err;
5540 
5541 	/* cluster raid doesn't support change array_sectors */
5542 	if (mddev_is_clustered(mddev)) {
5543 		mddev_unlock(mddev);
5544 		return -EINVAL;
5545 	}
5546 
5547 	if (strncmp(buf, "default", 7) == 0) {
5548 		if (mddev->pers)
5549 			sectors = mddev->pers->size(mddev, 0, 0);
5550 		else
5551 			sectors = mddev->array_sectors;
5552 
5553 		mddev->external_size = 0;
5554 	} else {
5555 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5556 			err = -EINVAL;
5557 		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5558 			err = -E2BIG;
5559 		else
5560 			mddev->external_size = 1;
5561 	}
5562 
5563 	if (!err) {
5564 		mddev->array_sectors = sectors;
5565 		if (mddev->pers)
5566 			set_capacity_and_notify(mddev->gendisk,
5567 						mddev->array_sectors);
5568 	}
5569 	mddev_unlock(mddev);
5570 	return err ?: len;
5571 }
5572 
5573 static struct md_sysfs_entry md_array_size =
5574 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5575        array_size_store);
5576 
5577 static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5578 consistency_policy_show(struct mddev *mddev, char *page)
5579 {
5580 	int ret;
5581 
5582 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5583 		ret = sprintf(page, "journal\n");
5584 	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5585 		ret = sprintf(page, "ppl\n");
5586 	} else if (mddev->bitmap) {
5587 		ret = sprintf(page, "bitmap\n");
5588 	} else if (mddev->pers) {
5589 		if (mddev->pers->sync_request)
5590 			ret = sprintf(page, "resync\n");
5591 		else
5592 			ret = sprintf(page, "none\n");
5593 	} else {
5594 		ret = sprintf(page, "unknown\n");
5595 	}
5596 
5597 	return ret;
5598 }
5599 
5600 static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5601 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5602 {
5603 	int err = 0;
5604 
5605 	if (mddev->pers) {
5606 		if (mddev->pers->change_consistency_policy)
5607 			err = mddev->pers->change_consistency_policy(mddev, buf);
5608 		else
5609 			err = -EBUSY;
5610 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5611 		set_bit(MD_HAS_PPL, &mddev->flags);
5612 	} else {
5613 		err = -EINVAL;
5614 	}
5615 
5616 	return err ? err : len;
5617 }
5618 
5619 static struct md_sysfs_entry md_consistency_policy =
5620 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5621        consistency_policy_store);
5622 
fail_last_dev_show(struct mddev * mddev,char * page)5623 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5624 {
5625 	return sprintf(page, "%d\n", mddev->fail_last_dev);
5626 }
5627 
5628 /*
5629  * Setting fail_last_dev to true to allow last device to be forcibly removed
5630  * from RAID1/RAID10.
5631  */
5632 static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5633 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5634 {
5635 	int ret;
5636 	bool value;
5637 
5638 	ret = kstrtobool(buf, &value);
5639 	if (ret)
5640 		return ret;
5641 
5642 	if (value != mddev->fail_last_dev)
5643 		mddev->fail_last_dev = value;
5644 
5645 	return len;
5646 }
5647 static struct md_sysfs_entry md_fail_last_dev =
5648 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5649        fail_last_dev_store);
5650 
serialize_policy_show(struct mddev * mddev,char * page)5651 static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5652 {
5653 	if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
5654 		return sprintf(page, "n/a\n");
5655 	else
5656 		return sprintf(page, "%d\n", mddev->serialize_policy);
5657 }
5658 
5659 /*
5660  * Setting serialize_policy to true to enforce write IO is not reordered
5661  * for raid1.
5662  */
5663 static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5664 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5665 {
5666 	int err;
5667 	bool value;
5668 
5669 	err = kstrtobool(buf, &value);
5670 	if (err)
5671 		return err;
5672 
5673 	if (value == mddev->serialize_policy)
5674 		return len;
5675 
5676 	err = mddev_suspend_and_lock(mddev);
5677 	if (err)
5678 		return err;
5679 	if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
5680 		pr_err("md: serialize_policy is only effective for raid1\n");
5681 		err = -EINVAL;
5682 		goto unlock;
5683 	}
5684 
5685 	if (value)
5686 		mddev_create_serial_pool(mddev, NULL);
5687 	else
5688 		mddev_destroy_serial_pool(mddev, NULL);
5689 	mddev->serialize_policy = value;
5690 unlock:
5691 	mddev_unlock_and_resume(mddev);
5692 	return err ?: len;
5693 }
5694 
5695 static struct md_sysfs_entry md_serialize_policy =
5696 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5697        serialize_policy_store);
5698 
5699 
5700 static struct attribute *md_default_attrs[] = {
5701 	&md_level.attr,
5702 	&md_new_level.attr,
5703 	&md_layout.attr,
5704 	&md_raid_disks.attr,
5705 	&md_uuid.attr,
5706 	&md_chunk_size.attr,
5707 	&md_size.attr,
5708 	&md_resync_start.attr,
5709 	&md_metadata.attr,
5710 	&md_new_device.attr,
5711 	&md_safe_delay.attr,
5712 	&md_array_state.attr,
5713 	&md_reshape_position.attr,
5714 	&md_reshape_direction.attr,
5715 	&md_array_size.attr,
5716 	&max_corr_read_errors.attr,
5717 	&md_consistency_policy.attr,
5718 	&md_fail_last_dev.attr,
5719 	&md_serialize_policy.attr,
5720 	NULL,
5721 };
5722 
5723 static const struct attribute_group md_default_group = {
5724 	.attrs = md_default_attrs,
5725 };
5726 
5727 static struct attribute *md_redundancy_attrs[] = {
5728 	&md_scan_mode.attr,
5729 	&md_last_scan_mode.attr,
5730 	&md_mismatches.attr,
5731 	&md_sync_min.attr,
5732 	&md_sync_max.attr,
5733 	&md_sync_io_depth.attr,
5734 	&md_sync_speed.attr,
5735 	&md_sync_force_parallel.attr,
5736 	&md_sync_completed.attr,
5737 	&md_min_sync.attr,
5738 	&md_max_sync.attr,
5739 	&md_suspend_lo.attr,
5740 	&md_suspend_hi.attr,
5741 	&md_bitmap.attr,
5742 	&md_degraded.attr,
5743 	NULL,
5744 };
5745 static const struct attribute_group md_redundancy_group = {
5746 	.name = NULL,
5747 	.attrs = md_redundancy_attrs,
5748 };
5749 
5750 static const struct attribute_group *md_attr_groups[] = {
5751 	&md_default_group,
5752 	&md_bitmap_group,
5753 	NULL,
5754 };
5755 
5756 static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)5757 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5758 {
5759 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5760 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5761 	ssize_t rv;
5762 
5763 	if (!entry->show)
5764 		return -EIO;
5765 	spin_lock(&all_mddevs_lock);
5766 	if (!mddev_get(mddev)) {
5767 		spin_unlock(&all_mddevs_lock);
5768 		return -EBUSY;
5769 	}
5770 	spin_unlock(&all_mddevs_lock);
5771 
5772 	rv = entry->show(mddev, page);
5773 	mddev_put(mddev);
5774 	return rv;
5775 }
5776 
5777 static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)5778 md_attr_store(struct kobject *kobj, struct attribute *attr,
5779 	      const char *page, size_t length)
5780 {
5781 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5782 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5783 	ssize_t rv;
5784 	struct kernfs_node *kn = NULL;
5785 
5786 	if (!entry->store)
5787 		return -EIO;
5788 	if (!capable(CAP_SYS_ADMIN))
5789 		return -EACCES;
5790 
5791 	if (entry->store == array_state_store && cmd_match(page, "clear"))
5792 		kn = sysfs_break_active_protection(kobj, attr);
5793 
5794 	spin_lock(&all_mddevs_lock);
5795 	if (!mddev_get(mddev)) {
5796 		spin_unlock(&all_mddevs_lock);
5797 		if (kn)
5798 			sysfs_unbreak_active_protection(kn);
5799 		return -EBUSY;
5800 	}
5801 	spin_unlock(&all_mddevs_lock);
5802 	rv = entry->store(mddev, page, length);
5803 	mddev_put(mddev);
5804 
5805 	if (kn)
5806 		sysfs_unbreak_active_protection(kn);
5807 
5808 	return rv;
5809 }
5810 
md_kobj_release(struct kobject * ko)5811 static void md_kobj_release(struct kobject *ko)
5812 {
5813 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5814 
5815 	put_disk(mddev->gendisk);
5816 }
5817 
5818 static const struct sysfs_ops md_sysfs_ops = {
5819 	.show	= md_attr_show,
5820 	.store	= md_attr_store,
5821 };
5822 static const struct kobj_type md_ktype = {
5823 	.release	= md_kobj_release,
5824 	.sysfs_ops	= &md_sysfs_ops,
5825 	.default_groups	= md_attr_groups,
5826 };
5827 
5828 int mdp_major = 0;
5829 
5830 /* stack the limit for all rdevs into lim */
mddev_stack_rdev_limits(struct mddev * mddev,struct queue_limits * lim,unsigned int flags)5831 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
5832 		unsigned int flags)
5833 {
5834 	struct md_rdev *rdev;
5835 
5836 	rdev_for_each(rdev, mddev) {
5837 		queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
5838 					mddev->gendisk->disk_name);
5839 		if ((flags & MDDEV_STACK_INTEGRITY) &&
5840 		    !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
5841 			return -EINVAL;
5842 	}
5843 
5844 	return 0;
5845 }
5846 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
5847 
5848 /* apply the extra stacking limits from a new rdev into mddev */
mddev_stack_new_rdev(struct mddev * mddev,struct md_rdev * rdev)5849 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
5850 {
5851 	struct queue_limits lim;
5852 
5853 	if (mddev_is_dm(mddev))
5854 		return 0;
5855 
5856 	lim = queue_limits_start_update(mddev->gendisk->queue);
5857 	queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
5858 				mddev->gendisk->disk_name);
5859 
5860 	if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
5861 		pr_err("%s: incompatible integrity profile for %pg\n",
5862 		       mdname(mddev), rdev->bdev);
5863 		queue_limits_cancel_update(mddev->gendisk->queue);
5864 		return -ENXIO;
5865 	}
5866 
5867 	return queue_limits_commit_update(mddev->gendisk->queue, &lim);
5868 }
5869 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
5870 
5871 /* update the optimal I/O size after a reshape */
mddev_update_io_opt(struct mddev * mddev,unsigned int nr_stripes)5872 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
5873 {
5874 	struct queue_limits lim;
5875 
5876 	if (mddev_is_dm(mddev))
5877 		return;
5878 
5879 	/* don't bother updating io_opt if we can't suspend the array */
5880 	if (mddev_suspend(mddev, false) < 0)
5881 		return;
5882 	lim = queue_limits_start_update(mddev->gendisk->queue);
5883 	lim.io_opt = lim.io_min * nr_stripes;
5884 	queue_limits_commit_update(mddev->gendisk->queue, &lim);
5885 	mddev_resume(mddev);
5886 }
5887 EXPORT_SYMBOL_GPL(mddev_update_io_opt);
5888 
mddev_delayed_delete(struct work_struct * ws)5889 static void mddev_delayed_delete(struct work_struct *ws)
5890 {
5891 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5892 
5893 	kobject_put(&mddev->kobj);
5894 }
5895 
md_init_stacking_limits(struct queue_limits * lim)5896 void md_init_stacking_limits(struct queue_limits *lim)
5897 {
5898 	blk_set_stacking_limits(lim);
5899 	lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
5900 			BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
5901 }
5902 EXPORT_SYMBOL_GPL(md_init_stacking_limits);
5903 
md_alloc(dev_t dev,char * name)5904 struct mddev *md_alloc(dev_t dev, char *name)
5905 {
5906 	/*
5907 	 * If dev is zero, name is the name of a device to allocate with
5908 	 * an arbitrary minor number.  It will be "md_???"
5909 	 * If dev is non-zero it must be a device number with a MAJOR of
5910 	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5911 	 * the device is being created by opening a node in /dev.
5912 	 * If "name" is not NULL, the device is being created by
5913 	 * writing to /sys/module/md_mod/parameters/new_array.
5914 	 */
5915 	static DEFINE_MUTEX(disks_mutex);
5916 	struct mddev *mddev;
5917 	struct gendisk *disk;
5918 	int partitioned;
5919 	int shift;
5920 	int unit;
5921 	int error;
5922 
5923 	/*
5924 	 * Wait for any previous instance of this device to be completely
5925 	 * removed (mddev_delayed_delete).
5926 	 */
5927 	flush_workqueue(md_misc_wq);
5928 
5929 	mutex_lock(&disks_mutex);
5930 	mddev = mddev_alloc(dev);
5931 	if (IS_ERR(mddev)) {
5932 		error = PTR_ERR(mddev);
5933 		goto out_unlock;
5934 	}
5935 
5936 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5937 	shift = partitioned ? MdpMinorShift : 0;
5938 	unit = MINOR(mddev->unit) >> shift;
5939 
5940 	if (name && !dev) {
5941 		/* Need to ensure that 'name' is not a duplicate.
5942 		 */
5943 		struct mddev *mddev2;
5944 		spin_lock(&all_mddevs_lock);
5945 
5946 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5947 			if (mddev2->gendisk &&
5948 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5949 				spin_unlock(&all_mddevs_lock);
5950 				error = -EEXIST;
5951 				goto out_free_mddev;
5952 			}
5953 		spin_unlock(&all_mddevs_lock);
5954 	}
5955 	if (name && dev)
5956 		/*
5957 		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5958 		 */
5959 		mddev->hold_active = UNTIL_STOP;
5960 
5961 	disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
5962 	if (IS_ERR(disk)) {
5963 		error = PTR_ERR(disk);
5964 		goto out_free_mddev;
5965 	}
5966 
5967 	disk->major = MAJOR(mddev->unit);
5968 	disk->first_minor = unit << shift;
5969 	disk->minors = 1 << shift;
5970 	if (name)
5971 		strcpy(disk->disk_name, name);
5972 	else if (partitioned)
5973 		sprintf(disk->disk_name, "md_d%d", unit);
5974 	else
5975 		sprintf(disk->disk_name, "md%d", unit);
5976 	disk->fops = &md_fops;
5977 	disk->private_data = mddev;
5978 
5979 	disk->events |= DISK_EVENT_MEDIA_CHANGE;
5980 	mddev->gendisk = disk;
5981 	error = add_disk(disk);
5982 	if (error)
5983 		goto out_put_disk;
5984 
5985 	kobject_init(&mddev->kobj, &md_ktype);
5986 	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5987 	if (error) {
5988 		/*
5989 		 * The disk is already live at this point.  Clear the hold flag
5990 		 * and let mddev_put take care of the deletion, as it isn't any
5991 		 * different from a normal close on last release now.
5992 		 */
5993 		mddev->hold_active = 0;
5994 		mutex_unlock(&disks_mutex);
5995 		mddev_put(mddev);
5996 		return ERR_PTR(error);
5997 	}
5998 
5999 	kobject_uevent(&mddev->kobj, KOBJ_ADD);
6000 	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
6001 	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
6002 	mutex_unlock(&disks_mutex);
6003 	return mddev;
6004 
6005 out_put_disk:
6006 	put_disk(disk);
6007 out_free_mddev:
6008 	mddev_free(mddev);
6009 out_unlock:
6010 	mutex_unlock(&disks_mutex);
6011 	return ERR_PTR(error);
6012 }
6013 
md_alloc_and_put(dev_t dev,char * name)6014 static int md_alloc_and_put(dev_t dev, char *name)
6015 {
6016 	struct mddev *mddev = md_alloc(dev, name);
6017 
6018 	if (IS_ERR(mddev))
6019 		return PTR_ERR(mddev);
6020 	mddev_put(mddev);
6021 	return 0;
6022 }
6023 
md_probe(dev_t dev)6024 static void md_probe(dev_t dev)
6025 {
6026 	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
6027 		return;
6028 	if (create_on_open)
6029 		md_alloc_and_put(dev, NULL);
6030 }
6031 
add_named_array(const char * val,const struct kernel_param * kp)6032 static int add_named_array(const char *val, const struct kernel_param *kp)
6033 {
6034 	/*
6035 	 * val must be "md_*" or "mdNNN".
6036 	 * For "md_*" we allocate an array with a large free minor number, and
6037 	 * set the name to val.  val must not already be an active name.
6038 	 * For "mdNNN" we allocate an array with the minor number NNN
6039 	 * which must not already be in use.
6040 	 */
6041 	int len = strlen(val);
6042 	char buf[DISK_NAME_LEN];
6043 	unsigned long devnum;
6044 
6045 	while (len && val[len-1] == '\n')
6046 		len--;
6047 	if (len >= DISK_NAME_LEN)
6048 		return -E2BIG;
6049 	strscpy(buf, val, len+1);
6050 	if (strncmp(buf, "md_", 3) == 0)
6051 		return md_alloc_and_put(0, buf);
6052 	if (strncmp(buf, "md", 2) == 0 &&
6053 	    isdigit(buf[2]) &&
6054 	    kstrtoul(buf+2, 10, &devnum) == 0 &&
6055 	    devnum <= MINORMASK)
6056 		return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
6057 
6058 	return -EINVAL;
6059 }
6060 
md_safemode_timeout(struct timer_list * t)6061 static void md_safemode_timeout(struct timer_list *t)
6062 {
6063 	struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
6064 
6065 	mddev->safemode = 1;
6066 	if (mddev->external)
6067 		sysfs_notify_dirent_safe(mddev->sysfs_state);
6068 
6069 	md_wakeup_thread(mddev->thread);
6070 }
6071 
6072 static int start_dirty_degraded;
6073 
md_run(struct mddev * mddev)6074 int md_run(struct mddev *mddev)
6075 {
6076 	int err;
6077 	struct md_rdev *rdev;
6078 	struct md_personality *pers;
6079 	bool nowait = true;
6080 
6081 	if (list_empty(&mddev->disks))
6082 		/* cannot run an array with no devices.. */
6083 		return -EINVAL;
6084 
6085 	if (mddev->pers)
6086 		return -EBUSY;
6087 	/* Cannot run until previous stop completes properly */
6088 	if (mddev->sysfs_active)
6089 		return -EBUSY;
6090 
6091 	/*
6092 	 * Analyze all RAID superblock(s)
6093 	 */
6094 	if (!mddev->raid_disks) {
6095 		if (!mddev->persistent)
6096 			return -EINVAL;
6097 		err = analyze_sbs(mddev);
6098 		if (err)
6099 			return -EINVAL;
6100 	}
6101 
6102 	if (mddev->level != LEVEL_NONE)
6103 		request_module("md-level-%d", mddev->level);
6104 	else if (mddev->clevel[0])
6105 		request_module("md-%s", mddev->clevel);
6106 
6107 	/*
6108 	 * Drop all container device buffers, from now on
6109 	 * the only valid external interface is through the md
6110 	 * device.
6111 	 */
6112 	mddev->has_superblocks = false;
6113 	rdev_for_each(rdev, mddev) {
6114 		if (test_bit(Faulty, &rdev->flags))
6115 			continue;
6116 		sync_blockdev(rdev->bdev);
6117 		invalidate_bdev(rdev->bdev);
6118 		if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6119 			mddev->ro = MD_RDONLY;
6120 			if (!mddev_is_dm(mddev))
6121 				set_disk_ro(mddev->gendisk, 1);
6122 		}
6123 
6124 		if (rdev->sb_page)
6125 			mddev->has_superblocks = true;
6126 
6127 		/* perform some consistency tests on the device.
6128 		 * We don't want the data to overlap the metadata,
6129 		 * Internal Bitmap issues have been handled elsewhere.
6130 		 */
6131 		if (rdev->meta_bdev) {
6132 			/* Nothing to check */;
6133 		} else if (rdev->data_offset < rdev->sb_start) {
6134 			if (mddev->dev_sectors &&
6135 			    rdev->data_offset + mddev->dev_sectors
6136 			    > rdev->sb_start) {
6137 				pr_warn("md: %s: data overlaps metadata\n",
6138 					mdname(mddev));
6139 				return -EINVAL;
6140 			}
6141 		} else {
6142 			if (rdev->sb_start + rdev->sb_size/512
6143 			    > rdev->data_offset) {
6144 				pr_warn("md: %s: metadata overlaps data\n",
6145 					mdname(mddev));
6146 				return -EINVAL;
6147 			}
6148 		}
6149 		sysfs_notify_dirent_safe(rdev->sysfs_state);
6150 		nowait = nowait && bdev_nowait(rdev->bdev);
6151 	}
6152 
6153 	if (!bioset_initialized(&mddev->bio_set)) {
6154 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6155 		if (err)
6156 			return err;
6157 	}
6158 	if (!bioset_initialized(&mddev->sync_set)) {
6159 		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6160 		if (err)
6161 			goto exit_bio_set;
6162 	}
6163 
6164 	if (!bioset_initialized(&mddev->io_clone_set)) {
6165 		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
6166 				  offsetof(struct md_io_clone, bio_clone), 0);
6167 		if (err)
6168 			goto exit_sync_set;
6169 	}
6170 
6171 	pers = get_pers(mddev->level, mddev->clevel);
6172 	if (!pers) {
6173 		err = -EINVAL;
6174 		goto abort;
6175 	}
6176 	if (mddev->level != pers->head.id) {
6177 		mddev->level = pers->head.id;
6178 		mddev->new_level = pers->head.id;
6179 	}
6180 	strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
6181 
6182 	if (mddev->reshape_position != MaxSector &&
6183 	    pers->start_reshape == NULL) {
6184 		/* This personality cannot handle reshaping... */
6185 		put_pers(pers);
6186 		err = -EINVAL;
6187 		goto abort;
6188 	}
6189 
6190 	if (pers->sync_request) {
6191 		/* Warn if this is a potentially silly
6192 		 * configuration.
6193 		 */
6194 		struct md_rdev *rdev2;
6195 		int warned = 0;
6196 
6197 		rdev_for_each(rdev, mddev)
6198 			rdev_for_each(rdev2, mddev) {
6199 				if (rdev < rdev2 &&
6200 				    rdev->bdev->bd_disk ==
6201 				    rdev2->bdev->bd_disk) {
6202 					pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6203 						mdname(mddev),
6204 						rdev->bdev,
6205 						rdev2->bdev);
6206 					warned = 1;
6207 				}
6208 			}
6209 
6210 		if (warned)
6211 			pr_warn("True protection against single-disk failure might be compromised.\n");
6212 	}
6213 
6214 	/* dm-raid expect sync_thread to be frozen until resume */
6215 	if (mddev->gendisk)
6216 		mddev->recovery = 0;
6217 
6218 	/* may be over-ridden by personality */
6219 	mddev->resync_max_sectors = mddev->dev_sectors;
6220 
6221 	mddev->ok_start_degraded = start_dirty_degraded;
6222 
6223 	if (start_readonly && md_is_rdwr(mddev))
6224 		mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6225 
6226 	err = pers->run(mddev);
6227 	if (err)
6228 		pr_warn("md: pers->run() failed ...\n");
6229 	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6230 		WARN_ONCE(!mddev->external_size,
6231 			  "%s: default size too small, but 'external_size' not in effect?\n",
6232 			  __func__);
6233 		pr_warn("md: invalid array_size %llu > default size %llu\n",
6234 			(unsigned long long)mddev->array_sectors / 2,
6235 			(unsigned long long)pers->size(mddev, 0, 0) / 2);
6236 		err = -EINVAL;
6237 	}
6238 	if (err == 0 && pers->sync_request &&
6239 	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6240 		err = mddev->bitmap_ops->create(mddev);
6241 		if (err)
6242 			pr_warn("%s: failed to create bitmap (%d)\n",
6243 				mdname(mddev), err);
6244 	}
6245 	if (err)
6246 		goto bitmap_abort;
6247 
6248 	if (mddev->bitmap_info.max_write_behind > 0) {
6249 		bool create_pool = false;
6250 
6251 		rdev_for_each(rdev, mddev) {
6252 			if (test_bit(WriteMostly, &rdev->flags) &&
6253 			    rdev_init_serial(rdev))
6254 				create_pool = true;
6255 		}
6256 		if (create_pool && mddev->serial_info_pool == NULL) {
6257 			mddev->serial_info_pool =
6258 				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6259 						    sizeof(struct serial_info));
6260 			if (!mddev->serial_info_pool) {
6261 				err = -ENOMEM;
6262 				goto bitmap_abort;
6263 			}
6264 		}
6265 	}
6266 
6267 	if (pers->sync_request) {
6268 		if (mddev->kobj.sd &&
6269 		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6270 			pr_warn("md: cannot register extra attributes for %s\n",
6271 				mdname(mddev));
6272 		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6273 		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6274 		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6275 	} else if (mddev->ro == MD_AUTO_READ)
6276 		mddev->ro = MD_RDWR;
6277 
6278 	atomic_set(&mddev->max_corr_read_errors,
6279 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6280 	mddev->safemode = 0;
6281 	if (mddev_is_clustered(mddev))
6282 		mddev->safemode_delay = 0;
6283 	else
6284 		mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6285 	mddev->in_sync = 1;
6286 	smp_wmb();
6287 	spin_lock(&mddev->lock);
6288 	mddev->pers = pers;
6289 	spin_unlock(&mddev->lock);
6290 	rdev_for_each(rdev, mddev)
6291 		if (rdev->raid_disk >= 0)
6292 			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6293 
6294 	if (mddev->degraded && md_is_rdwr(mddev))
6295 		/* This ensures that recovering status is reported immediately
6296 		 * via sysfs - until a lack of spares is confirmed.
6297 		 */
6298 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6299 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6300 
6301 	if (mddev->sb_flags)
6302 		md_update_sb(mddev, 0);
6303 
6304 	md_new_event();
6305 	return 0;
6306 
6307 bitmap_abort:
6308 	mddev_detach(mddev);
6309 	if (mddev->private)
6310 		pers->free(mddev, mddev->private);
6311 	mddev->private = NULL;
6312 	put_pers(pers);
6313 	mddev->bitmap_ops->destroy(mddev);
6314 abort:
6315 	bioset_exit(&mddev->io_clone_set);
6316 exit_sync_set:
6317 	bioset_exit(&mddev->sync_set);
6318 exit_bio_set:
6319 	bioset_exit(&mddev->bio_set);
6320 	return err;
6321 }
6322 EXPORT_SYMBOL_GPL(md_run);
6323 
do_md_run(struct mddev * mddev)6324 int do_md_run(struct mddev *mddev)
6325 {
6326 	int err;
6327 
6328 	set_bit(MD_NOT_READY, &mddev->flags);
6329 	err = md_run(mddev);
6330 	if (err)
6331 		goto out;
6332 
6333 	err = mddev->bitmap_ops->load(mddev);
6334 	if (err) {
6335 		mddev->bitmap_ops->destroy(mddev);
6336 		goto out;
6337 	}
6338 
6339 	if (mddev_is_clustered(mddev))
6340 		md_allow_write(mddev);
6341 
6342 	/* run start up tasks that require md_thread */
6343 	md_start(mddev);
6344 
6345 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6346 
6347 	set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6348 	clear_bit(MD_NOT_READY, &mddev->flags);
6349 	mddev->changed = 1;
6350 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6351 	sysfs_notify_dirent_safe(mddev->sysfs_state);
6352 	sysfs_notify_dirent_safe(mddev->sysfs_action);
6353 	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6354 out:
6355 	clear_bit(MD_NOT_READY, &mddev->flags);
6356 	return err;
6357 }
6358 
md_start(struct mddev * mddev)6359 int md_start(struct mddev *mddev)
6360 {
6361 	int ret = 0;
6362 
6363 	if (mddev->pers->start) {
6364 		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6365 		ret = mddev->pers->start(mddev);
6366 		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6367 		md_wakeup_thread(mddev->sync_thread);
6368 	}
6369 	return ret;
6370 }
6371 EXPORT_SYMBOL_GPL(md_start);
6372 
restart_array(struct mddev * mddev)6373 static int restart_array(struct mddev *mddev)
6374 {
6375 	struct gendisk *disk = mddev->gendisk;
6376 	struct md_rdev *rdev;
6377 	bool has_journal = false;
6378 	bool has_readonly = false;
6379 
6380 	/* Complain if it has no devices */
6381 	if (list_empty(&mddev->disks))
6382 		return -ENXIO;
6383 	if (!mddev->pers)
6384 		return -EINVAL;
6385 	if (md_is_rdwr(mddev))
6386 		return -EBUSY;
6387 
6388 	rcu_read_lock();
6389 	rdev_for_each_rcu(rdev, mddev) {
6390 		if (test_bit(Journal, &rdev->flags) &&
6391 		    !test_bit(Faulty, &rdev->flags))
6392 			has_journal = true;
6393 		if (rdev_read_only(rdev))
6394 			has_readonly = true;
6395 	}
6396 	rcu_read_unlock();
6397 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6398 		/* Don't restart rw with journal missing/faulty */
6399 			return -EINVAL;
6400 	if (has_readonly)
6401 		return -EROFS;
6402 
6403 	mddev->safemode = 0;
6404 	mddev->ro = MD_RDWR;
6405 	set_disk_ro(disk, 0);
6406 	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6407 	/* Kick recovery or resync if necessary */
6408 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6409 	md_wakeup_thread(mddev->sync_thread);
6410 	sysfs_notify_dirent_safe(mddev->sysfs_state);
6411 	return 0;
6412 }
6413 
md_clean(struct mddev * mddev)6414 static void md_clean(struct mddev *mddev)
6415 {
6416 	mddev->array_sectors = 0;
6417 	mddev->external_size = 0;
6418 	mddev->dev_sectors = 0;
6419 	mddev->raid_disks = 0;
6420 	mddev->recovery_cp = 0;
6421 	mddev->resync_min = 0;
6422 	mddev->resync_max = MaxSector;
6423 	mddev->reshape_position = MaxSector;
6424 	/* we still need mddev->external in export_rdev, do not clear it yet */
6425 	mddev->persistent = 0;
6426 	mddev->level = LEVEL_NONE;
6427 	mddev->clevel[0] = 0;
6428 	/* if UNTIL_STOP is set, it's cleared here */
6429 	mddev->hold_active = 0;
6430 	/* Don't clear MD_CLOSING, or mddev can be opened again. */
6431 	mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6432 	mddev->sb_flags = 0;
6433 	mddev->ro = MD_RDWR;
6434 	mddev->metadata_type[0] = 0;
6435 	mddev->chunk_sectors = 0;
6436 	mddev->ctime = mddev->utime = 0;
6437 	mddev->layout = 0;
6438 	mddev->max_disks = 0;
6439 	mddev->events = 0;
6440 	mddev->can_decrease_events = 0;
6441 	mddev->delta_disks = 0;
6442 	mddev->reshape_backwards = 0;
6443 	mddev->new_level = LEVEL_NONE;
6444 	mddev->new_layout = 0;
6445 	mddev->new_chunk_sectors = 0;
6446 	mddev->curr_resync = MD_RESYNC_NONE;
6447 	atomic64_set(&mddev->resync_mismatches, 0);
6448 	mddev->suspend_lo = mddev->suspend_hi = 0;
6449 	mddev->sync_speed_min = mddev->sync_speed_max = 0;
6450 	mddev->recovery = 0;
6451 	mddev->in_sync = 0;
6452 	mddev->changed = 0;
6453 	mddev->degraded = 0;
6454 	mddev->safemode = 0;
6455 	mddev->private = NULL;
6456 	mddev->cluster_info = NULL;
6457 	mddev->bitmap_info.offset = 0;
6458 	mddev->bitmap_info.default_offset = 0;
6459 	mddev->bitmap_info.default_space = 0;
6460 	mddev->bitmap_info.chunksize = 0;
6461 	mddev->bitmap_info.daemon_sleep = 0;
6462 	mddev->bitmap_info.max_write_behind = 0;
6463 	mddev->bitmap_info.nodes = 0;
6464 }
6465 
__md_stop_writes(struct mddev * mddev)6466 static void __md_stop_writes(struct mddev *mddev)
6467 {
6468 	timer_delete_sync(&mddev->safemode_timer);
6469 
6470 	if (mddev->pers && mddev->pers->quiesce) {
6471 		mddev->pers->quiesce(mddev, 1);
6472 		mddev->pers->quiesce(mddev, 0);
6473 	}
6474 
6475 	mddev->bitmap_ops->flush(mddev);
6476 
6477 	if (md_is_rdwr(mddev) &&
6478 	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6479 	     mddev->sb_flags)) {
6480 		/* mark array as shutdown cleanly */
6481 		if (!mddev_is_clustered(mddev))
6482 			mddev->in_sync = 1;
6483 		md_update_sb(mddev, 1);
6484 	}
6485 	/* disable policy to guarantee rdevs free resources for serialization */
6486 	mddev->serialize_policy = 0;
6487 	mddev_destroy_serial_pool(mddev, NULL);
6488 }
6489 
md_stop_writes(struct mddev * mddev)6490 void md_stop_writes(struct mddev *mddev)
6491 {
6492 	mddev_lock_nointr(mddev);
6493 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6494 	stop_sync_thread(mddev, true);
6495 	__md_stop_writes(mddev);
6496 	mddev_unlock(mddev);
6497 }
6498 EXPORT_SYMBOL_GPL(md_stop_writes);
6499 
mddev_detach(struct mddev * mddev)6500 static void mddev_detach(struct mddev *mddev)
6501 {
6502 	mddev->bitmap_ops->wait_behind_writes(mddev);
6503 	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6504 		mddev->pers->quiesce(mddev, 1);
6505 		mddev->pers->quiesce(mddev, 0);
6506 	}
6507 	md_unregister_thread(mddev, &mddev->thread);
6508 
6509 	/* the unplug fn references 'conf' */
6510 	if (!mddev_is_dm(mddev))
6511 		blk_sync_queue(mddev->gendisk->queue);
6512 }
6513 
__md_stop(struct mddev * mddev)6514 static void __md_stop(struct mddev *mddev)
6515 {
6516 	struct md_personality *pers = mddev->pers;
6517 
6518 	mddev->bitmap_ops->destroy(mddev);
6519 	mddev_detach(mddev);
6520 	spin_lock(&mddev->lock);
6521 	mddev->pers = NULL;
6522 	spin_unlock(&mddev->lock);
6523 	if (mddev->private)
6524 		pers->free(mddev, mddev->private);
6525 	mddev->private = NULL;
6526 	put_pers(pers);
6527 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6528 
6529 	bioset_exit(&mddev->bio_set);
6530 	bioset_exit(&mddev->sync_set);
6531 	bioset_exit(&mddev->io_clone_set);
6532 }
6533 
md_stop(struct mddev * mddev)6534 void md_stop(struct mddev *mddev)
6535 {
6536 	lockdep_assert_held(&mddev->reconfig_mutex);
6537 
6538 	/* stop the array and free an attached data structures.
6539 	 * This is called from dm-raid
6540 	 */
6541 	__md_stop_writes(mddev);
6542 	__md_stop(mddev);
6543 }
6544 
6545 EXPORT_SYMBOL_GPL(md_stop);
6546 
6547 /* ensure 'mddev->pers' exist before calling md_set_readonly() */
md_set_readonly(struct mddev * mddev)6548 static int md_set_readonly(struct mddev *mddev)
6549 {
6550 	int err = 0;
6551 	int did_freeze = 0;
6552 
6553 	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6554 		return -EBUSY;
6555 
6556 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6557 		did_freeze = 1;
6558 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6559 	}
6560 
6561 	stop_sync_thread(mddev, false);
6562 	wait_event(mddev->sb_wait,
6563 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6564 	mddev_lock_nointr(mddev);
6565 
6566 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6567 		pr_warn("md: %s still in use.\n",mdname(mddev));
6568 		err = -EBUSY;
6569 		goto out;
6570 	}
6571 
6572 	__md_stop_writes(mddev);
6573 
6574 	if (mddev->ro == MD_RDONLY) {
6575 		err  = -ENXIO;
6576 		goto out;
6577 	}
6578 
6579 	mddev->ro = MD_RDONLY;
6580 	set_disk_ro(mddev->gendisk, 1);
6581 
6582 out:
6583 	if (!err || did_freeze) {
6584 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6585 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6586 		sysfs_notify_dirent_safe(mddev->sysfs_state);
6587 	}
6588 
6589 	return err;
6590 }
6591 
6592 /* mode:
6593  *   0 - completely stop and dis-assemble array
6594  *   2 - stop but do not disassemble array
6595  */
do_md_stop(struct mddev * mddev,int mode)6596 static int do_md_stop(struct mddev *mddev, int mode)
6597 {
6598 	struct gendisk *disk = mddev->gendisk;
6599 	struct md_rdev *rdev;
6600 	int did_freeze = 0;
6601 
6602 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6603 		did_freeze = 1;
6604 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6605 	}
6606 
6607 	stop_sync_thread(mddev, true);
6608 
6609 	if (mddev->sysfs_active ||
6610 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6611 		pr_warn("md: %s still in use.\n",mdname(mddev));
6612 		if (did_freeze) {
6613 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6614 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6615 		}
6616 		return -EBUSY;
6617 	}
6618 	if (mddev->pers) {
6619 		if (!md_is_rdwr(mddev))
6620 			set_disk_ro(disk, 0);
6621 
6622 		__md_stop_writes(mddev);
6623 		__md_stop(mddev);
6624 
6625 		/* tell userspace to handle 'inactive' */
6626 		sysfs_notify_dirent_safe(mddev->sysfs_state);
6627 
6628 		rdev_for_each(rdev, mddev)
6629 			if (rdev->raid_disk >= 0)
6630 				sysfs_unlink_rdev(mddev, rdev);
6631 
6632 		set_capacity_and_notify(disk, 0);
6633 		mddev->changed = 1;
6634 
6635 		if (!md_is_rdwr(mddev))
6636 			mddev->ro = MD_RDWR;
6637 	}
6638 	/*
6639 	 * Free resources if final stop
6640 	 */
6641 	if (mode == 0) {
6642 		pr_info("md: %s stopped.\n", mdname(mddev));
6643 
6644 		if (mddev->bitmap_info.file) {
6645 			struct file *f = mddev->bitmap_info.file;
6646 			spin_lock(&mddev->lock);
6647 			mddev->bitmap_info.file = NULL;
6648 			spin_unlock(&mddev->lock);
6649 			fput(f);
6650 		}
6651 		mddev->bitmap_info.offset = 0;
6652 
6653 		export_array(mddev);
6654 		md_clean(mddev);
6655 		set_bit(MD_DELETED, &mddev->flags);
6656 	}
6657 	md_new_event();
6658 	sysfs_notify_dirent_safe(mddev->sysfs_state);
6659 	return 0;
6660 }
6661 
6662 #ifndef MODULE
autorun_array(struct mddev * mddev)6663 static void autorun_array(struct mddev *mddev)
6664 {
6665 	struct md_rdev *rdev;
6666 	int err;
6667 
6668 	if (list_empty(&mddev->disks))
6669 		return;
6670 
6671 	pr_info("md: running: ");
6672 
6673 	rdev_for_each(rdev, mddev) {
6674 		pr_cont("<%pg>", rdev->bdev);
6675 	}
6676 	pr_cont("\n");
6677 
6678 	err = do_md_run(mddev);
6679 	if (err) {
6680 		pr_warn("md: do_md_run() returned %d\n", err);
6681 		do_md_stop(mddev, 0);
6682 	}
6683 }
6684 
6685 /*
6686  * lets try to run arrays based on all disks that have arrived
6687  * until now. (those are in pending_raid_disks)
6688  *
6689  * the method: pick the first pending disk, collect all disks with
6690  * the same UUID, remove all from the pending list and put them into
6691  * the 'same_array' list. Then order this list based on superblock
6692  * update time (freshest comes first), kick out 'old' disks and
6693  * compare superblocks. If everything's fine then run it.
6694  *
6695  * If "unit" is allocated, then bump its reference count
6696  */
autorun_devices(int part)6697 static void autorun_devices(int part)
6698 {
6699 	struct md_rdev *rdev0, *rdev, *tmp;
6700 	struct mddev *mddev;
6701 
6702 	pr_info("md: autorun ...\n");
6703 	while (!list_empty(&pending_raid_disks)) {
6704 		int unit;
6705 		dev_t dev;
6706 		LIST_HEAD(candidates);
6707 		rdev0 = list_entry(pending_raid_disks.next,
6708 					 struct md_rdev, same_set);
6709 
6710 		pr_debug("md: considering %pg ...\n", rdev0->bdev);
6711 		INIT_LIST_HEAD(&candidates);
6712 		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6713 			if (super_90_load(rdev, rdev0, 0) >= 0) {
6714 				pr_debug("md:  adding %pg ...\n",
6715 					 rdev->bdev);
6716 				list_move(&rdev->same_set, &candidates);
6717 			}
6718 		/*
6719 		 * now we have a set of devices, with all of them having
6720 		 * mostly sane superblocks. It's time to allocate the
6721 		 * mddev.
6722 		 */
6723 		if (part) {
6724 			dev = MKDEV(mdp_major,
6725 				    rdev0->preferred_minor << MdpMinorShift);
6726 			unit = MINOR(dev) >> MdpMinorShift;
6727 		} else {
6728 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6729 			unit = MINOR(dev);
6730 		}
6731 		if (rdev0->preferred_minor != unit) {
6732 			pr_warn("md: unit number in %pg is bad: %d\n",
6733 				rdev0->bdev, rdev0->preferred_minor);
6734 			break;
6735 		}
6736 
6737 		mddev = md_alloc(dev, NULL);
6738 		if (IS_ERR(mddev))
6739 			break;
6740 
6741 		if (mddev_suspend_and_lock(mddev))
6742 			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6743 		else if (mddev->raid_disks || mddev->major_version
6744 			 || !list_empty(&mddev->disks)) {
6745 			pr_warn("md: %s already running, cannot run %pg\n",
6746 				mdname(mddev), rdev0->bdev);
6747 			mddev_unlock_and_resume(mddev);
6748 		} else {
6749 			pr_debug("md: created %s\n", mdname(mddev));
6750 			mddev->persistent = 1;
6751 			rdev_for_each_list(rdev, tmp, &candidates) {
6752 				list_del_init(&rdev->same_set);
6753 				if (bind_rdev_to_array(rdev, mddev))
6754 					export_rdev(rdev, mddev);
6755 			}
6756 			autorun_array(mddev);
6757 			mddev_unlock_and_resume(mddev);
6758 		}
6759 		/* on success, candidates will be empty, on error
6760 		 * it won't...
6761 		 */
6762 		rdev_for_each_list(rdev, tmp, &candidates) {
6763 			list_del_init(&rdev->same_set);
6764 			export_rdev(rdev, mddev);
6765 		}
6766 		mddev_put(mddev);
6767 	}
6768 	pr_info("md: ... autorun DONE.\n");
6769 }
6770 #endif /* !MODULE */
6771 
get_version(void __user * arg)6772 static int get_version(void __user *arg)
6773 {
6774 	mdu_version_t ver;
6775 
6776 	ver.major = MD_MAJOR_VERSION;
6777 	ver.minor = MD_MINOR_VERSION;
6778 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6779 
6780 	if (copy_to_user(arg, &ver, sizeof(ver)))
6781 		return -EFAULT;
6782 
6783 	return 0;
6784 }
6785 
get_array_info(struct mddev * mddev,void __user * arg)6786 static int get_array_info(struct mddev *mddev, void __user *arg)
6787 {
6788 	mdu_array_info_t info;
6789 	int nr,working,insync,failed,spare;
6790 	struct md_rdev *rdev;
6791 
6792 	nr = working = insync = failed = spare = 0;
6793 	rcu_read_lock();
6794 	rdev_for_each_rcu(rdev, mddev) {
6795 		nr++;
6796 		if (test_bit(Faulty, &rdev->flags))
6797 			failed++;
6798 		else {
6799 			working++;
6800 			if (test_bit(In_sync, &rdev->flags))
6801 				insync++;
6802 			else if (test_bit(Journal, &rdev->flags))
6803 				/* TODO: add journal count to md_u.h */
6804 				;
6805 			else
6806 				spare++;
6807 		}
6808 	}
6809 	rcu_read_unlock();
6810 
6811 	info.major_version = mddev->major_version;
6812 	info.minor_version = mddev->minor_version;
6813 	info.patch_version = MD_PATCHLEVEL_VERSION;
6814 	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6815 	info.level         = mddev->level;
6816 	info.size          = mddev->dev_sectors / 2;
6817 	if (info.size != mddev->dev_sectors / 2) /* overflow */
6818 		info.size = -1;
6819 	info.nr_disks      = nr;
6820 	info.raid_disks    = mddev->raid_disks;
6821 	info.md_minor      = mddev->md_minor;
6822 	info.not_persistent= !mddev->persistent;
6823 
6824 	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6825 	info.state         = 0;
6826 	if (mddev->in_sync)
6827 		info.state = (1<<MD_SB_CLEAN);
6828 	if (mddev->bitmap && mddev->bitmap_info.offset)
6829 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6830 	if (mddev_is_clustered(mddev))
6831 		info.state |= (1<<MD_SB_CLUSTERED);
6832 	info.active_disks  = insync;
6833 	info.working_disks = working;
6834 	info.failed_disks  = failed;
6835 	info.spare_disks   = spare;
6836 
6837 	info.layout        = mddev->layout;
6838 	info.chunk_size    = mddev->chunk_sectors << 9;
6839 
6840 	if (copy_to_user(arg, &info, sizeof(info)))
6841 		return -EFAULT;
6842 
6843 	return 0;
6844 }
6845 
get_bitmap_file(struct mddev * mddev,void __user * arg)6846 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6847 {
6848 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6849 	char *ptr;
6850 	int err;
6851 
6852 	file = kzalloc(sizeof(*file), GFP_NOIO);
6853 	if (!file)
6854 		return -ENOMEM;
6855 
6856 	err = 0;
6857 	spin_lock(&mddev->lock);
6858 	/* bitmap enabled */
6859 	if (mddev->bitmap_info.file) {
6860 		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6861 				sizeof(file->pathname));
6862 		if (IS_ERR(ptr))
6863 			err = PTR_ERR(ptr);
6864 		else
6865 			memmove(file->pathname, ptr,
6866 				sizeof(file->pathname)-(ptr-file->pathname));
6867 	}
6868 	spin_unlock(&mddev->lock);
6869 
6870 	if (err == 0 &&
6871 	    copy_to_user(arg, file, sizeof(*file)))
6872 		err = -EFAULT;
6873 
6874 	kfree(file);
6875 	return err;
6876 }
6877 
get_disk_info(struct mddev * mddev,void __user * arg)6878 static int get_disk_info(struct mddev *mddev, void __user * arg)
6879 {
6880 	mdu_disk_info_t info;
6881 	struct md_rdev *rdev;
6882 
6883 	if (copy_from_user(&info, arg, sizeof(info)))
6884 		return -EFAULT;
6885 
6886 	rcu_read_lock();
6887 	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6888 	if (rdev) {
6889 		info.major = MAJOR(rdev->bdev->bd_dev);
6890 		info.minor = MINOR(rdev->bdev->bd_dev);
6891 		info.raid_disk = rdev->raid_disk;
6892 		info.state = 0;
6893 		if (test_bit(Faulty, &rdev->flags))
6894 			info.state |= (1<<MD_DISK_FAULTY);
6895 		else if (test_bit(In_sync, &rdev->flags)) {
6896 			info.state |= (1<<MD_DISK_ACTIVE);
6897 			info.state |= (1<<MD_DISK_SYNC);
6898 		}
6899 		if (test_bit(Journal, &rdev->flags))
6900 			info.state |= (1<<MD_DISK_JOURNAL);
6901 		if (test_bit(WriteMostly, &rdev->flags))
6902 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6903 		if (test_bit(FailFast, &rdev->flags))
6904 			info.state |= (1<<MD_DISK_FAILFAST);
6905 	} else {
6906 		info.major = info.minor = 0;
6907 		info.raid_disk = -1;
6908 		info.state = (1<<MD_DISK_REMOVED);
6909 	}
6910 	rcu_read_unlock();
6911 
6912 	if (copy_to_user(arg, &info, sizeof(info)))
6913 		return -EFAULT;
6914 
6915 	return 0;
6916 }
6917 
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)6918 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6919 {
6920 	struct md_rdev *rdev;
6921 	dev_t dev = MKDEV(info->major,info->minor);
6922 
6923 	if (mddev_is_clustered(mddev) &&
6924 		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6925 		pr_warn("%s: Cannot add to clustered mddev.\n",
6926 			mdname(mddev));
6927 		return -EINVAL;
6928 	}
6929 
6930 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6931 		return -EOVERFLOW;
6932 
6933 	if (!mddev->raid_disks) {
6934 		int err;
6935 		/* expecting a device which has a superblock */
6936 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6937 		if (IS_ERR(rdev)) {
6938 			pr_warn("md: md_import_device returned %ld\n",
6939 				PTR_ERR(rdev));
6940 			return PTR_ERR(rdev);
6941 		}
6942 		if (!list_empty(&mddev->disks)) {
6943 			struct md_rdev *rdev0
6944 				= list_entry(mddev->disks.next,
6945 					     struct md_rdev, same_set);
6946 			err = super_types[mddev->major_version]
6947 				.load_super(rdev, rdev0, mddev->minor_version);
6948 			if (err < 0) {
6949 				pr_warn("md: %pg has different UUID to %pg\n",
6950 					rdev->bdev,
6951 					rdev0->bdev);
6952 				export_rdev(rdev, mddev);
6953 				return -EINVAL;
6954 			}
6955 		}
6956 		err = bind_rdev_to_array(rdev, mddev);
6957 		if (err)
6958 			export_rdev(rdev, mddev);
6959 		return err;
6960 	}
6961 
6962 	/*
6963 	 * md_add_new_disk can be used once the array is assembled
6964 	 * to add "hot spares".  They must already have a superblock
6965 	 * written
6966 	 */
6967 	if (mddev->pers) {
6968 		int err;
6969 		if (!mddev->pers->hot_add_disk) {
6970 			pr_warn("%s: personality does not support diskops!\n",
6971 				mdname(mddev));
6972 			return -EINVAL;
6973 		}
6974 		if (mddev->persistent)
6975 			rdev = md_import_device(dev, mddev->major_version,
6976 						mddev->minor_version);
6977 		else
6978 			rdev = md_import_device(dev, -1, -1);
6979 		if (IS_ERR(rdev)) {
6980 			pr_warn("md: md_import_device returned %ld\n",
6981 				PTR_ERR(rdev));
6982 			return PTR_ERR(rdev);
6983 		}
6984 		/* set saved_raid_disk if appropriate */
6985 		if (!mddev->persistent) {
6986 			if (info->state & (1<<MD_DISK_SYNC)  &&
6987 			    info->raid_disk < mddev->raid_disks) {
6988 				rdev->raid_disk = info->raid_disk;
6989 				clear_bit(Bitmap_sync, &rdev->flags);
6990 			} else
6991 				rdev->raid_disk = -1;
6992 			rdev->saved_raid_disk = rdev->raid_disk;
6993 		} else
6994 			super_types[mddev->major_version].
6995 				validate_super(mddev, NULL/*freshest*/, rdev);
6996 		if ((info->state & (1<<MD_DISK_SYNC)) &&
6997 		     rdev->raid_disk != info->raid_disk) {
6998 			/* This was a hot-add request, but events doesn't
6999 			 * match, so reject it.
7000 			 */
7001 			export_rdev(rdev, mddev);
7002 			return -EINVAL;
7003 		}
7004 
7005 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
7006 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7007 			set_bit(WriteMostly, &rdev->flags);
7008 		else
7009 			clear_bit(WriteMostly, &rdev->flags);
7010 		if (info->state & (1<<MD_DISK_FAILFAST))
7011 			set_bit(FailFast, &rdev->flags);
7012 		else
7013 			clear_bit(FailFast, &rdev->flags);
7014 
7015 		if (info->state & (1<<MD_DISK_JOURNAL)) {
7016 			struct md_rdev *rdev2;
7017 			bool has_journal = false;
7018 
7019 			/* make sure no existing journal disk */
7020 			rdev_for_each(rdev2, mddev) {
7021 				if (test_bit(Journal, &rdev2->flags)) {
7022 					has_journal = true;
7023 					break;
7024 				}
7025 			}
7026 			if (has_journal || mddev->bitmap) {
7027 				export_rdev(rdev, mddev);
7028 				return -EBUSY;
7029 			}
7030 			set_bit(Journal, &rdev->flags);
7031 		}
7032 		/*
7033 		 * check whether the device shows up in other nodes
7034 		 */
7035 		if (mddev_is_clustered(mddev)) {
7036 			if (info->state & (1 << MD_DISK_CANDIDATE))
7037 				set_bit(Candidate, &rdev->flags);
7038 			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
7039 				/* --add initiated by this node */
7040 				err = mddev->cluster_ops->add_new_disk(mddev, rdev);
7041 				if (err) {
7042 					export_rdev(rdev, mddev);
7043 					return err;
7044 				}
7045 			}
7046 		}
7047 
7048 		rdev->raid_disk = -1;
7049 		err = bind_rdev_to_array(rdev, mddev);
7050 
7051 		if (err)
7052 			export_rdev(rdev, mddev);
7053 
7054 		if (mddev_is_clustered(mddev)) {
7055 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
7056 				if (!err) {
7057 					err = mddev->cluster_ops->new_disk_ack(
7058 							mddev, err == 0);
7059 					if (err)
7060 						md_kick_rdev_from_array(rdev);
7061 				}
7062 			} else {
7063 				if (err)
7064 					mddev->cluster_ops->add_new_disk_cancel(mddev);
7065 				else
7066 					err = add_bound_rdev(rdev);
7067 			}
7068 
7069 		} else if (!err)
7070 			err = add_bound_rdev(rdev);
7071 
7072 		return err;
7073 	}
7074 
7075 	/* otherwise, md_add_new_disk is only allowed
7076 	 * for major_version==0 superblocks
7077 	 */
7078 	if (mddev->major_version != 0) {
7079 		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7080 		return -EINVAL;
7081 	}
7082 
7083 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
7084 		int err;
7085 		rdev = md_import_device(dev, -1, 0);
7086 		if (IS_ERR(rdev)) {
7087 			pr_warn("md: error, md_import_device() returned %ld\n",
7088 				PTR_ERR(rdev));
7089 			return PTR_ERR(rdev);
7090 		}
7091 		rdev->desc_nr = info->number;
7092 		if (info->raid_disk < mddev->raid_disks)
7093 			rdev->raid_disk = info->raid_disk;
7094 		else
7095 			rdev->raid_disk = -1;
7096 
7097 		if (rdev->raid_disk < mddev->raid_disks)
7098 			if (info->state & (1<<MD_DISK_SYNC))
7099 				set_bit(In_sync, &rdev->flags);
7100 
7101 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7102 			set_bit(WriteMostly, &rdev->flags);
7103 		if (info->state & (1<<MD_DISK_FAILFAST))
7104 			set_bit(FailFast, &rdev->flags);
7105 
7106 		if (!mddev->persistent) {
7107 			pr_debug("md: nonpersistent superblock ...\n");
7108 			rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7109 		} else
7110 			rdev->sb_start = calc_dev_sboffset(rdev);
7111 		rdev->sectors = rdev->sb_start;
7112 
7113 		err = bind_rdev_to_array(rdev, mddev);
7114 		if (err) {
7115 			export_rdev(rdev, mddev);
7116 			return err;
7117 		}
7118 	}
7119 
7120 	return 0;
7121 }
7122 
hot_remove_disk(struct mddev * mddev,dev_t dev)7123 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7124 {
7125 	struct md_rdev *rdev;
7126 
7127 	if (!mddev->pers)
7128 		return -ENODEV;
7129 
7130 	rdev = find_rdev(mddev, dev);
7131 	if (!rdev)
7132 		return -ENXIO;
7133 
7134 	if (rdev->raid_disk < 0)
7135 		goto kick_rdev;
7136 
7137 	clear_bit(Blocked, &rdev->flags);
7138 	remove_and_add_spares(mddev, rdev);
7139 
7140 	if (rdev->raid_disk >= 0)
7141 		goto busy;
7142 
7143 kick_rdev:
7144 	if (mddev_is_clustered(mddev) &&
7145 	    mddev->cluster_ops->remove_disk(mddev, rdev))
7146 		goto busy;
7147 
7148 	md_kick_rdev_from_array(rdev);
7149 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7150 	if (!mddev->thread)
7151 		md_update_sb(mddev, 1);
7152 	md_new_event();
7153 
7154 	return 0;
7155 busy:
7156 	pr_debug("md: cannot remove active disk %pg from %s ...\n",
7157 		 rdev->bdev, mdname(mddev));
7158 	return -EBUSY;
7159 }
7160 
hot_add_disk(struct mddev * mddev,dev_t dev)7161 static int hot_add_disk(struct mddev *mddev, dev_t dev)
7162 {
7163 	int err;
7164 	struct md_rdev *rdev;
7165 
7166 	if (!mddev->pers)
7167 		return -ENODEV;
7168 
7169 	if (mddev->major_version != 0) {
7170 		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7171 			mdname(mddev));
7172 		return -EINVAL;
7173 	}
7174 	if (!mddev->pers->hot_add_disk) {
7175 		pr_warn("%s: personality does not support diskops!\n",
7176 			mdname(mddev));
7177 		return -EINVAL;
7178 	}
7179 
7180 	rdev = md_import_device(dev, -1, 0);
7181 	if (IS_ERR(rdev)) {
7182 		pr_warn("md: error, md_import_device() returned %ld\n",
7183 			PTR_ERR(rdev));
7184 		return -EINVAL;
7185 	}
7186 
7187 	if (mddev->persistent)
7188 		rdev->sb_start = calc_dev_sboffset(rdev);
7189 	else
7190 		rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7191 
7192 	rdev->sectors = rdev->sb_start;
7193 
7194 	if (test_bit(Faulty, &rdev->flags)) {
7195 		pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7196 			rdev->bdev, mdname(mddev));
7197 		err = -EINVAL;
7198 		goto abort_export;
7199 	}
7200 
7201 	clear_bit(In_sync, &rdev->flags);
7202 	rdev->desc_nr = -1;
7203 	rdev->saved_raid_disk = -1;
7204 	err = bind_rdev_to_array(rdev, mddev);
7205 	if (err)
7206 		goto abort_export;
7207 
7208 	/*
7209 	 * The rest should better be atomic, we can have disk failures
7210 	 * noticed in interrupt contexts ...
7211 	 */
7212 
7213 	rdev->raid_disk = -1;
7214 
7215 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7216 	if (!mddev->thread)
7217 		md_update_sb(mddev, 1);
7218 	/*
7219 	 * Kick recovery, maybe this spare has to be added to the
7220 	 * array immediately.
7221 	 */
7222 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7223 	md_new_event();
7224 	return 0;
7225 
7226 abort_export:
7227 	export_rdev(rdev, mddev);
7228 	return err;
7229 }
7230 
set_bitmap_file(struct mddev * mddev,int fd)7231 static int set_bitmap_file(struct mddev *mddev, int fd)
7232 {
7233 	int err = 0;
7234 
7235 	if (mddev->pers) {
7236 		if (!mddev->pers->quiesce || !mddev->thread)
7237 			return -EBUSY;
7238 		if (mddev->recovery || mddev->sync_thread)
7239 			return -EBUSY;
7240 		/* we should be able to change the bitmap.. */
7241 	}
7242 
7243 	if (fd >= 0) {
7244 		struct inode *inode;
7245 		struct file *f;
7246 
7247 		if (mddev->bitmap || mddev->bitmap_info.file)
7248 			return -EEXIST; /* cannot add when bitmap is present */
7249 
7250 		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7251 			pr_warn("%s: bitmap files not supported by this kernel\n",
7252 				mdname(mddev));
7253 			return -EINVAL;
7254 		}
7255 		pr_warn("%s: using deprecated bitmap file support\n",
7256 			mdname(mddev));
7257 
7258 		f = fget(fd);
7259 
7260 		if (f == NULL) {
7261 			pr_warn("%s: error: failed to get bitmap file\n",
7262 				mdname(mddev));
7263 			return -EBADF;
7264 		}
7265 
7266 		inode = f->f_mapping->host;
7267 		if (!S_ISREG(inode->i_mode)) {
7268 			pr_warn("%s: error: bitmap file must be a regular file\n",
7269 				mdname(mddev));
7270 			err = -EBADF;
7271 		} else if (!(f->f_mode & FMODE_WRITE)) {
7272 			pr_warn("%s: error: bitmap file must open for write\n",
7273 				mdname(mddev));
7274 			err = -EBADF;
7275 		} else if (atomic_read(&inode->i_writecount) != 1) {
7276 			pr_warn("%s: error: bitmap file is already in use\n",
7277 				mdname(mddev));
7278 			err = -EBUSY;
7279 		}
7280 		if (err) {
7281 			fput(f);
7282 			return err;
7283 		}
7284 		mddev->bitmap_info.file = f;
7285 		mddev->bitmap_info.offset = 0; /* file overrides offset */
7286 	} else if (mddev->bitmap == NULL)
7287 		return -ENOENT; /* cannot remove what isn't there */
7288 	err = 0;
7289 	if (mddev->pers) {
7290 		if (fd >= 0) {
7291 			err = mddev->bitmap_ops->create(mddev);
7292 			if (!err)
7293 				err = mddev->bitmap_ops->load(mddev);
7294 
7295 			if (err) {
7296 				mddev->bitmap_ops->destroy(mddev);
7297 				fd = -1;
7298 			}
7299 		} else if (fd < 0) {
7300 			mddev->bitmap_ops->destroy(mddev);
7301 		}
7302 	}
7303 
7304 	if (fd < 0) {
7305 		struct file *f = mddev->bitmap_info.file;
7306 		if (f) {
7307 			spin_lock(&mddev->lock);
7308 			mddev->bitmap_info.file = NULL;
7309 			spin_unlock(&mddev->lock);
7310 			fput(f);
7311 		}
7312 	}
7313 
7314 	return err;
7315 }
7316 
7317 /*
7318  * md_set_array_info is used two different ways
7319  * The original usage is when creating a new array.
7320  * In this usage, raid_disks is > 0 and it together with
7321  *  level, size, not_persistent,layout,chunksize determine the
7322  *  shape of the array.
7323  *  This will always create an array with a type-0.90.0 superblock.
7324  * The newer usage is when assembling an array.
7325  *  In this case raid_disks will be 0, and the major_version field is
7326  *  use to determine which style super-blocks are to be found on the devices.
7327  *  The minor and patch _version numbers are also kept incase the
7328  *  super_block handler wishes to interpret them.
7329  */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7330 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7331 {
7332 	if (info->raid_disks == 0) {
7333 		/* just setting version number for superblock loading */
7334 		if (info->major_version < 0 ||
7335 		    info->major_version >= ARRAY_SIZE(super_types) ||
7336 		    super_types[info->major_version].name == NULL) {
7337 			/* maybe try to auto-load a module? */
7338 			pr_warn("md: superblock version %d not known\n",
7339 				info->major_version);
7340 			return -EINVAL;
7341 		}
7342 		mddev->major_version = info->major_version;
7343 		mddev->minor_version = info->minor_version;
7344 		mddev->patch_version = info->patch_version;
7345 		mddev->persistent = !info->not_persistent;
7346 		/* ensure mddev_put doesn't delete this now that there
7347 		 * is some minimal configuration.
7348 		 */
7349 		mddev->ctime         = ktime_get_real_seconds();
7350 		return 0;
7351 	}
7352 	mddev->major_version = MD_MAJOR_VERSION;
7353 	mddev->minor_version = MD_MINOR_VERSION;
7354 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
7355 	mddev->ctime         = ktime_get_real_seconds();
7356 
7357 	mddev->level         = info->level;
7358 	mddev->clevel[0]     = 0;
7359 	mddev->dev_sectors   = 2 * (sector_t)info->size;
7360 	mddev->raid_disks    = info->raid_disks;
7361 	/* don't set md_minor, it is determined by which /dev/md* was
7362 	 * openned
7363 	 */
7364 	if (info->state & (1<<MD_SB_CLEAN))
7365 		mddev->recovery_cp = MaxSector;
7366 	else
7367 		mddev->recovery_cp = 0;
7368 	mddev->persistent    = ! info->not_persistent;
7369 	mddev->external	     = 0;
7370 
7371 	mddev->layout        = info->layout;
7372 	if (mddev->level == 0)
7373 		/* Cannot trust RAID0 layout info here */
7374 		mddev->layout = -1;
7375 	mddev->chunk_sectors = info->chunk_size >> 9;
7376 
7377 	if (mddev->persistent) {
7378 		mddev->max_disks = MD_SB_DISKS;
7379 		mddev->flags = 0;
7380 		mddev->sb_flags = 0;
7381 	}
7382 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7383 
7384 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7385 	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7386 	mddev->bitmap_info.offset = 0;
7387 
7388 	mddev->reshape_position = MaxSector;
7389 
7390 	/*
7391 	 * Generate a 128 bit UUID
7392 	 */
7393 	get_random_bytes(mddev->uuid, 16);
7394 
7395 	mddev->new_level = mddev->level;
7396 	mddev->new_chunk_sectors = mddev->chunk_sectors;
7397 	mddev->new_layout = mddev->layout;
7398 	mddev->delta_disks = 0;
7399 	mddev->reshape_backwards = 0;
7400 
7401 	return 0;
7402 }
7403 
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7404 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7405 {
7406 	lockdep_assert_held(&mddev->reconfig_mutex);
7407 
7408 	if (mddev->external_size)
7409 		return;
7410 
7411 	mddev->array_sectors = array_sectors;
7412 }
7413 EXPORT_SYMBOL(md_set_array_sectors);
7414 
update_size(struct mddev * mddev,sector_t num_sectors)7415 static int update_size(struct mddev *mddev, sector_t num_sectors)
7416 {
7417 	struct md_rdev *rdev;
7418 	int rv;
7419 	int fit = (num_sectors == 0);
7420 	sector_t old_dev_sectors = mddev->dev_sectors;
7421 
7422 	if (mddev->pers->resize == NULL)
7423 		return -EINVAL;
7424 	/* The "num_sectors" is the number of sectors of each device that
7425 	 * is used.  This can only make sense for arrays with redundancy.
7426 	 * linear and raid0 always use whatever space is available. We can only
7427 	 * consider changing this number if no resync or reconstruction is
7428 	 * happening, and if the new size is acceptable. It must fit before the
7429 	 * sb_start or, if that is <data_offset, it must fit before the size
7430 	 * of each device.  If num_sectors is zero, we find the largest size
7431 	 * that fits.
7432 	 */
7433 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7434 		return -EBUSY;
7435 	if (!md_is_rdwr(mddev))
7436 		return -EROFS;
7437 
7438 	rdev_for_each(rdev, mddev) {
7439 		sector_t avail = rdev->sectors;
7440 
7441 		if (fit && (num_sectors == 0 || num_sectors > avail))
7442 			num_sectors = avail;
7443 		if (avail < num_sectors)
7444 			return -ENOSPC;
7445 	}
7446 	rv = mddev->pers->resize(mddev, num_sectors);
7447 	if (!rv) {
7448 		if (mddev_is_clustered(mddev))
7449 			mddev->cluster_ops->update_size(mddev, old_dev_sectors);
7450 		else if (!mddev_is_dm(mddev))
7451 			set_capacity_and_notify(mddev->gendisk,
7452 						mddev->array_sectors);
7453 	}
7454 	return rv;
7455 }
7456 
update_raid_disks(struct mddev * mddev,int raid_disks)7457 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7458 {
7459 	int rv;
7460 	struct md_rdev *rdev;
7461 	/* change the number of raid disks */
7462 	if (mddev->pers->check_reshape == NULL)
7463 		return -EINVAL;
7464 	if (!md_is_rdwr(mddev))
7465 		return -EROFS;
7466 	if (raid_disks <= 0 ||
7467 	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7468 		return -EINVAL;
7469 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7470 	    test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7471 	    mddev->reshape_position != MaxSector)
7472 		return -EBUSY;
7473 
7474 	rdev_for_each(rdev, mddev) {
7475 		if (mddev->raid_disks < raid_disks &&
7476 		    rdev->data_offset < rdev->new_data_offset)
7477 			return -EINVAL;
7478 		if (mddev->raid_disks > raid_disks &&
7479 		    rdev->data_offset > rdev->new_data_offset)
7480 			return -EINVAL;
7481 	}
7482 
7483 	mddev->delta_disks = raid_disks - mddev->raid_disks;
7484 	if (mddev->delta_disks < 0)
7485 		mddev->reshape_backwards = 1;
7486 	else if (mddev->delta_disks > 0)
7487 		mddev->reshape_backwards = 0;
7488 
7489 	rv = mddev->pers->check_reshape(mddev);
7490 	if (rv < 0) {
7491 		mddev->delta_disks = 0;
7492 		mddev->reshape_backwards = 0;
7493 	}
7494 	return rv;
7495 }
7496 
get_cluster_ops(struct mddev * mddev)7497 static int get_cluster_ops(struct mddev *mddev)
7498 {
7499 	xa_lock(&md_submodule);
7500 	mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
7501 	if (mddev->cluster_ops &&
7502 	    !try_module_get(mddev->cluster_ops->head.owner))
7503 		mddev->cluster_ops = NULL;
7504 	xa_unlock(&md_submodule);
7505 
7506 	return mddev->cluster_ops == NULL ? -ENOENT : 0;
7507 }
7508 
put_cluster_ops(struct mddev * mddev)7509 static void put_cluster_ops(struct mddev *mddev)
7510 {
7511 	if (!mddev->cluster_ops)
7512 		return;
7513 
7514 	mddev->cluster_ops->leave(mddev);
7515 	module_put(mddev->cluster_ops->head.owner);
7516 	mddev->cluster_ops = NULL;
7517 }
7518 
7519 /*
7520  * update_array_info is used to change the configuration of an
7521  * on-line array.
7522  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7523  * fields in the info are checked against the array.
7524  * Any differences that cannot be handled will cause an error.
7525  * Normally, only one change can be managed at a time.
7526  */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7527 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7528 {
7529 	int rv = 0;
7530 	int cnt = 0;
7531 	int state = 0;
7532 
7533 	/* calculate expected state,ignoring low bits */
7534 	if (mddev->bitmap && mddev->bitmap_info.offset)
7535 		state |= (1 << MD_SB_BITMAP_PRESENT);
7536 
7537 	if (mddev->major_version != info->major_version ||
7538 	    mddev->minor_version != info->minor_version ||
7539 /*	    mddev->patch_version != info->patch_version || */
7540 	    mddev->ctime         != info->ctime         ||
7541 	    mddev->level         != info->level         ||
7542 /*	    mddev->layout        != info->layout        || */
7543 	    mddev->persistent	 != !info->not_persistent ||
7544 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7545 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7546 	    ((state^info->state) & 0xfffffe00)
7547 		)
7548 		return -EINVAL;
7549 	/* Check there is only one change */
7550 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7551 		cnt++;
7552 	if (mddev->raid_disks != info->raid_disks)
7553 		cnt++;
7554 	if (mddev->layout != info->layout)
7555 		cnt++;
7556 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7557 		cnt++;
7558 	if (cnt == 0)
7559 		return 0;
7560 	if (cnt > 1)
7561 		return -EINVAL;
7562 
7563 	if (mddev->layout != info->layout) {
7564 		/* Change layout
7565 		 * we don't need to do anything at the md level, the
7566 		 * personality will take care of it all.
7567 		 */
7568 		if (mddev->pers->check_reshape == NULL)
7569 			return -EINVAL;
7570 		else {
7571 			mddev->new_layout = info->layout;
7572 			rv = mddev->pers->check_reshape(mddev);
7573 			if (rv)
7574 				mddev->new_layout = mddev->layout;
7575 			return rv;
7576 		}
7577 	}
7578 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7579 		rv = update_size(mddev, (sector_t)info->size * 2);
7580 
7581 	if (mddev->raid_disks    != info->raid_disks)
7582 		rv = update_raid_disks(mddev, info->raid_disks);
7583 
7584 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7585 		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7586 			rv = -EINVAL;
7587 			goto err;
7588 		}
7589 		if (mddev->recovery || mddev->sync_thread) {
7590 			rv = -EBUSY;
7591 			goto err;
7592 		}
7593 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7594 			/* add the bitmap */
7595 			if (mddev->bitmap) {
7596 				rv = -EEXIST;
7597 				goto err;
7598 			}
7599 			if (mddev->bitmap_info.default_offset == 0) {
7600 				rv = -EINVAL;
7601 				goto err;
7602 			}
7603 			mddev->bitmap_info.offset =
7604 				mddev->bitmap_info.default_offset;
7605 			mddev->bitmap_info.space =
7606 				mddev->bitmap_info.default_space;
7607 			rv = mddev->bitmap_ops->create(mddev);
7608 			if (!rv)
7609 				rv = mddev->bitmap_ops->load(mddev);
7610 
7611 			if (rv)
7612 				mddev->bitmap_ops->destroy(mddev);
7613 		} else {
7614 			struct md_bitmap_stats stats;
7615 
7616 			rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
7617 			if (rv)
7618 				goto err;
7619 
7620 			if (stats.file) {
7621 				rv = -EINVAL;
7622 				goto err;
7623 			}
7624 
7625 			if (mddev->bitmap_info.nodes) {
7626 				/* hold PW on all the bitmap lock */
7627 				if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7628 					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7629 					rv = -EPERM;
7630 					mddev->cluster_ops->unlock_all_bitmaps(mddev);
7631 					goto err;
7632 				}
7633 
7634 				mddev->bitmap_info.nodes = 0;
7635 				put_cluster_ops(mddev);
7636 				mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7637 			}
7638 			mddev->bitmap_ops->destroy(mddev);
7639 			mddev->bitmap_info.offset = 0;
7640 		}
7641 	}
7642 	md_update_sb(mddev, 1);
7643 	return rv;
7644 err:
7645 	return rv;
7646 }
7647 
set_disk_faulty(struct mddev * mddev,dev_t dev)7648 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7649 {
7650 	struct md_rdev *rdev;
7651 	int err = 0;
7652 
7653 	if (mddev->pers == NULL)
7654 		return -ENODEV;
7655 
7656 	rcu_read_lock();
7657 	rdev = md_find_rdev_rcu(mddev, dev);
7658 	if (!rdev)
7659 		err =  -ENODEV;
7660 	else {
7661 		md_error(mddev, rdev);
7662 		if (test_bit(MD_BROKEN, &mddev->flags))
7663 			err = -EBUSY;
7664 	}
7665 	rcu_read_unlock();
7666 	return err;
7667 }
7668 
7669 /*
7670  * We have a problem here : there is no easy way to give a CHS
7671  * virtual geometry. We currently pretend that we have a 2 heads
7672  * 4 sectors (with a BIG number of cylinders...). This drives
7673  * dosfs just mad... ;-)
7674  */
md_getgeo(struct block_device * bdev,struct hd_geometry * geo)7675 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7676 {
7677 	struct mddev *mddev = bdev->bd_disk->private_data;
7678 
7679 	geo->heads = 2;
7680 	geo->sectors = 4;
7681 	geo->cylinders = mddev->array_sectors / 8;
7682 	return 0;
7683 }
7684 
md_ioctl_valid(unsigned int cmd)7685 static inline int md_ioctl_valid(unsigned int cmd)
7686 {
7687 	switch (cmd) {
7688 	case GET_ARRAY_INFO:
7689 	case GET_DISK_INFO:
7690 	case RAID_VERSION:
7691 		return 0;
7692 	case ADD_NEW_DISK:
7693 	case GET_BITMAP_FILE:
7694 	case HOT_ADD_DISK:
7695 	case HOT_REMOVE_DISK:
7696 	case RESTART_ARRAY_RW:
7697 	case RUN_ARRAY:
7698 	case SET_ARRAY_INFO:
7699 	case SET_BITMAP_FILE:
7700 	case SET_DISK_FAULTY:
7701 	case STOP_ARRAY:
7702 	case STOP_ARRAY_RO:
7703 	case CLUSTERED_DISK_NACK:
7704 		if (!capable(CAP_SYS_ADMIN))
7705 			return -EACCES;
7706 		return 0;
7707 	default:
7708 		return -ENOTTY;
7709 	}
7710 }
7711 
md_ioctl_need_suspend(unsigned int cmd)7712 static bool md_ioctl_need_suspend(unsigned int cmd)
7713 {
7714 	switch (cmd) {
7715 	case ADD_NEW_DISK:
7716 	case HOT_ADD_DISK:
7717 	case HOT_REMOVE_DISK:
7718 	case SET_BITMAP_FILE:
7719 	case SET_ARRAY_INFO:
7720 		return true;
7721 	default:
7722 		return false;
7723 	}
7724 }
7725 
__md_set_array_info(struct mddev * mddev,void __user * argp)7726 static int __md_set_array_info(struct mddev *mddev, void __user *argp)
7727 {
7728 	mdu_array_info_t info;
7729 	int err;
7730 
7731 	if (!argp)
7732 		memset(&info, 0, sizeof(info));
7733 	else if (copy_from_user(&info, argp, sizeof(info)))
7734 		return -EFAULT;
7735 
7736 	if (mddev->pers) {
7737 		err = update_array_info(mddev, &info);
7738 		if (err)
7739 			pr_warn("md: couldn't update array info. %d\n", err);
7740 		return err;
7741 	}
7742 
7743 	if (!list_empty(&mddev->disks)) {
7744 		pr_warn("md: array %s already has disks!\n", mdname(mddev));
7745 		return -EBUSY;
7746 	}
7747 
7748 	if (mddev->raid_disks) {
7749 		pr_warn("md: array %s already initialised!\n", mdname(mddev));
7750 		return -EBUSY;
7751 	}
7752 
7753 	err = md_set_array_info(mddev, &info);
7754 	if (err)
7755 		pr_warn("md: couldn't set array info. %d\n", err);
7756 
7757 	return err;
7758 }
7759 
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7760 static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
7761 			unsigned int cmd, unsigned long arg)
7762 {
7763 	int err = 0;
7764 	void __user *argp = (void __user *)arg;
7765 	struct mddev *mddev = NULL;
7766 
7767 	err = md_ioctl_valid(cmd);
7768 	if (err)
7769 		return err;
7770 
7771 	/*
7772 	 * Commands dealing with the RAID driver but not any
7773 	 * particular array:
7774 	 */
7775 	if (cmd == RAID_VERSION)
7776 		return get_version(argp);
7777 
7778 	/*
7779 	 * Commands creating/starting a new array:
7780 	 */
7781 
7782 	mddev = bdev->bd_disk->private_data;
7783 
7784 	/* Some actions do not requires the mutex */
7785 	switch (cmd) {
7786 	case GET_ARRAY_INFO:
7787 		if (!mddev->raid_disks && !mddev->external)
7788 			return -ENODEV;
7789 		return get_array_info(mddev, argp);
7790 
7791 	case GET_DISK_INFO:
7792 		if (!mddev->raid_disks && !mddev->external)
7793 			return -ENODEV;
7794 		return get_disk_info(mddev, argp);
7795 
7796 	case SET_DISK_FAULTY:
7797 		return set_disk_faulty(mddev, new_decode_dev(arg));
7798 
7799 	case GET_BITMAP_FILE:
7800 		return get_bitmap_file(mddev, argp);
7801 	}
7802 
7803 	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7804 		/* Need to flush page cache, and ensure no-one else opens
7805 		 * and writes
7806 		 */
7807 		err = mddev_set_closing_and_sync_blockdev(mddev, 1);
7808 		if (err)
7809 			return err;
7810 	}
7811 
7812 	if (!md_is_rdwr(mddev))
7813 		flush_work(&mddev->sync_work);
7814 
7815 	err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
7816 					   mddev_lock(mddev);
7817 	if (err) {
7818 		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7819 			 err, cmd);
7820 		goto out;
7821 	}
7822 
7823 	if (cmd == SET_ARRAY_INFO) {
7824 		err = __md_set_array_info(mddev, argp);
7825 		goto unlock;
7826 	}
7827 
7828 	/*
7829 	 * Commands querying/configuring an existing array:
7830 	 */
7831 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7832 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7833 	if ((!mddev->raid_disks && !mddev->external)
7834 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7835 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7836 	    && cmd != GET_BITMAP_FILE) {
7837 		err = -ENODEV;
7838 		goto unlock;
7839 	}
7840 
7841 	/*
7842 	 * Commands even a read-only array can execute:
7843 	 */
7844 	switch (cmd) {
7845 	case RESTART_ARRAY_RW:
7846 		err = restart_array(mddev);
7847 		goto unlock;
7848 
7849 	case STOP_ARRAY:
7850 		err = do_md_stop(mddev, 0);
7851 		goto unlock;
7852 
7853 	case STOP_ARRAY_RO:
7854 		if (mddev->pers)
7855 			err = md_set_readonly(mddev);
7856 		goto unlock;
7857 
7858 	case HOT_REMOVE_DISK:
7859 		err = hot_remove_disk(mddev, new_decode_dev(arg));
7860 		goto unlock;
7861 
7862 	case ADD_NEW_DISK:
7863 		/* We can support ADD_NEW_DISK on read-only arrays
7864 		 * only if we are re-adding a preexisting device.
7865 		 * So require mddev->pers and MD_DISK_SYNC.
7866 		 */
7867 		if (mddev->pers) {
7868 			mdu_disk_info_t info;
7869 			if (copy_from_user(&info, argp, sizeof(info)))
7870 				err = -EFAULT;
7871 			else if (!(info.state & (1<<MD_DISK_SYNC)))
7872 				/* Need to clear read-only for this */
7873 				break;
7874 			else
7875 				err = md_add_new_disk(mddev, &info);
7876 			goto unlock;
7877 		}
7878 		break;
7879 	}
7880 
7881 	/*
7882 	 * The remaining ioctls are changing the state of the
7883 	 * superblock, so we do not allow them on read-only arrays.
7884 	 */
7885 	if (!md_is_rdwr(mddev) && mddev->pers) {
7886 		if (mddev->ro != MD_AUTO_READ) {
7887 			err = -EROFS;
7888 			goto unlock;
7889 		}
7890 		mddev->ro = MD_RDWR;
7891 		sysfs_notify_dirent_safe(mddev->sysfs_state);
7892 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7893 		/* mddev_unlock will wake thread */
7894 		/* If a device failed while we were read-only, we
7895 		 * need to make sure the metadata is updated now.
7896 		 */
7897 		if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7898 			mddev_unlock(mddev);
7899 			wait_event(mddev->sb_wait,
7900 				   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7901 				   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7902 			mddev_lock_nointr(mddev);
7903 		}
7904 	}
7905 
7906 	switch (cmd) {
7907 	case ADD_NEW_DISK:
7908 	{
7909 		mdu_disk_info_t info;
7910 		if (copy_from_user(&info, argp, sizeof(info)))
7911 			err = -EFAULT;
7912 		else
7913 			err = md_add_new_disk(mddev, &info);
7914 		goto unlock;
7915 	}
7916 
7917 	case CLUSTERED_DISK_NACK:
7918 		if (mddev_is_clustered(mddev))
7919 			mddev->cluster_ops->new_disk_ack(mddev, false);
7920 		else
7921 			err = -EINVAL;
7922 		goto unlock;
7923 
7924 	case HOT_ADD_DISK:
7925 		err = hot_add_disk(mddev, new_decode_dev(arg));
7926 		goto unlock;
7927 
7928 	case RUN_ARRAY:
7929 		err = do_md_run(mddev);
7930 		goto unlock;
7931 
7932 	case SET_BITMAP_FILE:
7933 		err = set_bitmap_file(mddev, (int)arg);
7934 		goto unlock;
7935 
7936 	default:
7937 		err = -EINVAL;
7938 		goto unlock;
7939 	}
7940 
7941 unlock:
7942 	if (mddev->hold_active == UNTIL_IOCTL &&
7943 	    err != -EINVAL)
7944 		mddev->hold_active = 0;
7945 
7946 	md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
7947 				     mddev_unlock(mddev);
7948 
7949 out:
7950 	if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
7951 		clear_bit(MD_CLOSING, &mddev->flags);
7952 	return err;
7953 }
7954 #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7955 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
7956 		    unsigned int cmd, unsigned long arg)
7957 {
7958 	switch (cmd) {
7959 	case HOT_REMOVE_DISK:
7960 	case HOT_ADD_DISK:
7961 	case SET_DISK_FAULTY:
7962 	case SET_BITMAP_FILE:
7963 		/* These take in integer arg, do not convert */
7964 		break;
7965 	default:
7966 		arg = (unsigned long)compat_ptr(arg);
7967 		break;
7968 	}
7969 
7970 	return md_ioctl(bdev, mode, cmd, arg);
7971 }
7972 #endif /* CONFIG_COMPAT */
7973 
md_set_read_only(struct block_device * bdev,bool ro)7974 static int md_set_read_only(struct block_device *bdev, bool ro)
7975 {
7976 	struct mddev *mddev = bdev->bd_disk->private_data;
7977 	int err;
7978 
7979 	err = mddev_lock(mddev);
7980 	if (err)
7981 		return err;
7982 
7983 	if (!mddev->raid_disks && !mddev->external) {
7984 		err = -ENODEV;
7985 		goto out_unlock;
7986 	}
7987 
7988 	/*
7989 	 * Transitioning to read-auto need only happen for arrays that call
7990 	 * md_write_start and which are not ready for writes yet.
7991 	 */
7992 	if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
7993 		err = restart_array(mddev);
7994 		if (err)
7995 			goto out_unlock;
7996 		mddev->ro = MD_AUTO_READ;
7997 	}
7998 
7999 out_unlock:
8000 	mddev_unlock(mddev);
8001 	return err;
8002 }
8003 
md_open(struct gendisk * disk,blk_mode_t mode)8004 static int md_open(struct gendisk *disk, blk_mode_t mode)
8005 {
8006 	struct mddev *mddev;
8007 	int err;
8008 
8009 	spin_lock(&all_mddevs_lock);
8010 	mddev = mddev_get(disk->private_data);
8011 	spin_unlock(&all_mddevs_lock);
8012 	if (!mddev)
8013 		return -ENODEV;
8014 
8015 	err = mutex_lock_interruptible(&mddev->open_mutex);
8016 	if (err)
8017 		goto out;
8018 
8019 	err = -ENODEV;
8020 	if (test_bit(MD_CLOSING, &mddev->flags))
8021 		goto out_unlock;
8022 
8023 	atomic_inc(&mddev->openers);
8024 	mutex_unlock(&mddev->open_mutex);
8025 
8026 	disk_check_media_change(disk);
8027 	return 0;
8028 
8029 out_unlock:
8030 	mutex_unlock(&mddev->open_mutex);
8031 out:
8032 	mddev_put(mddev);
8033 	return err;
8034 }
8035 
md_release(struct gendisk * disk)8036 static void md_release(struct gendisk *disk)
8037 {
8038 	struct mddev *mddev = disk->private_data;
8039 
8040 	BUG_ON(!mddev);
8041 	atomic_dec(&mddev->openers);
8042 	mddev_put(mddev);
8043 }
8044 
md_check_events(struct gendisk * disk,unsigned int clearing)8045 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
8046 {
8047 	struct mddev *mddev = disk->private_data;
8048 	unsigned int ret = 0;
8049 
8050 	if (mddev->changed)
8051 		ret = DISK_EVENT_MEDIA_CHANGE;
8052 	mddev->changed = 0;
8053 	return ret;
8054 }
8055 
md_free_disk(struct gendisk * disk)8056 static void md_free_disk(struct gendisk *disk)
8057 {
8058 	struct mddev *mddev = disk->private_data;
8059 
8060 	mddev_free(mddev);
8061 }
8062 
8063 const struct block_device_operations md_fops =
8064 {
8065 	.owner		= THIS_MODULE,
8066 	.submit_bio	= md_submit_bio,
8067 	.open		= md_open,
8068 	.release	= md_release,
8069 	.ioctl		= md_ioctl,
8070 #ifdef CONFIG_COMPAT
8071 	.compat_ioctl	= md_compat_ioctl,
8072 #endif
8073 	.getgeo		= md_getgeo,
8074 	.check_events	= md_check_events,
8075 	.set_read_only	= md_set_read_only,
8076 	.free_disk	= md_free_disk,
8077 };
8078 
md_thread(void * arg)8079 static int md_thread(void *arg)
8080 {
8081 	struct md_thread *thread = arg;
8082 
8083 	/*
8084 	 * md_thread is a 'system-thread', it's priority should be very
8085 	 * high. We avoid resource deadlocks individually in each
8086 	 * raid personality. (RAID5 does preallocation) We also use RR and
8087 	 * the very same RT priority as kswapd, thus we will never get
8088 	 * into a priority inversion deadlock.
8089 	 *
8090 	 * we definitely have to have equal or higher priority than
8091 	 * bdflush, otherwise bdflush will deadlock if there are too
8092 	 * many dirty RAID5 blocks.
8093 	 */
8094 
8095 	allow_signal(SIGKILL);
8096 	while (!kthread_should_stop()) {
8097 
8098 		/* We need to wait INTERRUPTIBLE so that
8099 		 * we don't add to the load-average.
8100 		 * That means we need to be sure no signals are
8101 		 * pending
8102 		 */
8103 		if (signal_pending(current))
8104 			flush_signals(current);
8105 
8106 		wait_event_interruptible_timeout
8107 			(thread->wqueue,
8108 			 test_bit(THREAD_WAKEUP, &thread->flags)
8109 			 || kthread_should_stop() || kthread_should_park(),
8110 			 thread->timeout);
8111 
8112 		clear_bit(THREAD_WAKEUP, &thread->flags);
8113 		if (kthread_should_park())
8114 			kthread_parkme();
8115 		if (!kthread_should_stop())
8116 			thread->run(thread);
8117 	}
8118 
8119 	return 0;
8120 }
8121 
md_wakeup_thread_directly(struct md_thread __rcu * thread)8122 static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
8123 {
8124 	struct md_thread *t;
8125 
8126 	rcu_read_lock();
8127 	t = rcu_dereference(thread);
8128 	if (t)
8129 		wake_up_process(t->tsk);
8130 	rcu_read_unlock();
8131 }
8132 
md_wakeup_thread(struct md_thread __rcu * thread)8133 void md_wakeup_thread(struct md_thread __rcu *thread)
8134 {
8135 	struct md_thread *t;
8136 
8137 	rcu_read_lock();
8138 	t = rcu_dereference(thread);
8139 	if (t) {
8140 		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8141 		set_bit(THREAD_WAKEUP, &t->flags);
8142 		if (wq_has_sleeper(&t->wqueue))
8143 			wake_up(&t->wqueue);
8144 	}
8145 	rcu_read_unlock();
8146 }
8147 EXPORT_SYMBOL(md_wakeup_thread);
8148 
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8149 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8150 		struct mddev *mddev, const char *name)
8151 {
8152 	struct md_thread *thread;
8153 
8154 	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8155 	if (!thread)
8156 		return NULL;
8157 
8158 	init_waitqueue_head(&thread->wqueue);
8159 
8160 	thread->run = run;
8161 	thread->mddev = mddev;
8162 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
8163 	thread->tsk = kthread_run(md_thread, thread,
8164 				  "%s_%s",
8165 				  mdname(thread->mddev),
8166 				  name);
8167 	if (IS_ERR(thread->tsk)) {
8168 		kfree(thread);
8169 		return NULL;
8170 	}
8171 	return thread;
8172 }
8173 EXPORT_SYMBOL(md_register_thread);
8174 
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8175 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8176 {
8177 	struct md_thread *thread = rcu_dereference_protected(*threadp,
8178 					lockdep_is_held(&mddev->reconfig_mutex));
8179 
8180 	if (!thread)
8181 		return;
8182 
8183 	rcu_assign_pointer(*threadp, NULL);
8184 	synchronize_rcu();
8185 
8186 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8187 	kthread_stop(thread->tsk);
8188 	kfree(thread);
8189 }
8190 EXPORT_SYMBOL(md_unregister_thread);
8191 
md_error(struct mddev * mddev,struct md_rdev * rdev)8192 void md_error(struct mddev *mddev, struct md_rdev *rdev)
8193 {
8194 	if (!rdev || test_bit(Faulty, &rdev->flags))
8195 		return;
8196 
8197 	if (!mddev->pers || !mddev->pers->error_handler)
8198 		return;
8199 	mddev->pers->error_handler(mddev, rdev);
8200 
8201 	if (mddev->pers->head.id == ID_RAID0 ||
8202 	    mddev->pers->head.id == ID_LINEAR)
8203 		return;
8204 
8205 	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8206 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8207 	sysfs_notify_dirent_safe(rdev->sysfs_state);
8208 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8209 	if (!test_bit(MD_BROKEN, &mddev->flags)) {
8210 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8211 		md_wakeup_thread(mddev->thread);
8212 	}
8213 	if (mddev->event_work.func)
8214 		queue_work(md_misc_wq, &mddev->event_work);
8215 	md_new_event();
8216 }
8217 EXPORT_SYMBOL(md_error);
8218 
8219 /* seq_file implementation /proc/mdstat */
8220 
status_unused(struct seq_file * seq)8221 static void status_unused(struct seq_file *seq)
8222 {
8223 	int i = 0;
8224 	struct md_rdev *rdev;
8225 
8226 	seq_printf(seq, "unused devices: ");
8227 
8228 	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8229 		i++;
8230 		seq_printf(seq, "%pg ", rdev->bdev);
8231 	}
8232 	if (!i)
8233 		seq_printf(seq, "<none>");
8234 
8235 	seq_printf(seq, "\n");
8236 }
8237 
status_personalities(struct seq_file * seq)8238 static void status_personalities(struct seq_file *seq)
8239 {
8240 	struct md_submodule_head *head;
8241 	unsigned long i;
8242 
8243 	seq_puts(seq, "Personalities : ");
8244 
8245 	xa_lock(&md_submodule);
8246 	xa_for_each(&md_submodule, i, head)
8247 		if (head->type == MD_PERSONALITY)
8248 			seq_printf(seq, "[%s] ", head->name);
8249 	xa_unlock(&md_submodule);
8250 
8251 	seq_puts(seq, "\n");
8252 }
8253 
status_resync(struct seq_file * seq,struct mddev * mddev)8254 static int status_resync(struct seq_file *seq, struct mddev *mddev)
8255 {
8256 	sector_t max_sectors, resync, res;
8257 	unsigned long dt, db = 0;
8258 	sector_t rt, curr_mark_cnt, resync_mark_cnt;
8259 	int scale, recovery_active;
8260 	unsigned int per_milli;
8261 
8262 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8263 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8264 		max_sectors = mddev->resync_max_sectors;
8265 	else
8266 		max_sectors = mddev->dev_sectors;
8267 
8268 	resync = mddev->curr_resync;
8269 	if (resync < MD_RESYNC_ACTIVE) {
8270 		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8271 			/* Still cleaning up */
8272 			resync = max_sectors;
8273 	} else if (resync > max_sectors) {
8274 		resync = max_sectors;
8275 	} else {
8276 		res = atomic_read(&mddev->recovery_active);
8277 		/*
8278 		 * Resync has started, but the subtraction has overflowed or
8279 		 * yielded one of the special values. Force it to active to
8280 		 * ensure the status reports an active resync.
8281 		 */
8282 		if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8283 			resync = MD_RESYNC_ACTIVE;
8284 		else
8285 			resync -= res;
8286 	}
8287 
8288 	if (resync == MD_RESYNC_NONE) {
8289 		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8290 			struct md_rdev *rdev;
8291 
8292 			rdev_for_each(rdev, mddev)
8293 				if (rdev->raid_disk >= 0 &&
8294 				    !test_bit(Faulty, &rdev->flags) &&
8295 				    rdev->recovery_offset != MaxSector &&
8296 				    rdev->recovery_offset) {
8297 					seq_printf(seq, "\trecover=REMOTE");
8298 					return 1;
8299 				}
8300 			if (mddev->reshape_position != MaxSector)
8301 				seq_printf(seq, "\treshape=REMOTE");
8302 			else
8303 				seq_printf(seq, "\tresync=REMOTE");
8304 			return 1;
8305 		}
8306 		if (mddev->recovery_cp < MaxSector) {
8307 			seq_printf(seq, "\tresync=PENDING");
8308 			return 1;
8309 		}
8310 		return 0;
8311 	}
8312 	if (resync < MD_RESYNC_ACTIVE) {
8313 		seq_printf(seq, "\tresync=DELAYED");
8314 		return 1;
8315 	}
8316 
8317 	WARN_ON(max_sectors == 0);
8318 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
8319 	 * in a sector_t, and (max_sectors>>scale) will fit in a
8320 	 * u32, as those are the requirements for sector_div.
8321 	 * Thus 'scale' must be at least 10
8322 	 */
8323 	scale = 10;
8324 	if (sizeof(sector_t) > sizeof(unsigned long)) {
8325 		while ( max_sectors/2 > (1ULL<<(scale+32)))
8326 			scale++;
8327 	}
8328 	res = (resync>>scale)*1000;
8329 	sector_div(res, (u32)((max_sectors>>scale)+1));
8330 
8331 	per_milli = res;
8332 	{
8333 		int i, x = per_milli/50, y = 20-x;
8334 		seq_printf(seq, "[");
8335 		for (i = 0; i < x; i++)
8336 			seq_printf(seq, "=");
8337 		seq_printf(seq, ">");
8338 		for (i = 0; i < y; i++)
8339 			seq_printf(seq, ".");
8340 		seq_printf(seq, "] ");
8341 	}
8342 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8343 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8344 		    "reshape" :
8345 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8346 		     "check" :
8347 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8348 		      "resync" : "recovery"))),
8349 		   per_milli/10, per_milli % 10,
8350 		   (unsigned long long) resync/2,
8351 		   (unsigned long long) max_sectors/2);
8352 
8353 	/*
8354 	 * dt: time from mark until now
8355 	 * db: blocks written from mark until now
8356 	 * rt: remaining time
8357 	 *
8358 	 * rt is a sector_t, which is always 64bit now. We are keeping
8359 	 * the original algorithm, but it is not really necessary.
8360 	 *
8361 	 * Original algorithm:
8362 	 *   So we divide before multiply in case it is 32bit and close
8363 	 *   to the limit.
8364 	 *   We scale the divisor (db) by 32 to avoid losing precision
8365 	 *   near the end of resync when the number of remaining sectors
8366 	 *   is close to 'db'.
8367 	 *   We then divide rt by 32 after multiplying by db to compensate.
8368 	 *   The '+1' avoids division by zero if db is very small.
8369 	 */
8370 	dt = ((jiffies - mddev->resync_mark) / HZ);
8371 	if (!dt) dt++;
8372 
8373 	curr_mark_cnt = mddev->curr_mark_cnt;
8374 	recovery_active = atomic_read(&mddev->recovery_active);
8375 	resync_mark_cnt = mddev->resync_mark_cnt;
8376 
8377 	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8378 		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8379 
8380 	rt = max_sectors - resync;    /* number of remaining sectors */
8381 	rt = div64_u64(rt, db/32+1);
8382 	rt *= dt;
8383 	rt >>= 5;
8384 
8385 	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8386 		   ((unsigned long)rt % 60)/6);
8387 
8388 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8389 	return 1;
8390 }
8391 
md_seq_start(struct seq_file * seq,loff_t * pos)8392 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8393 	__acquires(&all_mddevs_lock)
8394 {
8395 	seq->poll_event = atomic_read(&md_event_count);
8396 	spin_lock(&all_mddevs_lock);
8397 
8398 	return seq_list_start_head(&all_mddevs, *pos);
8399 }
8400 
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8401 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8402 {
8403 	return seq_list_next(v, &all_mddevs, pos);
8404 }
8405 
md_seq_stop(struct seq_file * seq,void * v)8406 static void md_seq_stop(struct seq_file *seq, void *v)
8407 	__releases(&all_mddevs_lock)
8408 {
8409 	spin_unlock(&all_mddevs_lock);
8410 }
8411 
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8412 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8413 {
8414 	struct md_bitmap_stats stats;
8415 	unsigned long used_pages;
8416 	unsigned long chunk_kb;
8417 	int err;
8418 
8419 	err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8420 	if (err)
8421 		return;
8422 
8423 	chunk_kb = mddev->bitmap_info.chunksize >> 10;
8424 	used_pages = stats.pages - stats.missing_pages;
8425 
8426 	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8427 		   used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8428 		   chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8429 		   chunk_kb ? "KB" : "B");
8430 
8431 	if (stats.file) {
8432 		seq_puts(seq, ", file: ");
8433 		seq_file_path(seq, stats.file, " \t\n");
8434 	}
8435 
8436 	seq_putc(seq, '\n');
8437 }
8438 
md_seq_show(struct seq_file * seq,void * v)8439 static int md_seq_show(struct seq_file *seq, void *v)
8440 {
8441 	struct mddev *mddev;
8442 	sector_t sectors;
8443 	struct md_rdev *rdev;
8444 
8445 	if (v == &all_mddevs) {
8446 		status_personalities(seq);
8447 		if (list_empty(&all_mddevs))
8448 			status_unused(seq);
8449 		return 0;
8450 	}
8451 
8452 	mddev = list_entry(v, struct mddev, all_mddevs);
8453 	if (!mddev_get(mddev))
8454 		return 0;
8455 
8456 	spin_unlock(&all_mddevs_lock);
8457 
8458 	/* prevent bitmap to be freed after checking */
8459 	mutex_lock(&mddev->bitmap_info.mutex);
8460 
8461 	spin_lock(&mddev->lock);
8462 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8463 		seq_printf(seq, "%s : ", mdname(mddev));
8464 		if (mddev->pers) {
8465 			if (test_bit(MD_BROKEN, &mddev->flags))
8466 				seq_printf(seq, "broken");
8467 			else
8468 				seq_printf(seq, "active");
8469 			if (mddev->ro == MD_RDONLY)
8470 				seq_printf(seq, " (read-only)");
8471 			if (mddev->ro == MD_AUTO_READ)
8472 				seq_printf(seq, " (auto-read-only)");
8473 			seq_printf(seq, " %s", mddev->pers->head.name);
8474 		} else {
8475 			seq_printf(seq, "inactive");
8476 		}
8477 
8478 		sectors = 0;
8479 		rcu_read_lock();
8480 		rdev_for_each_rcu(rdev, mddev) {
8481 			seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8482 
8483 			if (test_bit(WriteMostly, &rdev->flags))
8484 				seq_printf(seq, "(W)");
8485 			if (test_bit(Journal, &rdev->flags))
8486 				seq_printf(seq, "(J)");
8487 			if (test_bit(Faulty, &rdev->flags)) {
8488 				seq_printf(seq, "(F)");
8489 				continue;
8490 			}
8491 			if (rdev->raid_disk < 0)
8492 				seq_printf(seq, "(S)"); /* spare */
8493 			if (test_bit(Replacement, &rdev->flags))
8494 				seq_printf(seq, "(R)");
8495 			sectors += rdev->sectors;
8496 		}
8497 		rcu_read_unlock();
8498 
8499 		if (!list_empty(&mddev->disks)) {
8500 			if (mddev->pers)
8501 				seq_printf(seq, "\n      %llu blocks",
8502 					   (unsigned long long)
8503 					   mddev->array_sectors / 2);
8504 			else
8505 				seq_printf(seq, "\n      %llu blocks",
8506 					   (unsigned long long)sectors / 2);
8507 		}
8508 		if (mddev->persistent) {
8509 			if (mddev->major_version != 0 ||
8510 			    mddev->minor_version != 90) {
8511 				seq_printf(seq," super %d.%d",
8512 					   mddev->major_version,
8513 					   mddev->minor_version);
8514 			}
8515 		} else if (mddev->external)
8516 			seq_printf(seq, " super external:%s",
8517 				   mddev->metadata_type);
8518 		else
8519 			seq_printf(seq, " super non-persistent");
8520 
8521 		if (mddev->pers) {
8522 			mddev->pers->status(seq, mddev);
8523 			seq_printf(seq, "\n      ");
8524 			if (mddev->pers->sync_request) {
8525 				if (status_resync(seq, mddev))
8526 					seq_printf(seq, "\n      ");
8527 			}
8528 		} else
8529 			seq_printf(seq, "\n       ");
8530 
8531 		md_bitmap_status(seq, mddev);
8532 
8533 		seq_printf(seq, "\n");
8534 	}
8535 	spin_unlock(&mddev->lock);
8536 	mutex_unlock(&mddev->bitmap_info.mutex);
8537 	spin_lock(&all_mddevs_lock);
8538 
8539 	if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8540 		status_unused(seq);
8541 
8542 	mddev_put_locked(mddev);
8543 	return 0;
8544 }
8545 
8546 static const struct seq_operations md_seq_ops = {
8547 	.start  = md_seq_start,
8548 	.next   = md_seq_next,
8549 	.stop   = md_seq_stop,
8550 	.show   = md_seq_show,
8551 };
8552 
md_seq_open(struct inode * inode,struct file * file)8553 static int md_seq_open(struct inode *inode, struct file *file)
8554 {
8555 	struct seq_file *seq;
8556 	int error;
8557 
8558 	error = seq_open(file, &md_seq_ops);
8559 	if (error)
8560 		return error;
8561 
8562 	seq = file->private_data;
8563 	seq->poll_event = atomic_read(&md_event_count);
8564 	return error;
8565 }
8566 
8567 static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8568 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8569 {
8570 	struct seq_file *seq = filp->private_data;
8571 	__poll_t mask;
8572 
8573 	if (md_unloading)
8574 		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8575 	poll_wait(filp, &md_event_waiters, wait);
8576 
8577 	/* always allow read */
8578 	mask = EPOLLIN | EPOLLRDNORM;
8579 
8580 	if (seq->poll_event != atomic_read(&md_event_count))
8581 		mask |= EPOLLERR | EPOLLPRI;
8582 	return mask;
8583 }
8584 
8585 static const struct proc_ops mdstat_proc_ops = {
8586 	.proc_open	= md_seq_open,
8587 	.proc_read	= seq_read,
8588 	.proc_lseek	= seq_lseek,
8589 	.proc_release	= seq_release,
8590 	.proc_poll	= mdstat_poll,
8591 };
8592 
register_md_submodule(struct md_submodule_head * msh)8593 int register_md_submodule(struct md_submodule_head *msh)
8594 {
8595 	return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
8596 }
8597 EXPORT_SYMBOL_GPL(register_md_submodule);
8598 
unregister_md_submodule(struct md_submodule_head * msh)8599 void unregister_md_submodule(struct md_submodule_head *msh)
8600 {
8601 	xa_erase(&md_submodule, msh->id);
8602 }
8603 EXPORT_SYMBOL_GPL(unregister_md_submodule);
8604 
md_setup_cluster(struct mddev * mddev,int nodes)8605 int md_setup_cluster(struct mddev *mddev, int nodes)
8606 {
8607 	int ret = get_cluster_ops(mddev);
8608 
8609 	if (ret) {
8610 		request_module("md-cluster");
8611 		ret = get_cluster_ops(mddev);
8612 	}
8613 
8614 	/* ensure module won't be unloaded */
8615 	if (ret) {
8616 		pr_warn("can't find md-cluster module or get its reference.\n");
8617 		return ret;
8618 	}
8619 
8620 	ret = mddev->cluster_ops->join(mddev, nodes);
8621 	if (!ret)
8622 		mddev->safemode_delay = 0;
8623 	return ret;
8624 }
8625 
md_cluster_stop(struct mddev * mddev)8626 void md_cluster_stop(struct mddev *mddev)
8627 {
8628 	put_cluster_ops(mddev);
8629 }
8630 
is_rdev_holder_idle(struct md_rdev * rdev,bool init)8631 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
8632 {
8633 	unsigned long last_events = rdev->last_events;
8634 
8635 	if (!bdev_is_partition(rdev->bdev))
8636 		return true;
8637 
8638 	/*
8639 	 * If rdev is partition, and user doesn't issue IO to the array, the
8640 	 * array is still not idle if user issues IO to other partitions.
8641 	 */
8642 	rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
8643 						 sectors) -
8644 			    part_stat_read_accum(rdev->bdev, sectors);
8645 
8646 	return init || rdev->last_events <= last_events;
8647 }
8648 
8649 /*
8650  * mddev is idle if following conditions are matched since last check:
8651  * 1) mddev doesn't have normal IO completed;
8652  * 2) mddev doesn't have inflight normal IO;
8653  * 3) if any member disk is partition, and other partitions don't have IO
8654  *    completed;
8655  *
8656  * Noted this checking rely on IO accounting is enabled.
8657  */
is_mddev_idle(struct mddev * mddev,int init)8658 static bool is_mddev_idle(struct mddev *mddev, int init)
8659 {
8660 	unsigned long last_events = mddev->normal_io_events;
8661 	struct gendisk *disk;
8662 	struct md_rdev *rdev;
8663 	bool idle = true;
8664 
8665 	disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
8666 	if (!disk)
8667 		return true;
8668 
8669 	mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
8670 	if (!init && (mddev->normal_io_events > last_events ||
8671 		      bdev_count_inflight(disk->part0)))
8672 		idle = false;
8673 
8674 	rcu_read_lock();
8675 	rdev_for_each_rcu(rdev, mddev)
8676 		if (!is_rdev_holder_idle(rdev, init))
8677 			idle = false;
8678 	rcu_read_unlock();
8679 
8680 	return idle;
8681 }
8682 
md_done_sync(struct mddev * mddev,int blocks,int ok)8683 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8684 {
8685 	/* another "blocks" (512byte) blocks have been synced */
8686 	atomic_sub(blocks, &mddev->recovery_active);
8687 	wake_up(&mddev->recovery_wait);
8688 	if (!ok) {
8689 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8690 		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8691 		md_wakeup_thread(mddev->thread);
8692 		// stop recovery, signal do_sync ....
8693 	}
8694 }
8695 EXPORT_SYMBOL(md_done_sync);
8696 
8697 /* md_write_start(mddev, bi)
8698  * If we need to update some array metadata (e.g. 'active' flag
8699  * in superblock) before writing, schedule a superblock update
8700  * and wait for it to complete.
8701  * A return value of 'false' means that the write wasn't recorded
8702  * and cannot proceed as the array is being suspend.
8703  */
md_write_start(struct mddev * mddev,struct bio * bi)8704 void md_write_start(struct mddev *mddev, struct bio *bi)
8705 {
8706 	int did_change = 0;
8707 
8708 	if (bio_data_dir(bi) != WRITE)
8709 		return;
8710 
8711 	BUG_ON(mddev->ro == MD_RDONLY);
8712 	if (mddev->ro == MD_AUTO_READ) {
8713 		/* need to switch to read/write */
8714 		mddev->ro = MD_RDWR;
8715 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8716 		md_wakeup_thread(mddev->thread);
8717 		md_wakeup_thread(mddev->sync_thread);
8718 		did_change = 1;
8719 	}
8720 	rcu_read_lock();
8721 	percpu_ref_get(&mddev->writes_pending);
8722 	smp_mb(); /* Match smp_mb in set_in_sync() */
8723 	if (mddev->safemode == 1)
8724 		mddev->safemode = 0;
8725 	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8726 	if (mddev->in_sync || mddev->sync_checkers) {
8727 		spin_lock(&mddev->lock);
8728 		if (mddev->in_sync) {
8729 			mddev->in_sync = 0;
8730 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8731 			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8732 			md_wakeup_thread(mddev->thread);
8733 			did_change = 1;
8734 		}
8735 		spin_unlock(&mddev->lock);
8736 	}
8737 	rcu_read_unlock();
8738 	if (did_change)
8739 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8740 	if (!mddev->has_superblocks)
8741 		return;
8742 	wait_event(mddev->sb_wait,
8743 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8744 }
8745 EXPORT_SYMBOL(md_write_start);
8746 
8747 /* md_write_inc can only be called when md_write_start() has
8748  * already been called at least once of the current request.
8749  * It increments the counter and is useful when a single request
8750  * is split into several parts.  Each part causes an increment and
8751  * so needs a matching md_write_end().
8752  * Unlike md_write_start(), it is safe to call md_write_inc() inside
8753  * a spinlocked region.
8754  */
md_write_inc(struct mddev * mddev,struct bio * bi)8755 void md_write_inc(struct mddev *mddev, struct bio *bi)
8756 {
8757 	if (bio_data_dir(bi) != WRITE)
8758 		return;
8759 	WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
8760 	percpu_ref_get(&mddev->writes_pending);
8761 }
8762 EXPORT_SYMBOL(md_write_inc);
8763 
md_write_end(struct mddev * mddev)8764 void md_write_end(struct mddev *mddev)
8765 {
8766 	percpu_ref_put(&mddev->writes_pending);
8767 
8768 	if (mddev->safemode == 2)
8769 		md_wakeup_thread(mddev->thread);
8770 	else if (mddev->safemode_delay)
8771 		/* The roundup() ensures this only performs locking once
8772 		 * every ->safemode_delay jiffies
8773 		 */
8774 		mod_timer(&mddev->safemode_timer,
8775 			  roundup(jiffies, mddev->safemode_delay) +
8776 			  mddev->safemode_delay);
8777 }
8778 
8779 EXPORT_SYMBOL(md_write_end);
8780 
8781 /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)8782 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8783 			struct bio *bio, sector_t start, sector_t size)
8784 {
8785 	struct bio *discard_bio = NULL;
8786 
8787 	if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8788 			&discard_bio) || !discard_bio)
8789 		return;
8790 
8791 	bio_chain(discard_bio, bio);
8792 	bio_clone_blkg_association(discard_bio, bio);
8793 	mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
8794 	submit_bio_noacct(discard_bio);
8795 }
8796 EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8797 
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)8798 static void md_bitmap_start(struct mddev *mddev,
8799 			    struct md_io_clone *md_io_clone)
8800 {
8801 	if (mddev->pers->bitmap_sector)
8802 		mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
8803 					   &md_io_clone->sectors);
8804 
8805 	mddev->bitmap_ops->start_write(mddev, md_io_clone->offset,
8806 				       md_io_clone->sectors);
8807 }
8808 
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)8809 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
8810 {
8811 	mddev->bitmap_ops->end_write(mddev, md_io_clone->offset,
8812 				     md_io_clone->sectors);
8813 }
8814 
md_end_clone_io(struct bio * bio)8815 static void md_end_clone_io(struct bio *bio)
8816 {
8817 	struct md_io_clone *md_io_clone = bio->bi_private;
8818 	struct bio *orig_bio = md_io_clone->orig_bio;
8819 	struct mddev *mddev = md_io_clone->mddev;
8820 
8821 	if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8822 		md_bitmap_end(mddev, md_io_clone);
8823 
8824 	if (bio->bi_status && !orig_bio->bi_status)
8825 		orig_bio->bi_status = bio->bi_status;
8826 
8827 	if (md_io_clone->start_time)
8828 		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8829 
8830 	bio_put(bio);
8831 	bio_endio(orig_bio);
8832 	percpu_ref_put(&mddev->active_io);
8833 }
8834 
md_clone_bio(struct mddev * mddev,struct bio ** bio)8835 static void md_clone_bio(struct mddev *mddev, struct bio **bio)
8836 {
8837 	struct block_device *bdev = (*bio)->bi_bdev;
8838 	struct md_io_clone *md_io_clone;
8839 	struct bio *clone =
8840 		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
8841 
8842 	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
8843 	md_io_clone->orig_bio = *bio;
8844 	md_io_clone->mddev = mddev;
8845 	if (blk_queue_io_stat(bdev->bd_disk->queue))
8846 		md_io_clone->start_time = bio_start_io_acct(*bio);
8847 
8848 	if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
8849 		md_io_clone->offset = (*bio)->bi_iter.bi_sector;
8850 		md_io_clone->sectors = bio_sectors(*bio);
8851 		md_bitmap_start(mddev, md_io_clone);
8852 	}
8853 
8854 	clone->bi_end_io = md_end_clone_io;
8855 	clone->bi_private = md_io_clone;
8856 	*bio = clone;
8857 }
8858 
md_account_bio(struct mddev * mddev,struct bio ** bio)8859 void md_account_bio(struct mddev *mddev, struct bio **bio)
8860 {
8861 	percpu_ref_get(&mddev->active_io);
8862 	md_clone_bio(mddev, bio);
8863 }
8864 EXPORT_SYMBOL_GPL(md_account_bio);
8865 
md_free_cloned_bio(struct bio * bio)8866 void md_free_cloned_bio(struct bio *bio)
8867 {
8868 	struct md_io_clone *md_io_clone = bio->bi_private;
8869 	struct bio *orig_bio = md_io_clone->orig_bio;
8870 	struct mddev *mddev = md_io_clone->mddev;
8871 
8872 	if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8873 		md_bitmap_end(mddev, md_io_clone);
8874 
8875 	if (bio->bi_status && !orig_bio->bi_status)
8876 		orig_bio->bi_status = bio->bi_status;
8877 
8878 	if (md_io_clone->start_time)
8879 		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8880 
8881 	bio_put(bio);
8882 	percpu_ref_put(&mddev->active_io);
8883 }
8884 EXPORT_SYMBOL_GPL(md_free_cloned_bio);
8885 
8886 /* md_allow_write(mddev)
8887  * Calling this ensures that the array is marked 'active' so that writes
8888  * may proceed without blocking.  It is important to call this before
8889  * attempting a GFP_KERNEL allocation while holding the mddev lock.
8890  * Must be called with mddev_lock held.
8891  */
md_allow_write(struct mddev * mddev)8892 void md_allow_write(struct mddev *mddev)
8893 {
8894 	if (!mddev->pers)
8895 		return;
8896 	if (!md_is_rdwr(mddev))
8897 		return;
8898 	if (!mddev->pers->sync_request)
8899 		return;
8900 
8901 	spin_lock(&mddev->lock);
8902 	if (mddev->in_sync) {
8903 		mddev->in_sync = 0;
8904 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8905 		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8906 		if (mddev->safemode_delay &&
8907 		    mddev->safemode == 0)
8908 			mddev->safemode = 1;
8909 		spin_unlock(&mddev->lock);
8910 		md_update_sb(mddev, 0);
8911 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8912 		/* wait for the dirty state to be recorded in the metadata */
8913 		wait_event(mddev->sb_wait,
8914 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8915 	} else
8916 		spin_unlock(&mddev->lock);
8917 }
8918 EXPORT_SYMBOL_GPL(md_allow_write);
8919 
md_sync_max_sectors(struct mddev * mddev,enum sync_action action)8920 static sector_t md_sync_max_sectors(struct mddev *mddev,
8921 				    enum sync_action action)
8922 {
8923 	switch (action) {
8924 	case ACTION_RESYNC:
8925 	case ACTION_CHECK:
8926 	case ACTION_REPAIR:
8927 		atomic64_set(&mddev->resync_mismatches, 0);
8928 		fallthrough;
8929 	case ACTION_RESHAPE:
8930 		return mddev->resync_max_sectors;
8931 	case ACTION_RECOVER:
8932 		return mddev->dev_sectors;
8933 	default:
8934 		return 0;
8935 	}
8936 }
8937 
md_sync_position(struct mddev * mddev,enum sync_action action)8938 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
8939 {
8940 	sector_t start = 0;
8941 	struct md_rdev *rdev;
8942 
8943 	switch (action) {
8944 	case ACTION_CHECK:
8945 	case ACTION_REPAIR:
8946 		return mddev->resync_min;
8947 	case ACTION_RESYNC:
8948 		if (!mddev->bitmap)
8949 			return mddev->recovery_cp;
8950 		return 0;
8951 	case ACTION_RESHAPE:
8952 		/*
8953 		 * If the original node aborts reshaping then we continue the
8954 		 * reshaping, so set again to avoid restart reshape from the
8955 		 * first beginning
8956 		 */
8957 		if (mddev_is_clustered(mddev) &&
8958 		    mddev->reshape_position != MaxSector)
8959 			return mddev->reshape_position;
8960 		return 0;
8961 	case ACTION_RECOVER:
8962 		start = MaxSector;
8963 		rcu_read_lock();
8964 		rdev_for_each_rcu(rdev, mddev)
8965 			if (rdev->raid_disk >= 0 &&
8966 			    !test_bit(Journal, &rdev->flags) &&
8967 			    !test_bit(Faulty, &rdev->flags) &&
8968 			    !test_bit(In_sync, &rdev->flags) &&
8969 			    rdev->recovery_offset < start)
8970 				start = rdev->recovery_offset;
8971 		rcu_read_unlock();
8972 
8973 		/* If there is a bitmap, we need to make sure all
8974 		 * writes that started before we added a spare
8975 		 * complete before we start doing a recovery.
8976 		 * Otherwise the write might complete and (via
8977 		 * bitmap_endwrite) set a bit in the bitmap after the
8978 		 * recovery has checked that bit and skipped that
8979 		 * region.
8980 		 */
8981 		if (mddev->bitmap) {
8982 			mddev->pers->quiesce(mddev, 1);
8983 			mddev->pers->quiesce(mddev, 0);
8984 		}
8985 		return start;
8986 	default:
8987 		return MaxSector;
8988 	}
8989 }
8990 
sync_io_within_limit(struct mddev * mddev)8991 static bool sync_io_within_limit(struct mddev *mddev)
8992 {
8993 	int io_sectors;
8994 
8995 	/*
8996 	 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
8997 	 * RESYNC_PAGES(64k) per IO.
8998 	 */
8999 	if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
9000 		io_sectors = 8;
9001 	else
9002 		io_sectors = 128;
9003 
9004 	return atomic_read(&mddev->recovery_active) <
9005 		io_sectors * sync_io_depth(mddev);
9006 }
9007 
9008 #define SYNC_MARKS	10
9009 #define	SYNC_MARK_STEP	(3*HZ)
9010 #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)9011 void md_do_sync(struct md_thread *thread)
9012 {
9013 	struct mddev *mddev = thread->mddev;
9014 	struct mddev *mddev2;
9015 	unsigned int currspeed = 0, window;
9016 	sector_t max_sectors,j, io_sectors, recovery_done;
9017 	unsigned long mark[SYNC_MARKS];
9018 	unsigned long update_time;
9019 	sector_t mark_cnt[SYNC_MARKS];
9020 	int last_mark,m;
9021 	sector_t last_check;
9022 	int skipped = 0;
9023 	struct md_rdev *rdev;
9024 	enum sync_action action;
9025 	const char *desc;
9026 	struct blk_plug plug;
9027 	int ret;
9028 
9029 	/* just incase thread restarts... */
9030 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9031 		return;
9032 
9033 	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9034 		goto skip;
9035 
9036 	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
9037 	    !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
9038 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9039 		goto skip;
9040 	}
9041 
9042 	if (mddev_is_clustered(mddev)) {
9043 		ret = mddev->cluster_ops->resync_start(mddev);
9044 		if (ret)
9045 			goto skip;
9046 
9047 		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
9048 		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
9049 			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
9050 			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
9051 		     && ((unsigned long long)mddev->curr_resync_completed
9052 			 < (unsigned long long)mddev->resync_max_sectors))
9053 			goto skip;
9054 	}
9055 
9056 	action = md_sync_action(mddev);
9057 	desc = md_sync_action_name(action);
9058 	mddev->last_sync_action = action;
9059 
9060 	/*
9061 	 * Before starting a resync we must have set curr_resync to
9062 	 * 2, and then checked that every "conflicting" array has curr_resync
9063 	 * less than ours.  When we find one that is the same or higher
9064 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
9065 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
9066 	 * This will mean we have to start checking from the beginning again.
9067 	 *
9068 	 */
9069 	if (mddev_is_clustered(mddev))
9070 		mddev->cluster_ops->resync_start_notify(mddev);
9071 	do {
9072 		int mddev2_minor = -1;
9073 		mddev->curr_resync = MD_RESYNC_DELAYED;
9074 
9075 	try_again:
9076 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9077 			goto skip;
9078 		spin_lock(&all_mddevs_lock);
9079 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
9080 			if (test_bit(MD_DELETED, &mddev2->flags))
9081 				continue;
9082 			if (mddev2 == mddev)
9083 				continue;
9084 			if (!mddev->parallel_resync
9085 			&&  mddev2->curr_resync
9086 			&&  match_mddev_units(mddev, mddev2)) {
9087 				DEFINE_WAIT(wq);
9088 				if (mddev < mddev2 &&
9089 				    mddev->curr_resync == MD_RESYNC_DELAYED) {
9090 					/* arbitrarily yield */
9091 					mddev->curr_resync = MD_RESYNC_YIELDED;
9092 					wake_up(&resync_wait);
9093 				}
9094 				if (mddev > mddev2 &&
9095 				    mddev->curr_resync == MD_RESYNC_YIELDED)
9096 					/* no need to wait here, we can wait the next
9097 					 * time 'round when curr_resync == 2
9098 					 */
9099 					continue;
9100 				/* We need to wait 'interruptible' so as not to
9101 				 * contribute to the load average, and not to
9102 				 * be caught by 'softlockup'
9103 				 */
9104 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
9105 				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9106 				    mddev2->curr_resync >= mddev->curr_resync) {
9107 					if (mddev2_minor != mddev2->md_minor) {
9108 						mddev2_minor = mddev2->md_minor;
9109 						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
9110 							desc, mdname(mddev),
9111 							mdname(mddev2));
9112 					}
9113 					spin_unlock(&all_mddevs_lock);
9114 
9115 					if (signal_pending(current))
9116 						flush_signals(current);
9117 					schedule();
9118 					finish_wait(&resync_wait, &wq);
9119 					goto try_again;
9120 				}
9121 				finish_wait(&resync_wait, &wq);
9122 			}
9123 		}
9124 		spin_unlock(&all_mddevs_lock);
9125 	} while (mddev->curr_resync < MD_RESYNC_DELAYED);
9126 
9127 	max_sectors = md_sync_max_sectors(mddev, action);
9128 	j = md_sync_position(mddev, action);
9129 
9130 	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9131 	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
9132 	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9133 		 speed_max(mddev), desc);
9134 
9135 	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9136 
9137 	io_sectors = 0;
9138 	for (m = 0; m < SYNC_MARKS; m++) {
9139 		mark[m] = jiffies;
9140 		mark_cnt[m] = io_sectors;
9141 	}
9142 	last_mark = 0;
9143 	mddev->resync_mark = mark[last_mark];
9144 	mddev->resync_mark_cnt = mark_cnt[last_mark];
9145 
9146 	/*
9147 	 * Tune reconstruction:
9148 	 */
9149 	window = 32 * (PAGE_SIZE / 512);
9150 	pr_debug("md: using %dk window, over a total of %lluk.\n",
9151 		 window/2, (unsigned long long)max_sectors/2);
9152 
9153 	atomic_set(&mddev->recovery_active, 0);
9154 	last_check = 0;
9155 
9156 	if (j >= MD_RESYNC_ACTIVE) {
9157 		pr_debug("md: resuming %s of %s from checkpoint.\n",
9158 			 desc, mdname(mddev));
9159 		mddev->curr_resync = j;
9160 	} else
9161 		mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9162 	mddev->curr_resync_completed = j;
9163 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9164 	md_new_event();
9165 	update_time = jiffies;
9166 
9167 	blk_start_plug(&plug);
9168 	while (j < max_sectors) {
9169 		sector_t sectors;
9170 
9171 		skipped = 0;
9172 
9173 		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9174 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
9175 		      (mddev->curr_resync - mddev->curr_resync_completed)
9176 		      > (max_sectors >> 4)) ||
9177 		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9178 		     (j - mddev->curr_resync_completed)*2
9179 		     >= mddev->resync_max - mddev->curr_resync_completed ||
9180 		     mddev->curr_resync_completed > mddev->resync_max
9181 			    )) {
9182 			/* time to update curr_resync_completed */
9183 			wait_event(mddev->recovery_wait,
9184 				   atomic_read(&mddev->recovery_active) == 0);
9185 			mddev->curr_resync_completed = j;
9186 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9187 			    j > mddev->recovery_cp)
9188 				mddev->recovery_cp = j;
9189 			update_time = jiffies;
9190 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9191 			sysfs_notify_dirent_safe(mddev->sysfs_completed);
9192 		}
9193 
9194 		while (j >= mddev->resync_max &&
9195 		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9196 			/* As this condition is controlled by user-space,
9197 			 * we can block indefinitely, so use '_interruptible'
9198 			 * to avoid triggering warnings.
9199 			 */
9200 			flush_signals(current); /* just in case */
9201 			wait_event_interruptible(mddev->recovery_wait,
9202 						 mddev->resync_max > j
9203 						 || test_bit(MD_RECOVERY_INTR,
9204 							     &mddev->recovery));
9205 		}
9206 
9207 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9208 			break;
9209 
9210 		sectors = mddev->pers->sync_request(mddev, j, max_sectors,
9211 						    &skipped);
9212 		if (sectors == 0) {
9213 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9214 			break;
9215 		}
9216 
9217 		if (!skipped) { /* actual IO requested */
9218 			io_sectors += sectors;
9219 			atomic_add(sectors, &mddev->recovery_active);
9220 		}
9221 
9222 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9223 			break;
9224 
9225 		j += sectors;
9226 		if (j > max_sectors)
9227 			/* when skipping, extra large numbers can be returned. */
9228 			j = max_sectors;
9229 		if (j >= MD_RESYNC_ACTIVE)
9230 			mddev->curr_resync = j;
9231 		mddev->curr_mark_cnt = io_sectors;
9232 		if (last_check == 0)
9233 			/* this is the earliest that rebuild will be
9234 			 * visible in /proc/mdstat
9235 			 */
9236 			md_new_event();
9237 
9238 		if (last_check + window > io_sectors || j == max_sectors)
9239 			continue;
9240 
9241 		last_check = io_sectors;
9242 	repeat:
9243 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9244 			/* step marks */
9245 			int next = (last_mark+1) % SYNC_MARKS;
9246 
9247 			mddev->resync_mark = mark[next];
9248 			mddev->resync_mark_cnt = mark_cnt[next];
9249 			mark[next] = jiffies;
9250 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9251 			last_mark = next;
9252 		}
9253 
9254 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9255 			break;
9256 
9257 		/*
9258 		 * this loop exits only if either when we are slower than
9259 		 * the 'hard' speed limit, or the system was IO-idle for
9260 		 * a jiffy.
9261 		 * the system might be non-idle CPU-wise, but we only care
9262 		 * about not overloading the IO subsystem. (things like an
9263 		 * e2fsck being done on the RAID array should execute fast)
9264 		 */
9265 		cond_resched();
9266 
9267 		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9268 		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9269 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
9270 
9271 		if (currspeed > speed_min(mddev)) {
9272 			if (currspeed > speed_max(mddev)) {
9273 				msleep(500);
9274 				goto repeat;
9275 			}
9276 			if (!sync_io_within_limit(mddev) &&
9277 			    !is_mddev_idle(mddev, 0)) {
9278 				/*
9279 				 * Give other IO more of a chance.
9280 				 * The faster the devices, the less we wait.
9281 				 */
9282 				wait_event(mddev->recovery_wait,
9283 					   !atomic_read(&mddev->recovery_active));
9284 			}
9285 		}
9286 	}
9287 	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9288 		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9289 		? "interrupted" : "done");
9290 	/*
9291 	 * this also signals 'finished resyncing' to md_stop
9292 	 */
9293 	blk_finish_plug(&plug);
9294 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9295 
9296 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9297 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9298 	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9299 		mddev->curr_resync_completed = mddev->curr_resync;
9300 		sysfs_notify_dirent_safe(mddev->sysfs_completed);
9301 	}
9302 	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
9303 
9304 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9305 	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
9306 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9307 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9308 				if (mddev->curr_resync >= mddev->recovery_cp) {
9309 					pr_debug("md: checkpointing %s of %s.\n",
9310 						 desc, mdname(mddev));
9311 					if (test_bit(MD_RECOVERY_ERROR,
9312 						&mddev->recovery))
9313 						mddev->recovery_cp =
9314 							mddev->curr_resync_completed;
9315 					else
9316 						mddev->recovery_cp =
9317 							mddev->curr_resync;
9318 				}
9319 			} else
9320 				mddev->recovery_cp = MaxSector;
9321 		} else {
9322 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9323 				mddev->curr_resync = MaxSector;
9324 			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9325 			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9326 				rcu_read_lock();
9327 				rdev_for_each_rcu(rdev, mddev)
9328 					if (rdev->raid_disk >= 0 &&
9329 					    mddev->delta_disks >= 0 &&
9330 					    !test_bit(Journal, &rdev->flags) &&
9331 					    !test_bit(Faulty, &rdev->flags) &&
9332 					    !test_bit(In_sync, &rdev->flags) &&
9333 					    rdev->recovery_offset < mddev->curr_resync)
9334 						rdev->recovery_offset = mddev->curr_resync;
9335 				rcu_read_unlock();
9336 			}
9337 		}
9338 	}
9339  skip:
9340 	/* set CHANGE_PENDING here since maybe another update is needed,
9341 	 * so other nodes are informed. It should be harmless for normal
9342 	 * raid */
9343 	set_mask_bits(&mddev->sb_flags, 0,
9344 		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9345 
9346 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9347 			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9348 			mddev->delta_disks > 0 &&
9349 			mddev->pers->finish_reshape &&
9350 			mddev->pers->size &&
9351 			!mddev_is_dm(mddev)) {
9352 		mddev_lock_nointr(mddev);
9353 		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9354 		mddev_unlock(mddev);
9355 		if (!mddev_is_clustered(mddev))
9356 			set_capacity_and_notify(mddev->gendisk,
9357 						mddev->array_sectors);
9358 	}
9359 
9360 	spin_lock(&mddev->lock);
9361 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9362 		/* We completed so min/max setting can be forgotten if used. */
9363 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9364 			mddev->resync_min = 0;
9365 		mddev->resync_max = MaxSector;
9366 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9367 		mddev->resync_min = mddev->curr_resync_completed;
9368 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9369 	mddev->curr_resync = MD_RESYNC_NONE;
9370 	spin_unlock(&mddev->lock);
9371 
9372 	wake_up(&resync_wait);
9373 	md_wakeup_thread(mddev->thread);
9374 	return;
9375 }
9376 EXPORT_SYMBOL_GPL(md_do_sync);
9377 
rdev_removeable(struct md_rdev * rdev)9378 static bool rdev_removeable(struct md_rdev *rdev)
9379 {
9380 	/* rdev is not used. */
9381 	if (rdev->raid_disk < 0)
9382 		return false;
9383 
9384 	/* There are still inflight io, don't remove this rdev. */
9385 	if (atomic_read(&rdev->nr_pending))
9386 		return false;
9387 
9388 	/*
9389 	 * An error occurred but has not yet been acknowledged by the metadata
9390 	 * handler, don't remove this rdev.
9391 	 */
9392 	if (test_bit(Blocked, &rdev->flags))
9393 		return false;
9394 
9395 	/* Fautly rdev is not used, it's safe to remove it. */
9396 	if (test_bit(Faulty, &rdev->flags))
9397 		return true;
9398 
9399 	/* Journal disk can only be removed if it's faulty. */
9400 	if (test_bit(Journal, &rdev->flags))
9401 		return false;
9402 
9403 	/*
9404 	 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9405 	 * replacement has just become active from pers->spare_active(), and
9406 	 * then pers->hot_remove_disk() will replace this rdev with replacement.
9407 	 */
9408 	if (!test_bit(In_sync, &rdev->flags))
9409 		return true;
9410 
9411 	return false;
9412 }
9413 
rdev_is_spare(struct md_rdev * rdev)9414 static bool rdev_is_spare(struct md_rdev *rdev)
9415 {
9416 	return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9417 	       !test_bit(In_sync, &rdev->flags) &&
9418 	       !test_bit(Journal, &rdev->flags) &&
9419 	       !test_bit(Faulty, &rdev->flags);
9420 }
9421 
rdev_addable(struct md_rdev * rdev)9422 static bool rdev_addable(struct md_rdev *rdev)
9423 {
9424 	/* rdev is already used, don't add it again. */
9425 	if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9426 	    test_bit(Faulty, &rdev->flags))
9427 		return false;
9428 
9429 	/* Allow to add journal disk. */
9430 	if (test_bit(Journal, &rdev->flags))
9431 		return true;
9432 
9433 	/* Allow to add if array is read-write. */
9434 	if (md_is_rdwr(rdev->mddev))
9435 		return true;
9436 
9437 	/*
9438 	 * For read-only array, only allow to readd a rdev. And if bitmap is
9439 	 * used, don't allow to readd a rdev that is too old.
9440 	 */
9441 	if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9442 		return true;
9443 
9444 	return false;
9445 }
9446 
md_spares_need_change(struct mddev * mddev)9447 static bool md_spares_need_change(struct mddev *mddev)
9448 {
9449 	struct md_rdev *rdev;
9450 
9451 	rcu_read_lock();
9452 	rdev_for_each_rcu(rdev, mddev) {
9453 		if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9454 			rcu_read_unlock();
9455 			return true;
9456 		}
9457 	}
9458 	rcu_read_unlock();
9459 	return false;
9460 }
9461 
remove_spares(struct mddev * mddev,struct md_rdev * this)9462 static int remove_spares(struct mddev *mddev, struct md_rdev *this)
9463 {
9464 	struct md_rdev *rdev;
9465 	int removed = 0;
9466 
9467 	rdev_for_each(rdev, mddev) {
9468 		if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9469 		    !mddev->pers->hot_remove_disk(mddev, rdev)) {
9470 			sysfs_unlink_rdev(mddev, rdev);
9471 			rdev->saved_raid_disk = rdev->raid_disk;
9472 			rdev->raid_disk = -1;
9473 			removed++;
9474 		}
9475 	}
9476 
9477 	if (removed && mddev->kobj.sd)
9478 		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9479 
9480 	return removed;
9481 }
9482 
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9483 static int remove_and_add_spares(struct mddev *mddev,
9484 				 struct md_rdev *this)
9485 {
9486 	struct md_rdev *rdev;
9487 	int spares = 0;
9488 	int removed = 0;
9489 
9490 	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9491 		/* Mustn't remove devices when resync thread is running */
9492 		return 0;
9493 
9494 	removed = remove_spares(mddev, this);
9495 	if (this && removed)
9496 		goto no_add;
9497 
9498 	rdev_for_each(rdev, mddev) {
9499 		if (this && this != rdev)
9500 			continue;
9501 		if (rdev_is_spare(rdev))
9502 			spares++;
9503 		if (!rdev_addable(rdev))
9504 			continue;
9505 		if (!test_bit(Journal, &rdev->flags))
9506 			rdev->recovery_offset = 0;
9507 		if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9508 			/* failure here is OK */
9509 			sysfs_link_rdev(mddev, rdev);
9510 			if (!test_bit(Journal, &rdev->flags))
9511 				spares++;
9512 			md_new_event();
9513 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9514 		}
9515 	}
9516 no_add:
9517 	if (removed)
9518 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9519 	return spares;
9520 }
9521 
md_choose_sync_action(struct mddev * mddev,int * spares)9522 static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9523 {
9524 	/* Check if reshape is in progress first. */
9525 	if (mddev->reshape_position != MaxSector) {
9526 		if (mddev->pers->check_reshape == NULL ||
9527 		    mddev->pers->check_reshape(mddev) != 0)
9528 			return false;
9529 
9530 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9531 		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9532 		return true;
9533 	}
9534 
9535 	/* Check if resync is in progress. */
9536 	if (mddev->recovery_cp < MaxSector) {
9537 		remove_spares(mddev, NULL);
9538 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9539 		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9540 		return true;
9541 	}
9542 
9543 	/*
9544 	 * Remove any failed drives, then add spares if possible. Spares are
9545 	 * also removed and re-added, to allow the personality to fail the
9546 	 * re-add.
9547 	 */
9548 	*spares = remove_and_add_spares(mddev, NULL);
9549 	if (*spares) {
9550 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9551 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9552 		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9553 
9554 		/* Start new recovery. */
9555 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9556 		return true;
9557 	}
9558 
9559 	/* Delay to choose resync/check/repair in md_do_sync(). */
9560 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9561 		return true;
9562 
9563 	/* Nothing to be done */
9564 	return false;
9565 }
9566 
md_start_sync(struct work_struct * ws)9567 static void md_start_sync(struct work_struct *ws)
9568 {
9569 	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
9570 	int spares = 0;
9571 	bool suspend = false;
9572 	char *name;
9573 
9574 	/*
9575 	 * If reshape is still in progress, spares won't be added or removed
9576 	 * from conf until reshape is done.
9577 	 */
9578 	if (mddev->reshape_position == MaxSector &&
9579 	    md_spares_need_change(mddev)) {
9580 		suspend = true;
9581 		mddev_suspend(mddev, false);
9582 	}
9583 
9584 	mddev_lock_nointr(mddev);
9585 	if (!md_is_rdwr(mddev)) {
9586 		/*
9587 		 * On a read-only array we can:
9588 		 * - remove failed devices
9589 		 * - add already-in_sync devices if the array itself is in-sync.
9590 		 * As we only add devices that are already in-sync, we can
9591 		 * activate the spares immediately.
9592 		 */
9593 		remove_and_add_spares(mddev, NULL);
9594 		goto not_running;
9595 	}
9596 
9597 	if (!md_choose_sync_action(mddev, &spares))
9598 		goto not_running;
9599 
9600 	if (!mddev->pers->sync_request)
9601 		goto not_running;
9602 
9603 	/*
9604 	 * We are adding a device or devices to an array which has the bitmap
9605 	 * stored on all devices. So make sure all bitmap pages get written.
9606 	 */
9607 	if (spares)
9608 		mddev->bitmap_ops->write_all(mddev);
9609 
9610 	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
9611 			"reshape" : "resync";
9612 	rcu_assign_pointer(mddev->sync_thread,
9613 			   md_register_thread(md_do_sync, mddev, name));
9614 	if (!mddev->sync_thread) {
9615 		pr_warn("%s: could not start resync thread...\n",
9616 			mdname(mddev));
9617 		/* leave the spares where they are, it shouldn't hurt */
9618 		goto not_running;
9619 	}
9620 
9621 	mddev_unlock(mddev);
9622 	/*
9623 	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9624 	 * not set it again. Otherwise, we may cause issue like this one:
9625 	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9626 	 * Therefore, use __mddev_resume(mddev, false).
9627 	 */
9628 	if (suspend)
9629 		__mddev_resume(mddev, false);
9630 	md_wakeup_thread(mddev->sync_thread);
9631 	sysfs_notify_dirent_safe(mddev->sysfs_action);
9632 	md_new_event();
9633 	return;
9634 
9635 not_running:
9636 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9637 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9638 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9639 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9640 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9641 	mddev_unlock(mddev);
9642 	/*
9643 	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9644 	 * not set it again. Otherwise, we may cause issue like this one:
9645 	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9646 	 * Therefore, use __mddev_resume(mddev, false).
9647 	 */
9648 	if (suspend)
9649 		__mddev_resume(mddev, false);
9650 
9651 	wake_up(&resync_wait);
9652 	if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
9653 	    mddev->sysfs_action)
9654 		sysfs_notify_dirent_safe(mddev->sysfs_action);
9655 }
9656 
unregister_sync_thread(struct mddev * mddev)9657 static void unregister_sync_thread(struct mddev *mddev)
9658 {
9659 	if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9660 		/* resync/recovery still happening */
9661 		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9662 		return;
9663 	}
9664 
9665 	if (WARN_ON_ONCE(!mddev->sync_thread))
9666 		return;
9667 
9668 	md_reap_sync_thread(mddev);
9669 }
9670 
9671 /*
9672  * This routine is regularly called by all per-raid-array threads to
9673  * deal with generic issues like resync and super-block update.
9674  * Raid personalities that don't have a thread (linear/raid0) do not
9675  * need this as they never do any recovery or update the superblock.
9676  *
9677  * It does not do any resync itself, but rather "forks" off other threads
9678  * to do that as needed.
9679  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9680  * "->recovery" and create a thread at ->sync_thread.
9681  * When the thread finishes it sets MD_RECOVERY_DONE
9682  * and wakeups up this thread which will reap the thread and finish up.
9683  * This thread also removes any faulty devices (with nr_pending == 0).
9684  *
9685  * The overall approach is:
9686  *  1/ if the superblock needs updating, update it.
9687  *  2/ If a recovery thread is running, don't do anything else.
9688  *  3/ If recovery has finished, clean up, possibly marking spares active.
9689  *  4/ If there are any faulty devices, remove them.
9690  *  5/ If array is degraded, try to add spares devices
9691  *  6/ If array has spares or is not in-sync, start a resync thread.
9692  */
md_check_recovery(struct mddev * mddev)9693 void md_check_recovery(struct mddev *mddev)
9694 {
9695 	if (mddev->bitmap)
9696 		mddev->bitmap_ops->daemon_work(mddev);
9697 
9698 	if (signal_pending(current)) {
9699 		if (mddev->pers->sync_request && !mddev->external) {
9700 			pr_debug("md: %s in immediate safe mode\n",
9701 				 mdname(mddev));
9702 			mddev->safemode = 2;
9703 		}
9704 		flush_signals(current);
9705 	}
9706 
9707 	if (!md_is_rdwr(mddev) &&
9708 	    !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9709 	    !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9710 		return;
9711 	if ( ! (
9712 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9713 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9714 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9715 		(mddev->external == 0 && mddev->safemode == 1) ||
9716 		(mddev->safemode == 2
9717 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9718 		))
9719 		return;
9720 
9721 	if (mddev_trylock(mddev)) {
9722 		bool try_set_sync = mddev->safemode != 0;
9723 
9724 		if (!mddev->external && mddev->safemode == 1)
9725 			mddev->safemode = 0;
9726 
9727 		if (!md_is_rdwr(mddev)) {
9728 			struct md_rdev *rdev;
9729 
9730 			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9731 				unregister_sync_thread(mddev);
9732 				goto unlock;
9733 			}
9734 
9735 			if (!mddev->external && mddev->in_sync)
9736 				/*
9737 				 * 'Blocked' flag not needed as failed devices
9738 				 * will be recorded if array switched to read/write.
9739 				 * Leaving it set will prevent the device
9740 				 * from being removed.
9741 				 */
9742 				rdev_for_each(rdev, mddev)
9743 					clear_bit(Blocked, &rdev->flags);
9744 
9745 			/*
9746 			 * There is no thread, but we need to call
9747 			 * ->spare_active and clear saved_raid_disk
9748 			 */
9749 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9750 			md_reap_sync_thread(mddev);
9751 
9752 			/*
9753 			 * Let md_start_sync() to remove and add rdevs to the
9754 			 * array.
9755 			 */
9756 			if (md_spares_need_change(mddev)) {
9757 				set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9758 				queue_work(md_misc_wq, &mddev->sync_work);
9759 			}
9760 
9761 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9762 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9763 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9764 
9765 			goto unlock;
9766 		}
9767 
9768 		if (mddev_is_clustered(mddev)) {
9769 			struct md_rdev *rdev, *tmp;
9770 			/* kick the device if another node issued a
9771 			 * remove disk.
9772 			 */
9773 			rdev_for_each_safe(rdev, tmp, mddev) {
9774 				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9775 						rdev->raid_disk < 0)
9776 					md_kick_rdev_from_array(rdev);
9777 			}
9778 		}
9779 
9780 		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9781 			spin_lock(&mddev->lock);
9782 			set_in_sync(mddev);
9783 			spin_unlock(&mddev->lock);
9784 		}
9785 
9786 		if (mddev->sb_flags)
9787 			md_update_sb(mddev, 0);
9788 
9789 		/*
9790 		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
9791 		 * still set.
9792 		 */
9793 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9794 			unregister_sync_thread(mddev);
9795 			goto unlock;
9796 		}
9797 
9798 		/* Set RUNNING before clearing NEEDED to avoid
9799 		 * any transients in the value of "sync_action".
9800 		 */
9801 		mddev->curr_resync_completed = 0;
9802 		spin_lock(&mddev->lock);
9803 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9804 		spin_unlock(&mddev->lock);
9805 		/* Clear some bits that don't mean anything, but
9806 		 * might be left set
9807 		 */
9808 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9809 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9810 
9811 		if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9812 		    !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
9813 			queue_work(md_misc_wq, &mddev->sync_work);
9814 		} else {
9815 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9816 			wake_up(&resync_wait);
9817 		}
9818 
9819 	unlock:
9820 		wake_up(&mddev->sb_wait);
9821 		mddev_unlock(mddev);
9822 	}
9823 }
9824 EXPORT_SYMBOL(md_check_recovery);
9825 
md_reap_sync_thread(struct mddev * mddev)9826 void md_reap_sync_thread(struct mddev *mddev)
9827 {
9828 	struct md_rdev *rdev;
9829 	sector_t old_dev_sectors = mddev->dev_sectors;
9830 	bool is_reshaped = false;
9831 
9832 	/* resync has finished, collect result */
9833 	md_unregister_thread(mddev, &mddev->sync_thread);
9834 	atomic_inc(&mddev->sync_seq);
9835 
9836 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9837 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9838 	    mddev->degraded != mddev->raid_disks) {
9839 		/* success...*/
9840 		/* activate any spares */
9841 		if (mddev->pers->spare_active(mddev)) {
9842 			sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9843 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9844 		}
9845 	}
9846 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9847 	    mddev->pers->finish_reshape) {
9848 		mddev->pers->finish_reshape(mddev);
9849 		if (mddev_is_clustered(mddev))
9850 			is_reshaped = true;
9851 	}
9852 
9853 	/* If array is no-longer degraded, then any saved_raid_disk
9854 	 * information must be scrapped.
9855 	 */
9856 	if (!mddev->degraded)
9857 		rdev_for_each(rdev, mddev)
9858 			rdev->saved_raid_disk = -1;
9859 
9860 	md_update_sb(mddev, 1);
9861 	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9862 	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9863 	 * clustered raid */
9864 	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9865 		mddev->cluster_ops->resync_finish(mddev);
9866 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9867 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9868 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9869 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9870 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9871 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9872 	/*
9873 	 * We call mddev->cluster_ops->update_size here because sync_size could
9874 	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9875 	 * so it is time to update size across cluster.
9876 	 */
9877 	if (mddev_is_clustered(mddev) && is_reshaped
9878 				      && !test_bit(MD_CLOSING, &mddev->flags))
9879 		mddev->cluster_ops->update_size(mddev, old_dev_sectors);
9880 	/* flag recovery needed just to double check */
9881 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9882 	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9883 	sysfs_notify_dirent_safe(mddev->sysfs_action);
9884 	md_new_event();
9885 	if (mddev->event_work.func)
9886 		queue_work(md_misc_wq, &mddev->event_work);
9887 	wake_up(&resync_wait);
9888 }
9889 EXPORT_SYMBOL(md_reap_sync_thread);
9890 
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)9891 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9892 {
9893 	sysfs_notify_dirent_safe(rdev->sysfs_state);
9894 	wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
9895 			   msecs_to_jiffies(5000));
9896 	rdev_dec_pending(rdev, mddev);
9897 }
9898 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9899 
md_finish_reshape(struct mddev * mddev)9900 void md_finish_reshape(struct mddev *mddev)
9901 {
9902 	/* called be personality module when reshape completes. */
9903 	struct md_rdev *rdev;
9904 
9905 	rdev_for_each(rdev, mddev) {
9906 		if (rdev->data_offset > rdev->new_data_offset)
9907 			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9908 		else
9909 			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9910 		rdev->data_offset = rdev->new_data_offset;
9911 	}
9912 }
9913 EXPORT_SYMBOL(md_finish_reshape);
9914 
9915 /* Bad block management */
9916 
9917 /* Returns true on success, false on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9918 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9919 			int is_new)
9920 {
9921 	struct mddev *mddev = rdev->mddev;
9922 
9923 	/*
9924 	 * Recording new badblocks for faulty rdev will force unnecessary
9925 	 * super block updating. This is fragile for external management because
9926 	 * userspace daemon may trying to remove this device and deadlock may
9927 	 * occur. This will be probably solved in the mdadm, but it is safer to
9928 	 * avoid it.
9929 	 */
9930 	if (test_bit(Faulty, &rdev->flags))
9931 		return true;
9932 
9933 	if (is_new)
9934 		s += rdev->new_data_offset;
9935 	else
9936 		s += rdev->data_offset;
9937 
9938 	if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
9939 		return false;
9940 
9941 	/* Make sure they get written out promptly */
9942 	if (test_bit(ExternalBbl, &rdev->flags))
9943 		sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9944 	sysfs_notify_dirent_safe(rdev->sysfs_state);
9945 	set_mask_bits(&mddev->sb_flags, 0,
9946 		      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9947 	md_wakeup_thread(rdev->mddev->thread);
9948 	return true;
9949 }
9950 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9951 
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9952 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9953 			  int is_new)
9954 {
9955 	if (is_new)
9956 		s += rdev->new_data_offset;
9957 	else
9958 		s += rdev->data_offset;
9959 
9960 	if (!badblocks_clear(&rdev->badblocks, s, sectors))
9961 		return;
9962 
9963 	if (test_bit(ExternalBbl, &rdev->flags))
9964 		sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9965 }
9966 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9967 
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)9968 static int md_notify_reboot(struct notifier_block *this,
9969 			    unsigned long code, void *x)
9970 {
9971 	struct mddev *mddev;
9972 	int need_delay = 0;
9973 
9974 	spin_lock(&all_mddevs_lock);
9975 	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
9976 		if (!mddev_get(mddev))
9977 			continue;
9978 		spin_unlock(&all_mddevs_lock);
9979 		if (mddev_trylock(mddev)) {
9980 			if (mddev->pers)
9981 				__md_stop_writes(mddev);
9982 			if (mddev->persistent)
9983 				mddev->safemode = 2;
9984 			mddev_unlock(mddev);
9985 		}
9986 		need_delay = 1;
9987 		spin_lock(&all_mddevs_lock);
9988 		mddev_put_locked(mddev);
9989 	}
9990 	spin_unlock(&all_mddevs_lock);
9991 
9992 	/*
9993 	 * certain more exotic SCSI devices are known to be
9994 	 * volatile wrt too early system reboots. While the
9995 	 * right place to handle this issue is the given
9996 	 * driver, we do want to have a safe RAID driver ...
9997 	 */
9998 	if (need_delay)
9999 		msleep(1000);
10000 
10001 	return NOTIFY_DONE;
10002 }
10003 
10004 static struct notifier_block md_notifier = {
10005 	.notifier_call	= md_notify_reboot,
10006 	.next		= NULL,
10007 	.priority	= INT_MAX, /* before any real devices */
10008 };
10009 
md_geninit(void)10010 static void md_geninit(void)
10011 {
10012 	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
10013 
10014 	proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
10015 }
10016 
md_init(void)10017 static int __init md_init(void)
10018 {
10019 	int ret = -ENOMEM;
10020 
10021 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
10022 	if (!md_wq)
10023 		goto err_wq;
10024 
10025 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
10026 	if (!md_misc_wq)
10027 		goto err_misc_wq;
10028 
10029 	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
10030 				       0);
10031 	if (!md_bitmap_wq)
10032 		goto err_bitmap_wq;
10033 
10034 	ret = __register_blkdev(MD_MAJOR, "md", md_probe);
10035 	if (ret < 0)
10036 		goto err_md;
10037 
10038 	ret = __register_blkdev(0, "mdp", md_probe);
10039 	if (ret < 0)
10040 		goto err_mdp;
10041 	mdp_major = ret;
10042 
10043 	register_reboot_notifier(&md_notifier);
10044 	raid_table_header = register_sysctl("dev/raid", raid_table);
10045 
10046 	md_geninit();
10047 	return 0;
10048 
10049 err_mdp:
10050 	unregister_blkdev(MD_MAJOR, "md");
10051 err_md:
10052 	destroy_workqueue(md_bitmap_wq);
10053 err_bitmap_wq:
10054 	destroy_workqueue(md_misc_wq);
10055 err_misc_wq:
10056 	destroy_workqueue(md_wq);
10057 err_wq:
10058 	return ret;
10059 }
10060 
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)10061 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
10062 {
10063 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
10064 	struct md_rdev *rdev2, *tmp;
10065 	int role, ret;
10066 
10067 	/*
10068 	 * If size is changed in another node then we need to
10069 	 * do resize as well.
10070 	 */
10071 	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
10072 		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
10073 		if (ret)
10074 			pr_info("md-cluster: resize failed\n");
10075 		else
10076 			mddev->bitmap_ops->update_sb(mddev->bitmap);
10077 	}
10078 
10079 	/* Check for change of roles in the active devices */
10080 	rdev_for_each_safe(rdev2, tmp, mddev) {
10081 		if (test_bit(Faulty, &rdev2->flags))
10082 			continue;
10083 
10084 		/* Check if the roles changed */
10085 		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
10086 
10087 		if (test_bit(Candidate, &rdev2->flags)) {
10088 			if (role == MD_DISK_ROLE_FAULTY) {
10089 				pr_info("md: Removing Candidate device %pg because add failed\n",
10090 					rdev2->bdev);
10091 				md_kick_rdev_from_array(rdev2);
10092 				continue;
10093 			}
10094 			else
10095 				clear_bit(Candidate, &rdev2->flags);
10096 		}
10097 
10098 		if (role != rdev2->raid_disk) {
10099 			/*
10100 			 * got activated except reshape is happening.
10101 			 */
10102 			if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
10103 			    !(le32_to_cpu(sb->feature_map) &
10104 			      MD_FEATURE_RESHAPE_ACTIVE) &&
10105 			    !mddev->cluster_ops->resync_status_get(mddev)) {
10106 				/*
10107 				 * -1 to make raid1_add_disk() set conf->fullsync
10108 				 * to 1. This could avoid skipping sync when the
10109 				 * remote node is down during resyncing.
10110 				 */
10111 				if ((le32_to_cpu(sb->feature_map)
10112 				    & MD_FEATURE_RECOVERY_OFFSET))
10113 					rdev2->saved_raid_disk = -1;
10114 				else
10115 					rdev2->saved_raid_disk = role;
10116 				ret = remove_and_add_spares(mddev, rdev2);
10117 				pr_info("Activated spare: %pg\n",
10118 					rdev2->bdev);
10119 				/* wakeup mddev->thread here, so array could
10120 				 * perform resync with the new activated disk */
10121 				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10122 				md_wakeup_thread(mddev->thread);
10123 			}
10124 			/* device faulty
10125 			 * We just want to do the minimum to mark the disk
10126 			 * as faulty. The recovery is performed by the
10127 			 * one who initiated the error.
10128 			 */
10129 			if (role == MD_DISK_ROLE_FAULTY ||
10130 			    role == MD_DISK_ROLE_JOURNAL) {
10131 				md_error(mddev, rdev2);
10132 				clear_bit(Blocked, &rdev2->flags);
10133 			}
10134 		}
10135 	}
10136 
10137 	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
10138 		ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
10139 		if (ret)
10140 			pr_warn("md: updating array disks failed. %d\n", ret);
10141 	}
10142 
10143 	/*
10144 	 * Since mddev->delta_disks has already updated in update_raid_disks,
10145 	 * so it is time to check reshape.
10146 	 */
10147 	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10148 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10149 		/*
10150 		 * reshape is happening in the remote node, we need to
10151 		 * update reshape_position and call start_reshape.
10152 		 */
10153 		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10154 		if (mddev->pers->update_reshape_pos)
10155 			mddev->pers->update_reshape_pos(mddev);
10156 		if (mddev->pers->start_reshape)
10157 			mddev->pers->start_reshape(mddev);
10158 	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10159 		   mddev->reshape_position != MaxSector &&
10160 		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10161 		/* reshape is just done in another node. */
10162 		mddev->reshape_position = MaxSector;
10163 		if (mddev->pers->update_reshape_pos)
10164 			mddev->pers->update_reshape_pos(mddev);
10165 	}
10166 
10167 	/* Finally set the event to be up to date */
10168 	mddev->events = le64_to_cpu(sb->events);
10169 }
10170 
read_rdev(struct mddev * mddev,struct md_rdev * rdev)10171 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10172 {
10173 	int err;
10174 	struct page *swapout = rdev->sb_page;
10175 	struct mdp_superblock_1 *sb;
10176 
10177 	/* Store the sb page of the rdev in the swapout temporary
10178 	 * variable in case we err in the future
10179 	 */
10180 	rdev->sb_page = NULL;
10181 	err = alloc_disk_sb(rdev);
10182 	if (err == 0) {
10183 		ClearPageUptodate(rdev->sb_page);
10184 		rdev->sb_loaded = 0;
10185 		err = super_types[mddev->major_version].
10186 			load_super(rdev, NULL, mddev->minor_version);
10187 	}
10188 	if (err < 0) {
10189 		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10190 				__func__, __LINE__, rdev->desc_nr, err);
10191 		if (rdev->sb_page)
10192 			put_page(rdev->sb_page);
10193 		rdev->sb_page = swapout;
10194 		rdev->sb_loaded = 1;
10195 		return err;
10196 	}
10197 
10198 	sb = page_address(rdev->sb_page);
10199 	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10200 	 * is not set
10201 	 */
10202 
10203 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10204 		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10205 
10206 	/* The other node finished recovery, call spare_active to set
10207 	 * device In_sync and mddev->degraded
10208 	 */
10209 	if (rdev->recovery_offset == MaxSector &&
10210 	    !test_bit(In_sync, &rdev->flags) &&
10211 	    mddev->pers->spare_active(mddev))
10212 		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10213 
10214 	put_page(swapout);
10215 	return 0;
10216 }
10217 
md_reload_sb(struct mddev * mddev,int nr)10218 void md_reload_sb(struct mddev *mddev, int nr)
10219 {
10220 	struct md_rdev *rdev = NULL, *iter;
10221 	int err;
10222 
10223 	/* Find the rdev */
10224 	rdev_for_each_rcu(iter, mddev) {
10225 		if (iter->desc_nr == nr) {
10226 			rdev = iter;
10227 			break;
10228 		}
10229 	}
10230 
10231 	if (!rdev) {
10232 		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10233 		return;
10234 	}
10235 
10236 	err = read_rdev(mddev, rdev);
10237 	if (err < 0)
10238 		return;
10239 
10240 	check_sb_changes(mddev, rdev);
10241 
10242 	/* Read all rdev's to update recovery_offset */
10243 	rdev_for_each_rcu(rdev, mddev) {
10244 		if (!test_bit(Faulty, &rdev->flags))
10245 			read_rdev(mddev, rdev);
10246 	}
10247 }
10248 EXPORT_SYMBOL(md_reload_sb);
10249 
10250 #ifndef MODULE
10251 
10252 /*
10253  * Searches all registered partitions for autorun RAID arrays
10254  * at boot time.
10255  */
10256 
10257 static DEFINE_MUTEX(detected_devices_mutex);
10258 static LIST_HEAD(all_detected_devices);
10259 struct detected_devices_node {
10260 	struct list_head list;
10261 	dev_t dev;
10262 };
10263 
md_autodetect_dev(dev_t dev)10264 void md_autodetect_dev(dev_t dev)
10265 {
10266 	struct detected_devices_node *node_detected_dev;
10267 
10268 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
10269 	if (node_detected_dev) {
10270 		node_detected_dev->dev = dev;
10271 		mutex_lock(&detected_devices_mutex);
10272 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
10273 		mutex_unlock(&detected_devices_mutex);
10274 	}
10275 }
10276 
md_autostart_arrays(int part)10277 void md_autostart_arrays(int part)
10278 {
10279 	struct md_rdev *rdev;
10280 	struct detected_devices_node *node_detected_dev;
10281 	dev_t dev;
10282 	int i_scanned, i_passed;
10283 
10284 	i_scanned = 0;
10285 	i_passed = 0;
10286 
10287 	pr_info("md: Autodetecting RAID arrays.\n");
10288 
10289 	mutex_lock(&detected_devices_mutex);
10290 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10291 		i_scanned++;
10292 		node_detected_dev = list_entry(all_detected_devices.next,
10293 					struct detected_devices_node, list);
10294 		list_del(&node_detected_dev->list);
10295 		dev = node_detected_dev->dev;
10296 		kfree(node_detected_dev);
10297 		mutex_unlock(&detected_devices_mutex);
10298 		rdev = md_import_device(dev,0, 90);
10299 		mutex_lock(&detected_devices_mutex);
10300 		if (IS_ERR(rdev))
10301 			continue;
10302 
10303 		if (test_bit(Faulty, &rdev->flags))
10304 			continue;
10305 
10306 		set_bit(AutoDetected, &rdev->flags);
10307 		list_add(&rdev->same_set, &pending_raid_disks);
10308 		i_passed++;
10309 	}
10310 	mutex_unlock(&detected_devices_mutex);
10311 
10312 	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10313 
10314 	autorun_devices(part);
10315 }
10316 
10317 #endif /* !MODULE */
10318 
md_exit(void)10319 static __exit void md_exit(void)
10320 {
10321 	struct mddev *mddev;
10322 	int delay = 1;
10323 
10324 	unregister_blkdev(MD_MAJOR,"md");
10325 	unregister_blkdev(mdp_major, "mdp");
10326 	unregister_reboot_notifier(&md_notifier);
10327 	unregister_sysctl_table(raid_table_header);
10328 
10329 	/* We cannot unload the modules while some process is
10330 	 * waiting for us in select() or poll() - wake them up
10331 	 */
10332 	md_unloading = 1;
10333 	while (waitqueue_active(&md_event_waiters)) {
10334 		/* not safe to leave yet */
10335 		wake_up(&md_event_waiters);
10336 		msleep(delay);
10337 		delay += delay;
10338 	}
10339 	remove_proc_entry("mdstat", NULL);
10340 
10341 	spin_lock(&all_mddevs_lock);
10342 	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10343 		if (!mddev_get(mddev))
10344 			continue;
10345 		spin_unlock(&all_mddevs_lock);
10346 		export_array(mddev);
10347 		mddev->ctime = 0;
10348 		mddev->hold_active = 0;
10349 		/*
10350 		 * As the mddev is now fully clear, mddev_put will schedule
10351 		 * the mddev for destruction by a workqueue, and the
10352 		 * destroy_workqueue() below will wait for that to complete.
10353 		 */
10354 		spin_lock(&all_mddevs_lock);
10355 		mddev_put_locked(mddev);
10356 	}
10357 	spin_unlock(&all_mddevs_lock);
10358 
10359 	destroy_workqueue(md_misc_wq);
10360 	destroy_workqueue(md_bitmap_wq);
10361 	destroy_workqueue(md_wq);
10362 }
10363 
10364 subsys_initcall(md_init);
module_exit(md_exit)10365 module_exit(md_exit)
10366 
10367 static int get_ro(char *buffer, const struct kernel_param *kp)
10368 {
10369 	return sprintf(buffer, "%d\n", start_readonly);
10370 }
set_ro(const char * val,const struct kernel_param * kp)10371 static int set_ro(const char *val, const struct kernel_param *kp)
10372 {
10373 	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10374 }
10375 
10376 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10377 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10378 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10379 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10380 
10381 MODULE_LICENSE("GPL");
10382 MODULE_DESCRIPTION("MD RAID framework");
10383 MODULE_ALIAS("md");
10384 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10385