xref: /linux/drivers/md/md.c (revision aa23aa55166c2865ac430168c4b9d405cf8c6980)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3    md.c : Multiple Devices driver for Linux
4      Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 
6      completely rewritten, based on the MD driver code from Marc Zyngier
7 
8    Changes:
9 
10    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14    - kmod support by: Cyrus Durgin
15    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 
18    - lots of fixes and improvements to the RAID1/RAID5 and generic
19      RAID code (such as request based resynchronization):
20 
21      Neil Brown <neilb@cse.unsw.edu.au>.
22 
23    - persistent bitmap code
24      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25 
26 
27    Errors, Warnings, etc.
28    Please use:
29      pr_crit() for error conditions that risk data loss
30      pr_err() for error conditions that are unexpected, like an IO error
31          or internal inconsistency
32      pr_warn() for error conditions that could have been predicated, like
33          adding a device to an array when it has incompatible metadata
34      pr_info() for every interesting, very rare events, like an array starting
35          or stopping, or resync starting or stopping
36      pr_debug() for everything else.
37 
38 */
39 
40 #include <linux/sched/signal.h>
41 #include <linux/kthread.h>
42 #include <linux/blkdev.h>
43 #include <linux/badblocks.h>
44 #include <linux/sysctl.h>
45 #include <linux/seq_file.h>
46 #include <linux/fs.h>
47 #include <linux/poll.h>
48 #include <linux/ctype.h>
49 #include <linux/string.h>
50 #include <linux/hdreg.h>
51 #include <linux/proc_fs.h>
52 #include <linux/random.h>
53 #include <linux/module.h>
54 #include <linux/reboot.h>
55 #include <linux/file.h>
56 #include <linux/compat.h>
57 #include <linux/delay.h>
58 #include <linux/raid/md_p.h>
59 #include <linux/raid/md_u.h>
60 #include <linux/slab.h>
61 #include <linux/percpu-refcount.h>
62 
63 #include <trace/events/block.h>
64 #include "md.h"
65 #include "md-bitmap.h"
66 #include "md-cluster.h"
67 
68 #ifndef MODULE
69 static void autostart_arrays(int part);
70 #endif
71 
72 /* pers_list is a list of registered personalities protected
73  * by pers_lock.
74  * pers_lock does extra service to protect accesses to
75  * mddev->thread when the mutex cannot be held.
76  */
77 static LIST_HEAD(pers_list);
78 static DEFINE_SPINLOCK(pers_lock);
79 
80 static struct kobj_type md_ktype;
81 
82 struct md_cluster_operations *md_cluster_ops;
83 EXPORT_SYMBOL(md_cluster_ops);
84 static struct module *md_cluster_mod;
85 
86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87 static struct workqueue_struct *md_wq;
88 static struct workqueue_struct *md_misc_wq;
89 
90 static int remove_and_add_spares(struct mddev *mddev,
91 				 struct md_rdev *this);
92 static void mddev_detach(struct mddev *mddev);
93 
94 /*
95  * Default number of read corrections we'll attempt on an rdev
96  * before ejecting it from the array. We divide the read error
97  * count by 2 for every hour elapsed between read errors.
98  */
99 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
100 /*
101  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
102  * is 1000 KB/sec, so the extra system load does not show up that much.
103  * Increase it if you want to have more _guaranteed_ speed. Note that
104  * the RAID driver will use the maximum available bandwidth if the IO
105  * subsystem is idle. There is also an 'absolute maximum' reconstruction
106  * speed limit - in case reconstruction slows down your system despite
107  * idle IO detection.
108  *
109  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
110  * or /sys/block/mdX/md/sync_speed_{min,max}
111  */
112 
113 static int sysctl_speed_limit_min = 1000;
114 static int sysctl_speed_limit_max = 200000;
115 static inline int speed_min(struct mddev *mddev)
116 {
117 	return mddev->sync_speed_min ?
118 		mddev->sync_speed_min : sysctl_speed_limit_min;
119 }
120 
121 static inline int speed_max(struct mddev *mddev)
122 {
123 	return mddev->sync_speed_max ?
124 		mddev->sync_speed_max : sysctl_speed_limit_max;
125 }
126 
127 static struct ctl_table_header *raid_table_header;
128 
129 static struct ctl_table raid_table[] = {
130 	{
131 		.procname	= "speed_limit_min",
132 		.data		= &sysctl_speed_limit_min,
133 		.maxlen		= sizeof(int),
134 		.mode		= S_IRUGO|S_IWUSR,
135 		.proc_handler	= proc_dointvec,
136 	},
137 	{
138 		.procname	= "speed_limit_max",
139 		.data		= &sysctl_speed_limit_max,
140 		.maxlen		= sizeof(int),
141 		.mode		= S_IRUGO|S_IWUSR,
142 		.proc_handler	= proc_dointvec,
143 	},
144 	{ }
145 };
146 
147 static struct ctl_table raid_dir_table[] = {
148 	{
149 		.procname	= "raid",
150 		.maxlen		= 0,
151 		.mode		= S_IRUGO|S_IXUGO,
152 		.child		= raid_table,
153 	},
154 	{ }
155 };
156 
157 static struct ctl_table raid_root_table[] = {
158 	{
159 		.procname	= "dev",
160 		.maxlen		= 0,
161 		.mode		= 0555,
162 		.child		= raid_dir_table,
163 	},
164 	{  }
165 };
166 
167 static const struct block_device_operations md_fops;
168 
169 static int start_readonly;
170 
171 /*
172  * The original mechanism for creating an md device is to create
173  * a device node in /dev and to open it.  This causes races with device-close.
174  * The preferred method is to write to the "new_array" module parameter.
175  * This can avoid races.
176  * Setting create_on_open to false disables the original mechanism
177  * so all the races disappear.
178  */
179 static bool create_on_open = true;
180 
181 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
182 			    struct mddev *mddev)
183 {
184 	if (!mddev || !bioset_initialized(&mddev->bio_set))
185 		return bio_alloc(gfp_mask, nr_iovecs);
186 
187 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
188 }
189 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
190 
191 static struct bio *md_bio_alloc_sync(struct mddev *mddev)
192 {
193 	if (!mddev || !bioset_initialized(&mddev->sync_set))
194 		return bio_alloc(GFP_NOIO, 1);
195 
196 	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
197 }
198 
199 /*
200  * We have a system wide 'event count' that is incremented
201  * on any 'interesting' event, and readers of /proc/mdstat
202  * can use 'poll' or 'select' to find out when the event
203  * count increases.
204  *
205  * Events are:
206  *  start array, stop array, error, add device, remove device,
207  *  start build, activate spare
208  */
209 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
210 static atomic_t md_event_count;
211 void md_new_event(struct mddev *mddev)
212 {
213 	atomic_inc(&md_event_count);
214 	wake_up(&md_event_waiters);
215 }
216 EXPORT_SYMBOL_GPL(md_new_event);
217 
218 /*
219  * Enables to iterate over all existing md arrays
220  * all_mddevs_lock protects this list.
221  */
222 static LIST_HEAD(all_mddevs);
223 static DEFINE_SPINLOCK(all_mddevs_lock);
224 
225 /*
226  * iterates through all used mddevs in the system.
227  * We take care to grab the all_mddevs_lock whenever navigating
228  * the list, and to always hold a refcount when unlocked.
229  * Any code which breaks out of this loop while own
230  * a reference to the current mddev and must mddev_put it.
231  */
232 #define for_each_mddev(_mddev,_tmp)					\
233 									\
234 	for (({ spin_lock(&all_mddevs_lock);				\
235 		_tmp = all_mddevs.next;					\
236 		_mddev = NULL;});					\
237 	     ({ if (_tmp != &all_mddevs)				\
238 			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
239 		spin_unlock(&all_mddevs_lock);				\
240 		if (_mddev) mddev_put(_mddev);				\
241 		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
242 		_tmp != &all_mddevs;});					\
243 	     ({ spin_lock(&all_mddevs_lock);				\
244 		_tmp = _tmp->next;})					\
245 		)
246 
247 /* Rather than calling directly into the personality make_request function,
248  * IO requests come here first so that we can check if the device is
249  * being suspended pending a reconfiguration.
250  * We hold a refcount over the call to ->make_request.  By the time that
251  * call has finished, the bio has been linked into some internal structure
252  * and so is visible to ->quiesce(), so we don't need the refcount any more.
253  */
254 static bool is_suspended(struct mddev *mddev, struct bio *bio)
255 {
256 	if (mddev->suspended)
257 		return true;
258 	if (bio_data_dir(bio) != WRITE)
259 		return false;
260 	if (mddev->suspend_lo >= mddev->suspend_hi)
261 		return false;
262 	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
263 		return false;
264 	if (bio_end_sector(bio) < mddev->suspend_lo)
265 		return false;
266 	return true;
267 }
268 
269 void md_handle_request(struct mddev *mddev, struct bio *bio)
270 {
271 check_suspended:
272 	rcu_read_lock();
273 	if (is_suspended(mddev, bio)) {
274 		DEFINE_WAIT(__wait);
275 		for (;;) {
276 			prepare_to_wait(&mddev->sb_wait, &__wait,
277 					TASK_UNINTERRUPTIBLE);
278 			if (!is_suspended(mddev, bio))
279 				break;
280 			rcu_read_unlock();
281 			schedule();
282 			rcu_read_lock();
283 		}
284 		finish_wait(&mddev->sb_wait, &__wait);
285 	}
286 	atomic_inc(&mddev->active_io);
287 	rcu_read_unlock();
288 
289 	if (!mddev->pers->make_request(mddev, bio)) {
290 		atomic_dec(&mddev->active_io);
291 		wake_up(&mddev->sb_wait);
292 		goto check_suspended;
293 	}
294 
295 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
296 		wake_up(&mddev->sb_wait);
297 }
298 EXPORT_SYMBOL(md_handle_request);
299 
300 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
301 {
302 	const int rw = bio_data_dir(bio);
303 	const int sgrp = op_stat_group(bio_op(bio));
304 	struct mddev *mddev = q->queuedata;
305 	unsigned int sectors;
306 
307 	blk_queue_split(q, &bio);
308 
309 	if (mddev == NULL || mddev->pers == NULL) {
310 		bio_io_error(bio);
311 		return BLK_QC_T_NONE;
312 	}
313 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
314 		if (bio_sectors(bio) != 0)
315 			bio->bi_status = BLK_STS_IOERR;
316 		bio_endio(bio);
317 		return BLK_QC_T_NONE;
318 	}
319 
320 	/*
321 	 * save the sectors now since our bio can
322 	 * go away inside make_request
323 	 */
324 	sectors = bio_sectors(bio);
325 	/* bio could be mergeable after passing to underlayer */
326 	bio->bi_opf &= ~REQ_NOMERGE;
327 
328 	md_handle_request(mddev, bio);
329 
330 	part_stat_lock();
331 	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
332 	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
333 	part_stat_unlock();
334 
335 	return BLK_QC_T_NONE;
336 }
337 
338 /* mddev_suspend makes sure no new requests are submitted
339  * to the device, and that any requests that have been submitted
340  * are completely handled.
341  * Once mddev_detach() is called and completes, the module will be
342  * completely unused.
343  */
344 void mddev_suspend(struct mddev *mddev)
345 {
346 	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
347 	lockdep_assert_held(&mddev->reconfig_mutex);
348 	if (mddev->suspended++)
349 		return;
350 	synchronize_rcu();
351 	wake_up(&mddev->sb_wait);
352 	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
353 	smp_mb__after_atomic();
354 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
355 	mddev->pers->quiesce(mddev, 1);
356 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
357 	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
358 
359 	del_timer_sync(&mddev->safemode_timer);
360 }
361 EXPORT_SYMBOL_GPL(mddev_suspend);
362 
363 void mddev_resume(struct mddev *mddev)
364 {
365 	lockdep_assert_held(&mddev->reconfig_mutex);
366 	if (--mddev->suspended)
367 		return;
368 	wake_up(&mddev->sb_wait);
369 	mddev->pers->quiesce(mddev, 0);
370 
371 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
372 	md_wakeup_thread(mddev->thread);
373 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
374 }
375 EXPORT_SYMBOL_GPL(mddev_resume);
376 
377 int mddev_congested(struct mddev *mddev, int bits)
378 {
379 	struct md_personality *pers = mddev->pers;
380 	int ret = 0;
381 
382 	rcu_read_lock();
383 	if (mddev->suspended)
384 		ret = 1;
385 	else if (pers && pers->congested)
386 		ret = pers->congested(mddev, bits);
387 	rcu_read_unlock();
388 	return ret;
389 }
390 EXPORT_SYMBOL_GPL(mddev_congested);
391 static int md_congested(void *data, int bits)
392 {
393 	struct mddev *mddev = data;
394 	return mddev_congested(mddev, bits);
395 }
396 
397 /*
398  * Generic flush handling for md
399  */
400 
401 static void md_end_flush(struct bio *bio)
402 {
403 	struct md_rdev *rdev = bio->bi_private;
404 	struct mddev *mddev = rdev->mddev;
405 
406 	rdev_dec_pending(rdev, mddev);
407 
408 	if (atomic_dec_and_test(&mddev->flush_pending)) {
409 		/* The pre-request flush has finished */
410 		queue_work(md_wq, &mddev->flush_work);
411 	}
412 	bio_put(bio);
413 }
414 
415 static void md_submit_flush_data(struct work_struct *ws);
416 
417 static void submit_flushes(struct work_struct *ws)
418 {
419 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
420 	struct md_rdev *rdev;
421 
422 	mddev->start_flush = ktime_get_boottime();
423 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
424 	atomic_set(&mddev->flush_pending, 1);
425 	rcu_read_lock();
426 	rdev_for_each_rcu(rdev, mddev)
427 		if (rdev->raid_disk >= 0 &&
428 		    !test_bit(Faulty, &rdev->flags)) {
429 			/* Take two references, one is dropped
430 			 * when request finishes, one after
431 			 * we reclaim rcu_read_lock
432 			 */
433 			struct bio *bi;
434 			atomic_inc(&rdev->nr_pending);
435 			atomic_inc(&rdev->nr_pending);
436 			rcu_read_unlock();
437 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
438 			bi->bi_end_io = md_end_flush;
439 			bi->bi_private = rdev;
440 			bio_set_dev(bi, rdev->bdev);
441 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
442 			atomic_inc(&mddev->flush_pending);
443 			submit_bio(bi);
444 			rcu_read_lock();
445 			rdev_dec_pending(rdev, mddev);
446 		}
447 	rcu_read_unlock();
448 	if (atomic_dec_and_test(&mddev->flush_pending))
449 		queue_work(md_wq, &mddev->flush_work);
450 }
451 
452 static void md_submit_flush_data(struct work_struct *ws)
453 {
454 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
455 	struct bio *bio = mddev->flush_bio;
456 
457 	/*
458 	 * must reset flush_bio before calling into md_handle_request to avoid a
459 	 * deadlock, because other bios passed md_handle_request suspend check
460 	 * could wait for this and below md_handle_request could wait for those
461 	 * bios because of suspend check
462 	 */
463 	mddev->last_flush = mddev->start_flush;
464 	mddev->flush_bio = NULL;
465 	wake_up(&mddev->sb_wait);
466 
467 	if (bio->bi_iter.bi_size == 0) {
468 		/* an empty barrier - all done */
469 		bio_endio(bio);
470 	} else {
471 		bio->bi_opf &= ~REQ_PREFLUSH;
472 		md_handle_request(mddev, bio);
473 	}
474 }
475 
476 void md_flush_request(struct mddev *mddev, struct bio *bio)
477 {
478 	ktime_t start = ktime_get_boottime();
479 	spin_lock_irq(&mddev->lock);
480 	wait_event_lock_irq(mddev->sb_wait,
481 			    !mddev->flush_bio ||
482 			    ktime_after(mddev->last_flush, start),
483 			    mddev->lock);
484 	if (!ktime_after(mddev->last_flush, start)) {
485 		WARN_ON(mddev->flush_bio);
486 		mddev->flush_bio = bio;
487 		bio = NULL;
488 	}
489 	spin_unlock_irq(&mddev->lock);
490 
491 	if (!bio) {
492 		INIT_WORK(&mddev->flush_work, submit_flushes);
493 		queue_work(md_wq, &mddev->flush_work);
494 	} else {
495 		/* flush was performed for some other bio while we waited. */
496 		if (bio->bi_iter.bi_size == 0)
497 			/* an empty barrier - all done */
498 			bio_endio(bio);
499 		else {
500 			bio->bi_opf &= ~REQ_PREFLUSH;
501 			mddev->pers->make_request(mddev, bio);
502 		}
503 	}
504 }
505 EXPORT_SYMBOL(md_flush_request);
506 
507 static inline struct mddev *mddev_get(struct mddev *mddev)
508 {
509 	atomic_inc(&mddev->active);
510 	return mddev;
511 }
512 
513 static void mddev_delayed_delete(struct work_struct *ws);
514 
515 static void mddev_put(struct mddev *mddev)
516 {
517 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
518 		return;
519 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
520 	    mddev->ctime == 0 && !mddev->hold_active) {
521 		/* Array is not configured at all, and not held active,
522 		 * so destroy it */
523 		list_del_init(&mddev->all_mddevs);
524 
525 		/*
526 		 * Call queue_work inside the spinlock so that
527 		 * flush_workqueue() after mddev_find will succeed in waiting
528 		 * for the work to be done.
529 		 */
530 		INIT_WORK(&mddev->del_work, mddev_delayed_delete);
531 		queue_work(md_misc_wq, &mddev->del_work);
532 	}
533 	spin_unlock(&all_mddevs_lock);
534 }
535 
536 static void md_safemode_timeout(struct timer_list *t);
537 
538 void mddev_init(struct mddev *mddev)
539 {
540 	kobject_init(&mddev->kobj, &md_ktype);
541 	mutex_init(&mddev->open_mutex);
542 	mutex_init(&mddev->reconfig_mutex);
543 	mutex_init(&mddev->bitmap_info.mutex);
544 	INIT_LIST_HEAD(&mddev->disks);
545 	INIT_LIST_HEAD(&mddev->all_mddevs);
546 	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
547 	atomic_set(&mddev->active, 1);
548 	atomic_set(&mddev->openers, 0);
549 	atomic_set(&mddev->active_io, 0);
550 	spin_lock_init(&mddev->lock);
551 	atomic_set(&mddev->flush_pending, 0);
552 	init_waitqueue_head(&mddev->sb_wait);
553 	init_waitqueue_head(&mddev->recovery_wait);
554 	mddev->reshape_position = MaxSector;
555 	mddev->reshape_backwards = 0;
556 	mddev->last_sync_action = "none";
557 	mddev->resync_min = 0;
558 	mddev->resync_max = MaxSector;
559 	mddev->level = LEVEL_NONE;
560 }
561 EXPORT_SYMBOL_GPL(mddev_init);
562 
563 static struct mddev *mddev_find(dev_t unit)
564 {
565 	struct mddev *mddev, *new = NULL;
566 
567 	if (unit && MAJOR(unit) != MD_MAJOR)
568 		unit &= ~((1<<MdpMinorShift)-1);
569 
570  retry:
571 	spin_lock(&all_mddevs_lock);
572 
573 	if (unit) {
574 		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
575 			if (mddev->unit == unit) {
576 				mddev_get(mddev);
577 				spin_unlock(&all_mddevs_lock);
578 				kfree(new);
579 				return mddev;
580 			}
581 
582 		if (new) {
583 			list_add(&new->all_mddevs, &all_mddevs);
584 			spin_unlock(&all_mddevs_lock);
585 			new->hold_active = UNTIL_IOCTL;
586 			return new;
587 		}
588 	} else if (new) {
589 		/* find an unused unit number */
590 		static int next_minor = 512;
591 		int start = next_minor;
592 		int is_free = 0;
593 		int dev = 0;
594 		while (!is_free) {
595 			dev = MKDEV(MD_MAJOR, next_minor);
596 			next_minor++;
597 			if (next_minor > MINORMASK)
598 				next_minor = 0;
599 			if (next_minor == start) {
600 				/* Oh dear, all in use. */
601 				spin_unlock(&all_mddevs_lock);
602 				kfree(new);
603 				return NULL;
604 			}
605 
606 			is_free = 1;
607 			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
608 				if (mddev->unit == dev) {
609 					is_free = 0;
610 					break;
611 				}
612 		}
613 		new->unit = dev;
614 		new->md_minor = MINOR(dev);
615 		new->hold_active = UNTIL_STOP;
616 		list_add(&new->all_mddevs, &all_mddevs);
617 		spin_unlock(&all_mddevs_lock);
618 		return new;
619 	}
620 	spin_unlock(&all_mddevs_lock);
621 
622 	new = kzalloc(sizeof(*new), GFP_KERNEL);
623 	if (!new)
624 		return NULL;
625 
626 	new->unit = unit;
627 	if (MAJOR(unit) == MD_MAJOR)
628 		new->md_minor = MINOR(unit);
629 	else
630 		new->md_minor = MINOR(unit) >> MdpMinorShift;
631 
632 	mddev_init(new);
633 
634 	goto retry;
635 }
636 
637 static struct attribute_group md_redundancy_group;
638 
639 void mddev_unlock(struct mddev *mddev)
640 {
641 	if (mddev->to_remove) {
642 		/* These cannot be removed under reconfig_mutex as
643 		 * an access to the files will try to take reconfig_mutex
644 		 * while holding the file unremovable, which leads to
645 		 * a deadlock.
646 		 * So hold set sysfs_active while the remove in happeing,
647 		 * and anything else which might set ->to_remove or my
648 		 * otherwise change the sysfs namespace will fail with
649 		 * -EBUSY if sysfs_active is still set.
650 		 * We set sysfs_active under reconfig_mutex and elsewhere
651 		 * test it under the same mutex to ensure its correct value
652 		 * is seen.
653 		 */
654 		struct attribute_group *to_remove = mddev->to_remove;
655 		mddev->to_remove = NULL;
656 		mddev->sysfs_active = 1;
657 		mutex_unlock(&mddev->reconfig_mutex);
658 
659 		if (mddev->kobj.sd) {
660 			if (to_remove != &md_redundancy_group)
661 				sysfs_remove_group(&mddev->kobj, to_remove);
662 			if (mddev->pers == NULL ||
663 			    mddev->pers->sync_request == NULL) {
664 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
665 				if (mddev->sysfs_action)
666 					sysfs_put(mddev->sysfs_action);
667 				mddev->sysfs_action = NULL;
668 			}
669 		}
670 		mddev->sysfs_active = 0;
671 	} else
672 		mutex_unlock(&mddev->reconfig_mutex);
673 
674 	/* As we've dropped the mutex we need a spinlock to
675 	 * make sure the thread doesn't disappear
676 	 */
677 	spin_lock(&pers_lock);
678 	md_wakeup_thread(mddev->thread);
679 	wake_up(&mddev->sb_wait);
680 	spin_unlock(&pers_lock);
681 }
682 EXPORT_SYMBOL_GPL(mddev_unlock);
683 
684 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
685 {
686 	struct md_rdev *rdev;
687 
688 	rdev_for_each_rcu(rdev, mddev)
689 		if (rdev->desc_nr == nr)
690 			return rdev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
695 
696 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
697 {
698 	struct md_rdev *rdev;
699 
700 	rdev_for_each(rdev, mddev)
701 		if (rdev->bdev->bd_dev == dev)
702 			return rdev;
703 
704 	return NULL;
705 }
706 
707 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
708 {
709 	struct md_rdev *rdev;
710 
711 	rdev_for_each_rcu(rdev, mddev)
712 		if (rdev->bdev->bd_dev == dev)
713 			return rdev;
714 
715 	return NULL;
716 }
717 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
718 
719 static struct md_personality *find_pers(int level, char *clevel)
720 {
721 	struct md_personality *pers;
722 	list_for_each_entry(pers, &pers_list, list) {
723 		if (level != LEVEL_NONE && pers->level == level)
724 			return pers;
725 		if (strcmp(pers->name, clevel)==0)
726 			return pers;
727 	}
728 	return NULL;
729 }
730 
731 /* return the offset of the super block in 512byte sectors */
732 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
733 {
734 	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
735 	return MD_NEW_SIZE_SECTORS(num_sectors);
736 }
737 
738 static int alloc_disk_sb(struct md_rdev *rdev)
739 {
740 	rdev->sb_page = alloc_page(GFP_KERNEL);
741 	if (!rdev->sb_page)
742 		return -ENOMEM;
743 	return 0;
744 }
745 
746 void md_rdev_clear(struct md_rdev *rdev)
747 {
748 	if (rdev->sb_page) {
749 		put_page(rdev->sb_page);
750 		rdev->sb_loaded = 0;
751 		rdev->sb_page = NULL;
752 		rdev->sb_start = 0;
753 		rdev->sectors = 0;
754 	}
755 	if (rdev->bb_page) {
756 		put_page(rdev->bb_page);
757 		rdev->bb_page = NULL;
758 	}
759 	badblocks_exit(&rdev->badblocks);
760 }
761 EXPORT_SYMBOL_GPL(md_rdev_clear);
762 
763 static void super_written(struct bio *bio)
764 {
765 	struct md_rdev *rdev = bio->bi_private;
766 	struct mddev *mddev = rdev->mddev;
767 
768 	if (bio->bi_status) {
769 		pr_err("md: super_written gets error=%d\n", bio->bi_status);
770 		md_error(mddev, rdev);
771 		if (!test_bit(Faulty, &rdev->flags)
772 		    && (bio->bi_opf & MD_FAILFAST)) {
773 			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
774 			set_bit(LastDev, &rdev->flags);
775 		}
776 	} else
777 		clear_bit(LastDev, &rdev->flags);
778 
779 	if (atomic_dec_and_test(&mddev->pending_writes))
780 		wake_up(&mddev->sb_wait);
781 	rdev_dec_pending(rdev, mddev);
782 	bio_put(bio);
783 }
784 
785 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
786 		   sector_t sector, int size, struct page *page)
787 {
788 	/* write first size bytes of page to sector of rdev
789 	 * Increment mddev->pending_writes before returning
790 	 * and decrement it on completion, waking up sb_wait
791 	 * if zero is reached.
792 	 * If an error occurred, call md_error
793 	 */
794 	struct bio *bio;
795 	int ff = 0;
796 
797 	if (!page)
798 		return;
799 
800 	if (test_bit(Faulty, &rdev->flags))
801 		return;
802 
803 	bio = md_bio_alloc_sync(mddev);
804 
805 	atomic_inc(&rdev->nr_pending);
806 
807 	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
808 	bio->bi_iter.bi_sector = sector;
809 	bio_add_page(bio, page, size, 0);
810 	bio->bi_private = rdev;
811 	bio->bi_end_io = super_written;
812 
813 	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
814 	    test_bit(FailFast, &rdev->flags) &&
815 	    !test_bit(LastDev, &rdev->flags))
816 		ff = MD_FAILFAST;
817 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
818 
819 	atomic_inc(&mddev->pending_writes);
820 	submit_bio(bio);
821 }
822 
823 int md_super_wait(struct mddev *mddev)
824 {
825 	/* wait for all superblock writes that were scheduled to complete */
826 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
827 	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
828 		return -EAGAIN;
829 	return 0;
830 }
831 
832 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
833 		 struct page *page, int op, int op_flags, bool metadata_op)
834 {
835 	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
836 	int ret;
837 
838 	if (metadata_op && rdev->meta_bdev)
839 		bio_set_dev(bio, rdev->meta_bdev);
840 	else
841 		bio_set_dev(bio, rdev->bdev);
842 	bio_set_op_attrs(bio, op, op_flags);
843 	if (metadata_op)
844 		bio->bi_iter.bi_sector = sector + rdev->sb_start;
845 	else if (rdev->mddev->reshape_position != MaxSector &&
846 		 (rdev->mddev->reshape_backwards ==
847 		  (sector >= rdev->mddev->reshape_position)))
848 		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
849 	else
850 		bio->bi_iter.bi_sector = sector + rdev->data_offset;
851 	bio_add_page(bio, page, size, 0);
852 
853 	submit_bio_wait(bio);
854 
855 	ret = !bio->bi_status;
856 	bio_put(bio);
857 	return ret;
858 }
859 EXPORT_SYMBOL_GPL(sync_page_io);
860 
861 static int read_disk_sb(struct md_rdev *rdev, int size)
862 {
863 	char b[BDEVNAME_SIZE];
864 
865 	if (rdev->sb_loaded)
866 		return 0;
867 
868 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
869 		goto fail;
870 	rdev->sb_loaded = 1;
871 	return 0;
872 
873 fail:
874 	pr_err("md: disabled device %s, could not read superblock.\n",
875 	       bdevname(rdev->bdev,b));
876 	return -EINVAL;
877 }
878 
879 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
880 {
881 	return	sb1->set_uuid0 == sb2->set_uuid0 &&
882 		sb1->set_uuid1 == sb2->set_uuid1 &&
883 		sb1->set_uuid2 == sb2->set_uuid2 &&
884 		sb1->set_uuid3 == sb2->set_uuid3;
885 }
886 
887 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
888 {
889 	int ret;
890 	mdp_super_t *tmp1, *tmp2;
891 
892 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
893 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
894 
895 	if (!tmp1 || !tmp2) {
896 		ret = 0;
897 		goto abort;
898 	}
899 
900 	*tmp1 = *sb1;
901 	*tmp2 = *sb2;
902 
903 	/*
904 	 * nr_disks is not constant
905 	 */
906 	tmp1->nr_disks = 0;
907 	tmp2->nr_disks = 0;
908 
909 	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
910 abort:
911 	kfree(tmp1);
912 	kfree(tmp2);
913 	return ret;
914 }
915 
916 static u32 md_csum_fold(u32 csum)
917 {
918 	csum = (csum & 0xffff) + (csum >> 16);
919 	return (csum & 0xffff) + (csum >> 16);
920 }
921 
922 static unsigned int calc_sb_csum(mdp_super_t *sb)
923 {
924 	u64 newcsum = 0;
925 	u32 *sb32 = (u32*)sb;
926 	int i;
927 	unsigned int disk_csum, csum;
928 
929 	disk_csum = sb->sb_csum;
930 	sb->sb_csum = 0;
931 
932 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
933 		newcsum += sb32[i];
934 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
935 
936 #ifdef CONFIG_ALPHA
937 	/* This used to use csum_partial, which was wrong for several
938 	 * reasons including that different results are returned on
939 	 * different architectures.  It isn't critical that we get exactly
940 	 * the same return value as before (we always csum_fold before
941 	 * testing, and that removes any differences).  However as we
942 	 * know that csum_partial always returned a 16bit value on
943 	 * alphas, do a fold to maximise conformity to previous behaviour.
944 	 */
945 	sb->sb_csum = md_csum_fold(disk_csum);
946 #else
947 	sb->sb_csum = disk_csum;
948 #endif
949 	return csum;
950 }
951 
952 /*
953  * Handle superblock details.
954  * We want to be able to handle multiple superblock formats
955  * so we have a common interface to them all, and an array of
956  * different handlers.
957  * We rely on user-space to write the initial superblock, and support
958  * reading and updating of superblocks.
959  * Interface methods are:
960  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
961  *      loads and validates a superblock on dev.
962  *      if refdev != NULL, compare superblocks on both devices
963  *    Return:
964  *      0 - dev has a superblock that is compatible with refdev
965  *      1 - dev has a superblock that is compatible and newer than refdev
966  *          so dev should be used as the refdev in future
967  *     -EINVAL superblock incompatible or invalid
968  *     -othererror e.g. -EIO
969  *
970  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
971  *      Verify that dev is acceptable into mddev.
972  *       The first time, mddev->raid_disks will be 0, and data from
973  *       dev should be merged in.  Subsequent calls check that dev
974  *       is new enough.  Return 0 or -EINVAL
975  *
976  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
977  *     Update the superblock for rdev with data in mddev
978  *     This does not write to disc.
979  *
980  */
981 
982 struct super_type  {
983 	char		    *name;
984 	struct module	    *owner;
985 	int		    (*load_super)(struct md_rdev *rdev,
986 					  struct md_rdev *refdev,
987 					  int minor_version);
988 	int		    (*validate_super)(struct mddev *mddev,
989 					      struct md_rdev *rdev);
990 	void		    (*sync_super)(struct mddev *mddev,
991 					  struct md_rdev *rdev);
992 	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
993 						sector_t num_sectors);
994 	int		    (*allow_new_offset)(struct md_rdev *rdev,
995 						unsigned long long new_offset);
996 };
997 
998 /*
999  * Check that the given mddev has no bitmap.
1000  *
1001  * This function is called from the run method of all personalities that do not
1002  * support bitmaps. It prints an error message and returns non-zero if mddev
1003  * has a bitmap. Otherwise, it returns 0.
1004  *
1005  */
1006 int md_check_no_bitmap(struct mddev *mddev)
1007 {
1008 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1009 		return 0;
1010 	pr_warn("%s: bitmaps are not supported for %s\n",
1011 		mdname(mddev), mddev->pers->name);
1012 	return 1;
1013 }
1014 EXPORT_SYMBOL(md_check_no_bitmap);
1015 
1016 /*
1017  * load_super for 0.90.0
1018  */
1019 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1020 {
1021 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1022 	mdp_super_t *sb;
1023 	int ret;
1024 
1025 	/*
1026 	 * Calculate the position of the superblock (512byte sectors),
1027 	 * it's at the end of the disk.
1028 	 *
1029 	 * It also happens to be a multiple of 4Kb.
1030 	 */
1031 	rdev->sb_start = calc_dev_sboffset(rdev);
1032 
1033 	ret = read_disk_sb(rdev, MD_SB_BYTES);
1034 	if (ret)
1035 		return ret;
1036 
1037 	ret = -EINVAL;
1038 
1039 	bdevname(rdev->bdev, b);
1040 	sb = page_address(rdev->sb_page);
1041 
1042 	if (sb->md_magic != MD_SB_MAGIC) {
1043 		pr_warn("md: invalid raid superblock magic on %s\n", b);
1044 		goto abort;
1045 	}
1046 
1047 	if (sb->major_version != 0 ||
1048 	    sb->minor_version < 90 ||
1049 	    sb->minor_version > 91) {
1050 		pr_warn("Bad version number %d.%d on %s\n",
1051 			sb->major_version, sb->minor_version, b);
1052 		goto abort;
1053 	}
1054 
1055 	if (sb->raid_disks <= 0)
1056 		goto abort;
1057 
1058 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1059 		pr_warn("md: invalid superblock checksum on %s\n", b);
1060 		goto abort;
1061 	}
1062 
1063 	rdev->preferred_minor = sb->md_minor;
1064 	rdev->data_offset = 0;
1065 	rdev->new_data_offset = 0;
1066 	rdev->sb_size = MD_SB_BYTES;
1067 	rdev->badblocks.shift = -1;
1068 
1069 	if (sb->level == LEVEL_MULTIPATH)
1070 		rdev->desc_nr = -1;
1071 	else
1072 		rdev->desc_nr = sb->this_disk.number;
1073 
1074 	if (!refdev) {
1075 		ret = 1;
1076 	} else {
1077 		__u64 ev1, ev2;
1078 		mdp_super_t *refsb = page_address(refdev->sb_page);
1079 		if (!md_uuid_equal(refsb, sb)) {
1080 			pr_warn("md: %s has different UUID to %s\n",
1081 				b, bdevname(refdev->bdev,b2));
1082 			goto abort;
1083 		}
1084 		if (!md_sb_equal(refsb, sb)) {
1085 			pr_warn("md: %s has same UUID but different superblock to %s\n",
1086 				b, bdevname(refdev->bdev, b2));
1087 			goto abort;
1088 		}
1089 		ev1 = md_event(sb);
1090 		ev2 = md_event(refsb);
1091 		if (ev1 > ev2)
1092 			ret = 1;
1093 		else
1094 			ret = 0;
1095 	}
1096 	rdev->sectors = rdev->sb_start;
1097 	/* Limit to 4TB as metadata cannot record more than that.
1098 	 * (not needed for Linear and RAID0 as metadata doesn't
1099 	 * record this size)
1100 	 */
1101 	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1102 		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1103 
1104 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1105 		/* "this cannot possibly happen" ... */
1106 		ret = -EINVAL;
1107 
1108  abort:
1109 	return ret;
1110 }
1111 
1112 /*
1113  * validate_super for 0.90.0
1114  */
1115 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1116 {
1117 	mdp_disk_t *desc;
1118 	mdp_super_t *sb = page_address(rdev->sb_page);
1119 	__u64 ev1 = md_event(sb);
1120 
1121 	rdev->raid_disk = -1;
1122 	clear_bit(Faulty, &rdev->flags);
1123 	clear_bit(In_sync, &rdev->flags);
1124 	clear_bit(Bitmap_sync, &rdev->flags);
1125 	clear_bit(WriteMostly, &rdev->flags);
1126 
1127 	if (mddev->raid_disks == 0) {
1128 		mddev->major_version = 0;
1129 		mddev->minor_version = sb->minor_version;
1130 		mddev->patch_version = sb->patch_version;
1131 		mddev->external = 0;
1132 		mddev->chunk_sectors = sb->chunk_size >> 9;
1133 		mddev->ctime = sb->ctime;
1134 		mddev->utime = sb->utime;
1135 		mddev->level = sb->level;
1136 		mddev->clevel[0] = 0;
1137 		mddev->layout = sb->layout;
1138 		mddev->raid_disks = sb->raid_disks;
1139 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1140 		mddev->events = ev1;
1141 		mddev->bitmap_info.offset = 0;
1142 		mddev->bitmap_info.space = 0;
1143 		/* bitmap can use 60 K after the 4K superblocks */
1144 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1145 		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1146 		mddev->reshape_backwards = 0;
1147 
1148 		if (mddev->minor_version >= 91) {
1149 			mddev->reshape_position = sb->reshape_position;
1150 			mddev->delta_disks = sb->delta_disks;
1151 			mddev->new_level = sb->new_level;
1152 			mddev->new_layout = sb->new_layout;
1153 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1154 			if (mddev->delta_disks < 0)
1155 				mddev->reshape_backwards = 1;
1156 		} else {
1157 			mddev->reshape_position = MaxSector;
1158 			mddev->delta_disks = 0;
1159 			mddev->new_level = mddev->level;
1160 			mddev->new_layout = mddev->layout;
1161 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1162 		}
1163 
1164 		if (sb->state & (1<<MD_SB_CLEAN))
1165 			mddev->recovery_cp = MaxSector;
1166 		else {
1167 			if (sb->events_hi == sb->cp_events_hi &&
1168 				sb->events_lo == sb->cp_events_lo) {
1169 				mddev->recovery_cp = sb->recovery_cp;
1170 			} else
1171 				mddev->recovery_cp = 0;
1172 		}
1173 
1174 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1175 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1176 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1177 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1178 
1179 		mddev->max_disks = MD_SB_DISKS;
1180 
1181 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1182 		    mddev->bitmap_info.file == NULL) {
1183 			mddev->bitmap_info.offset =
1184 				mddev->bitmap_info.default_offset;
1185 			mddev->bitmap_info.space =
1186 				mddev->bitmap_info.default_space;
1187 		}
1188 
1189 	} else if (mddev->pers == NULL) {
1190 		/* Insist on good event counter while assembling, except
1191 		 * for spares (which don't need an event count) */
1192 		++ev1;
1193 		if (sb->disks[rdev->desc_nr].state & (
1194 			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1195 			if (ev1 < mddev->events)
1196 				return -EINVAL;
1197 	} else if (mddev->bitmap) {
1198 		/* if adding to array with a bitmap, then we can accept an
1199 		 * older device ... but not too old.
1200 		 */
1201 		if (ev1 < mddev->bitmap->events_cleared)
1202 			return 0;
1203 		if (ev1 < mddev->events)
1204 			set_bit(Bitmap_sync, &rdev->flags);
1205 	} else {
1206 		if (ev1 < mddev->events)
1207 			/* just a hot-add of a new device, leave raid_disk at -1 */
1208 			return 0;
1209 	}
1210 
1211 	if (mddev->level != LEVEL_MULTIPATH) {
1212 		desc = sb->disks + rdev->desc_nr;
1213 
1214 		if (desc->state & (1<<MD_DISK_FAULTY))
1215 			set_bit(Faulty, &rdev->flags);
1216 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1217 			    desc->raid_disk < mddev->raid_disks */) {
1218 			set_bit(In_sync, &rdev->flags);
1219 			rdev->raid_disk = desc->raid_disk;
1220 			rdev->saved_raid_disk = desc->raid_disk;
1221 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1222 			/* active but not in sync implies recovery up to
1223 			 * reshape position.  We don't know exactly where
1224 			 * that is, so set to zero for now */
1225 			if (mddev->minor_version >= 91) {
1226 				rdev->recovery_offset = 0;
1227 				rdev->raid_disk = desc->raid_disk;
1228 			}
1229 		}
1230 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1231 			set_bit(WriteMostly, &rdev->flags);
1232 		if (desc->state & (1<<MD_DISK_FAILFAST))
1233 			set_bit(FailFast, &rdev->flags);
1234 	} else /* MULTIPATH are always insync */
1235 		set_bit(In_sync, &rdev->flags);
1236 	return 0;
1237 }
1238 
1239 /*
1240  * sync_super for 0.90.0
1241  */
1242 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1243 {
1244 	mdp_super_t *sb;
1245 	struct md_rdev *rdev2;
1246 	int next_spare = mddev->raid_disks;
1247 
1248 	/* make rdev->sb match mddev data..
1249 	 *
1250 	 * 1/ zero out disks
1251 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1252 	 * 3/ any empty disks < next_spare become removed
1253 	 *
1254 	 * disks[0] gets initialised to REMOVED because
1255 	 * we cannot be sure from other fields if it has
1256 	 * been initialised or not.
1257 	 */
1258 	int i;
1259 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1260 
1261 	rdev->sb_size = MD_SB_BYTES;
1262 
1263 	sb = page_address(rdev->sb_page);
1264 
1265 	memset(sb, 0, sizeof(*sb));
1266 
1267 	sb->md_magic = MD_SB_MAGIC;
1268 	sb->major_version = mddev->major_version;
1269 	sb->patch_version = mddev->patch_version;
1270 	sb->gvalid_words  = 0; /* ignored */
1271 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1272 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1273 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1274 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1275 
1276 	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1277 	sb->level = mddev->level;
1278 	sb->size = mddev->dev_sectors / 2;
1279 	sb->raid_disks = mddev->raid_disks;
1280 	sb->md_minor = mddev->md_minor;
1281 	sb->not_persistent = 0;
1282 	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1283 	sb->state = 0;
1284 	sb->events_hi = (mddev->events>>32);
1285 	sb->events_lo = (u32)mddev->events;
1286 
1287 	if (mddev->reshape_position == MaxSector)
1288 		sb->minor_version = 90;
1289 	else {
1290 		sb->minor_version = 91;
1291 		sb->reshape_position = mddev->reshape_position;
1292 		sb->new_level = mddev->new_level;
1293 		sb->delta_disks = mddev->delta_disks;
1294 		sb->new_layout = mddev->new_layout;
1295 		sb->new_chunk = mddev->new_chunk_sectors << 9;
1296 	}
1297 	mddev->minor_version = sb->minor_version;
1298 	if (mddev->in_sync)
1299 	{
1300 		sb->recovery_cp = mddev->recovery_cp;
1301 		sb->cp_events_hi = (mddev->events>>32);
1302 		sb->cp_events_lo = (u32)mddev->events;
1303 		if (mddev->recovery_cp == MaxSector)
1304 			sb->state = (1<< MD_SB_CLEAN);
1305 	} else
1306 		sb->recovery_cp = 0;
1307 
1308 	sb->layout = mddev->layout;
1309 	sb->chunk_size = mddev->chunk_sectors << 9;
1310 
1311 	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1312 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1313 
1314 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1315 	rdev_for_each(rdev2, mddev) {
1316 		mdp_disk_t *d;
1317 		int desc_nr;
1318 		int is_active = test_bit(In_sync, &rdev2->flags);
1319 
1320 		if (rdev2->raid_disk >= 0 &&
1321 		    sb->minor_version >= 91)
1322 			/* we have nowhere to store the recovery_offset,
1323 			 * but if it is not below the reshape_position,
1324 			 * we can piggy-back on that.
1325 			 */
1326 			is_active = 1;
1327 		if (rdev2->raid_disk < 0 ||
1328 		    test_bit(Faulty, &rdev2->flags))
1329 			is_active = 0;
1330 		if (is_active)
1331 			desc_nr = rdev2->raid_disk;
1332 		else
1333 			desc_nr = next_spare++;
1334 		rdev2->desc_nr = desc_nr;
1335 		d = &sb->disks[rdev2->desc_nr];
1336 		nr_disks++;
1337 		d->number = rdev2->desc_nr;
1338 		d->major = MAJOR(rdev2->bdev->bd_dev);
1339 		d->minor = MINOR(rdev2->bdev->bd_dev);
1340 		if (is_active)
1341 			d->raid_disk = rdev2->raid_disk;
1342 		else
1343 			d->raid_disk = rdev2->desc_nr; /* compatibility */
1344 		if (test_bit(Faulty, &rdev2->flags))
1345 			d->state = (1<<MD_DISK_FAULTY);
1346 		else if (is_active) {
1347 			d->state = (1<<MD_DISK_ACTIVE);
1348 			if (test_bit(In_sync, &rdev2->flags))
1349 				d->state |= (1<<MD_DISK_SYNC);
1350 			active++;
1351 			working++;
1352 		} else {
1353 			d->state = 0;
1354 			spare++;
1355 			working++;
1356 		}
1357 		if (test_bit(WriteMostly, &rdev2->flags))
1358 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1359 		if (test_bit(FailFast, &rdev2->flags))
1360 			d->state |= (1<<MD_DISK_FAILFAST);
1361 	}
1362 	/* now set the "removed" and "faulty" bits on any missing devices */
1363 	for (i=0 ; i < mddev->raid_disks ; i++) {
1364 		mdp_disk_t *d = &sb->disks[i];
1365 		if (d->state == 0 && d->number == 0) {
1366 			d->number = i;
1367 			d->raid_disk = i;
1368 			d->state = (1<<MD_DISK_REMOVED);
1369 			d->state |= (1<<MD_DISK_FAULTY);
1370 			failed++;
1371 		}
1372 	}
1373 	sb->nr_disks = nr_disks;
1374 	sb->active_disks = active;
1375 	sb->working_disks = working;
1376 	sb->failed_disks = failed;
1377 	sb->spare_disks = spare;
1378 
1379 	sb->this_disk = sb->disks[rdev->desc_nr];
1380 	sb->sb_csum = calc_sb_csum(sb);
1381 }
1382 
1383 /*
1384  * rdev_size_change for 0.90.0
1385  */
1386 static unsigned long long
1387 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1388 {
1389 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1390 		return 0; /* component must fit device */
1391 	if (rdev->mddev->bitmap_info.offset)
1392 		return 0; /* can't move bitmap */
1393 	rdev->sb_start = calc_dev_sboffset(rdev);
1394 	if (!num_sectors || num_sectors > rdev->sb_start)
1395 		num_sectors = rdev->sb_start;
1396 	/* Limit to 4TB as metadata cannot record more than that.
1397 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1398 	 */
1399 	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1400 		num_sectors = (sector_t)(2ULL << 32) - 2;
1401 	do {
1402 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1403 		       rdev->sb_page);
1404 	} while (md_super_wait(rdev->mddev) < 0);
1405 	return num_sectors;
1406 }
1407 
1408 static int
1409 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1410 {
1411 	/* non-zero offset changes not possible with v0.90 */
1412 	return new_offset == 0;
1413 }
1414 
1415 /*
1416  * version 1 superblock
1417  */
1418 
1419 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1420 {
1421 	__le32 disk_csum;
1422 	u32 csum;
1423 	unsigned long long newcsum;
1424 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1425 	__le32 *isuper = (__le32*)sb;
1426 
1427 	disk_csum = sb->sb_csum;
1428 	sb->sb_csum = 0;
1429 	newcsum = 0;
1430 	for (; size >= 4; size -= 4)
1431 		newcsum += le32_to_cpu(*isuper++);
1432 
1433 	if (size == 2)
1434 		newcsum += le16_to_cpu(*(__le16*) isuper);
1435 
1436 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1437 	sb->sb_csum = disk_csum;
1438 	return cpu_to_le32(csum);
1439 }
1440 
1441 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1442 {
1443 	struct mdp_superblock_1 *sb;
1444 	int ret;
1445 	sector_t sb_start;
1446 	sector_t sectors;
1447 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1448 	int bmask;
1449 
1450 	/*
1451 	 * Calculate the position of the superblock in 512byte sectors.
1452 	 * It is always aligned to a 4K boundary and
1453 	 * depeding on minor_version, it can be:
1454 	 * 0: At least 8K, but less than 12K, from end of device
1455 	 * 1: At start of device
1456 	 * 2: 4K from start of device.
1457 	 */
1458 	switch(minor_version) {
1459 	case 0:
1460 		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1461 		sb_start -= 8*2;
1462 		sb_start &= ~(sector_t)(4*2-1);
1463 		break;
1464 	case 1:
1465 		sb_start = 0;
1466 		break;
1467 	case 2:
1468 		sb_start = 8;
1469 		break;
1470 	default:
1471 		return -EINVAL;
1472 	}
1473 	rdev->sb_start = sb_start;
1474 
1475 	/* superblock is rarely larger than 1K, but it can be larger,
1476 	 * and it is safe to read 4k, so we do that
1477 	 */
1478 	ret = read_disk_sb(rdev, 4096);
1479 	if (ret) return ret;
1480 
1481 	sb = page_address(rdev->sb_page);
1482 
1483 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1484 	    sb->major_version != cpu_to_le32(1) ||
1485 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1486 	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1487 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1488 		return -EINVAL;
1489 
1490 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1491 		pr_warn("md: invalid superblock checksum on %s\n",
1492 			bdevname(rdev->bdev,b));
1493 		return -EINVAL;
1494 	}
1495 	if (le64_to_cpu(sb->data_size) < 10) {
1496 		pr_warn("md: data_size too small on %s\n",
1497 			bdevname(rdev->bdev,b));
1498 		return -EINVAL;
1499 	}
1500 	if (sb->pad0 ||
1501 	    sb->pad3[0] ||
1502 	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1503 		/* Some padding is non-zero, might be a new feature */
1504 		return -EINVAL;
1505 
1506 	rdev->preferred_minor = 0xffff;
1507 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1508 	rdev->new_data_offset = rdev->data_offset;
1509 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1510 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1511 		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1512 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1513 
1514 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1515 	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1516 	if (rdev->sb_size & bmask)
1517 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1518 
1519 	if (minor_version
1520 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1521 		return -EINVAL;
1522 	if (minor_version
1523 	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1524 		return -EINVAL;
1525 
1526 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1527 		rdev->desc_nr = -1;
1528 	else
1529 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1530 
1531 	if (!rdev->bb_page) {
1532 		rdev->bb_page = alloc_page(GFP_KERNEL);
1533 		if (!rdev->bb_page)
1534 			return -ENOMEM;
1535 	}
1536 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1537 	    rdev->badblocks.count == 0) {
1538 		/* need to load the bad block list.
1539 		 * Currently we limit it to one page.
1540 		 */
1541 		s32 offset;
1542 		sector_t bb_sector;
1543 		__le64 *bbp;
1544 		int i;
1545 		int sectors = le16_to_cpu(sb->bblog_size);
1546 		if (sectors > (PAGE_SIZE / 512))
1547 			return -EINVAL;
1548 		offset = le32_to_cpu(sb->bblog_offset);
1549 		if (offset == 0)
1550 			return -EINVAL;
1551 		bb_sector = (long long)offset;
1552 		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1553 				  rdev->bb_page, REQ_OP_READ, 0, true))
1554 			return -EIO;
1555 		bbp = (__le64 *)page_address(rdev->bb_page);
1556 		rdev->badblocks.shift = sb->bblog_shift;
1557 		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1558 			u64 bb = le64_to_cpu(*bbp);
1559 			int count = bb & (0x3ff);
1560 			u64 sector = bb >> 10;
1561 			sector <<= sb->bblog_shift;
1562 			count <<= sb->bblog_shift;
1563 			if (bb + 1 == 0)
1564 				break;
1565 			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1566 				return -EINVAL;
1567 		}
1568 	} else if (sb->bblog_offset != 0)
1569 		rdev->badblocks.shift = 0;
1570 
1571 	if ((le32_to_cpu(sb->feature_map) &
1572 	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1573 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1574 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1575 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1576 	}
1577 
1578 	if (!refdev) {
1579 		ret = 1;
1580 	} else {
1581 		__u64 ev1, ev2;
1582 		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1583 
1584 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1585 		    sb->level != refsb->level ||
1586 		    sb->layout != refsb->layout ||
1587 		    sb->chunksize != refsb->chunksize) {
1588 			pr_warn("md: %s has strangely different superblock to %s\n",
1589 				bdevname(rdev->bdev,b),
1590 				bdevname(refdev->bdev,b2));
1591 			return -EINVAL;
1592 		}
1593 		ev1 = le64_to_cpu(sb->events);
1594 		ev2 = le64_to_cpu(refsb->events);
1595 
1596 		if (ev1 > ev2)
1597 			ret = 1;
1598 		else
1599 			ret = 0;
1600 	}
1601 	if (minor_version) {
1602 		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1603 		sectors -= rdev->data_offset;
1604 	} else
1605 		sectors = rdev->sb_start;
1606 	if (sectors < le64_to_cpu(sb->data_size))
1607 		return -EINVAL;
1608 	rdev->sectors = le64_to_cpu(sb->data_size);
1609 	return ret;
1610 }
1611 
1612 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1613 {
1614 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1615 	__u64 ev1 = le64_to_cpu(sb->events);
1616 
1617 	rdev->raid_disk = -1;
1618 	clear_bit(Faulty, &rdev->flags);
1619 	clear_bit(In_sync, &rdev->flags);
1620 	clear_bit(Bitmap_sync, &rdev->flags);
1621 	clear_bit(WriteMostly, &rdev->flags);
1622 
1623 	if (mddev->raid_disks == 0) {
1624 		mddev->major_version = 1;
1625 		mddev->patch_version = 0;
1626 		mddev->external = 0;
1627 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1628 		mddev->ctime = le64_to_cpu(sb->ctime);
1629 		mddev->utime = le64_to_cpu(sb->utime);
1630 		mddev->level = le32_to_cpu(sb->level);
1631 		mddev->clevel[0] = 0;
1632 		mddev->layout = le32_to_cpu(sb->layout);
1633 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1634 		mddev->dev_sectors = le64_to_cpu(sb->size);
1635 		mddev->events = ev1;
1636 		mddev->bitmap_info.offset = 0;
1637 		mddev->bitmap_info.space = 0;
1638 		/* Default location for bitmap is 1K after superblock
1639 		 * using 3K - total of 4K
1640 		 */
1641 		mddev->bitmap_info.default_offset = 1024 >> 9;
1642 		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1643 		mddev->reshape_backwards = 0;
1644 
1645 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1646 		memcpy(mddev->uuid, sb->set_uuid, 16);
1647 
1648 		mddev->max_disks =  (4096-256)/2;
1649 
1650 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1651 		    mddev->bitmap_info.file == NULL) {
1652 			mddev->bitmap_info.offset =
1653 				(__s32)le32_to_cpu(sb->bitmap_offset);
1654 			/* Metadata doesn't record how much space is available.
1655 			 * For 1.0, we assume we can use up to the superblock
1656 			 * if before, else to 4K beyond superblock.
1657 			 * For others, assume no change is possible.
1658 			 */
1659 			if (mddev->minor_version > 0)
1660 				mddev->bitmap_info.space = 0;
1661 			else if (mddev->bitmap_info.offset > 0)
1662 				mddev->bitmap_info.space =
1663 					8 - mddev->bitmap_info.offset;
1664 			else
1665 				mddev->bitmap_info.space =
1666 					-mddev->bitmap_info.offset;
1667 		}
1668 
1669 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1670 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1671 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1672 			mddev->new_level = le32_to_cpu(sb->new_level);
1673 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1674 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1675 			if (mddev->delta_disks < 0 ||
1676 			    (mddev->delta_disks == 0 &&
1677 			     (le32_to_cpu(sb->feature_map)
1678 			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1679 				mddev->reshape_backwards = 1;
1680 		} else {
1681 			mddev->reshape_position = MaxSector;
1682 			mddev->delta_disks = 0;
1683 			mddev->new_level = mddev->level;
1684 			mddev->new_layout = mddev->layout;
1685 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1686 		}
1687 
1688 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1689 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1690 
1691 		if (le32_to_cpu(sb->feature_map) &
1692 		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1693 			if (le32_to_cpu(sb->feature_map) &
1694 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1695 				return -EINVAL;
1696 			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1697 			    (le32_to_cpu(sb->feature_map) &
1698 					    MD_FEATURE_MULTIPLE_PPLS))
1699 				return -EINVAL;
1700 			set_bit(MD_HAS_PPL, &mddev->flags);
1701 		}
1702 	} else if (mddev->pers == NULL) {
1703 		/* Insist of good event counter while assembling, except for
1704 		 * spares (which don't need an event count) */
1705 		++ev1;
1706 		if (rdev->desc_nr >= 0 &&
1707 		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1708 		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1709 		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1710 			if (ev1 < mddev->events)
1711 				return -EINVAL;
1712 	} else if (mddev->bitmap) {
1713 		/* If adding to array with a bitmap, then we can accept an
1714 		 * older device, but not too old.
1715 		 */
1716 		if (ev1 < mddev->bitmap->events_cleared)
1717 			return 0;
1718 		if (ev1 < mddev->events)
1719 			set_bit(Bitmap_sync, &rdev->flags);
1720 	} else {
1721 		if (ev1 < mddev->events)
1722 			/* just a hot-add of a new device, leave raid_disk at -1 */
1723 			return 0;
1724 	}
1725 	if (mddev->level != LEVEL_MULTIPATH) {
1726 		int role;
1727 		if (rdev->desc_nr < 0 ||
1728 		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1729 			role = MD_DISK_ROLE_SPARE;
1730 			rdev->desc_nr = -1;
1731 		} else
1732 			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1733 		switch(role) {
1734 		case MD_DISK_ROLE_SPARE: /* spare */
1735 			break;
1736 		case MD_DISK_ROLE_FAULTY: /* faulty */
1737 			set_bit(Faulty, &rdev->flags);
1738 			break;
1739 		case MD_DISK_ROLE_JOURNAL: /* journal device */
1740 			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1741 				/* journal device without journal feature */
1742 				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1743 				return -EINVAL;
1744 			}
1745 			set_bit(Journal, &rdev->flags);
1746 			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1747 			rdev->raid_disk = 0;
1748 			break;
1749 		default:
1750 			rdev->saved_raid_disk = role;
1751 			if ((le32_to_cpu(sb->feature_map) &
1752 			     MD_FEATURE_RECOVERY_OFFSET)) {
1753 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1754 				if (!(le32_to_cpu(sb->feature_map) &
1755 				      MD_FEATURE_RECOVERY_BITMAP))
1756 					rdev->saved_raid_disk = -1;
1757 			} else
1758 				set_bit(In_sync, &rdev->flags);
1759 			rdev->raid_disk = role;
1760 			break;
1761 		}
1762 		if (sb->devflags & WriteMostly1)
1763 			set_bit(WriteMostly, &rdev->flags);
1764 		if (sb->devflags & FailFast1)
1765 			set_bit(FailFast, &rdev->flags);
1766 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1767 			set_bit(Replacement, &rdev->flags);
1768 	} else /* MULTIPATH are always insync */
1769 		set_bit(In_sync, &rdev->flags);
1770 
1771 	return 0;
1772 }
1773 
1774 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1775 {
1776 	struct mdp_superblock_1 *sb;
1777 	struct md_rdev *rdev2;
1778 	int max_dev, i;
1779 	/* make rdev->sb match mddev and rdev data. */
1780 
1781 	sb = page_address(rdev->sb_page);
1782 
1783 	sb->feature_map = 0;
1784 	sb->pad0 = 0;
1785 	sb->recovery_offset = cpu_to_le64(0);
1786 	memset(sb->pad3, 0, sizeof(sb->pad3));
1787 
1788 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1789 	sb->events = cpu_to_le64(mddev->events);
1790 	if (mddev->in_sync)
1791 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1792 	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1793 		sb->resync_offset = cpu_to_le64(MaxSector);
1794 	else
1795 		sb->resync_offset = cpu_to_le64(0);
1796 
1797 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1798 
1799 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1800 	sb->size = cpu_to_le64(mddev->dev_sectors);
1801 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1802 	sb->level = cpu_to_le32(mddev->level);
1803 	sb->layout = cpu_to_le32(mddev->layout);
1804 	if (test_bit(FailFast, &rdev->flags))
1805 		sb->devflags |= FailFast1;
1806 	else
1807 		sb->devflags &= ~FailFast1;
1808 
1809 	if (test_bit(WriteMostly, &rdev->flags))
1810 		sb->devflags |= WriteMostly1;
1811 	else
1812 		sb->devflags &= ~WriteMostly1;
1813 	sb->data_offset = cpu_to_le64(rdev->data_offset);
1814 	sb->data_size = cpu_to_le64(rdev->sectors);
1815 
1816 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1817 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1818 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1819 	}
1820 
1821 	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1822 	    !test_bit(In_sync, &rdev->flags)) {
1823 		sb->feature_map |=
1824 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1825 		sb->recovery_offset =
1826 			cpu_to_le64(rdev->recovery_offset);
1827 		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1828 			sb->feature_map |=
1829 				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1830 	}
1831 	/* Note: recovery_offset and journal_tail share space  */
1832 	if (test_bit(Journal, &rdev->flags))
1833 		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1834 	if (test_bit(Replacement, &rdev->flags))
1835 		sb->feature_map |=
1836 			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1837 
1838 	if (mddev->reshape_position != MaxSector) {
1839 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1840 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1841 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1842 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1843 		sb->new_level = cpu_to_le32(mddev->new_level);
1844 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1845 		if (mddev->delta_disks == 0 &&
1846 		    mddev->reshape_backwards)
1847 			sb->feature_map
1848 				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1849 		if (rdev->new_data_offset != rdev->data_offset) {
1850 			sb->feature_map
1851 				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1852 			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1853 							     - rdev->data_offset));
1854 		}
1855 	}
1856 
1857 	if (mddev_is_clustered(mddev))
1858 		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1859 
1860 	if (rdev->badblocks.count == 0)
1861 		/* Nothing to do for bad blocks*/ ;
1862 	else if (sb->bblog_offset == 0)
1863 		/* Cannot record bad blocks on this device */
1864 		md_error(mddev, rdev);
1865 	else {
1866 		struct badblocks *bb = &rdev->badblocks;
1867 		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
1868 		u64 *p = bb->page;
1869 		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1870 		if (bb->changed) {
1871 			unsigned seq;
1872 
1873 retry:
1874 			seq = read_seqbegin(&bb->lock);
1875 
1876 			memset(bbp, 0xff, PAGE_SIZE);
1877 
1878 			for (i = 0 ; i < bb->count ; i++) {
1879 				u64 internal_bb = p[i];
1880 				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1881 						| BB_LEN(internal_bb));
1882 				bbp[i] = cpu_to_le64(store_bb);
1883 			}
1884 			bb->changed = 0;
1885 			if (read_seqretry(&bb->lock, seq))
1886 				goto retry;
1887 
1888 			bb->sector = (rdev->sb_start +
1889 				      (int)le32_to_cpu(sb->bblog_offset));
1890 			bb->size = le16_to_cpu(sb->bblog_size);
1891 		}
1892 	}
1893 
1894 	max_dev = 0;
1895 	rdev_for_each(rdev2, mddev)
1896 		if (rdev2->desc_nr+1 > max_dev)
1897 			max_dev = rdev2->desc_nr+1;
1898 
1899 	if (max_dev > le32_to_cpu(sb->max_dev)) {
1900 		int bmask;
1901 		sb->max_dev = cpu_to_le32(max_dev);
1902 		rdev->sb_size = max_dev * 2 + 256;
1903 		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1904 		if (rdev->sb_size & bmask)
1905 			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1906 	} else
1907 		max_dev = le32_to_cpu(sb->max_dev);
1908 
1909 	for (i=0; i<max_dev;i++)
1910 		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1911 
1912 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1913 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1914 
1915 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1916 		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1917 			sb->feature_map |=
1918 			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1919 		else
1920 			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1921 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1922 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1923 	}
1924 
1925 	rdev_for_each(rdev2, mddev) {
1926 		i = rdev2->desc_nr;
1927 		if (test_bit(Faulty, &rdev2->flags))
1928 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1929 		else if (test_bit(In_sync, &rdev2->flags))
1930 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1931 		else if (test_bit(Journal, &rdev2->flags))
1932 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1933 		else if (rdev2->raid_disk >= 0)
1934 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1935 		else
1936 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1937 	}
1938 
1939 	sb->sb_csum = calc_sb_1_csum(sb);
1940 }
1941 
1942 static unsigned long long
1943 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1944 {
1945 	struct mdp_superblock_1 *sb;
1946 	sector_t max_sectors;
1947 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1948 		return 0; /* component must fit device */
1949 	if (rdev->data_offset != rdev->new_data_offset)
1950 		return 0; /* too confusing */
1951 	if (rdev->sb_start < rdev->data_offset) {
1952 		/* minor versions 1 and 2; superblock before data */
1953 		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1954 		max_sectors -= rdev->data_offset;
1955 		if (!num_sectors || num_sectors > max_sectors)
1956 			num_sectors = max_sectors;
1957 	} else if (rdev->mddev->bitmap_info.offset) {
1958 		/* minor version 0 with bitmap we can't move */
1959 		return 0;
1960 	} else {
1961 		/* minor version 0; superblock after data */
1962 		sector_t sb_start;
1963 		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1964 		sb_start &= ~(sector_t)(4*2 - 1);
1965 		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1966 		if (!num_sectors || num_sectors > max_sectors)
1967 			num_sectors = max_sectors;
1968 		rdev->sb_start = sb_start;
1969 	}
1970 	sb = page_address(rdev->sb_page);
1971 	sb->data_size = cpu_to_le64(num_sectors);
1972 	sb->super_offset = cpu_to_le64(rdev->sb_start);
1973 	sb->sb_csum = calc_sb_1_csum(sb);
1974 	do {
1975 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1976 			       rdev->sb_page);
1977 	} while (md_super_wait(rdev->mddev) < 0);
1978 	return num_sectors;
1979 
1980 }
1981 
1982 static int
1983 super_1_allow_new_offset(struct md_rdev *rdev,
1984 			 unsigned long long new_offset)
1985 {
1986 	/* All necessary checks on new >= old have been done */
1987 	struct bitmap *bitmap;
1988 	if (new_offset >= rdev->data_offset)
1989 		return 1;
1990 
1991 	/* with 1.0 metadata, there is no metadata to tread on
1992 	 * so we can always move back */
1993 	if (rdev->mddev->minor_version == 0)
1994 		return 1;
1995 
1996 	/* otherwise we must be sure not to step on
1997 	 * any metadata, so stay:
1998 	 * 36K beyond start of superblock
1999 	 * beyond end of badblocks
2000 	 * beyond write-intent bitmap
2001 	 */
2002 	if (rdev->sb_start + (32+4)*2 > new_offset)
2003 		return 0;
2004 	bitmap = rdev->mddev->bitmap;
2005 	if (bitmap && !rdev->mddev->bitmap_info.file &&
2006 	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2007 	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2008 		return 0;
2009 	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2010 		return 0;
2011 
2012 	return 1;
2013 }
2014 
2015 static struct super_type super_types[] = {
2016 	[0] = {
2017 		.name	= "0.90.0",
2018 		.owner	= THIS_MODULE,
2019 		.load_super	    = super_90_load,
2020 		.validate_super	    = super_90_validate,
2021 		.sync_super	    = super_90_sync,
2022 		.rdev_size_change   = super_90_rdev_size_change,
2023 		.allow_new_offset   = super_90_allow_new_offset,
2024 	},
2025 	[1] = {
2026 		.name	= "md-1",
2027 		.owner	= THIS_MODULE,
2028 		.load_super	    = super_1_load,
2029 		.validate_super	    = super_1_validate,
2030 		.sync_super	    = super_1_sync,
2031 		.rdev_size_change   = super_1_rdev_size_change,
2032 		.allow_new_offset   = super_1_allow_new_offset,
2033 	},
2034 };
2035 
2036 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2037 {
2038 	if (mddev->sync_super) {
2039 		mddev->sync_super(mddev, rdev);
2040 		return;
2041 	}
2042 
2043 	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2044 
2045 	super_types[mddev->major_version].sync_super(mddev, rdev);
2046 }
2047 
2048 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2049 {
2050 	struct md_rdev *rdev, *rdev2;
2051 
2052 	rcu_read_lock();
2053 	rdev_for_each_rcu(rdev, mddev1) {
2054 		if (test_bit(Faulty, &rdev->flags) ||
2055 		    test_bit(Journal, &rdev->flags) ||
2056 		    rdev->raid_disk == -1)
2057 			continue;
2058 		rdev_for_each_rcu(rdev2, mddev2) {
2059 			if (test_bit(Faulty, &rdev2->flags) ||
2060 			    test_bit(Journal, &rdev2->flags) ||
2061 			    rdev2->raid_disk == -1)
2062 				continue;
2063 			if (rdev->bdev->bd_contains ==
2064 			    rdev2->bdev->bd_contains) {
2065 				rcu_read_unlock();
2066 				return 1;
2067 			}
2068 		}
2069 	}
2070 	rcu_read_unlock();
2071 	return 0;
2072 }
2073 
2074 static LIST_HEAD(pending_raid_disks);
2075 
2076 /*
2077  * Try to register data integrity profile for an mddev
2078  *
2079  * This is called when an array is started and after a disk has been kicked
2080  * from the array. It only succeeds if all working and active component devices
2081  * are integrity capable with matching profiles.
2082  */
2083 int md_integrity_register(struct mddev *mddev)
2084 {
2085 	struct md_rdev *rdev, *reference = NULL;
2086 
2087 	if (list_empty(&mddev->disks))
2088 		return 0; /* nothing to do */
2089 	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2090 		return 0; /* shouldn't register, or already is */
2091 	rdev_for_each(rdev, mddev) {
2092 		/* skip spares and non-functional disks */
2093 		if (test_bit(Faulty, &rdev->flags))
2094 			continue;
2095 		if (rdev->raid_disk < 0)
2096 			continue;
2097 		if (!reference) {
2098 			/* Use the first rdev as the reference */
2099 			reference = rdev;
2100 			continue;
2101 		}
2102 		/* does this rdev's profile match the reference profile? */
2103 		if (blk_integrity_compare(reference->bdev->bd_disk,
2104 				rdev->bdev->bd_disk) < 0)
2105 			return -EINVAL;
2106 	}
2107 	if (!reference || !bdev_get_integrity(reference->bdev))
2108 		return 0;
2109 	/*
2110 	 * All component devices are integrity capable and have matching
2111 	 * profiles, register the common profile for the md device.
2112 	 */
2113 	blk_integrity_register(mddev->gendisk,
2114 			       bdev_get_integrity(reference->bdev));
2115 
2116 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2117 	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2118 		pr_err("md: failed to create integrity pool for %s\n",
2119 		       mdname(mddev));
2120 		return -EINVAL;
2121 	}
2122 	return 0;
2123 }
2124 EXPORT_SYMBOL(md_integrity_register);
2125 
2126 /*
2127  * Attempt to add an rdev, but only if it is consistent with the current
2128  * integrity profile
2129  */
2130 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2131 {
2132 	struct blk_integrity *bi_mddev;
2133 	char name[BDEVNAME_SIZE];
2134 
2135 	if (!mddev->gendisk)
2136 		return 0;
2137 
2138 	bi_mddev = blk_get_integrity(mddev->gendisk);
2139 
2140 	if (!bi_mddev) /* nothing to do */
2141 		return 0;
2142 
2143 	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2144 		pr_err("%s: incompatible integrity profile for %s\n",
2145 		       mdname(mddev), bdevname(rdev->bdev, name));
2146 		return -ENXIO;
2147 	}
2148 
2149 	return 0;
2150 }
2151 EXPORT_SYMBOL(md_integrity_add_rdev);
2152 
2153 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2154 {
2155 	char b[BDEVNAME_SIZE];
2156 	struct kobject *ko;
2157 	int err;
2158 
2159 	/* prevent duplicates */
2160 	if (find_rdev(mddev, rdev->bdev->bd_dev))
2161 		return -EEXIST;
2162 
2163 	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2164 	    mddev->pers)
2165 		return -EROFS;
2166 
2167 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2168 	if (!test_bit(Journal, &rdev->flags) &&
2169 	    rdev->sectors &&
2170 	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2171 		if (mddev->pers) {
2172 			/* Cannot change size, so fail
2173 			 * If mddev->level <= 0, then we don't care
2174 			 * about aligning sizes (e.g. linear)
2175 			 */
2176 			if (mddev->level > 0)
2177 				return -ENOSPC;
2178 		} else
2179 			mddev->dev_sectors = rdev->sectors;
2180 	}
2181 
2182 	/* Verify rdev->desc_nr is unique.
2183 	 * If it is -1, assign a free number, else
2184 	 * check number is not in use
2185 	 */
2186 	rcu_read_lock();
2187 	if (rdev->desc_nr < 0) {
2188 		int choice = 0;
2189 		if (mddev->pers)
2190 			choice = mddev->raid_disks;
2191 		while (md_find_rdev_nr_rcu(mddev, choice))
2192 			choice++;
2193 		rdev->desc_nr = choice;
2194 	} else {
2195 		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2196 			rcu_read_unlock();
2197 			return -EBUSY;
2198 		}
2199 	}
2200 	rcu_read_unlock();
2201 	if (!test_bit(Journal, &rdev->flags) &&
2202 	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2203 		pr_warn("md: %s: array is limited to %d devices\n",
2204 			mdname(mddev), mddev->max_disks);
2205 		return -EBUSY;
2206 	}
2207 	bdevname(rdev->bdev,b);
2208 	strreplace(b, '/', '!');
2209 
2210 	rdev->mddev = mddev;
2211 	pr_debug("md: bind<%s>\n", b);
2212 
2213 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2214 		goto fail;
2215 
2216 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2217 	if (sysfs_create_link(&rdev->kobj, ko, "block"))
2218 		/* failure here is OK */;
2219 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2220 
2221 	list_add_rcu(&rdev->same_set, &mddev->disks);
2222 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2223 
2224 	/* May as well allow recovery to be retried once */
2225 	mddev->recovery_disabled++;
2226 
2227 	return 0;
2228 
2229  fail:
2230 	pr_warn("md: failed to register dev-%s for %s\n",
2231 		b, mdname(mddev));
2232 	return err;
2233 }
2234 
2235 static void md_delayed_delete(struct work_struct *ws)
2236 {
2237 	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2238 	kobject_del(&rdev->kobj);
2239 	kobject_put(&rdev->kobj);
2240 }
2241 
2242 static void unbind_rdev_from_array(struct md_rdev *rdev)
2243 {
2244 	char b[BDEVNAME_SIZE];
2245 
2246 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2247 	list_del_rcu(&rdev->same_set);
2248 	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2249 	rdev->mddev = NULL;
2250 	sysfs_remove_link(&rdev->kobj, "block");
2251 	sysfs_put(rdev->sysfs_state);
2252 	rdev->sysfs_state = NULL;
2253 	rdev->badblocks.count = 0;
2254 	/* We need to delay this, otherwise we can deadlock when
2255 	 * writing to 'remove' to "dev/state".  We also need
2256 	 * to delay it due to rcu usage.
2257 	 */
2258 	synchronize_rcu();
2259 	INIT_WORK(&rdev->del_work, md_delayed_delete);
2260 	kobject_get(&rdev->kobj);
2261 	queue_work(md_misc_wq, &rdev->del_work);
2262 }
2263 
2264 /*
2265  * prevent the device from being mounted, repartitioned or
2266  * otherwise reused by a RAID array (or any other kernel
2267  * subsystem), by bd_claiming the device.
2268  */
2269 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2270 {
2271 	int err = 0;
2272 	struct block_device *bdev;
2273 	char b[BDEVNAME_SIZE];
2274 
2275 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2276 				 shared ? (struct md_rdev *)lock_rdev : rdev);
2277 	if (IS_ERR(bdev)) {
2278 		pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2279 		return PTR_ERR(bdev);
2280 	}
2281 	rdev->bdev = bdev;
2282 	return err;
2283 }
2284 
2285 static void unlock_rdev(struct md_rdev *rdev)
2286 {
2287 	struct block_device *bdev = rdev->bdev;
2288 	rdev->bdev = NULL;
2289 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2290 }
2291 
2292 void md_autodetect_dev(dev_t dev);
2293 
2294 static void export_rdev(struct md_rdev *rdev)
2295 {
2296 	char b[BDEVNAME_SIZE];
2297 
2298 	pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2299 	md_rdev_clear(rdev);
2300 #ifndef MODULE
2301 	if (test_bit(AutoDetected, &rdev->flags))
2302 		md_autodetect_dev(rdev->bdev->bd_dev);
2303 #endif
2304 	unlock_rdev(rdev);
2305 	kobject_put(&rdev->kobj);
2306 }
2307 
2308 void md_kick_rdev_from_array(struct md_rdev *rdev)
2309 {
2310 	unbind_rdev_from_array(rdev);
2311 	export_rdev(rdev);
2312 }
2313 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2314 
2315 static void export_array(struct mddev *mddev)
2316 {
2317 	struct md_rdev *rdev;
2318 
2319 	while (!list_empty(&mddev->disks)) {
2320 		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2321 					same_set);
2322 		md_kick_rdev_from_array(rdev);
2323 	}
2324 	mddev->raid_disks = 0;
2325 	mddev->major_version = 0;
2326 }
2327 
2328 static bool set_in_sync(struct mddev *mddev)
2329 {
2330 	lockdep_assert_held(&mddev->lock);
2331 	if (!mddev->in_sync) {
2332 		mddev->sync_checkers++;
2333 		spin_unlock(&mddev->lock);
2334 		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2335 		spin_lock(&mddev->lock);
2336 		if (!mddev->in_sync &&
2337 		    percpu_ref_is_zero(&mddev->writes_pending)) {
2338 			mddev->in_sync = 1;
2339 			/*
2340 			 * Ensure ->in_sync is visible before we clear
2341 			 * ->sync_checkers.
2342 			 */
2343 			smp_mb();
2344 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2345 			sysfs_notify_dirent_safe(mddev->sysfs_state);
2346 		}
2347 		if (--mddev->sync_checkers == 0)
2348 			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2349 	}
2350 	if (mddev->safemode == 1)
2351 		mddev->safemode = 0;
2352 	return mddev->in_sync;
2353 }
2354 
2355 static void sync_sbs(struct mddev *mddev, int nospares)
2356 {
2357 	/* Update each superblock (in-memory image), but
2358 	 * if we are allowed to, skip spares which already
2359 	 * have the right event counter, or have one earlier
2360 	 * (which would mean they aren't being marked as dirty
2361 	 * with the rest of the array)
2362 	 */
2363 	struct md_rdev *rdev;
2364 	rdev_for_each(rdev, mddev) {
2365 		if (rdev->sb_events == mddev->events ||
2366 		    (nospares &&
2367 		     rdev->raid_disk < 0 &&
2368 		     rdev->sb_events+1 == mddev->events)) {
2369 			/* Don't update this superblock */
2370 			rdev->sb_loaded = 2;
2371 		} else {
2372 			sync_super(mddev, rdev);
2373 			rdev->sb_loaded = 1;
2374 		}
2375 	}
2376 }
2377 
2378 static bool does_sb_need_changing(struct mddev *mddev)
2379 {
2380 	struct md_rdev *rdev;
2381 	struct mdp_superblock_1 *sb;
2382 	int role;
2383 
2384 	/* Find a good rdev */
2385 	rdev_for_each(rdev, mddev)
2386 		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2387 			break;
2388 
2389 	/* No good device found. */
2390 	if (!rdev)
2391 		return false;
2392 
2393 	sb = page_address(rdev->sb_page);
2394 	/* Check if a device has become faulty or a spare become active */
2395 	rdev_for_each(rdev, mddev) {
2396 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2397 		/* Device activated? */
2398 		if (role == 0xffff && rdev->raid_disk >=0 &&
2399 		    !test_bit(Faulty, &rdev->flags))
2400 			return true;
2401 		/* Device turned faulty? */
2402 		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2403 			return true;
2404 	}
2405 
2406 	/* Check if any mddev parameters have changed */
2407 	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2408 	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2409 	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2410 	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2411 	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2412 		return true;
2413 
2414 	return false;
2415 }
2416 
2417 void md_update_sb(struct mddev *mddev, int force_change)
2418 {
2419 	struct md_rdev *rdev;
2420 	int sync_req;
2421 	int nospares = 0;
2422 	int any_badblocks_changed = 0;
2423 	int ret = -1;
2424 
2425 	if (mddev->ro) {
2426 		if (force_change)
2427 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2428 		return;
2429 	}
2430 
2431 repeat:
2432 	if (mddev_is_clustered(mddev)) {
2433 		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2434 			force_change = 1;
2435 		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2436 			nospares = 1;
2437 		ret = md_cluster_ops->metadata_update_start(mddev);
2438 		/* Has someone else has updated the sb */
2439 		if (!does_sb_need_changing(mddev)) {
2440 			if (ret == 0)
2441 				md_cluster_ops->metadata_update_cancel(mddev);
2442 			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2443 							 BIT(MD_SB_CHANGE_DEVS) |
2444 							 BIT(MD_SB_CHANGE_CLEAN));
2445 			return;
2446 		}
2447 	}
2448 
2449 	/*
2450 	 * First make sure individual recovery_offsets are correct
2451 	 * curr_resync_completed can only be used during recovery.
2452 	 * During reshape/resync it might use array-addresses rather
2453 	 * that device addresses.
2454 	 */
2455 	rdev_for_each(rdev, mddev) {
2456 		if (rdev->raid_disk >= 0 &&
2457 		    mddev->delta_disks >= 0 &&
2458 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2459 		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2460 		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2461 		    !test_bit(Journal, &rdev->flags) &&
2462 		    !test_bit(In_sync, &rdev->flags) &&
2463 		    mddev->curr_resync_completed > rdev->recovery_offset)
2464 				rdev->recovery_offset = mddev->curr_resync_completed;
2465 
2466 	}
2467 	if (!mddev->persistent) {
2468 		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2469 		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2470 		if (!mddev->external) {
2471 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2472 			rdev_for_each(rdev, mddev) {
2473 				if (rdev->badblocks.changed) {
2474 					rdev->badblocks.changed = 0;
2475 					ack_all_badblocks(&rdev->badblocks);
2476 					md_error(mddev, rdev);
2477 				}
2478 				clear_bit(Blocked, &rdev->flags);
2479 				clear_bit(BlockedBadBlocks, &rdev->flags);
2480 				wake_up(&rdev->blocked_wait);
2481 			}
2482 		}
2483 		wake_up(&mddev->sb_wait);
2484 		return;
2485 	}
2486 
2487 	spin_lock(&mddev->lock);
2488 
2489 	mddev->utime = ktime_get_real_seconds();
2490 
2491 	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2492 		force_change = 1;
2493 	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2494 		/* just a clean<-> dirty transition, possibly leave spares alone,
2495 		 * though if events isn't the right even/odd, we will have to do
2496 		 * spares after all
2497 		 */
2498 		nospares = 1;
2499 	if (force_change)
2500 		nospares = 0;
2501 	if (mddev->degraded)
2502 		/* If the array is degraded, then skipping spares is both
2503 		 * dangerous and fairly pointless.
2504 		 * Dangerous because a device that was removed from the array
2505 		 * might have a event_count that still looks up-to-date,
2506 		 * so it can be re-added without a resync.
2507 		 * Pointless because if there are any spares to skip,
2508 		 * then a recovery will happen and soon that array won't
2509 		 * be degraded any more and the spare can go back to sleep then.
2510 		 */
2511 		nospares = 0;
2512 
2513 	sync_req = mddev->in_sync;
2514 
2515 	/* If this is just a dirty<->clean transition, and the array is clean
2516 	 * and 'events' is odd, we can roll back to the previous clean state */
2517 	if (nospares
2518 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2519 	    && mddev->can_decrease_events
2520 	    && mddev->events != 1) {
2521 		mddev->events--;
2522 		mddev->can_decrease_events = 0;
2523 	} else {
2524 		/* otherwise we have to go forward and ... */
2525 		mddev->events ++;
2526 		mddev->can_decrease_events = nospares;
2527 	}
2528 
2529 	/*
2530 	 * This 64-bit counter should never wrap.
2531 	 * Either we are in around ~1 trillion A.C., assuming
2532 	 * 1 reboot per second, or we have a bug...
2533 	 */
2534 	WARN_ON(mddev->events == 0);
2535 
2536 	rdev_for_each(rdev, mddev) {
2537 		if (rdev->badblocks.changed)
2538 			any_badblocks_changed++;
2539 		if (test_bit(Faulty, &rdev->flags))
2540 			set_bit(FaultRecorded, &rdev->flags);
2541 	}
2542 
2543 	sync_sbs(mddev, nospares);
2544 	spin_unlock(&mddev->lock);
2545 
2546 	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2547 		 mdname(mddev), mddev->in_sync);
2548 
2549 	if (mddev->queue)
2550 		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2551 rewrite:
2552 	md_bitmap_update_sb(mddev->bitmap);
2553 	rdev_for_each(rdev, mddev) {
2554 		char b[BDEVNAME_SIZE];
2555 
2556 		if (rdev->sb_loaded != 1)
2557 			continue; /* no noise on spare devices */
2558 
2559 		if (!test_bit(Faulty, &rdev->flags)) {
2560 			md_super_write(mddev,rdev,
2561 				       rdev->sb_start, rdev->sb_size,
2562 				       rdev->sb_page);
2563 			pr_debug("md: (write) %s's sb offset: %llu\n",
2564 				 bdevname(rdev->bdev, b),
2565 				 (unsigned long long)rdev->sb_start);
2566 			rdev->sb_events = mddev->events;
2567 			if (rdev->badblocks.size) {
2568 				md_super_write(mddev, rdev,
2569 					       rdev->badblocks.sector,
2570 					       rdev->badblocks.size << 9,
2571 					       rdev->bb_page);
2572 				rdev->badblocks.size = 0;
2573 			}
2574 
2575 		} else
2576 			pr_debug("md: %s (skipping faulty)\n",
2577 				 bdevname(rdev->bdev, b));
2578 
2579 		if (mddev->level == LEVEL_MULTIPATH)
2580 			/* only need to write one superblock... */
2581 			break;
2582 	}
2583 	if (md_super_wait(mddev) < 0)
2584 		goto rewrite;
2585 	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2586 
2587 	if (mddev_is_clustered(mddev) && ret == 0)
2588 		md_cluster_ops->metadata_update_finish(mddev);
2589 
2590 	if (mddev->in_sync != sync_req ||
2591 	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2592 			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2593 		/* have to write it out again */
2594 		goto repeat;
2595 	wake_up(&mddev->sb_wait);
2596 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2597 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2598 
2599 	rdev_for_each(rdev, mddev) {
2600 		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2601 			clear_bit(Blocked, &rdev->flags);
2602 
2603 		if (any_badblocks_changed)
2604 			ack_all_badblocks(&rdev->badblocks);
2605 		clear_bit(BlockedBadBlocks, &rdev->flags);
2606 		wake_up(&rdev->blocked_wait);
2607 	}
2608 }
2609 EXPORT_SYMBOL(md_update_sb);
2610 
2611 static int add_bound_rdev(struct md_rdev *rdev)
2612 {
2613 	struct mddev *mddev = rdev->mddev;
2614 	int err = 0;
2615 	bool add_journal = test_bit(Journal, &rdev->flags);
2616 
2617 	if (!mddev->pers->hot_remove_disk || add_journal) {
2618 		/* If there is hot_add_disk but no hot_remove_disk
2619 		 * then added disks for geometry changes,
2620 		 * and should be added immediately.
2621 		 */
2622 		super_types[mddev->major_version].
2623 			validate_super(mddev, rdev);
2624 		if (add_journal)
2625 			mddev_suspend(mddev);
2626 		err = mddev->pers->hot_add_disk(mddev, rdev);
2627 		if (add_journal)
2628 			mddev_resume(mddev);
2629 		if (err) {
2630 			md_kick_rdev_from_array(rdev);
2631 			return err;
2632 		}
2633 	}
2634 	sysfs_notify_dirent_safe(rdev->sysfs_state);
2635 
2636 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2637 	if (mddev->degraded)
2638 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2639 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2640 	md_new_event(mddev);
2641 	md_wakeup_thread(mddev->thread);
2642 	return 0;
2643 }
2644 
2645 /* words written to sysfs files may, or may not, be \n terminated.
2646  * We want to accept with case. For this we use cmd_match.
2647  */
2648 static int cmd_match(const char *cmd, const char *str)
2649 {
2650 	/* See if cmd, written into a sysfs file, matches
2651 	 * str.  They must either be the same, or cmd can
2652 	 * have a trailing newline
2653 	 */
2654 	while (*cmd && *str && *cmd == *str) {
2655 		cmd++;
2656 		str++;
2657 	}
2658 	if (*cmd == '\n')
2659 		cmd++;
2660 	if (*str || *cmd)
2661 		return 0;
2662 	return 1;
2663 }
2664 
2665 struct rdev_sysfs_entry {
2666 	struct attribute attr;
2667 	ssize_t (*show)(struct md_rdev *, char *);
2668 	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2669 };
2670 
2671 static ssize_t
2672 state_show(struct md_rdev *rdev, char *page)
2673 {
2674 	char *sep = ",";
2675 	size_t len = 0;
2676 	unsigned long flags = READ_ONCE(rdev->flags);
2677 
2678 	if (test_bit(Faulty, &flags) ||
2679 	    (!test_bit(ExternalBbl, &flags) &&
2680 	    rdev->badblocks.unacked_exist))
2681 		len += sprintf(page+len, "faulty%s", sep);
2682 	if (test_bit(In_sync, &flags))
2683 		len += sprintf(page+len, "in_sync%s", sep);
2684 	if (test_bit(Journal, &flags))
2685 		len += sprintf(page+len, "journal%s", sep);
2686 	if (test_bit(WriteMostly, &flags))
2687 		len += sprintf(page+len, "write_mostly%s", sep);
2688 	if (test_bit(Blocked, &flags) ||
2689 	    (rdev->badblocks.unacked_exist
2690 	     && !test_bit(Faulty, &flags)))
2691 		len += sprintf(page+len, "blocked%s", sep);
2692 	if (!test_bit(Faulty, &flags) &&
2693 	    !test_bit(Journal, &flags) &&
2694 	    !test_bit(In_sync, &flags))
2695 		len += sprintf(page+len, "spare%s", sep);
2696 	if (test_bit(WriteErrorSeen, &flags))
2697 		len += sprintf(page+len, "write_error%s", sep);
2698 	if (test_bit(WantReplacement, &flags))
2699 		len += sprintf(page+len, "want_replacement%s", sep);
2700 	if (test_bit(Replacement, &flags))
2701 		len += sprintf(page+len, "replacement%s", sep);
2702 	if (test_bit(ExternalBbl, &flags))
2703 		len += sprintf(page+len, "external_bbl%s", sep);
2704 	if (test_bit(FailFast, &flags))
2705 		len += sprintf(page+len, "failfast%s", sep);
2706 
2707 	if (len)
2708 		len -= strlen(sep);
2709 
2710 	return len+sprintf(page+len, "\n");
2711 }
2712 
2713 static ssize_t
2714 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2715 {
2716 	/* can write
2717 	 *  faulty  - simulates an error
2718 	 *  remove  - disconnects the device
2719 	 *  writemostly - sets write_mostly
2720 	 *  -writemostly - clears write_mostly
2721 	 *  blocked - sets the Blocked flags
2722 	 *  -blocked - clears the Blocked and possibly simulates an error
2723 	 *  insync - sets Insync providing device isn't active
2724 	 *  -insync - clear Insync for a device with a slot assigned,
2725 	 *            so that it gets rebuilt based on bitmap
2726 	 *  write_error - sets WriteErrorSeen
2727 	 *  -write_error - clears WriteErrorSeen
2728 	 *  {,-}failfast - set/clear FailFast
2729 	 */
2730 	int err = -EINVAL;
2731 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2732 		md_error(rdev->mddev, rdev);
2733 		if (test_bit(Faulty, &rdev->flags))
2734 			err = 0;
2735 		else
2736 			err = -EBUSY;
2737 	} else if (cmd_match(buf, "remove")) {
2738 		if (rdev->mddev->pers) {
2739 			clear_bit(Blocked, &rdev->flags);
2740 			remove_and_add_spares(rdev->mddev, rdev);
2741 		}
2742 		if (rdev->raid_disk >= 0)
2743 			err = -EBUSY;
2744 		else {
2745 			struct mddev *mddev = rdev->mddev;
2746 			err = 0;
2747 			if (mddev_is_clustered(mddev))
2748 				err = md_cluster_ops->remove_disk(mddev, rdev);
2749 
2750 			if (err == 0) {
2751 				md_kick_rdev_from_array(rdev);
2752 				if (mddev->pers) {
2753 					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2754 					md_wakeup_thread(mddev->thread);
2755 				}
2756 				md_new_event(mddev);
2757 			}
2758 		}
2759 	} else if (cmd_match(buf, "writemostly")) {
2760 		set_bit(WriteMostly, &rdev->flags);
2761 		err = 0;
2762 	} else if (cmd_match(buf, "-writemostly")) {
2763 		clear_bit(WriteMostly, &rdev->flags);
2764 		err = 0;
2765 	} else if (cmd_match(buf, "blocked")) {
2766 		set_bit(Blocked, &rdev->flags);
2767 		err = 0;
2768 	} else if (cmd_match(buf, "-blocked")) {
2769 		if (!test_bit(Faulty, &rdev->flags) &&
2770 		    !test_bit(ExternalBbl, &rdev->flags) &&
2771 		    rdev->badblocks.unacked_exist) {
2772 			/* metadata handler doesn't understand badblocks,
2773 			 * so we need to fail the device
2774 			 */
2775 			md_error(rdev->mddev, rdev);
2776 		}
2777 		clear_bit(Blocked, &rdev->flags);
2778 		clear_bit(BlockedBadBlocks, &rdev->flags);
2779 		wake_up(&rdev->blocked_wait);
2780 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2781 		md_wakeup_thread(rdev->mddev->thread);
2782 
2783 		err = 0;
2784 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2785 		set_bit(In_sync, &rdev->flags);
2786 		err = 0;
2787 	} else if (cmd_match(buf, "failfast")) {
2788 		set_bit(FailFast, &rdev->flags);
2789 		err = 0;
2790 	} else if (cmd_match(buf, "-failfast")) {
2791 		clear_bit(FailFast, &rdev->flags);
2792 		err = 0;
2793 	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2794 		   !test_bit(Journal, &rdev->flags)) {
2795 		if (rdev->mddev->pers == NULL) {
2796 			clear_bit(In_sync, &rdev->flags);
2797 			rdev->saved_raid_disk = rdev->raid_disk;
2798 			rdev->raid_disk = -1;
2799 			err = 0;
2800 		}
2801 	} else if (cmd_match(buf, "write_error")) {
2802 		set_bit(WriteErrorSeen, &rdev->flags);
2803 		err = 0;
2804 	} else if (cmd_match(buf, "-write_error")) {
2805 		clear_bit(WriteErrorSeen, &rdev->flags);
2806 		err = 0;
2807 	} else if (cmd_match(buf, "want_replacement")) {
2808 		/* Any non-spare device that is not a replacement can
2809 		 * become want_replacement at any time, but we then need to
2810 		 * check if recovery is needed.
2811 		 */
2812 		if (rdev->raid_disk >= 0 &&
2813 		    !test_bit(Journal, &rdev->flags) &&
2814 		    !test_bit(Replacement, &rdev->flags))
2815 			set_bit(WantReplacement, &rdev->flags);
2816 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2817 		md_wakeup_thread(rdev->mddev->thread);
2818 		err = 0;
2819 	} else if (cmd_match(buf, "-want_replacement")) {
2820 		/* Clearing 'want_replacement' is always allowed.
2821 		 * Once replacements starts it is too late though.
2822 		 */
2823 		err = 0;
2824 		clear_bit(WantReplacement, &rdev->flags);
2825 	} else if (cmd_match(buf, "replacement")) {
2826 		/* Can only set a device as a replacement when array has not
2827 		 * yet been started.  Once running, replacement is automatic
2828 		 * from spares, or by assigning 'slot'.
2829 		 */
2830 		if (rdev->mddev->pers)
2831 			err = -EBUSY;
2832 		else {
2833 			set_bit(Replacement, &rdev->flags);
2834 			err = 0;
2835 		}
2836 	} else if (cmd_match(buf, "-replacement")) {
2837 		/* Similarly, can only clear Replacement before start */
2838 		if (rdev->mddev->pers)
2839 			err = -EBUSY;
2840 		else {
2841 			clear_bit(Replacement, &rdev->flags);
2842 			err = 0;
2843 		}
2844 	} else if (cmd_match(buf, "re-add")) {
2845 		if (!rdev->mddev->pers)
2846 			err = -EINVAL;
2847 		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2848 				rdev->saved_raid_disk >= 0) {
2849 			/* clear_bit is performed _after_ all the devices
2850 			 * have their local Faulty bit cleared. If any writes
2851 			 * happen in the meantime in the local node, they
2852 			 * will land in the local bitmap, which will be synced
2853 			 * by this node eventually
2854 			 */
2855 			if (!mddev_is_clustered(rdev->mddev) ||
2856 			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2857 				clear_bit(Faulty, &rdev->flags);
2858 				err = add_bound_rdev(rdev);
2859 			}
2860 		} else
2861 			err = -EBUSY;
2862 	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2863 		set_bit(ExternalBbl, &rdev->flags);
2864 		rdev->badblocks.shift = 0;
2865 		err = 0;
2866 	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2867 		clear_bit(ExternalBbl, &rdev->flags);
2868 		err = 0;
2869 	}
2870 	if (!err)
2871 		sysfs_notify_dirent_safe(rdev->sysfs_state);
2872 	return err ? err : len;
2873 }
2874 static struct rdev_sysfs_entry rdev_state =
2875 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2876 
2877 static ssize_t
2878 errors_show(struct md_rdev *rdev, char *page)
2879 {
2880 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2881 }
2882 
2883 static ssize_t
2884 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2885 {
2886 	unsigned int n;
2887 	int rv;
2888 
2889 	rv = kstrtouint(buf, 10, &n);
2890 	if (rv < 0)
2891 		return rv;
2892 	atomic_set(&rdev->corrected_errors, n);
2893 	return len;
2894 }
2895 static struct rdev_sysfs_entry rdev_errors =
2896 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2897 
2898 static ssize_t
2899 slot_show(struct md_rdev *rdev, char *page)
2900 {
2901 	if (test_bit(Journal, &rdev->flags))
2902 		return sprintf(page, "journal\n");
2903 	else if (rdev->raid_disk < 0)
2904 		return sprintf(page, "none\n");
2905 	else
2906 		return sprintf(page, "%d\n", rdev->raid_disk);
2907 }
2908 
2909 static ssize_t
2910 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2911 {
2912 	int slot;
2913 	int err;
2914 
2915 	if (test_bit(Journal, &rdev->flags))
2916 		return -EBUSY;
2917 	if (strncmp(buf, "none", 4)==0)
2918 		slot = -1;
2919 	else {
2920 		err = kstrtouint(buf, 10, (unsigned int *)&slot);
2921 		if (err < 0)
2922 			return err;
2923 	}
2924 	if (rdev->mddev->pers && slot == -1) {
2925 		/* Setting 'slot' on an active array requires also
2926 		 * updating the 'rd%d' link, and communicating
2927 		 * with the personality with ->hot_*_disk.
2928 		 * For now we only support removing
2929 		 * failed/spare devices.  This normally happens automatically,
2930 		 * but not when the metadata is externally managed.
2931 		 */
2932 		if (rdev->raid_disk == -1)
2933 			return -EEXIST;
2934 		/* personality does all needed checks */
2935 		if (rdev->mddev->pers->hot_remove_disk == NULL)
2936 			return -EINVAL;
2937 		clear_bit(Blocked, &rdev->flags);
2938 		remove_and_add_spares(rdev->mddev, rdev);
2939 		if (rdev->raid_disk >= 0)
2940 			return -EBUSY;
2941 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2942 		md_wakeup_thread(rdev->mddev->thread);
2943 	} else if (rdev->mddev->pers) {
2944 		/* Activating a spare .. or possibly reactivating
2945 		 * if we ever get bitmaps working here.
2946 		 */
2947 		int err;
2948 
2949 		if (rdev->raid_disk != -1)
2950 			return -EBUSY;
2951 
2952 		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2953 			return -EBUSY;
2954 
2955 		if (rdev->mddev->pers->hot_add_disk == NULL)
2956 			return -EINVAL;
2957 
2958 		if (slot >= rdev->mddev->raid_disks &&
2959 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2960 			return -ENOSPC;
2961 
2962 		rdev->raid_disk = slot;
2963 		if (test_bit(In_sync, &rdev->flags))
2964 			rdev->saved_raid_disk = slot;
2965 		else
2966 			rdev->saved_raid_disk = -1;
2967 		clear_bit(In_sync, &rdev->flags);
2968 		clear_bit(Bitmap_sync, &rdev->flags);
2969 		err = rdev->mddev->pers->
2970 			hot_add_disk(rdev->mddev, rdev);
2971 		if (err) {
2972 			rdev->raid_disk = -1;
2973 			return err;
2974 		} else
2975 			sysfs_notify_dirent_safe(rdev->sysfs_state);
2976 		if (sysfs_link_rdev(rdev->mddev, rdev))
2977 			/* failure here is OK */;
2978 		/* don't wakeup anyone, leave that to userspace. */
2979 	} else {
2980 		if (slot >= rdev->mddev->raid_disks &&
2981 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2982 			return -ENOSPC;
2983 		rdev->raid_disk = slot;
2984 		/* assume it is working */
2985 		clear_bit(Faulty, &rdev->flags);
2986 		clear_bit(WriteMostly, &rdev->flags);
2987 		set_bit(In_sync, &rdev->flags);
2988 		sysfs_notify_dirent_safe(rdev->sysfs_state);
2989 	}
2990 	return len;
2991 }
2992 
2993 static struct rdev_sysfs_entry rdev_slot =
2994 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2995 
2996 static ssize_t
2997 offset_show(struct md_rdev *rdev, char *page)
2998 {
2999 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3000 }
3001 
3002 static ssize_t
3003 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3004 {
3005 	unsigned long long offset;
3006 	if (kstrtoull(buf, 10, &offset) < 0)
3007 		return -EINVAL;
3008 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3009 		return -EBUSY;
3010 	if (rdev->sectors && rdev->mddev->external)
3011 		/* Must set offset before size, so overlap checks
3012 		 * can be sane */
3013 		return -EBUSY;
3014 	rdev->data_offset = offset;
3015 	rdev->new_data_offset = offset;
3016 	return len;
3017 }
3018 
3019 static struct rdev_sysfs_entry rdev_offset =
3020 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3021 
3022 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3023 {
3024 	return sprintf(page, "%llu\n",
3025 		       (unsigned long long)rdev->new_data_offset);
3026 }
3027 
3028 static ssize_t new_offset_store(struct md_rdev *rdev,
3029 				const char *buf, size_t len)
3030 {
3031 	unsigned long long new_offset;
3032 	struct mddev *mddev = rdev->mddev;
3033 
3034 	if (kstrtoull(buf, 10, &new_offset) < 0)
3035 		return -EINVAL;
3036 
3037 	if (mddev->sync_thread ||
3038 	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3039 		return -EBUSY;
3040 	if (new_offset == rdev->data_offset)
3041 		/* reset is always permitted */
3042 		;
3043 	else if (new_offset > rdev->data_offset) {
3044 		/* must not push array size beyond rdev_sectors */
3045 		if (new_offset - rdev->data_offset
3046 		    + mddev->dev_sectors > rdev->sectors)
3047 				return -E2BIG;
3048 	}
3049 	/* Metadata worries about other space details. */
3050 
3051 	/* decreasing the offset is inconsistent with a backwards
3052 	 * reshape.
3053 	 */
3054 	if (new_offset < rdev->data_offset &&
3055 	    mddev->reshape_backwards)
3056 		return -EINVAL;
3057 	/* Increasing offset is inconsistent with forwards
3058 	 * reshape.  reshape_direction should be set to
3059 	 * 'backwards' first.
3060 	 */
3061 	if (new_offset > rdev->data_offset &&
3062 	    !mddev->reshape_backwards)
3063 		return -EINVAL;
3064 
3065 	if (mddev->pers && mddev->persistent &&
3066 	    !super_types[mddev->major_version]
3067 	    .allow_new_offset(rdev, new_offset))
3068 		return -E2BIG;
3069 	rdev->new_data_offset = new_offset;
3070 	if (new_offset > rdev->data_offset)
3071 		mddev->reshape_backwards = 1;
3072 	else if (new_offset < rdev->data_offset)
3073 		mddev->reshape_backwards = 0;
3074 
3075 	return len;
3076 }
3077 static struct rdev_sysfs_entry rdev_new_offset =
3078 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3079 
3080 static ssize_t
3081 rdev_size_show(struct md_rdev *rdev, char *page)
3082 {
3083 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3084 }
3085 
3086 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3087 {
3088 	/* check if two start/length pairs overlap */
3089 	if (s1+l1 <= s2)
3090 		return 0;
3091 	if (s2+l2 <= s1)
3092 		return 0;
3093 	return 1;
3094 }
3095 
3096 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3097 {
3098 	unsigned long long blocks;
3099 	sector_t new;
3100 
3101 	if (kstrtoull(buf, 10, &blocks) < 0)
3102 		return -EINVAL;
3103 
3104 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3105 		return -EINVAL; /* sector conversion overflow */
3106 
3107 	new = blocks * 2;
3108 	if (new != blocks * 2)
3109 		return -EINVAL; /* unsigned long long to sector_t overflow */
3110 
3111 	*sectors = new;
3112 	return 0;
3113 }
3114 
3115 static ssize_t
3116 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3117 {
3118 	struct mddev *my_mddev = rdev->mddev;
3119 	sector_t oldsectors = rdev->sectors;
3120 	sector_t sectors;
3121 
3122 	if (test_bit(Journal, &rdev->flags))
3123 		return -EBUSY;
3124 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3125 		return -EINVAL;
3126 	if (rdev->data_offset != rdev->new_data_offset)
3127 		return -EINVAL; /* too confusing */
3128 	if (my_mddev->pers && rdev->raid_disk >= 0) {
3129 		if (my_mddev->persistent) {
3130 			sectors = super_types[my_mddev->major_version].
3131 				rdev_size_change(rdev, sectors);
3132 			if (!sectors)
3133 				return -EBUSY;
3134 		} else if (!sectors)
3135 			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3136 				rdev->data_offset;
3137 		if (!my_mddev->pers->resize)
3138 			/* Cannot change size for RAID0 or Linear etc */
3139 			return -EINVAL;
3140 	}
3141 	if (sectors < my_mddev->dev_sectors)
3142 		return -EINVAL; /* component must fit device */
3143 
3144 	rdev->sectors = sectors;
3145 	if (sectors > oldsectors && my_mddev->external) {
3146 		/* Need to check that all other rdevs with the same
3147 		 * ->bdev do not overlap.  'rcu' is sufficient to walk
3148 		 * the rdev lists safely.
3149 		 * This check does not provide a hard guarantee, it
3150 		 * just helps avoid dangerous mistakes.
3151 		 */
3152 		struct mddev *mddev;
3153 		int overlap = 0;
3154 		struct list_head *tmp;
3155 
3156 		rcu_read_lock();
3157 		for_each_mddev(mddev, tmp) {
3158 			struct md_rdev *rdev2;
3159 
3160 			rdev_for_each(rdev2, mddev)
3161 				if (rdev->bdev == rdev2->bdev &&
3162 				    rdev != rdev2 &&
3163 				    overlaps(rdev->data_offset, rdev->sectors,
3164 					     rdev2->data_offset,
3165 					     rdev2->sectors)) {
3166 					overlap = 1;
3167 					break;
3168 				}
3169 			if (overlap) {
3170 				mddev_put(mddev);
3171 				break;
3172 			}
3173 		}
3174 		rcu_read_unlock();
3175 		if (overlap) {
3176 			/* Someone else could have slipped in a size
3177 			 * change here, but doing so is just silly.
3178 			 * We put oldsectors back because we *know* it is
3179 			 * safe, and trust userspace not to race with
3180 			 * itself
3181 			 */
3182 			rdev->sectors = oldsectors;
3183 			return -EBUSY;
3184 		}
3185 	}
3186 	return len;
3187 }
3188 
3189 static struct rdev_sysfs_entry rdev_size =
3190 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3191 
3192 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3193 {
3194 	unsigned long long recovery_start = rdev->recovery_offset;
3195 
3196 	if (test_bit(In_sync, &rdev->flags) ||
3197 	    recovery_start == MaxSector)
3198 		return sprintf(page, "none\n");
3199 
3200 	return sprintf(page, "%llu\n", recovery_start);
3201 }
3202 
3203 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3204 {
3205 	unsigned long long recovery_start;
3206 
3207 	if (cmd_match(buf, "none"))
3208 		recovery_start = MaxSector;
3209 	else if (kstrtoull(buf, 10, &recovery_start))
3210 		return -EINVAL;
3211 
3212 	if (rdev->mddev->pers &&
3213 	    rdev->raid_disk >= 0)
3214 		return -EBUSY;
3215 
3216 	rdev->recovery_offset = recovery_start;
3217 	if (recovery_start == MaxSector)
3218 		set_bit(In_sync, &rdev->flags);
3219 	else
3220 		clear_bit(In_sync, &rdev->flags);
3221 	return len;
3222 }
3223 
3224 static struct rdev_sysfs_entry rdev_recovery_start =
3225 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3226 
3227 /* sysfs access to bad-blocks list.
3228  * We present two files.
3229  * 'bad-blocks' lists sector numbers and lengths of ranges that
3230  *    are recorded as bad.  The list is truncated to fit within
3231  *    the one-page limit of sysfs.
3232  *    Writing "sector length" to this file adds an acknowledged
3233  *    bad block list.
3234  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3235  *    been acknowledged.  Writing to this file adds bad blocks
3236  *    without acknowledging them.  This is largely for testing.
3237  */
3238 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3239 {
3240 	return badblocks_show(&rdev->badblocks, page, 0);
3241 }
3242 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3243 {
3244 	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3245 	/* Maybe that ack was all we needed */
3246 	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3247 		wake_up(&rdev->blocked_wait);
3248 	return rv;
3249 }
3250 static struct rdev_sysfs_entry rdev_bad_blocks =
3251 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3252 
3253 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3254 {
3255 	return badblocks_show(&rdev->badblocks, page, 1);
3256 }
3257 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3258 {
3259 	return badblocks_store(&rdev->badblocks, page, len, 1);
3260 }
3261 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3262 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3263 
3264 static ssize_t
3265 ppl_sector_show(struct md_rdev *rdev, char *page)
3266 {
3267 	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3268 }
3269 
3270 static ssize_t
3271 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3272 {
3273 	unsigned long long sector;
3274 
3275 	if (kstrtoull(buf, 10, &sector) < 0)
3276 		return -EINVAL;
3277 	if (sector != (sector_t)sector)
3278 		return -EINVAL;
3279 
3280 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3281 	    rdev->raid_disk >= 0)
3282 		return -EBUSY;
3283 
3284 	if (rdev->mddev->persistent) {
3285 		if (rdev->mddev->major_version == 0)
3286 			return -EINVAL;
3287 		if ((sector > rdev->sb_start &&
3288 		     sector - rdev->sb_start > S16_MAX) ||
3289 		    (sector < rdev->sb_start &&
3290 		     rdev->sb_start - sector > -S16_MIN))
3291 			return -EINVAL;
3292 		rdev->ppl.offset = sector - rdev->sb_start;
3293 	} else if (!rdev->mddev->external) {
3294 		return -EBUSY;
3295 	}
3296 	rdev->ppl.sector = sector;
3297 	return len;
3298 }
3299 
3300 static struct rdev_sysfs_entry rdev_ppl_sector =
3301 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3302 
3303 static ssize_t
3304 ppl_size_show(struct md_rdev *rdev, char *page)
3305 {
3306 	return sprintf(page, "%u\n", rdev->ppl.size);
3307 }
3308 
3309 static ssize_t
3310 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3311 {
3312 	unsigned int size;
3313 
3314 	if (kstrtouint(buf, 10, &size) < 0)
3315 		return -EINVAL;
3316 
3317 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3318 	    rdev->raid_disk >= 0)
3319 		return -EBUSY;
3320 
3321 	if (rdev->mddev->persistent) {
3322 		if (rdev->mddev->major_version == 0)
3323 			return -EINVAL;
3324 		if (size > U16_MAX)
3325 			return -EINVAL;
3326 	} else if (!rdev->mddev->external) {
3327 		return -EBUSY;
3328 	}
3329 	rdev->ppl.size = size;
3330 	return len;
3331 }
3332 
3333 static struct rdev_sysfs_entry rdev_ppl_size =
3334 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3335 
3336 static struct attribute *rdev_default_attrs[] = {
3337 	&rdev_state.attr,
3338 	&rdev_errors.attr,
3339 	&rdev_slot.attr,
3340 	&rdev_offset.attr,
3341 	&rdev_new_offset.attr,
3342 	&rdev_size.attr,
3343 	&rdev_recovery_start.attr,
3344 	&rdev_bad_blocks.attr,
3345 	&rdev_unack_bad_blocks.attr,
3346 	&rdev_ppl_sector.attr,
3347 	&rdev_ppl_size.attr,
3348 	NULL,
3349 };
3350 static ssize_t
3351 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3352 {
3353 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3354 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3355 
3356 	if (!entry->show)
3357 		return -EIO;
3358 	if (!rdev->mddev)
3359 		return -EBUSY;
3360 	return entry->show(rdev, page);
3361 }
3362 
3363 static ssize_t
3364 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3365 	      const char *page, size_t length)
3366 {
3367 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3368 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3369 	ssize_t rv;
3370 	struct mddev *mddev = rdev->mddev;
3371 
3372 	if (!entry->store)
3373 		return -EIO;
3374 	if (!capable(CAP_SYS_ADMIN))
3375 		return -EACCES;
3376 	rv = mddev ? mddev_lock(mddev) : -ENODEV;
3377 	if (!rv) {
3378 		if (rdev->mddev == NULL)
3379 			rv = -ENODEV;
3380 		else
3381 			rv = entry->store(rdev, page, length);
3382 		mddev_unlock(mddev);
3383 	}
3384 	return rv;
3385 }
3386 
3387 static void rdev_free(struct kobject *ko)
3388 {
3389 	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3390 	kfree(rdev);
3391 }
3392 static const struct sysfs_ops rdev_sysfs_ops = {
3393 	.show		= rdev_attr_show,
3394 	.store		= rdev_attr_store,
3395 };
3396 static struct kobj_type rdev_ktype = {
3397 	.release	= rdev_free,
3398 	.sysfs_ops	= &rdev_sysfs_ops,
3399 	.default_attrs	= rdev_default_attrs,
3400 };
3401 
3402 int md_rdev_init(struct md_rdev *rdev)
3403 {
3404 	rdev->desc_nr = -1;
3405 	rdev->saved_raid_disk = -1;
3406 	rdev->raid_disk = -1;
3407 	rdev->flags = 0;
3408 	rdev->data_offset = 0;
3409 	rdev->new_data_offset = 0;
3410 	rdev->sb_events = 0;
3411 	rdev->last_read_error = 0;
3412 	rdev->sb_loaded = 0;
3413 	rdev->bb_page = NULL;
3414 	atomic_set(&rdev->nr_pending, 0);
3415 	atomic_set(&rdev->read_errors, 0);
3416 	atomic_set(&rdev->corrected_errors, 0);
3417 
3418 	INIT_LIST_HEAD(&rdev->same_set);
3419 	init_waitqueue_head(&rdev->blocked_wait);
3420 
3421 	/* Add space to store bad block list.
3422 	 * This reserves the space even on arrays where it cannot
3423 	 * be used - I wonder if that matters
3424 	 */
3425 	return badblocks_init(&rdev->badblocks, 0);
3426 }
3427 EXPORT_SYMBOL_GPL(md_rdev_init);
3428 /*
3429  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3430  *
3431  * mark the device faulty if:
3432  *
3433  *   - the device is nonexistent (zero size)
3434  *   - the device has no valid superblock
3435  *
3436  * a faulty rdev _never_ has rdev->sb set.
3437  */
3438 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3439 {
3440 	char b[BDEVNAME_SIZE];
3441 	int err;
3442 	struct md_rdev *rdev;
3443 	sector_t size;
3444 
3445 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3446 	if (!rdev)
3447 		return ERR_PTR(-ENOMEM);
3448 
3449 	err = md_rdev_init(rdev);
3450 	if (err)
3451 		goto abort_free;
3452 	err = alloc_disk_sb(rdev);
3453 	if (err)
3454 		goto abort_free;
3455 
3456 	err = lock_rdev(rdev, newdev, super_format == -2);
3457 	if (err)
3458 		goto abort_free;
3459 
3460 	kobject_init(&rdev->kobj, &rdev_ktype);
3461 
3462 	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3463 	if (!size) {
3464 		pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3465 			bdevname(rdev->bdev,b));
3466 		err = -EINVAL;
3467 		goto abort_free;
3468 	}
3469 
3470 	if (super_format >= 0) {
3471 		err = super_types[super_format].
3472 			load_super(rdev, NULL, super_minor);
3473 		if (err == -EINVAL) {
3474 			pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3475 				bdevname(rdev->bdev,b),
3476 				super_format, super_minor);
3477 			goto abort_free;
3478 		}
3479 		if (err < 0) {
3480 			pr_warn("md: could not read %s's sb, not importing!\n",
3481 				bdevname(rdev->bdev,b));
3482 			goto abort_free;
3483 		}
3484 	}
3485 
3486 	return rdev;
3487 
3488 abort_free:
3489 	if (rdev->bdev)
3490 		unlock_rdev(rdev);
3491 	md_rdev_clear(rdev);
3492 	kfree(rdev);
3493 	return ERR_PTR(err);
3494 }
3495 
3496 /*
3497  * Check a full RAID array for plausibility
3498  */
3499 
3500 static void analyze_sbs(struct mddev *mddev)
3501 {
3502 	int i;
3503 	struct md_rdev *rdev, *freshest, *tmp;
3504 	char b[BDEVNAME_SIZE];
3505 
3506 	freshest = NULL;
3507 	rdev_for_each_safe(rdev, tmp, mddev)
3508 		switch (super_types[mddev->major_version].
3509 			load_super(rdev, freshest, mddev->minor_version)) {
3510 		case 1:
3511 			freshest = rdev;
3512 			break;
3513 		case 0:
3514 			break;
3515 		default:
3516 			pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3517 				bdevname(rdev->bdev,b));
3518 			md_kick_rdev_from_array(rdev);
3519 		}
3520 
3521 	super_types[mddev->major_version].
3522 		validate_super(mddev, freshest);
3523 
3524 	i = 0;
3525 	rdev_for_each_safe(rdev, tmp, mddev) {
3526 		if (mddev->max_disks &&
3527 		    (rdev->desc_nr >= mddev->max_disks ||
3528 		     i > mddev->max_disks)) {
3529 			pr_warn("md: %s: %s: only %d devices permitted\n",
3530 				mdname(mddev), bdevname(rdev->bdev, b),
3531 				mddev->max_disks);
3532 			md_kick_rdev_from_array(rdev);
3533 			continue;
3534 		}
3535 		if (rdev != freshest) {
3536 			if (super_types[mddev->major_version].
3537 			    validate_super(mddev, rdev)) {
3538 				pr_warn("md: kicking non-fresh %s from array!\n",
3539 					bdevname(rdev->bdev,b));
3540 				md_kick_rdev_from_array(rdev);
3541 				continue;
3542 			}
3543 		}
3544 		if (mddev->level == LEVEL_MULTIPATH) {
3545 			rdev->desc_nr = i++;
3546 			rdev->raid_disk = rdev->desc_nr;
3547 			set_bit(In_sync, &rdev->flags);
3548 		} else if (rdev->raid_disk >=
3549 			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3550 			   !test_bit(Journal, &rdev->flags)) {
3551 			rdev->raid_disk = -1;
3552 			clear_bit(In_sync, &rdev->flags);
3553 		}
3554 	}
3555 }
3556 
3557 /* Read a fixed-point number.
3558  * Numbers in sysfs attributes should be in "standard" units where
3559  * possible, so time should be in seconds.
3560  * However we internally use a a much smaller unit such as
3561  * milliseconds or jiffies.
3562  * This function takes a decimal number with a possible fractional
3563  * component, and produces an integer which is the result of
3564  * multiplying that number by 10^'scale'.
3565  * all without any floating-point arithmetic.
3566  */
3567 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3568 {
3569 	unsigned long result = 0;
3570 	long decimals = -1;
3571 	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3572 		if (*cp == '.')
3573 			decimals = 0;
3574 		else if (decimals < scale) {
3575 			unsigned int value;
3576 			value = *cp - '0';
3577 			result = result * 10 + value;
3578 			if (decimals >= 0)
3579 				decimals++;
3580 		}
3581 		cp++;
3582 	}
3583 	if (*cp == '\n')
3584 		cp++;
3585 	if (*cp)
3586 		return -EINVAL;
3587 	if (decimals < 0)
3588 		decimals = 0;
3589 	while (decimals < scale) {
3590 		result *= 10;
3591 		decimals ++;
3592 	}
3593 	*res = result;
3594 	return 0;
3595 }
3596 
3597 static ssize_t
3598 safe_delay_show(struct mddev *mddev, char *page)
3599 {
3600 	int msec = (mddev->safemode_delay*1000)/HZ;
3601 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3602 }
3603 static ssize_t
3604 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3605 {
3606 	unsigned long msec;
3607 
3608 	if (mddev_is_clustered(mddev)) {
3609 		pr_warn("md: Safemode is disabled for clustered mode\n");
3610 		return -EINVAL;
3611 	}
3612 
3613 	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3614 		return -EINVAL;
3615 	if (msec == 0)
3616 		mddev->safemode_delay = 0;
3617 	else {
3618 		unsigned long old_delay = mddev->safemode_delay;
3619 		unsigned long new_delay = (msec*HZ)/1000;
3620 
3621 		if (new_delay == 0)
3622 			new_delay = 1;
3623 		mddev->safemode_delay = new_delay;
3624 		if (new_delay < old_delay || old_delay == 0)
3625 			mod_timer(&mddev->safemode_timer, jiffies+1);
3626 	}
3627 	return len;
3628 }
3629 static struct md_sysfs_entry md_safe_delay =
3630 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3631 
3632 static ssize_t
3633 level_show(struct mddev *mddev, char *page)
3634 {
3635 	struct md_personality *p;
3636 	int ret;
3637 	spin_lock(&mddev->lock);
3638 	p = mddev->pers;
3639 	if (p)
3640 		ret = sprintf(page, "%s\n", p->name);
3641 	else if (mddev->clevel[0])
3642 		ret = sprintf(page, "%s\n", mddev->clevel);
3643 	else if (mddev->level != LEVEL_NONE)
3644 		ret = sprintf(page, "%d\n", mddev->level);
3645 	else
3646 		ret = 0;
3647 	spin_unlock(&mddev->lock);
3648 	return ret;
3649 }
3650 
3651 static ssize_t
3652 level_store(struct mddev *mddev, const char *buf, size_t len)
3653 {
3654 	char clevel[16];
3655 	ssize_t rv;
3656 	size_t slen = len;
3657 	struct md_personality *pers, *oldpers;
3658 	long level;
3659 	void *priv, *oldpriv;
3660 	struct md_rdev *rdev;
3661 
3662 	if (slen == 0 || slen >= sizeof(clevel))
3663 		return -EINVAL;
3664 
3665 	rv = mddev_lock(mddev);
3666 	if (rv)
3667 		return rv;
3668 
3669 	if (mddev->pers == NULL) {
3670 		strncpy(mddev->clevel, buf, slen);
3671 		if (mddev->clevel[slen-1] == '\n')
3672 			slen--;
3673 		mddev->clevel[slen] = 0;
3674 		mddev->level = LEVEL_NONE;
3675 		rv = len;
3676 		goto out_unlock;
3677 	}
3678 	rv = -EROFS;
3679 	if (mddev->ro)
3680 		goto out_unlock;
3681 
3682 	/* request to change the personality.  Need to ensure:
3683 	 *  - array is not engaged in resync/recovery/reshape
3684 	 *  - old personality can be suspended
3685 	 *  - new personality will access other array.
3686 	 */
3687 
3688 	rv = -EBUSY;
3689 	if (mddev->sync_thread ||
3690 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3691 	    mddev->reshape_position != MaxSector ||
3692 	    mddev->sysfs_active)
3693 		goto out_unlock;
3694 
3695 	rv = -EINVAL;
3696 	if (!mddev->pers->quiesce) {
3697 		pr_warn("md: %s: %s does not support online personality change\n",
3698 			mdname(mddev), mddev->pers->name);
3699 		goto out_unlock;
3700 	}
3701 
3702 	/* Now find the new personality */
3703 	strncpy(clevel, buf, slen);
3704 	if (clevel[slen-1] == '\n')
3705 		slen--;
3706 	clevel[slen] = 0;
3707 	if (kstrtol(clevel, 10, &level))
3708 		level = LEVEL_NONE;
3709 
3710 	if (request_module("md-%s", clevel) != 0)
3711 		request_module("md-level-%s", clevel);
3712 	spin_lock(&pers_lock);
3713 	pers = find_pers(level, clevel);
3714 	if (!pers || !try_module_get(pers->owner)) {
3715 		spin_unlock(&pers_lock);
3716 		pr_warn("md: personality %s not loaded\n", clevel);
3717 		rv = -EINVAL;
3718 		goto out_unlock;
3719 	}
3720 	spin_unlock(&pers_lock);
3721 
3722 	if (pers == mddev->pers) {
3723 		/* Nothing to do! */
3724 		module_put(pers->owner);
3725 		rv = len;
3726 		goto out_unlock;
3727 	}
3728 	if (!pers->takeover) {
3729 		module_put(pers->owner);
3730 		pr_warn("md: %s: %s does not support personality takeover\n",
3731 			mdname(mddev), clevel);
3732 		rv = -EINVAL;
3733 		goto out_unlock;
3734 	}
3735 
3736 	rdev_for_each(rdev, mddev)
3737 		rdev->new_raid_disk = rdev->raid_disk;
3738 
3739 	/* ->takeover must set new_* and/or delta_disks
3740 	 * if it succeeds, and may set them when it fails.
3741 	 */
3742 	priv = pers->takeover(mddev);
3743 	if (IS_ERR(priv)) {
3744 		mddev->new_level = mddev->level;
3745 		mddev->new_layout = mddev->layout;
3746 		mddev->new_chunk_sectors = mddev->chunk_sectors;
3747 		mddev->raid_disks -= mddev->delta_disks;
3748 		mddev->delta_disks = 0;
3749 		mddev->reshape_backwards = 0;
3750 		module_put(pers->owner);
3751 		pr_warn("md: %s: %s would not accept array\n",
3752 			mdname(mddev), clevel);
3753 		rv = PTR_ERR(priv);
3754 		goto out_unlock;
3755 	}
3756 
3757 	/* Looks like we have a winner */
3758 	mddev_suspend(mddev);
3759 	mddev_detach(mddev);
3760 
3761 	spin_lock(&mddev->lock);
3762 	oldpers = mddev->pers;
3763 	oldpriv = mddev->private;
3764 	mddev->pers = pers;
3765 	mddev->private = priv;
3766 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3767 	mddev->level = mddev->new_level;
3768 	mddev->layout = mddev->new_layout;
3769 	mddev->chunk_sectors = mddev->new_chunk_sectors;
3770 	mddev->delta_disks = 0;
3771 	mddev->reshape_backwards = 0;
3772 	mddev->degraded = 0;
3773 	spin_unlock(&mddev->lock);
3774 
3775 	if (oldpers->sync_request == NULL &&
3776 	    mddev->external) {
3777 		/* We are converting from a no-redundancy array
3778 		 * to a redundancy array and metadata is managed
3779 		 * externally so we need to be sure that writes
3780 		 * won't block due to a need to transition
3781 		 *      clean->dirty
3782 		 * until external management is started.
3783 		 */
3784 		mddev->in_sync = 0;
3785 		mddev->safemode_delay = 0;
3786 		mddev->safemode = 0;
3787 	}
3788 
3789 	oldpers->free(mddev, oldpriv);
3790 
3791 	if (oldpers->sync_request == NULL &&
3792 	    pers->sync_request != NULL) {
3793 		/* need to add the md_redundancy_group */
3794 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3795 			pr_warn("md: cannot register extra attributes for %s\n",
3796 				mdname(mddev));
3797 		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3798 	}
3799 	if (oldpers->sync_request != NULL &&
3800 	    pers->sync_request == NULL) {
3801 		/* need to remove the md_redundancy_group */
3802 		if (mddev->to_remove == NULL)
3803 			mddev->to_remove = &md_redundancy_group;
3804 	}
3805 
3806 	module_put(oldpers->owner);
3807 
3808 	rdev_for_each(rdev, mddev) {
3809 		if (rdev->raid_disk < 0)
3810 			continue;
3811 		if (rdev->new_raid_disk >= mddev->raid_disks)
3812 			rdev->new_raid_disk = -1;
3813 		if (rdev->new_raid_disk == rdev->raid_disk)
3814 			continue;
3815 		sysfs_unlink_rdev(mddev, rdev);
3816 	}
3817 	rdev_for_each(rdev, mddev) {
3818 		if (rdev->raid_disk < 0)
3819 			continue;
3820 		if (rdev->new_raid_disk == rdev->raid_disk)
3821 			continue;
3822 		rdev->raid_disk = rdev->new_raid_disk;
3823 		if (rdev->raid_disk < 0)
3824 			clear_bit(In_sync, &rdev->flags);
3825 		else {
3826 			if (sysfs_link_rdev(mddev, rdev))
3827 				pr_warn("md: cannot register rd%d for %s after level change\n",
3828 					rdev->raid_disk, mdname(mddev));
3829 		}
3830 	}
3831 
3832 	if (pers->sync_request == NULL) {
3833 		/* this is now an array without redundancy, so
3834 		 * it must always be in_sync
3835 		 */
3836 		mddev->in_sync = 1;
3837 		del_timer_sync(&mddev->safemode_timer);
3838 	}
3839 	blk_set_stacking_limits(&mddev->queue->limits);
3840 	pers->run(mddev);
3841 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3842 	mddev_resume(mddev);
3843 	if (!mddev->thread)
3844 		md_update_sb(mddev, 1);
3845 	sysfs_notify(&mddev->kobj, NULL, "level");
3846 	md_new_event(mddev);
3847 	rv = len;
3848 out_unlock:
3849 	mddev_unlock(mddev);
3850 	return rv;
3851 }
3852 
3853 static struct md_sysfs_entry md_level =
3854 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3855 
3856 static ssize_t
3857 layout_show(struct mddev *mddev, char *page)
3858 {
3859 	/* just a number, not meaningful for all levels */
3860 	if (mddev->reshape_position != MaxSector &&
3861 	    mddev->layout != mddev->new_layout)
3862 		return sprintf(page, "%d (%d)\n",
3863 			       mddev->new_layout, mddev->layout);
3864 	return sprintf(page, "%d\n", mddev->layout);
3865 }
3866 
3867 static ssize_t
3868 layout_store(struct mddev *mddev, const char *buf, size_t len)
3869 {
3870 	unsigned int n;
3871 	int err;
3872 
3873 	err = kstrtouint(buf, 10, &n);
3874 	if (err < 0)
3875 		return err;
3876 	err = mddev_lock(mddev);
3877 	if (err)
3878 		return err;
3879 
3880 	if (mddev->pers) {
3881 		if (mddev->pers->check_reshape == NULL)
3882 			err = -EBUSY;
3883 		else if (mddev->ro)
3884 			err = -EROFS;
3885 		else {
3886 			mddev->new_layout = n;
3887 			err = mddev->pers->check_reshape(mddev);
3888 			if (err)
3889 				mddev->new_layout = mddev->layout;
3890 		}
3891 	} else {
3892 		mddev->new_layout = n;
3893 		if (mddev->reshape_position == MaxSector)
3894 			mddev->layout = n;
3895 	}
3896 	mddev_unlock(mddev);
3897 	return err ?: len;
3898 }
3899 static struct md_sysfs_entry md_layout =
3900 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3901 
3902 static ssize_t
3903 raid_disks_show(struct mddev *mddev, char *page)
3904 {
3905 	if (mddev->raid_disks == 0)
3906 		return 0;
3907 	if (mddev->reshape_position != MaxSector &&
3908 	    mddev->delta_disks != 0)
3909 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3910 			       mddev->raid_disks - mddev->delta_disks);
3911 	return sprintf(page, "%d\n", mddev->raid_disks);
3912 }
3913 
3914 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3915 
3916 static ssize_t
3917 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3918 {
3919 	unsigned int n;
3920 	int err;
3921 
3922 	err = kstrtouint(buf, 10, &n);
3923 	if (err < 0)
3924 		return err;
3925 
3926 	err = mddev_lock(mddev);
3927 	if (err)
3928 		return err;
3929 	if (mddev->pers)
3930 		err = update_raid_disks(mddev, n);
3931 	else if (mddev->reshape_position != MaxSector) {
3932 		struct md_rdev *rdev;
3933 		int olddisks = mddev->raid_disks - mddev->delta_disks;
3934 
3935 		err = -EINVAL;
3936 		rdev_for_each(rdev, mddev) {
3937 			if (olddisks < n &&
3938 			    rdev->data_offset < rdev->new_data_offset)
3939 				goto out_unlock;
3940 			if (olddisks > n &&
3941 			    rdev->data_offset > rdev->new_data_offset)
3942 				goto out_unlock;
3943 		}
3944 		err = 0;
3945 		mddev->delta_disks = n - olddisks;
3946 		mddev->raid_disks = n;
3947 		mddev->reshape_backwards = (mddev->delta_disks < 0);
3948 	} else
3949 		mddev->raid_disks = n;
3950 out_unlock:
3951 	mddev_unlock(mddev);
3952 	return err ? err : len;
3953 }
3954 static struct md_sysfs_entry md_raid_disks =
3955 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3956 
3957 static ssize_t
3958 chunk_size_show(struct mddev *mddev, char *page)
3959 {
3960 	if (mddev->reshape_position != MaxSector &&
3961 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
3962 		return sprintf(page, "%d (%d)\n",
3963 			       mddev->new_chunk_sectors << 9,
3964 			       mddev->chunk_sectors << 9);
3965 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3966 }
3967 
3968 static ssize_t
3969 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3970 {
3971 	unsigned long n;
3972 	int err;
3973 
3974 	err = kstrtoul(buf, 10, &n);
3975 	if (err < 0)
3976 		return err;
3977 
3978 	err = mddev_lock(mddev);
3979 	if (err)
3980 		return err;
3981 	if (mddev->pers) {
3982 		if (mddev->pers->check_reshape == NULL)
3983 			err = -EBUSY;
3984 		else if (mddev->ro)
3985 			err = -EROFS;
3986 		else {
3987 			mddev->new_chunk_sectors = n >> 9;
3988 			err = mddev->pers->check_reshape(mddev);
3989 			if (err)
3990 				mddev->new_chunk_sectors = mddev->chunk_sectors;
3991 		}
3992 	} else {
3993 		mddev->new_chunk_sectors = n >> 9;
3994 		if (mddev->reshape_position == MaxSector)
3995 			mddev->chunk_sectors = n >> 9;
3996 	}
3997 	mddev_unlock(mddev);
3998 	return err ?: len;
3999 }
4000 static struct md_sysfs_entry md_chunk_size =
4001 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4002 
4003 static ssize_t
4004 resync_start_show(struct mddev *mddev, char *page)
4005 {
4006 	if (mddev->recovery_cp == MaxSector)
4007 		return sprintf(page, "none\n");
4008 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4009 }
4010 
4011 static ssize_t
4012 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4013 {
4014 	unsigned long long n;
4015 	int err;
4016 
4017 	if (cmd_match(buf, "none"))
4018 		n = MaxSector;
4019 	else {
4020 		err = kstrtoull(buf, 10, &n);
4021 		if (err < 0)
4022 			return err;
4023 		if (n != (sector_t)n)
4024 			return -EINVAL;
4025 	}
4026 
4027 	err = mddev_lock(mddev);
4028 	if (err)
4029 		return err;
4030 	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4031 		err = -EBUSY;
4032 
4033 	if (!err) {
4034 		mddev->recovery_cp = n;
4035 		if (mddev->pers)
4036 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4037 	}
4038 	mddev_unlock(mddev);
4039 	return err ?: len;
4040 }
4041 static struct md_sysfs_entry md_resync_start =
4042 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4043 		resync_start_show, resync_start_store);
4044 
4045 /*
4046  * The array state can be:
4047  *
4048  * clear
4049  *     No devices, no size, no level
4050  *     Equivalent to STOP_ARRAY ioctl
4051  * inactive
4052  *     May have some settings, but array is not active
4053  *        all IO results in error
4054  *     When written, doesn't tear down array, but just stops it
4055  * suspended (not supported yet)
4056  *     All IO requests will block. The array can be reconfigured.
4057  *     Writing this, if accepted, will block until array is quiescent
4058  * readonly
4059  *     no resync can happen.  no superblocks get written.
4060  *     write requests fail
4061  * read-auto
4062  *     like readonly, but behaves like 'clean' on a write request.
4063  *
4064  * clean - no pending writes, but otherwise active.
4065  *     When written to inactive array, starts without resync
4066  *     If a write request arrives then
4067  *       if metadata is known, mark 'dirty' and switch to 'active'.
4068  *       if not known, block and switch to write-pending
4069  *     If written to an active array that has pending writes, then fails.
4070  * active
4071  *     fully active: IO and resync can be happening.
4072  *     When written to inactive array, starts with resync
4073  *
4074  * write-pending
4075  *     clean, but writes are blocked waiting for 'active' to be written.
4076  *
4077  * active-idle
4078  *     like active, but no writes have been seen for a while (100msec).
4079  *
4080  */
4081 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4082 		   write_pending, active_idle, bad_word};
4083 static char *array_states[] = {
4084 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4085 	"write-pending", "active-idle", NULL };
4086 
4087 static int match_word(const char *word, char **list)
4088 {
4089 	int n;
4090 	for (n=0; list[n]; n++)
4091 		if (cmd_match(word, list[n]))
4092 			break;
4093 	return n;
4094 }
4095 
4096 static ssize_t
4097 array_state_show(struct mddev *mddev, char *page)
4098 {
4099 	enum array_state st = inactive;
4100 
4101 	if (mddev->pers)
4102 		switch(mddev->ro) {
4103 		case 1:
4104 			st = readonly;
4105 			break;
4106 		case 2:
4107 			st = read_auto;
4108 			break;
4109 		case 0:
4110 			spin_lock(&mddev->lock);
4111 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4112 				st = write_pending;
4113 			else if (mddev->in_sync)
4114 				st = clean;
4115 			else if (mddev->safemode)
4116 				st = active_idle;
4117 			else
4118 				st = active;
4119 			spin_unlock(&mddev->lock);
4120 		}
4121 	else {
4122 		if (list_empty(&mddev->disks) &&
4123 		    mddev->raid_disks == 0 &&
4124 		    mddev->dev_sectors == 0)
4125 			st = clear;
4126 		else
4127 			st = inactive;
4128 	}
4129 	return sprintf(page, "%s\n", array_states[st]);
4130 }
4131 
4132 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4133 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4134 static int do_md_run(struct mddev *mddev);
4135 static int restart_array(struct mddev *mddev);
4136 
4137 static ssize_t
4138 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4139 {
4140 	int err = 0;
4141 	enum array_state st = match_word(buf, array_states);
4142 
4143 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4144 		/* don't take reconfig_mutex when toggling between
4145 		 * clean and active
4146 		 */
4147 		spin_lock(&mddev->lock);
4148 		if (st == active) {
4149 			restart_array(mddev);
4150 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4151 			md_wakeup_thread(mddev->thread);
4152 			wake_up(&mddev->sb_wait);
4153 		} else /* st == clean */ {
4154 			restart_array(mddev);
4155 			if (!set_in_sync(mddev))
4156 				err = -EBUSY;
4157 		}
4158 		if (!err)
4159 			sysfs_notify_dirent_safe(mddev->sysfs_state);
4160 		spin_unlock(&mddev->lock);
4161 		return err ?: len;
4162 	}
4163 	err = mddev_lock(mddev);
4164 	if (err)
4165 		return err;
4166 	err = -EINVAL;
4167 	switch(st) {
4168 	case bad_word:
4169 		break;
4170 	case clear:
4171 		/* stopping an active array */
4172 		err = do_md_stop(mddev, 0, NULL);
4173 		break;
4174 	case inactive:
4175 		/* stopping an active array */
4176 		if (mddev->pers)
4177 			err = do_md_stop(mddev, 2, NULL);
4178 		else
4179 			err = 0; /* already inactive */
4180 		break;
4181 	case suspended:
4182 		break; /* not supported yet */
4183 	case readonly:
4184 		if (mddev->pers)
4185 			err = md_set_readonly(mddev, NULL);
4186 		else {
4187 			mddev->ro = 1;
4188 			set_disk_ro(mddev->gendisk, 1);
4189 			err = do_md_run(mddev);
4190 		}
4191 		break;
4192 	case read_auto:
4193 		if (mddev->pers) {
4194 			if (mddev->ro == 0)
4195 				err = md_set_readonly(mddev, NULL);
4196 			else if (mddev->ro == 1)
4197 				err = restart_array(mddev);
4198 			if (err == 0) {
4199 				mddev->ro = 2;
4200 				set_disk_ro(mddev->gendisk, 0);
4201 			}
4202 		} else {
4203 			mddev->ro = 2;
4204 			err = do_md_run(mddev);
4205 		}
4206 		break;
4207 	case clean:
4208 		if (mddev->pers) {
4209 			err = restart_array(mddev);
4210 			if (err)
4211 				break;
4212 			spin_lock(&mddev->lock);
4213 			if (!set_in_sync(mddev))
4214 				err = -EBUSY;
4215 			spin_unlock(&mddev->lock);
4216 		} else
4217 			err = -EINVAL;
4218 		break;
4219 	case active:
4220 		if (mddev->pers) {
4221 			err = restart_array(mddev);
4222 			if (err)
4223 				break;
4224 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4225 			wake_up(&mddev->sb_wait);
4226 			err = 0;
4227 		} else {
4228 			mddev->ro = 0;
4229 			set_disk_ro(mddev->gendisk, 0);
4230 			err = do_md_run(mddev);
4231 		}
4232 		break;
4233 	case write_pending:
4234 	case active_idle:
4235 		/* these cannot be set */
4236 		break;
4237 	}
4238 
4239 	if (!err) {
4240 		if (mddev->hold_active == UNTIL_IOCTL)
4241 			mddev->hold_active = 0;
4242 		sysfs_notify_dirent_safe(mddev->sysfs_state);
4243 	}
4244 	mddev_unlock(mddev);
4245 	return err ?: len;
4246 }
4247 static struct md_sysfs_entry md_array_state =
4248 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4249 
4250 static ssize_t
4251 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4252 	return sprintf(page, "%d\n",
4253 		       atomic_read(&mddev->max_corr_read_errors));
4254 }
4255 
4256 static ssize_t
4257 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4258 {
4259 	unsigned int n;
4260 	int rv;
4261 
4262 	rv = kstrtouint(buf, 10, &n);
4263 	if (rv < 0)
4264 		return rv;
4265 	atomic_set(&mddev->max_corr_read_errors, n);
4266 	return len;
4267 }
4268 
4269 static struct md_sysfs_entry max_corr_read_errors =
4270 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4271 	max_corrected_read_errors_store);
4272 
4273 static ssize_t
4274 null_show(struct mddev *mddev, char *page)
4275 {
4276 	return -EINVAL;
4277 }
4278 
4279 static ssize_t
4280 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4281 {
4282 	/* buf must be %d:%d\n? giving major and minor numbers */
4283 	/* The new device is added to the array.
4284 	 * If the array has a persistent superblock, we read the
4285 	 * superblock to initialise info and check validity.
4286 	 * Otherwise, only checking done is that in bind_rdev_to_array,
4287 	 * which mainly checks size.
4288 	 */
4289 	char *e;
4290 	int major = simple_strtoul(buf, &e, 10);
4291 	int minor;
4292 	dev_t dev;
4293 	struct md_rdev *rdev;
4294 	int err;
4295 
4296 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4297 		return -EINVAL;
4298 	minor = simple_strtoul(e+1, &e, 10);
4299 	if (*e && *e != '\n')
4300 		return -EINVAL;
4301 	dev = MKDEV(major, minor);
4302 	if (major != MAJOR(dev) ||
4303 	    minor != MINOR(dev))
4304 		return -EOVERFLOW;
4305 
4306 	flush_workqueue(md_misc_wq);
4307 
4308 	err = mddev_lock(mddev);
4309 	if (err)
4310 		return err;
4311 	if (mddev->persistent) {
4312 		rdev = md_import_device(dev, mddev->major_version,
4313 					mddev->minor_version);
4314 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4315 			struct md_rdev *rdev0
4316 				= list_entry(mddev->disks.next,
4317 					     struct md_rdev, same_set);
4318 			err = super_types[mddev->major_version]
4319 				.load_super(rdev, rdev0, mddev->minor_version);
4320 			if (err < 0)
4321 				goto out;
4322 		}
4323 	} else if (mddev->external)
4324 		rdev = md_import_device(dev, -2, -1);
4325 	else
4326 		rdev = md_import_device(dev, -1, -1);
4327 
4328 	if (IS_ERR(rdev)) {
4329 		mddev_unlock(mddev);
4330 		return PTR_ERR(rdev);
4331 	}
4332 	err = bind_rdev_to_array(rdev, mddev);
4333  out:
4334 	if (err)
4335 		export_rdev(rdev);
4336 	mddev_unlock(mddev);
4337 	if (!err)
4338 		md_new_event(mddev);
4339 	return err ? err : len;
4340 }
4341 
4342 static struct md_sysfs_entry md_new_device =
4343 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4344 
4345 static ssize_t
4346 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4347 {
4348 	char *end;
4349 	unsigned long chunk, end_chunk;
4350 	int err;
4351 
4352 	err = mddev_lock(mddev);
4353 	if (err)
4354 		return err;
4355 	if (!mddev->bitmap)
4356 		goto out;
4357 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4358 	while (*buf) {
4359 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4360 		if (buf == end) break;
4361 		if (*end == '-') { /* range */
4362 			buf = end + 1;
4363 			end_chunk = simple_strtoul(buf, &end, 0);
4364 			if (buf == end) break;
4365 		}
4366 		if (*end && !isspace(*end)) break;
4367 		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4368 		buf = skip_spaces(end);
4369 	}
4370 	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4371 out:
4372 	mddev_unlock(mddev);
4373 	return len;
4374 }
4375 
4376 static struct md_sysfs_entry md_bitmap =
4377 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4378 
4379 static ssize_t
4380 size_show(struct mddev *mddev, char *page)
4381 {
4382 	return sprintf(page, "%llu\n",
4383 		(unsigned long long)mddev->dev_sectors / 2);
4384 }
4385 
4386 static int update_size(struct mddev *mddev, sector_t num_sectors);
4387 
4388 static ssize_t
4389 size_store(struct mddev *mddev, const char *buf, size_t len)
4390 {
4391 	/* If array is inactive, we can reduce the component size, but
4392 	 * not increase it (except from 0).
4393 	 * If array is active, we can try an on-line resize
4394 	 */
4395 	sector_t sectors;
4396 	int err = strict_blocks_to_sectors(buf, &sectors);
4397 
4398 	if (err < 0)
4399 		return err;
4400 	err = mddev_lock(mddev);
4401 	if (err)
4402 		return err;
4403 	if (mddev->pers) {
4404 		err = update_size(mddev, sectors);
4405 		if (err == 0)
4406 			md_update_sb(mddev, 1);
4407 	} else {
4408 		if (mddev->dev_sectors == 0 ||
4409 		    mddev->dev_sectors > sectors)
4410 			mddev->dev_sectors = sectors;
4411 		else
4412 			err = -ENOSPC;
4413 	}
4414 	mddev_unlock(mddev);
4415 	return err ? err : len;
4416 }
4417 
4418 static struct md_sysfs_entry md_size =
4419 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4420 
4421 /* Metadata version.
4422  * This is one of
4423  *   'none' for arrays with no metadata (good luck...)
4424  *   'external' for arrays with externally managed metadata,
4425  * or N.M for internally known formats
4426  */
4427 static ssize_t
4428 metadata_show(struct mddev *mddev, char *page)
4429 {
4430 	if (mddev->persistent)
4431 		return sprintf(page, "%d.%d\n",
4432 			       mddev->major_version, mddev->minor_version);
4433 	else if (mddev->external)
4434 		return sprintf(page, "external:%s\n", mddev->metadata_type);
4435 	else
4436 		return sprintf(page, "none\n");
4437 }
4438 
4439 static ssize_t
4440 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4441 {
4442 	int major, minor;
4443 	char *e;
4444 	int err;
4445 	/* Changing the details of 'external' metadata is
4446 	 * always permitted.  Otherwise there must be
4447 	 * no devices attached to the array.
4448 	 */
4449 
4450 	err = mddev_lock(mddev);
4451 	if (err)
4452 		return err;
4453 	err = -EBUSY;
4454 	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4455 		;
4456 	else if (!list_empty(&mddev->disks))
4457 		goto out_unlock;
4458 
4459 	err = 0;
4460 	if (cmd_match(buf, "none")) {
4461 		mddev->persistent = 0;
4462 		mddev->external = 0;
4463 		mddev->major_version = 0;
4464 		mddev->minor_version = 90;
4465 		goto out_unlock;
4466 	}
4467 	if (strncmp(buf, "external:", 9) == 0) {
4468 		size_t namelen = len-9;
4469 		if (namelen >= sizeof(mddev->metadata_type))
4470 			namelen = sizeof(mddev->metadata_type)-1;
4471 		strncpy(mddev->metadata_type, buf+9, namelen);
4472 		mddev->metadata_type[namelen] = 0;
4473 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4474 			mddev->metadata_type[--namelen] = 0;
4475 		mddev->persistent = 0;
4476 		mddev->external = 1;
4477 		mddev->major_version = 0;
4478 		mddev->minor_version = 90;
4479 		goto out_unlock;
4480 	}
4481 	major = simple_strtoul(buf, &e, 10);
4482 	err = -EINVAL;
4483 	if (e==buf || *e != '.')
4484 		goto out_unlock;
4485 	buf = e+1;
4486 	minor = simple_strtoul(buf, &e, 10);
4487 	if (e==buf || (*e && *e != '\n') )
4488 		goto out_unlock;
4489 	err = -ENOENT;
4490 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4491 		goto out_unlock;
4492 	mddev->major_version = major;
4493 	mddev->minor_version = minor;
4494 	mddev->persistent = 1;
4495 	mddev->external = 0;
4496 	err = 0;
4497 out_unlock:
4498 	mddev_unlock(mddev);
4499 	return err ?: len;
4500 }
4501 
4502 static struct md_sysfs_entry md_metadata =
4503 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4504 
4505 static ssize_t
4506 action_show(struct mddev *mddev, char *page)
4507 {
4508 	char *type = "idle";
4509 	unsigned long recovery = mddev->recovery;
4510 	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4511 		type = "frozen";
4512 	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4513 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4514 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4515 			type = "reshape";
4516 		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4517 			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4518 				type = "resync";
4519 			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4520 				type = "check";
4521 			else
4522 				type = "repair";
4523 		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4524 			type = "recover";
4525 		else if (mddev->reshape_position != MaxSector)
4526 			type = "reshape";
4527 	}
4528 	return sprintf(page, "%s\n", type);
4529 }
4530 
4531 static ssize_t
4532 action_store(struct mddev *mddev, const char *page, size_t len)
4533 {
4534 	if (!mddev->pers || !mddev->pers->sync_request)
4535 		return -EINVAL;
4536 
4537 
4538 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4539 		if (cmd_match(page, "frozen"))
4540 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4541 		else
4542 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4543 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4544 		    mddev_lock(mddev) == 0) {
4545 			flush_workqueue(md_misc_wq);
4546 			if (mddev->sync_thread) {
4547 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4548 				md_reap_sync_thread(mddev);
4549 			}
4550 			mddev_unlock(mddev);
4551 		}
4552 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4553 		return -EBUSY;
4554 	else if (cmd_match(page, "resync"))
4555 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4556 	else if (cmd_match(page, "recover")) {
4557 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4558 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4559 	} else if (cmd_match(page, "reshape")) {
4560 		int err;
4561 		if (mddev->pers->start_reshape == NULL)
4562 			return -EINVAL;
4563 		err = mddev_lock(mddev);
4564 		if (!err) {
4565 			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4566 				err =  -EBUSY;
4567 			else {
4568 				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4569 				err = mddev->pers->start_reshape(mddev);
4570 			}
4571 			mddev_unlock(mddev);
4572 		}
4573 		if (err)
4574 			return err;
4575 		sysfs_notify(&mddev->kobj, NULL, "degraded");
4576 	} else {
4577 		if (cmd_match(page, "check"))
4578 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4579 		else if (!cmd_match(page, "repair"))
4580 			return -EINVAL;
4581 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4582 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4583 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4584 	}
4585 	if (mddev->ro == 2) {
4586 		/* A write to sync_action is enough to justify
4587 		 * canceling read-auto mode
4588 		 */
4589 		mddev->ro = 0;
4590 		md_wakeup_thread(mddev->sync_thread);
4591 	}
4592 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4593 	md_wakeup_thread(mddev->thread);
4594 	sysfs_notify_dirent_safe(mddev->sysfs_action);
4595 	return len;
4596 }
4597 
4598 static struct md_sysfs_entry md_scan_mode =
4599 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4600 
4601 static ssize_t
4602 last_sync_action_show(struct mddev *mddev, char *page)
4603 {
4604 	return sprintf(page, "%s\n", mddev->last_sync_action);
4605 }
4606 
4607 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4608 
4609 static ssize_t
4610 mismatch_cnt_show(struct mddev *mddev, char *page)
4611 {
4612 	return sprintf(page, "%llu\n",
4613 		       (unsigned long long)
4614 		       atomic64_read(&mddev->resync_mismatches));
4615 }
4616 
4617 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4618 
4619 static ssize_t
4620 sync_min_show(struct mddev *mddev, char *page)
4621 {
4622 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4623 		       mddev->sync_speed_min ? "local": "system");
4624 }
4625 
4626 static ssize_t
4627 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4628 {
4629 	unsigned int min;
4630 	int rv;
4631 
4632 	if (strncmp(buf, "system", 6)==0) {
4633 		min = 0;
4634 	} else {
4635 		rv = kstrtouint(buf, 10, &min);
4636 		if (rv < 0)
4637 			return rv;
4638 		if (min == 0)
4639 			return -EINVAL;
4640 	}
4641 	mddev->sync_speed_min = min;
4642 	return len;
4643 }
4644 
4645 static struct md_sysfs_entry md_sync_min =
4646 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4647 
4648 static ssize_t
4649 sync_max_show(struct mddev *mddev, char *page)
4650 {
4651 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4652 		       mddev->sync_speed_max ? "local": "system");
4653 }
4654 
4655 static ssize_t
4656 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4657 {
4658 	unsigned int max;
4659 	int rv;
4660 
4661 	if (strncmp(buf, "system", 6)==0) {
4662 		max = 0;
4663 	} else {
4664 		rv = kstrtouint(buf, 10, &max);
4665 		if (rv < 0)
4666 			return rv;
4667 		if (max == 0)
4668 			return -EINVAL;
4669 	}
4670 	mddev->sync_speed_max = max;
4671 	return len;
4672 }
4673 
4674 static struct md_sysfs_entry md_sync_max =
4675 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4676 
4677 static ssize_t
4678 degraded_show(struct mddev *mddev, char *page)
4679 {
4680 	return sprintf(page, "%d\n", mddev->degraded);
4681 }
4682 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4683 
4684 static ssize_t
4685 sync_force_parallel_show(struct mddev *mddev, char *page)
4686 {
4687 	return sprintf(page, "%d\n", mddev->parallel_resync);
4688 }
4689 
4690 static ssize_t
4691 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4692 {
4693 	long n;
4694 
4695 	if (kstrtol(buf, 10, &n))
4696 		return -EINVAL;
4697 
4698 	if (n != 0 && n != 1)
4699 		return -EINVAL;
4700 
4701 	mddev->parallel_resync = n;
4702 
4703 	if (mddev->sync_thread)
4704 		wake_up(&resync_wait);
4705 
4706 	return len;
4707 }
4708 
4709 /* force parallel resync, even with shared block devices */
4710 static struct md_sysfs_entry md_sync_force_parallel =
4711 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4712        sync_force_parallel_show, sync_force_parallel_store);
4713 
4714 static ssize_t
4715 sync_speed_show(struct mddev *mddev, char *page)
4716 {
4717 	unsigned long resync, dt, db;
4718 	if (mddev->curr_resync == 0)
4719 		return sprintf(page, "none\n");
4720 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4721 	dt = (jiffies - mddev->resync_mark) / HZ;
4722 	if (!dt) dt++;
4723 	db = resync - mddev->resync_mark_cnt;
4724 	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4725 }
4726 
4727 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4728 
4729 static ssize_t
4730 sync_completed_show(struct mddev *mddev, char *page)
4731 {
4732 	unsigned long long max_sectors, resync;
4733 
4734 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4735 		return sprintf(page, "none\n");
4736 
4737 	if (mddev->curr_resync == 1 ||
4738 	    mddev->curr_resync == 2)
4739 		return sprintf(page, "delayed\n");
4740 
4741 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4742 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4743 		max_sectors = mddev->resync_max_sectors;
4744 	else
4745 		max_sectors = mddev->dev_sectors;
4746 
4747 	resync = mddev->curr_resync_completed;
4748 	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4749 }
4750 
4751 static struct md_sysfs_entry md_sync_completed =
4752 	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4753 
4754 static ssize_t
4755 min_sync_show(struct mddev *mddev, char *page)
4756 {
4757 	return sprintf(page, "%llu\n",
4758 		       (unsigned long long)mddev->resync_min);
4759 }
4760 static ssize_t
4761 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4762 {
4763 	unsigned long long min;
4764 	int err;
4765 
4766 	if (kstrtoull(buf, 10, &min))
4767 		return -EINVAL;
4768 
4769 	spin_lock(&mddev->lock);
4770 	err = -EINVAL;
4771 	if (min > mddev->resync_max)
4772 		goto out_unlock;
4773 
4774 	err = -EBUSY;
4775 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4776 		goto out_unlock;
4777 
4778 	/* Round down to multiple of 4K for safety */
4779 	mddev->resync_min = round_down(min, 8);
4780 	err = 0;
4781 
4782 out_unlock:
4783 	spin_unlock(&mddev->lock);
4784 	return err ?: len;
4785 }
4786 
4787 static struct md_sysfs_entry md_min_sync =
4788 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4789 
4790 static ssize_t
4791 max_sync_show(struct mddev *mddev, char *page)
4792 {
4793 	if (mddev->resync_max == MaxSector)
4794 		return sprintf(page, "max\n");
4795 	else
4796 		return sprintf(page, "%llu\n",
4797 			       (unsigned long long)mddev->resync_max);
4798 }
4799 static ssize_t
4800 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4801 {
4802 	int err;
4803 	spin_lock(&mddev->lock);
4804 	if (strncmp(buf, "max", 3) == 0)
4805 		mddev->resync_max = MaxSector;
4806 	else {
4807 		unsigned long long max;
4808 		int chunk;
4809 
4810 		err = -EINVAL;
4811 		if (kstrtoull(buf, 10, &max))
4812 			goto out_unlock;
4813 		if (max < mddev->resync_min)
4814 			goto out_unlock;
4815 
4816 		err = -EBUSY;
4817 		if (max < mddev->resync_max &&
4818 		    mddev->ro == 0 &&
4819 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4820 			goto out_unlock;
4821 
4822 		/* Must be a multiple of chunk_size */
4823 		chunk = mddev->chunk_sectors;
4824 		if (chunk) {
4825 			sector_t temp = max;
4826 
4827 			err = -EINVAL;
4828 			if (sector_div(temp, chunk))
4829 				goto out_unlock;
4830 		}
4831 		mddev->resync_max = max;
4832 	}
4833 	wake_up(&mddev->recovery_wait);
4834 	err = 0;
4835 out_unlock:
4836 	spin_unlock(&mddev->lock);
4837 	return err ?: len;
4838 }
4839 
4840 static struct md_sysfs_entry md_max_sync =
4841 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4842 
4843 static ssize_t
4844 suspend_lo_show(struct mddev *mddev, char *page)
4845 {
4846 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4847 }
4848 
4849 static ssize_t
4850 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4851 {
4852 	unsigned long long new;
4853 	int err;
4854 
4855 	err = kstrtoull(buf, 10, &new);
4856 	if (err < 0)
4857 		return err;
4858 	if (new != (sector_t)new)
4859 		return -EINVAL;
4860 
4861 	err = mddev_lock(mddev);
4862 	if (err)
4863 		return err;
4864 	err = -EINVAL;
4865 	if (mddev->pers == NULL ||
4866 	    mddev->pers->quiesce == NULL)
4867 		goto unlock;
4868 	mddev_suspend(mddev);
4869 	mddev->suspend_lo = new;
4870 	mddev_resume(mddev);
4871 
4872 	err = 0;
4873 unlock:
4874 	mddev_unlock(mddev);
4875 	return err ?: len;
4876 }
4877 static struct md_sysfs_entry md_suspend_lo =
4878 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4879 
4880 static ssize_t
4881 suspend_hi_show(struct mddev *mddev, char *page)
4882 {
4883 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4884 }
4885 
4886 static ssize_t
4887 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4888 {
4889 	unsigned long long new;
4890 	int err;
4891 
4892 	err = kstrtoull(buf, 10, &new);
4893 	if (err < 0)
4894 		return err;
4895 	if (new != (sector_t)new)
4896 		return -EINVAL;
4897 
4898 	err = mddev_lock(mddev);
4899 	if (err)
4900 		return err;
4901 	err = -EINVAL;
4902 	if (mddev->pers == NULL)
4903 		goto unlock;
4904 
4905 	mddev_suspend(mddev);
4906 	mddev->suspend_hi = new;
4907 	mddev_resume(mddev);
4908 
4909 	err = 0;
4910 unlock:
4911 	mddev_unlock(mddev);
4912 	return err ?: len;
4913 }
4914 static struct md_sysfs_entry md_suspend_hi =
4915 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4916 
4917 static ssize_t
4918 reshape_position_show(struct mddev *mddev, char *page)
4919 {
4920 	if (mddev->reshape_position != MaxSector)
4921 		return sprintf(page, "%llu\n",
4922 			       (unsigned long long)mddev->reshape_position);
4923 	strcpy(page, "none\n");
4924 	return 5;
4925 }
4926 
4927 static ssize_t
4928 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4929 {
4930 	struct md_rdev *rdev;
4931 	unsigned long long new;
4932 	int err;
4933 
4934 	err = kstrtoull(buf, 10, &new);
4935 	if (err < 0)
4936 		return err;
4937 	if (new != (sector_t)new)
4938 		return -EINVAL;
4939 	err = mddev_lock(mddev);
4940 	if (err)
4941 		return err;
4942 	err = -EBUSY;
4943 	if (mddev->pers)
4944 		goto unlock;
4945 	mddev->reshape_position = new;
4946 	mddev->delta_disks = 0;
4947 	mddev->reshape_backwards = 0;
4948 	mddev->new_level = mddev->level;
4949 	mddev->new_layout = mddev->layout;
4950 	mddev->new_chunk_sectors = mddev->chunk_sectors;
4951 	rdev_for_each(rdev, mddev)
4952 		rdev->new_data_offset = rdev->data_offset;
4953 	err = 0;
4954 unlock:
4955 	mddev_unlock(mddev);
4956 	return err ?: len;
4957 }
4958 
4959 static struct md_sysfs_entry md_reshape_position =
4960 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4961        reshape_position_store);
4962 
4963 static ssize_t
4964 reshape_direction_show(struct mddev *mddev, char *page)
4965 {
4966 	return sprintf(page, "%s\n",
4967 		       mddev->reshape_backwards ? "backwards" : "forwards");
4968 }
4969 
4970 static ssize_t
4971 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4972 {
4973 	int backwards = 0;
4974 	int err;
4975 
4976 	if (cmd_match(buf, "forwards"))
4977 		backwards = 0;
4978 	else if (cmd_match(buf, "backwards"))
4979 		backwards = 1;
4980 	else
4981 		return -EINVAL;
4982 	if (mddev->reshape_backwards == backwards)
4983 		return len;
4984 
4985 	err = mddev_lock(mddev);
4986 	if (err)
4987 		return err;
4988 	/* check if we are allowed to change */
4989 	if (mddev->delta_disks)
4990 		err = -EBUSY;
4991 	else if (mddev->persistent &&
4992 	    mddev->major_version == 0)
4993 		err =  -EINVAL;
4994 	else
4995 		mddev->reshape_backwards = backwards;
4996 	mddev_unlock(mddev);
4997 	return err ?: len;
4998 }
4999 
5000 static struct md_sysfs_entry md_reshape_direction =
5001 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5002        reshape_direction_store);
5003 
5004 static ssize_t
5005 array_size_show(struct mddev *mddev, char *page)
5006 {
5007 	if (mddev->external_size)
5008 		return sprintf(page, "%llu\n",
5009 			       (unsigned long long)mddev->array_sectors/2);
5010 	else
5011 		return sprintf(page, "default\n");
5012 }
5013 
5014 static ssize_t
5015 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5016 {
5017 	sector_t sectors;
5018 	int err;
5019 
5020 	err = mddev_lock(mddev);
5021 	if (err)
5022 		return err;
5023 
5024 	/* cluster raid doesn't support change array_sectors */
5025 	if (mddev_is_clustered(mddev)) {
5026 		mddev_unlock(mddev);
5027 		return -EINVAL;
5028 	}
5029 
5030 	if (strncmp(buf, "default", 7) == 0) {
5031 		if (mddev->pers)
5032 			sectors = mddev->pers->size(mddev, 0, 0);
5033 		else
5034 			sectors = mddev->array_sectors;
5035 
5036 		mddev->external_size = 0;
5037 	} else {
5038 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5039 			err = -EINVAL;
5040 		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5041 			err = -E2BIG;
5042 		else
5043 			mddev->external_size = 1;
5044 	}
5045 
5046 	if (!err) {
5047 		mddev->array_sectors = sectors;
5048 		if (mddev->pers) {
5049 			set_capacity(mddev->gendisk, mddev->array_sectors);
5050 			revalidate_disk(mddev->gendisk);
5051 		}
5052 	}
5053 	mddev_unlock(mddev);
5054 	return err ?: len;
5055 }
5056 
5057 static struct md_sysfs_entry md_array_size =
5058 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5059        array_size_store);
5060 
5061 static ssize_t
5062 consistency_policy_show(struct mddev *mddev, char *page)
5063 {
5064 	int ret;
5065 
5066 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5067 		ret = sprintf(page, "journal\n");
5068 	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5069 		ret = sprintf(page, "ppl\n");
5070 	} else if (mddev->bitmap) {
5071 		ret = sprintf(page, "bitmap\n");
5072 	} else if (mddev->pers) {
5073 		if (mddev->pers->sync_request)
5074 			ret = sprintf(page, "resync\n");
5075 		else
5076 			ret = sprintf(page, "none\n");
5077 	} else {
5078 		ret = sprintf(page, "unknown\n");
5079 	}
5080 
5081 	return ret;
5082 }
5083 
5084 static ssize_t
5085 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5086 {
5087 	int err = 0;
5088 
5089 	if (mddev->pers) {
5090 		if (mddev->pers->change_consistency_policy)
5091 			err = mddev->pers->change_consistency_policy(mddev, buf);
5092 		else
5093 			err = -EBUSY;
5094 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5095 		set_bit(MD_HAS_PPL, &mddev->flags);
5096 	} else {
5097 		err = -EINVAL;
5098 	}
5099 
5100 	return err ? err : len;
5101 }
5102 
5103 static struct md_sysfs_entry md_consistency_policy =
5104 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5105        consistency_policy_store);
5106 
5107 static struct attribute *md_default_attrs[] = {
5108 	&md_level.attr,
5109 	&md_layout.attr,
5110 	&md_raid_disks.attr,
5111 	&md_chunk_size.attr,
5112 	&md_size.attr,
5113 	&md_resync_start.attr,
5114 	&md_metadata.attr,
5115 	&md_new_device.attr,
5116 	&md_safe_delay.attr,
5117 	&md_array_state.attr,
5118 	&md_reshape_position.attr,
5119 	&md_reshape_direction.attr,
5120 	&md_array_size.attr,
5121 	&max_corr_read_errors.attr,
5122 	&md_consistency_policy.attr,
5123 	NULL,
5124 };
5125 
5126 static struct attribute *md_redundancy_attrs[] = {
5127 	&md_scan_mode.attr,
5128 	&md_last_scan_mode.attr,
5129 	&md_mismatches.attr,
5130 	&md_sync_min.attr,
5131 	&md_sync_max.attr,
5132 	&md_sync_speed.attr,
5133 	&md_sync_force_parallel.attr,
5134 	&md_sync_completed.attr,
5135 	&md_min_sync.attr,
5136 	&md_max_sync.attr,
5137 	&md_suspend_lo.attr,
5138 	&md_suspend_hi.attr,
5139 	&md_bitmap.attr,
5140 	&md_degraded.attr,
5141 	NULL,
5142 };
5143 static struct attribute_group md_redundancy_group = {
5144 	.name = NULL,
5145 	.attrs = md_redundancy_attrs,
5146 };
5147 
5148 static ssize_t
5149 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5150 {
5151 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5152 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5153 	ssize_t rv;
5154 
5155 	if (!entry->show)
5156 		return -EIO;
5157 	spin_lock(&all_mddevs_lock);
5158 	if (list_empty(&mddev->all_mddevs)) {
5159 		spin_unlock(&all_mddevs_lock);
5160 		return -EBUSY;
5161 	}
5162 	mddev_get(mddev);
5163 	spin_unlock(&all_mddevs_lock);
5164 
5165 	rv = entry->show(mddev, page);
5166 	mddev_put(mddev);
5167 	return rv;
5168 }
5169 
5170 static ssize_t
5171 md_attr_store(struct kobject *kobj, struct attribute *attr,
5172 	      const char *page, size_t length)
5173 {
5174 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5175 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5176 	ssize_t rv;
5177 
5178 	if (!entry->store)
5179 		return -EIO;
5180 	if (!capable(CAP_SYS_ADMIN))
5181 		return -EACCES;
5182 	spin_lock(&all_mddevs_lock);
5183 	if (list_empty(&mddev->all_mddevs)) {
5184 		spin_unlock(&all_mddevs_lock);
5185 		return -EBUSY;
5186 	}
5187 	mddev_get(mddev);
5188 	spin_unlock(&all_mddevs_lock);
5189 	rv = entry->store(mddev, page, length);
5190 	mddev_put(mddev);
5191 	return rv;
5192 }
5193 
5194 static void md_free(struct kobject *ko)
5195 {
5196 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5197 
5198 	if (mddev->sysfs_state)
5199 		sysfs_put(mddev->sysfs_state);
5200 
5201 	if (mddev->gendisk)
5202 		del_gendisk(mddev->gendisk);
5203 	if (mddev->queue)
5204 		blk_cleanup_queue(mddev->queue);
5205 	if (mddev->gendisk)
5206 		put_disk(mddev->gendisk);
5207 	percpu_ref_exit(&mddev->writes_pending);
5208 
5209 	bioset_exit(&mddev->bio_set);
5210 	bioset_exit(&mddev->sync_set);
5211 	kfree(mddev);
5212 }
5213 
5214 static const struct sysfs_ops md_sysfs_ops = {
5215 	.show	= md_attr_show,
5216 	.store	= md_attr_store,
5217 };
5218 static struct kobj_type md_ktype = {
5219 	.release	= md_free,
5220 	.sysfs_ops	= &md_sysfs_ops,
5221 	.default_attrs	= md_default_attrs,
5222 };
5223 
5224 int mdp_major = 0;
5225 
5226 static void mddev_delayed_delete(struct work_struct *ws)
5227 {
5228 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5229 
5230 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5231 	kobject_del(&mddev->kobj);
5232 	kobject_put(&mddev->kobj);
5233 }
5234 
5235 static void no_op(struct percpu_ref *r) {}
5236 
5237 int mddev_init_writes_pending(struct mddev *mddev)
5238 {
5239 	if (mddev->writes_pending.percpu_count_ptr)
5240 		return 0;
5241 	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
5242 		return -ENOMEM;
5243 	/* We want to start with the refcount at zero */
5244 	percpu_ref_put(&mddev->writes_pending);
5245 	return 0;
5246 }
5247 EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5248 
5249 static int md_alloc(dev_t dev, char *name)
5250 {
5251 	/*
5252 	 * If dev is zero, name is the name of a device to allocate with
5253 	 * an arbitrary minor number.  It will be "md_???"
5254 	 * If dev is non-zero it must be a device number with a MAJOR of
5255 	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5256 	 * the device is being created by opening a node in /dev.
5257 	 * If "name" is not NULL, the device is being created by
5258 	 * writing to /sys/module/md_mod/parameters/new_array.
5259 	 */
5260 	static DEFINE_MUTEX(disks_mutex);
5261 	struct mddev *mddev = mddev_find(dev);
5262 	struct gendisk *disk;
5263 	int partitioned;
5264 	int shift;
5265 	int unit;
5266 	int error;
5267 
5268 	if (!mddev)
5269 		return -ENODEV;
5270 
5271 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5272 	shift = partitioned ? MdpMinorShift : 0;
5273 	unit = MINOR(mddev->unit) >> shift;
5274 
5275 	/* wait for any previous instance of this device to be
5276 	 * completely removed (mddev_delayed_delete).
5277 	 */
5278 	flush_workqueue(md_misc_wq);
5279 
5280 	mutex_lock(&disks_mutex);
5281 	error = -EEXIST;
5282 	if (mddev->gendisk)
5283 		goto abort;
5284 
5285 	if (name && !dev) {
5286 		/* Need to ensure that 'name' is not a duplicate.
5287 		 */
5288 		struct mddev *mddev2;
5289 		spin_lock(&all_mddevs_lock);
5290 
5291 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5292 			if (mddev2->gendisk &&
5293 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5294 				spin_unlock(&all_mddevs_lock);
5295 				goto abort;
5296 			}
5297 		spin_unlock(&all_mddevs_lock);
5298 	}
5299 	if (name && dev)
5300 		/*
5301 		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5302 		 */
5303 		mddev->hold_active = UNTIL_STOP;
5304 
5305 	error = -ENOMEM;
5306 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
5307 	if (!mddev->queue)
5308 		goto abort;
5309 	mddev->queue->queuedata = mddev;
5310 
5311 	blk_queue_make_request(mddev->queue, md_make_request);
5312 	blk_set_stacking_limits(&mddev->queue->limits);
5313 
5314 	disk = alloc_disk(1 << shift);
5315 	if (!disk) {
5316 		blk_cleanup_queue(mddev->queue);
5317 		mddev->queue = NULL;
5318 		goto abort;
5319 	}
5320 	disk->major = MAJOR(mddev->unit);
5321 	disk->first_minor = unit << shift;
5322 	if (name)
5323 		strcpy(disk->disk_name, name);
5324 	else if (partitioned)
5325 		sprintf(disk->disk_name, "md_d%d", unit);
5326 	else
5327 		sprintf(disk->disk_name, "md%d", unit);
5328 	disk->fops = &md_fops;
5329 	disk->private_data = mddev;
5330 	disk->queue = mddev->queue;
5331 	blk_queue_write_cache(mddev->queue, true, true);
5332 	/* Allow extended partitions.  This makes the
5333 	 * 'mdp' device redundant, but we can't really
5334 	 * remove it now.
5335 	 */
5336 	disk->flags |= GENHD_FL_EXT_DEVT;
5337 	mddev->gendisk = disk;
5338 	/* As soon as we call add_disk(), another thread could get
5339 	 * through to md_open, so make sure it doesn't get too far
5340 	 */
5341 	mutex_lock(&mddev->open_mutex);
5342 	add_disk(disk);
5343 
5344 	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5345 	if (error) {
5346 		/* This isn't possible, but as kobject_init_and_add is marked
5347 		 * __must_check, we must do something with the result
5348 		 */
5349 		pr_debug("md: cannot register %s/md - name in use\n",
5350 			 disk->disk_name);
5351 		error = 0;
5352 	}
5353 	if (mddev->kobj.sd &&
5354 	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5355 		pr_debug("pointless warning\n");
5356 	mutex_unlock(&mddev->open_mutex);
5357  abort:
5358 	mutex_unlock(&disks_mutex);
5359 	if (!error && mddev->kobj.sd) {
5360 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
5361 		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5362 	}
5363 	mddev_put(mddev);
5364 	return error;
5365 }
5366 
5367 static struct kobject *md_probe(dev_t dev, int *part, void *data)
5368 {
5369 	if (create_on_open)
5370 		md_alloc(dev, NULL);
5371 	return NULL;
5372 }
5373 
5374 static int add_named_array(const char *val, const struct kernel_param *kp)
5375 {
5376 	/*
5377 	 * val must be "md_*" or "mdNNN".
5378 	 * For "md_*" we allocate an array with a large free minor number, and
5379 	 * set the name to val.  val must not already be an active name.
5380 	 * For "mdNNN" we allocate an array with the minor number NNN
5381 	 * which must not already be in use.
5382 	 */
5383 	int len = strlen(val);
5384 	char buf[DISK_NAME_LEN];
5385 	unsigned long devnum;
5386 
5387 	while (len && val[len-1] == '\n')
5388 		len--;
5389 	if (len >= DISK_NAME_LEN)
5390 		return -E2BIG;
5391 	strlcpy(buf, val, len+1);
5392 	if (strncmp(buf, "md_", 3) == 0)
5393 		return md_alloc(0, buf);
5394 	if (strncmp(buf, "md", 2) == 0 &&
5395 	    isdigit(buf[2]) &&
5396 	    kstrtoul(buf+2, 10, &devnum) == 0 &&
5397 	    devnum <= MINORMASK)
5398 		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5399 
5400 	return -EINVAL;
5401 }
5402 
5403 static void md_safemode_timeout(struct timer_list *t)
5404 {
5405 	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5406 
5407 	mddev->safemode = 1;
5408 	if (mddev->external)
5409 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5410 
5411 	md_wakeup_thread(mddev->thread);
5412 }
5413 
5414 static int start_dirty_degraded;
5415 
5416 int md_run(struct mddev *mddev)
5417 {
5418 	int err;
5419 	struct md_rdev *rdev;
5420 	struct md_personality *pers;
5421 
5422 	if (list_empty(&mddev->disks))
5423 		/* cannot run an array with no devices.. */
5424 		return -EINVAL;
5425 
5426 	if (mddev->pers)
5427 		return -EBUSY;
5428 	/* Cannot run until previous stop completes properly */
5429 	if (mddev->sysfs_active)
5430 		return -EBUSY;
5431 
5432 	/*
5433 	 * Analyze all RAID superblock(s)
5434 	 */
5435 	if (!mddev->raid_disks) {
5436 		if (!mddev->persistent)
5437 			return -EINVAL;
5438 		analyze_sbs(mddev);
5439 	}
5440 
5441 	if (mddev->level != LEVEL_NONE)
5442 		request_module("md-level-%d", mddev->level);
5443 	else if (mddev->clevel[0])
5444 		request_module("md-%s", mddev->clevel);
5445 
5446 	/*
5447 	 * Drop all container device buffers, from now on
5448 	 * the only valid external interface is through the md
5449 	 * device.
5450 	 */
5451 	mddev->has_superblocks = false;
5452 	rdev_for_each(rdev, mddev) {
5453 		if (test_bit(Faulty, &rdev->flags))
5454 			continue;
5455 		sync_blockdev(rdev->bdev);
5456 		invalidate_bdev(rdev->bdev);
5457 		if (mddev->ro != 1 &&
5458 		    (bdev_read_only(rdev->bdev) ||
5459 		     bdev_read_only(rdev->meta_bdev))) {
5460 			mddev->ro = 1;
5461 			if (mddev->gendisk)
5462 				set_disk_ro(mddev->gendisk, 1);
5463 		}
5464 
5465 		if (rdev->sb_page)
5466 			mddev->has_superblocks = true;
5467 
5468 		/* perform some consistency tests on the device.
5469 		 * We don't want the data to overlap the metadata,
5470 		 * Internal Bitmap issues have been handled elsewhere.
5471 		 */
5472 		if (rdev->meta_bdev) {
5473 			/* Nothing to check */;
5474 		} else if (rdev->data_offset < rdev->sb_start) {
5475 			if (mddev->dev_sectors &&
5476 			    rdev->data_offset + mddev->dev_sectors
5477 			    > rdev->sb_start) {
5478 				pr_warn("md: %s: data overlaps metadata\n",
5479 					mdname(mddev));
5480 				return -EINVAL;
5481 			}
5482 		} else {
5483 			if (rdev->sb_start + rdev->sb_size/512
5484 			    > rdev->data_offset) {
5485 				pr_warn("md: %s: metadata overlaps data\n",
5486 					mdname(mddev));
5487 				return -EINVAL;
5488 			}
5489 		}
5490 		sysfs_notify_dirent_safe(rdev->sysfs_state);
5491 	}
5492 
5493 	if (!bioset_initialized(&mddev->bio_set)) {
5494 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5495 		if (err)
5496 			return err;
5497 	}
5498 	if (!bioset_initialized(&mddev->sync_set)) {
5499 		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5500 		if (err)
5501 			return err;
5502 	}
5503 
5504 	spin_lock(&pers_lock);
5505 	pers = find_pers(mddev->level, mddev->clevel);
5506 	if (!pers || !try_module_get(pers->owner)) {
5507 		spin_unlock(&pers_lock);
5508 		if (mddev->level != LEVEL_NONE)
5509 			pr_warn("md: personality for level %d is not loaded!\n",
5510 				mddev->level);
5511 		else
5512 			pr_warn("md: personality for level %s is not loaded!\n",
5513 				mddev->clevel);
5514 		err = -EINVAL;
5515 		goto abort;
5516 	}
5517 	spin_unlock(&pers_lock);
5518 	if (mddev->level != pers->level) {
5519 		mddev->level = pers->level;
5520 		mddev->new_level = pers->level;
5521 	}
5522 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5523 
5524 	if (mddev->reshape_position != MaxSector &&
5525 	    pers->start_reshape == NULL) {
5526 		/* This personality cannot handle reshaping... */
5527 		module_put(pers->owner);
5528 		err = -EINVAL;
5529 		goto abort;
5530 	}
5531 
5532 	if (pers->sync_request) {
5533 		/* Warn if this is a potentially silly
5534 		 * configuration.
5535 		 */
5536 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5537 		struct md_rdev *rdev2;
5538 		int warned = 0;
5539 
5540 		rdev_for_each(rdev, mddev)
5541 			rdev_for_each(rdev2, mddev) {
5542 				if (rdev < rdev2 &&
5543 				    rdev->bdev->bd_contains ==
5544 				    rdev2->bdev->bd_contains) {
5545 					pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5546 						mdname(mddev),
5547 						bdevname(rdev->bdev,b),
5548 						bdevname(rdev2->bdev,b2));
5549 					warned = 1;
5550 				}
5551 			}
5552 
5553 		if (warned)
5554 			pr_warn("True protection against single-disk failure might be compromised.\n");
5555 	}
5556 
5557 	mddev->recovery = 0;
5558 	/* may be over-ridden by personality */
5559 	mddev->resync_max_sectors = mddev->dev_sectors;
5560 
5561 	mddev->ok_start_degraded = start_dirty_degraded;
5562 
5563 	if (start_readonly && mddev->ro == 0)
5564 		mddev->ro = 2; /* read-only, but switch on first write */
5565 
5566 	err = pers->run(mddev);
5567 	if (err)
5568 		pr_warn("md: pers->run() failed ...\n");
5569 	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5570 		WARN_ONCE(!mddev->external_size,
5571 			  "%s: default size too small, but 'external_size' not in effect?\n",
5572 			  __func__);
5573 		pr_warn("md: invalid array_size %llu > default size %llu\n",
5574 			(unsigned long long)mddev->array_sectors / 2,
5575 			(unsigned long long)pers->size(mddev, 0, 0) / 2);
5576 		err = -EINVAL;
5577 	}
5578 	if (err == 0 && pers->sync_request &&
5579 	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5580 		struct bitmap *bitmap;
5581 
5582 		bitmap = md_bitmap_create(mddev, -1);
5583 		if (IS_ERR(bitmap)) {
5584 			err = PTR_ERR(bitmap);
5585 			pr_warn("%s: failed to create bitmap (%d)\n",
5586 				mdname(mddev), err);
5587 		} else
5588 			mddev->bitmap = bitmap;
5589 
5590 	}
5591 	if (err) {
5592 		mddev_detach(mddev);
5593 		if (mddev->private)
5594 			pers->free(mddev, mddev->private);
5595 		mddev->private = NULL;
5596 		module_put(pers->owner);
5597 		md_bitmap_destroy(mddev);
5598 		goto abort;
5599 	}
5600 	if (mddev->queue) {
5601 		bool nonrot = true;
5602 
5603 		rdev_for_each(rdev, mddev) {
5604 			if (rdev->raid_disk >= 0 &&
5605 			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5606 				nonrot = false;
5607 				break;
5608 			}
5609 		}
5610 		if (mddev->degraded)
5611 			nonrot = false;
5612 		if (nonrot)
5613 			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
5614 		else
5615 			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5616 		mddev->queue->backing_dev_info->congested_data = mddev;
5617 		mddev->queue->backing_dev_info->congested_fn = md_congested;
5618 	}
5619 	if (pers->sync_request) {
5620 		if (mddev->kobj.sd &&
5621 		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5622 			pr_warn("md: cannot register extra attributes for %s\n",
5623 				mdname(mddev));
5624 		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5625 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5626 		mddev->ro = 0;
5627 
5628 	atomic_set(&mddev->max_corr_read_errors,
5629 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5630 	mddev->safemode = 0;
5631 	if (mddev_is_clustered(mddev))
5632 		mddev->safemode_delay = 0;
5633 	else
5634 		mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5635 	mddev->in_sync = 1;
5636 	smp_wmb();
5637 	spin_lock(&mddev->lock);
5638 	mddev->pers = pers;
5639 	spin_unlock(&mddev->lock);
5640 	rdev_for_each(rdev, mddev)
5641 		if (rdev->raid_disk >= 0)
5642 			if (sysfs_link_rdev(mddev, rdev))
5643 				/* failure here is OK */;
5644 
5645 	if (mddev->degraded && !mddev->ro)
5646 		/* This ensures that recovering status is reported immediately
5647 		 * via sysfs - until a lack of spares is confirmed.
5648 		 */
5649 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5650 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5651 
5652 	if (mddev->sb_flags)
5653 		md_update_sb(mddev, 0);
5654 
5655 	md_new_event(mddev);
5656 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5657 	sysfs_notify_dirent_safe(mddev->sysfs_action);
5658 	sysfs_notify(&mddev->kobj, NULL, "degraded");
5659 	return 0;
5660 
5661 abort:
5662 	bioset_exit(&mddev->bio_set);
5663 	bioset_exit(&mddev->sync_set);
5664 	return err;
5665 }
5666 EXPORT_SYMBOL_GPL(md_run);
5667 
5668 static int do_md_run(struct mddev *mddev)
5669 {
5670 	int err;
5671 
5672 	err = md_run(mddev);
5673 	if (err)
5674 		goto out;
5675 	err = md_bitmap_load(mddev);
5676 	if (err) {
5677 		md_bitmap_destroy(mddev);
5678 		goto out;
5679 	}
5680 
5681 	if (mddev_is_clustered(mddev))
5682 		md_allow_write(mddev);
5683 
5684 	/* run start up tasks that require md_thread */
5685 	md_start(mddev);
5686 
5687 	md_wakeup_thread(mddev->thread);
5688 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5689 
5690 	set_capacity(mddev->gendisk, mddev->array_sectors);
5691 	revalidate_disk(mddev->gendisk);
5692 	mddev->changed = 1;
5693 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5694 out:
5695 	return err;
5696 }
5697 
5698 int md_start(struct mddev *mddev)
5699 {
5700 	int ret = 0;
5701 
5702 	if (mddev->pers->start) {
5703 		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5704 		md_wakeup_thread(mddev->thread);
5705 		ret = mddev->pers->start(mddev);
5706 		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5707 		md_wakeup_thread(mddev->sync_thread);
5708 	}
5709 	return ret;
5710 }
5711 EXPORT_SYMBOL_GPL(md_start);
5712 
5713 static int restart_array(struct mddev *mddev)
5714 {
5715 	struct gendisk *disk = mddev->gendisk;
5716 	struct md_rdev *rdev;
5717 	bool has_journal = false;
5718 	bool has_readonly = false;
5719 
5720 	/* Complain if it has no devices */
5721 	if (list_empty(&mddev->disks))
5722 		return -ENXIO;
5723 	if (!mddev->pers)
5724 		return -EINVAL;
5725 	if (!mddev->ro)
5726 		return -EBUSY;
5727 
5728 	rcu_read_lock();
5729 	rdev_for_each_rcu(rdev, mddev) {
5730 		if (test_bit(Journal, &rdev->flags) &&
5731 		    !test_bit(Faulty, &rdev->flags))
5732 			has_journal = true;
5733 		if (bdev_read_only(rdev->bdev))
5734 			has_readonly = true;
5735 	}
5736 	rcu_read_unlock();
5737 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5738 		/* Don't restart rw with journal missing/faulty */
5739 			return -EINVAL;
5740 	if (has_readonly)
5741 		return -EROFS;
5742 
5743 	mddev->safemode = 0;
5744 	mddev->ro = 0;
5745 	set_disk_ro(disk, 0);
5746 	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5747 	/* Kick recovery or resync if necessary */
5748 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5749 	md_wakeup_thread(mddev->thread);
5750 	md_wakeup_thread(mddev->sync_thread);
5751 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5752 	return 0;
5753 }
5754 
5755 static void md_clean(struct mddev *mddev)
5756 {
5757 	mddev->array_sectors = 0;
5758 	mddev->external_size = 0;
5759 	mddev->dev_sectors = 0;
5760 	mddev->raid_disks = 0;
5761 	mddev->recovery_cp = 0;
5762 	mddev->resync_min = 0;
5763 	mddev->resync_max = MaxSector;
5764 	mddev->reshape_position = MaxSector;
5765 	mddev->external = 0;
5766 	mddev->persistent = 0;
5767 	mddev->level = LEVEL_NONE;
5768 	mddev->clevel[0] = 0;
5769 	mddev->flags = 0;
5770 	mddev->sb_flags = 0;
5771 	mddev->ro = 0;
5772 	mddev->metadata_type[0] = 0;
5773 	mddev->chunk_sectors = 0;
5774 	mddev->ctime = mddev->utime = 0;
5775 	mddev->layout = 0;
5776 	mddev->max_disks = 0;
5777 	mddev->events = 0;
5778 	mddev->can_decrease_events = 0;
5779 	mddev->delta_disks = 0;
5780 	mddev->reshape_backwards = 0;
5781 	mddev->new_level = LEVEL_NONE;
5782 	mddev->new_layout = 0;
5783 	mddev->new_chunk_sectors = 0;
5784 	mddev->curr_resync = 0;
5785 	atomic64_set(&mddev->resync_mismatches, 0);
5786 	mddev->suspend_lo = mddev->suspend_hi = 0;
5787 	mddev->sync_speed_min = mddev->sync_speed_max = 0;
5788 	mddev->recovery = 0;
5789 	mddev->in_sync = 0;
5790 	mddev->changed = 0;
5791 	mddev->degraded = 0;
5792 	mddev->safemode = 0;
5793 	mddev->private = NULL;
5794 	mddev->cluster_info = NULL;
5795 	mddev->bitmap_info.offset = 0;
5796 	mddev->bitmap_info.default_offset = 0;
5797 	mddev->bitmap_info.default_space = 0;
5798 	mddev->bitmap_info.chunksize = 0;
5799 	mddev->bitmap_info.daemon_sleep = 0;
5800 	mddev->bitmap_info.max_write_behind = 0;
5801 	mddev->bitmap_info.nodes = 0;
5802 }
5803 
5804 static void __md_stop_writes(struct mddev *mddev)
5805 {
5806 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5807 	flush_workqueue(md_misc_wq);
5808 	if (mddev->sync_thread) {
5809 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5810 		md_reap_sync_thread(mddev);
5811 	}
5812 
5813 	del_timer_sync(&mddev->safemode_timer);
5814 
5815 	if (mddev->pers && mddev->pers->quiesce) {
5816 		mddev->pers->quiesce(mddev, 1);
5817 		mddev->pers->quiesce(mddev, 0);
5818 	}
5819 	md_bitmap_flush(mddev);
5820 
5821 	if (mddev->ro == 0 &&
5822 	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5823 	     mddev->sb_flags)) {
5824 		/* mark array as shutdown cleanly */
5825 		if (!mddev_is_clustered(mddev))
5826 			mddev->in_sync = 1;
5827 		md_update_sb(mddev, 1);
5828 	}
5829 }
5830 
5831 void md_stop_writes(struct mddev *mddev)
5832 {
5833 	mddev_lock_nointr(mddev);
5834 	__md_stop_writes(mddev);
5835 	mddev_unlock(mddev);
5836 }
5837 EXPORT_SYMBOL_GPL(md_stop_writes);
5838 
5839 static void mddev_detach(struct mddev *mddev)
5840 {
5841 	md_bitmap_wait_behind_writes(mddev);
5842 	if (mddev->pers && mddev->pers->quiesce) {
5843 		mddev->pers->quiesce(mddev, 1);
5844 		mddev->pers->quiesce(mddev, 0);
5845 	}
5846 	md_unregister_thread(&mddev->thread);
5847 	if (mddev->queue)
5848 		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5849 }
5850 
5851 static void __md_stop(struct mddev *mddev)
5852 {
5853 	struct md_personality *pers = mddev->pers;
5854 	md_bitmap_destroy(mddev);
5855 	mddev_detach(mddev);
5856 	/* Ensure ->event_work is done */
5857 	flush_workqueue(md_misc_wq);
5858 	spin_lock(&mddev->lock);
5859 	mddev->pers = NULL;
5860 	spin_unlock(&mddev->lock);
5861 	pers->free(mddev, mddev->private);
5862 	mddev->private = NULL;
5863 	if (pers->sync_request && mddev->to_remove == NULL)
5864 		mddev->to_remove = &md_redundancy_group;
5865 	module_put(pers->owner);
5866 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5867 }
5868 
5869 void md_stop(struct mddev *mddev)
5870 {
5871 	/* stop the array and free an attached data structures.
5872 	 * This is called from dm-raid
5873 	 */
5874 	__md_stop(mddev);
5875 	bioset_exit(&mddev->bio_set);
5876 	bioset_exit(&mddev->sync_set);
5877 }
5878 
5879 EXPORT_SYMBOL_GPL(md_stop);
5880 
5881 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5882 {
5883 	int err = 0;
5884 	int did_freeze = 0;
5885 
5886 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5887 		did_freeze = 1;
5888 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5889 		md_wakeup_thread(mddev->thread);
5890 	}
5891 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5892 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5893 	if (mddev->sync_thread)
5894 		/* Thread might be blocked waiting for metadata update
5895 		 * which will now never happen */
5896 		wake_up_process(mddev->sync_thread->tsk);
5897 
5898 	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5899 		return -EBUSY;
5900 	mddev_unlock(mddev);
5901 	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5902 					  &mddev->recovery));
5903 	wait_event(mddev->sb_wait,
5904 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5905 	mddev_lock_nointr(mddev);
5906 
5907 	mutex_lock(&mddev->open_mutex);
5908 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5909 	    mddev->sync_thread ||
5910 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5911 		pr_warn("md: %s still in use.\n",mdname(mddev));
5912 		if (did_freeze) {
5913 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5914 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5915 			md_wakeup_thread(mddev->thread);
5916 		}
5917 		err = -EBUSY;
5918 		goto out;
5919 	}
5920 	if (mddev->pers) {
5921 		__md_stop_writes(mddev);
5922 
5923 		err  = -ENXIO;
5924 		if (mddev->ro==1)
5925 			goto out;
5926 		mddev->ro = 1;
5927 		set_disk_ro(mddev->gendisk, 1);
5928 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5929 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5930 		md_wakeup_thread(mddev->thread);
5931 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5932 		err = 0;
5933 	}
5934 out:
5935 	mutex_unlock(&mddev->open_mutex);
5936 	return err;
5937 }
5938 
5939 /* mode:
5940  *   0 - completely stop and dis-assemble array
5941  *   2 - stop but do not disassemble array
5942  */
5943 static int do_md_stop(struct mddev *mddev, int mode,
5944 		      struct block_device *bdev)
5945 {
5946 	struct gendisk *disk = mddev->gendisk;
5947 	struct md_rdev *rdev;
5948 	int did_freeze = 0;
5949 
5950 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5951 		did_freeze = 1;
5952 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5953 		md_wakeup_thread(mddev->thread);
5954 	}
5955 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5956 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5957 	if (mddev->sync_thread)
5958 		/* Thread might be blocked waiting for metadata update
5959 		 * which will now never happen */
5960 		wake_up_process(mddev->sync_thread->tsk);
5961 
5962 	mddev_unlock(mddev);
5963 	wait_event(resync_wait, (mddev->sync_thread == NULL &&
5964 				 !test_bit(MD_RECOVERY_RUNNING,
5965 					   &mddev->recovery)));
5966 	mddev_lock_nointr(mddev);
5967 
5968 	mutex_lock(&mddev->open_mutex);
5969 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5970 	    mddev->sysfs_active ||
5971 	    mddev->sync_thread ||
5972 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5973 		pr_warn("md: %s still in use.\n",mdname(mddev));
5974 		mutex_unlock(&mddev->open_mutex);
5975 		if (did_freeze) {
5976 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5977 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5978 			md_wakeup_thread(mddev->thread);
5979 		}
5980 		return -EBUSY;
5981 	}
5982 	if (mddev->pers) {
5983 		if (mddev->ro)
5984 			set_disk_ro(disk, 0);
5985 
5986 		__md_stop_writes(mddev);
5987 		__md_stop(mddev);
5988 		mddev->queue->backing_dev_info->congested_fn = NULL;
5989 
5990 		/* tell userspace to handle 'inactive' */
5991 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5992 
5993 		rdev_for_each(rdev, mddev)
5994 			if (rdev->raid_disk >= 0)
5995 				sysfs_unlink_rdev(mddev, rdev);
5996 
5997 		set_capacity(disk, 0);
5998 		mutex_unlock(&mddev->open_mutex);
5999 		mddev->changed = 1;
6000 		revalidate_disk(disk);
6001 
6002 		if (mddev->ro)
6003 			mddev->ro = 0;
6004 	} else
6005 		mutex_unlock(&mddev->open_mutex);
6006 	/*
6007 	 * Free resources if final stop
6008 	 */
6009 	if (mode == 0) {
6010 		pr_info("md: %s stopped.\n", mdname(mddev));
6011 
6012 		if (mddev->bitmap_info.file) {
6013 			struct file *f = mddev->bitmap_info.file;
6014 			spin_lock(&mddev->lock);
6015 			mddev->bitmap_info.file = NULL;
6016 			spin_unlock(&mddev->lock);
6017 			fput(f);
6018 		}
6019 		mddev->bitmap_info.offset = 0;
6020 
6021 		export_array(mddev);
6022 
6023 		md_clean(mddev);
6024 		if (mddev->hold_active == UNTIL_STOP)
6025 			mddev->hold_active = 0;
6026 	}
6027 	md_new_event(mddev);
6028 	sysfs_notify_dirent_safe(mddev->sysfs_state);
6029 	return 0;
6030 }
6031 
6032 #ifndef MODULE
6033 static void autorun_array(struct mddev *mddev)
6034 {
6035 	struct md_rdev *rdev;
6036 	int err;
6037 
6038 	if (list_empty(&mddev->disks))
6039 		return;
6040 
6041 	pr_info("md: running: ");
6042 
6043 	rdev_for_each(rdev, mddev) {
6044 		char b[BDEVNAME_SIZE];
6045 		pr_cont("<%s>", bdevname(rdev->bdev,b));
6046 	}
6047 	pr_cont("\n");
6048 
6049 	err = do_md_run(mddev);
6050 	if (err) {
6051 		pr_warn("md: do_md_run() returned %d\n", err);
6052 		do_md_stop(mddev, 0, NULL);
6053 	}
6054 }
6055 
6056 /*
6057  * lets try to run arrays based on all disks that have arrived
6058  * until now. (those are in pending_raid_disks)
6059  *
6060  * the method: pick the first pending disk, collect all disks with
6061  * the same UUID, remove all from the pending list and put them into
6062  * the 'same_array' list. Then order this list based on superblock
6063  * update time (freshest comes first), kick out 'old' disks and
6064  * compare superblocks. If everything's fine then run it.
6065  *
6066  * If "unit" is allocated, then bump its reference count
6067  */
6068 static void autorun_devices(int part)
6069 {
6070 	struct md_rdev *rdev0, *rdev, *tmp;
6071 	struct mddev *mddev;
6072 	char b[BDEVNAME_SIZE];
6073 
6074 	pr_info("md: autorun ...\n");
6075 	while (!list_empty(&pending_raid_disks)) {
6076 		int unit;
6077 		dev_t dev;
6078 		LIST_HEAD(candidates);
6079 		rdev0 = list_entry(pending_raid_disks.next,
6080 					 struct md_rdev, same_set);
6081 
6082 		pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6083 		INIT_LIST_HEAD(&candidates);
6084 		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6085 			if (super_90_load(rdev, rdev0, 0) >= 0) {
6086 				pr_debug("md:  adding %s ...\n",
6087 					 bdevname(rdev->bdev,b));
6088 				list_move(&rdev->same_set, &candidates);
6089 			}
6090 		/*
6091 		 * now we have a set of devices, with all of them having
6092 		 * mostly sane superblocks. It's time to allocate the
6093 		 * mddev.
6094 		 */
6095 		if (part) {
6096 			dev = MKDEV(mdp_major,
6097 				    rdev0->preferred_minor << MdpMinorShift);
6098 			unit = MINOR(dev) >> MdpMinorShift;
6099 		} else {
6100 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6101 			unit = MINOR(dev);
6102 		}
6103 		if (rdev0->preferred_minor != unit) {
6104 			pr_warn("md: unit number in %s is bad: %d\n",
6105 				bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6106 			break;
6107 		}
6108 
6109 		md_probe(dev, NULL, NULL);
6110 		mddev = mddev_find(dev);
6111 		if (!mddev || !mddev->gendisk) {
6112 			if (mddev)
6113 				mddev_put(mddev);
6114 			break;
6115 		}
6116 		if (mddev_lock(mddev))
6117 			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6118 		else if (mddev->raid_disks || mddev->major_version
6119 			 || !list_empty(&mddev->disks)) {
6120 			pr_warn("md: %s already running, cannot run %s\n",
6121 				mdname(mddev), bdevname(rdev0->bdev,b));
6122 			mddev_unlock(mddev);
6123 		} else {
6124 			pr_debug("md: created %s\n", mdname(mddev));
6125 			mddev->persistent = 1;
6126 			rdev_for_each_list(rdev, tmp, &candidates) {
6127 				list_del_init(&rdev->same_set);
6128 				if (bind_rdev_to_array(rdev, mddev))
6129 					export_rdev(rdev);
6130 			}
6131 			autorun_array(mddev);
6132 			mddev_unlock(mddev);
6133 		}
6134 		/* on success, candidates will be empty, on error
6135 		 * it won't...
6136 		 */
6137 		rdev_for_each_list(rdev, tmp, &candidates) {
6138 			list_del_init(&rdev->same_set);
6139 			export_rdev(rdev);
6140 		}
6141 		mddev_put(mddev);
6142 	}
6143 	pr_info("md: ... autorun DONE.\n");
6144 }
6145 #endif /* !MODULE */
6146 
6147 static int get_version(void __user *arg)
6148 {
6149 	mdu_version_t ver;
6150 
6151 	ver.major = MD_MAJOR_VERSION;
6152 	ver.minor = MD_MINOR_VERSION;
6153 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6154 
6155 	if (copy_to_user(arg, &ver, sizeof(ver)))
6156 		return -EFAULT;
6157 
6158 	return 0;
6159 }
6160 
6161 static int get_array_info(struct mddev *mddev, void __user *arg)
6162 {
6163 	mdu_array_info_t info;
6164 	int nr,working,insync,failed,spare;
6165 	struct md_rdev *rdev;
6166 
6167 	nr = working = insync = failed = spare = 0;
6168 	rcu_read_lock();
6169 	rdev_for_each_rcu(rdev, mddev) {
6170 		nr++;
6171 		if (test_bit(Faulty, &rdev->flags))
6172 			failed++;
6173 		else {
6174 			working++;
6175 			if (test_bit(In_sync, &rdev->flags))
6176 				insync++;
6177 			else if (test_bit(Journal, &rdev->flags))
6178 				/* TODO: add journal count to md_u.h */
6179 				;
6180 			else
6181 				spare++;
6182 		}
6183 	}
6184 	rcu_read_unlock();
6185 
6186 	info.major_version = mddev->major_version;
6187 	info.minor_version = mddev->minor_version;
6188 	info.patch_version = MD_PATCHLEVEL_VERSION;
6189 	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6190 	info.level         = mddev->level;
6191 	info.size          = mddev->dev_sectors / 2;
6192 	if (info.size != mddev->dev_sectors / 2) /* overflow */
6193 		info.size = -1;
6194 	info.nr_disks      = nr;
6195 	info.raid_disks    = mddev->raid_disks;
6196 	info.md_minor      = mddev->md_minor;
6197 	info.not_persistent= !mddev->persistent;
6198 
6199 	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6200 	info.state         = 0;
6201 	if (mddev->in_sync)
6202 		info.state = (1<<MD_SB_CLEAN);
6203 	if (mddev->bitmap && mddev->bitmap_info.offset)
6204 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6205 	if (mddev_is_clustered(mddev))
6206 		info.state |= (1<<MD_SB_CLUSTERED);
6207 	info.active_disks  = insync;
6208 	info.working_disks = working;
6209 	info.failed_disks  = failed;
6210 	info.spare_disks   = spare;
6211 
6212 	info.layout        = mddev->layout;
6213 	info.chunk_size    = mddev->chunk_sectors << 9;
6214 
6215 	if (copy_to_user(arg, &info, sizeof(info)))
6216 		return -EFAULT;
6217 
6218 	return 0;
6219 }
6220 
6221 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6222 {
6223 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6224 	char *ptr;
6225 	int err;
6226 
6227 	file = kzalloc(sizeof(*file), GFP_NOIO);
6228 	if (!file)
6229 		return -ENOMEM;
6230 
6231 	err = 0;
6232 	spin_lock(&mddev->lock);
6233 	/* bitmap enabled */
6234 	if (mddev->bitmap_info.file) {
6235 		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6236 				sizeof(file->pathname));
6237 		if (IS_ERR(ptr))
6238 			err = PTR_ERR(ptr);
6239 		else
6240 			memmove(file->pathname, ptr,
6241 				sizeof(file->pathname)-(ptr-file->pathname));
6242 	}
6243 	spin_unlock(&mddev->lock);
6244 
6245 	if (err == 0 &&
6246 	    copy_to_user(arg, file, sizeof(*file)))
6247 		err = -EFAULT;
6248 
6249 	kfree(file);
6250 	return err;
6251 }
6252 
6253 static int get_disk_info(struct mddev *mddev, void __user * arg)
6254 {
6255 	mdu_disk_info_t info;
6256 	struct md_rdev *rdev;
6257 
6258 	if (copy_from_user(&info, arg, sizeof(info)))
6259 		return -EFAULT;
6260 
6261 	rcu_read_lock();
6262 	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6263 	if (rdev) {
6264 		info.major = MAJOR(rdev->bdev->bd_dev);
6265 		info.minor = MINOR(rdev->bdev->bd_dev);
6266 		info.raid_disk = rdev->raid_disk;
6267 		info.state = 0;
6268 		if (test_bit(Faulty, &rdev->flags))
6269 			info.state |= (1<<MD_DISK_FAULTY);
6270 		else if (test_bit(In_sync, &rdev->flags)) {
6271 			info.state |= (1<<MD_DISK_ACTIVE);
6272 			info.state |= (1<<MD_DISK_SYNC);
6273 		}
6274 		if (test_bit(Journal, &rdev->flags))
6275 			info.state |= (1<<MD_DISK_JOURNAL);
6276 		if (test_bit(WriteMostly, &rdev->flags))
6277 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6278 		if (test_bit(FailFast, &rdev->flags))
6279 			info.state |= (1<<MD_DISK_FAILFAST);
6280 	} else {
6281 		info.major = info.minor = 0;
6282 		info.raid_disk = -1;
6283 		info.state = (1<<MD_DISK_REMOVED);
6284 	}
6285 	rcu_read_unlock();
6286 
6287 	if (copy_to_user(arg, &info, sizeof(info)))
6288 		return -EFAULT;
6289 
6290 	return 0;
6291 }
6292 
6293 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6294 {
6295 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6296 	struct md_rdev *rdev;
6297 	dev_t dev = MKDEV(info->major,info->minor);
6298 
6299 	if (mddev_is_clustered(mddev) &&
6300 		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6301 		pr_warn("%s: Cannot add to clustered mddev.\n",
6302 			mdname(mddev));
6303 		return -EINVAL;
6304 	}
6305 
6306 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6307 		return -EOVERFLOW;
6308 
6309 	if (!mddev->raid_disks) {
6310 		int err;
6311 		/* expecting a device which has a superblock */
6312 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6313 		if (IS_ERR(rdev)) {
6314 			pr_warn("md: md_import_device returned %ld\n",
6315 				PTR_ERR(rdev));
6316 			return PTR_ERR(rdev);
6317 		}
6318 		if (!list_empty(&mddev->disks)) {
6319 			struct md_rdev *rdev0
6320 				= list_entry(mddev->disks.next,
6321 					     struct md_rdev, same_set);
6322 			err = super_types[mddev->major_version]
6323 				.load_super(rdev, rdev0, mddev->minor_version);
6324 			if (err < 0) {
6325 				pr_warn("md: %s has different UUID to %s\n",
6326 					bdevname(rdev->bdev,b),
6327 					bdevname(rdev0->bdev,b2));
6328 				export_rdev(rdev);
6329 				return -EINVAL;
6330 			}
6331 		}
6332 		err = bind_rdev_to_array(rdev, mddev);
6333 		if (err)
6334 			export_rdev(rdev);
6335 		return err;
6336 	}
6337 
6338 	/*
6339 	 * add_new_disk can be used once the array is assembled
6340 	 * to add "hot spares".  They must already have a superblock
6341 	 * written
6342 	 */
6343 	if (mddev->pers) {
6344 		int err;
6345 		if (!mddev->pers->hot_add_disk) {
6346 			pr_warn("%s: personality does not support diskops!\n",
6347 				mdname(mddev));
6348 			return -EINVAL;
6349 		}
6350 		if (mddev->persistent)
6351 			rdev = md_import_device(dev, mddev->major_version,
6352 						mddev->minor_version);
6353 		else
6354 			rdev = md_import_device(dev, -1, -1);
6355 		if (IS_ERR(rdev)) {
6356 			pr_warn("md: md_import_device returned %ld\n",
6357 				PTR_ERR(rdev));
6358 			return PTR_ERR(rdev);
6359 		}
6360 		/* set saved_raid_disk if appropriate */
6361 		if (!mddev->persistent) {
6362 			if (info->state & (1<<MD_DISK_SYNC)  &&
6363 			    info->raid_disk < mddev->raid_disks) {
6364 				rdev->raid_disk = info->raid_disk;
6365 				set_bit(In_sync, &rdev->flags);
6366 				clear_bit(Bitmap_sync, &rdev->flags);
6367 			} else
6368 				rdev->raid_disk = -1;
6369 			rdev->saved_raid_disk = rdev->raid_disk;
6370 		} else
6371 			super_types[mddev->major_version].
6372 				validate_super(mddev, rdev);
6373 		if ((info->state & (1<<MD_DISK_SYNC)) &&
6374 		     rdev->raid_disk != info->raid_disk) {
6375 			/* This was a hot-add request, but events doesn't
6376 			 * match, so reject it.
6377 			 */
6378 			export_rdev(rdev);
6379 			return -EINVAL;
6380 		}
6381 
6382 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6383 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6384 			set_bit(WriteMostly, &rdev->flags);
6385 		else
6386 			clear_bit(WriteMostly, &rdev->flags);
6387 		if (info->state & (1<<MD_DISK_FAILFAST))
6388 			set_bit(FailFast, &rdev->flags);
6389 		else
6390 			clear_bit(FailFast, &rdev->flags);
6391 
6392 		if (info->state & (1<<MD_DISK_JOURNAL)) {
6393 			struct md_rdev *rdev2;
6394 			bool has_journal = false;
6395 
6396 			/* make sure no existing journal disk */
6397 			rdev_for_each(rdev2, mddev) {
6398 				if (test_bit(Journal, &rdev2->flags)) {
6399 					has_journal = true;
6400 					break;
6401 				}
6402 			}
6403 			if (has_journal || mddev->bitmap) {
6404 				export_rdev(rdev);
6405 				return -EBUSY;
6406 			}
6407 			set_bit(Journal, &rdev->flags);
6408 		}
6409 		/*
6410 		 * check whether the device shows up in other nodes
6411 		 */
6412 		if (mddev_is_clustered(mddev)) {
6413 			if (info->state & (1 << MD_DISK_CANDIDATE))
6414 				set_bit(Candidate, &rdev->flags);
6415 			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6416 				/* --add initiated by this node */
6417 				err = md_cluster_ops->add_new_disk(mddev, rdev);
6418 				if (err) {
6419 					export_rdev(rdev);
6420 					return err;
6421 				}
6422 			}
6423 		}
6424 
6425 		rdev->raid_disk = -1;
6426 		err = bind_rdev_to_array(rdev, mddev);
6427 
6428 		if (err)
6429 			export_rdev(rdev);
6430 
6431 		if (mddev_is_clustered(mddev)) {
6432 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
6433 				if (!err) {
6434 					err = md_cluster_ops->new_disk_ack(mddev,
6435 						err == 0);
6436 					if (err)
6437 						md_kick_rdev_from_array(rdev);
6438 				}
6439 			} else {
6440 				if (err)
6441 					md_cluster_ops->add_new_disk_cancel(mddev);
6442 				else
6443 					err = add_bound_rdev(rdev);
6444 			}
6445 
6446 		} else if (!err)
6447 			err = add_bound_rdev(rdev);
6448 
6449 		return err;
6450 	}
6451 
6452 	/* otherwise, add_new_disk is only allowed
6453 	 * for major_version==0 superblocks
6454 	 */
6455 	if (mddev->major_version != 0) {
6456 		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6457 		return -EINVAL;
6458 	}
6459 
6460 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
6461 		int err;
6462 		rdev = md_import_device(dev, -1, 0);
6463 		if (IS_ERR(rdev)) {
6464 			pr_warn("md: error, md_import_device() returned %ld\n",
6465 				PTR_ERR(rdev));
6466 			return PTR_ERR(rdev);
6467 		}
6468 		rdev->desc_nr = info->number;
6469 		if (info->raid_disk < mddev->raid_disks)
6470 			rdev->raid_disk = info->raid_disk;
6471 		else
6472 			rdev->raid_disk = -1;
6473 
6474 		if (rdev->raid_disk < mddev->raid_disks)
6475 			if (info->state & (1<<MD_DISK_SYNC))
6476 				set_bit(In_sync, &rdev->flags);
6477 
6478 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6479 			set_bit(WriteMostly, &rdev->flags);
6480 		if (info->state & (1<<MD_DISK_FAILFAST))
6481 			set_bit(FailFast, &rdev->flags);
6482 
6483 		if (!mddev->persistent) {
6484 			pr_debug("md: nonpersistent superblock ...\n");
6485 			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6486 		} else
6487 			rdev->sb_start = calc_dev_sboffset(rdev);
6488 		rdev->sectors = rdev->sb_start;
6489 
6490 		err = bind_rdev_to_array(rdev, mddev);
6491 		if (err) {
6492 			export_rdev(rdev);
6493 			return err;
6494 		}
6495 	}
6496 
6497 	return 0;
6498 }
6499 
6500 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6501 {
6502 	char b[BDEVNAME_SIZE];
6503 	struct md_rdev *rdev;
6504 
6505 	if (!mddev->pers)
6506 		return -ENODEV;
6507 
6508 	rdev = find_rdev(mddev, dev);
6509 	if (!rdev)
6510 		return -ENXIO;
6511 
6512 	if (rdev->raid_disk < 0)
6513 		goto kick_rdev;
6514 
6515 	clear_bit(Blocked, &rdev->flags);
6516 	remove_and_add_spares(mddev, rdev);
6517 
6518 	if (rdev->raid_disk >= 0)
6519 		goto busy;
6520 
6521 kick_rdev:
6522 	if (mddev_is_clustered(mddev))
6523 		md_cluster_ops->remove_disk(mddev, rdev);
6524 
6525 	md_kick_rdev_from_array(rdev);
6526 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6527 	if (mddev->thread)
6528 		md_wakeup_thread(mddev->thread);
6529 	else
6530 		md_update_sb(mddev, 1);
6531 	md_new_event(mddev);
6532 
6533 	return 0;
6534 busy:
6535 	pr_debug("md: cannot remove active disk %s from %s ...\n",
6536 		 bdevname(rdev->bdev,b), mdname(mddev));
6537 	return -EBUSY;
6538 }
6539 
6540 static int hot_add_disk(struct mddev *mddev, dev_t dev)
6541 {
6542 	char b[BDEVNAME_SIZE];
6543 	int err;
6544 	struct md_rdev *rdev;
6545 
6546 	if (!mddev->pers)
6547 		return -ENODEV;
6548 
6549 	if (mddev->major_version != 0) {
6550 		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6551 			mdname(mddev));
6552 		return -EINVAL;
6553 	}
6554 	if (!mddev->pers->hot_add_disk) {
6555 		pr_warn("%s: personality does not support diskops!\n",
6556 			mdname(mddev));
6557 		return -EINVAL;
6558 	}
6559 
6560 	rdev = md_import_device(dev, -1, 0);
6561 	if (IS_ERR(rdev)) {
6562 		pr_warn("md: error, md_import_device() returned %ld\n",
6563 			PTR_ERR(rdev));
6564 		return -EINVAL;
6565 	}
6566 
6567 	if (mddev->persistent)
6568 		rdev->sb_start = calc_dev_sboffset(rdev);
6569 	else
6570 		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6571 
6572 	rdev->sectors = rdev->sb_start;
6573 
6574 	if (test_bit(Faulty, &rdev->flags)) {
6575 		pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6576 			bdevname(rdev->bdev,b), mdname(mddev));
6577 		err = -EINVAL;
6578 		goto abort_export;
6579 	}
6580 
6581 	clear_bit(In_sync, &rdev->flags);
6582 	rdev->desc_nr = -1;
6583 	rdev->saved_raid_disk = -1;
6584 	err = bind_rdev_to_array(rdev, mddev);
6585 	if (err)
6586 		goto abort_export;
6587 
6588 	/*
6589 	 * The rest should better be atomic, we can have disk failures
6590 	 * noticed in interrupt contexts ...
6591 	 */
6592 
6593 	rdev->raid_disk = -1;
6594 
6595 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6596 	if (!mddev->thread)
6597 		md_update_sb(mddev, 1);
6598 	/*
6599 	 * Kick recovery, maybe this spare has to be added to the
6600 	 * array immediately.
6601 	 */
6602 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6603 	md_wakeup_thread(mddev->thread);
6604 	md_new_event(mddev);
6605 	return 0;
6606 
6607 abort_export:
6608 	export_rdev(rdev);
6609 	return err;
6610 }
6611 
6612 static int set_bitmap_file(struct mddev *mddev, int fd)
6613 {
6614 	int err = 0;
6615 
6616 	if (mddev->pers) {
6617 		if (!mddev->pers->quiesce || !mddev->thread)
6618 			return -EBUSY;
6619 		if (mddev->recovery || mddev->sync_thread)
6620 			return -EBUSY;
6621 		/* we should be able to change the bitmap.. */
6622 	}
6623 
6624 	if (fd >= 0) {
6625 		struct inode *inode;
6626 		struct file *f;
6627 
6628 		if (mddev->bitmap || mddev->bitmap_info.file)
6629 			return -EEXIST; /* cannot add when bitmap is present */
6630 		f = fget(fd);
6631 
6632 		if (f == NULL) {
6633 			pr_warn("%s: error: failed to get bitmap file\n",
6634 				mdname(mddev));
6635 			return -EBADF;
6636 		}
6637 
6638 		inode = f->f_mapping->host;
6639 		if (!S_ISREG(inode->i_mode)) {
6640 			pr_warn("%s: error: bitmap file must be a regular file\n",
6641 				mdname(mddev));
6642 			err = -EBADF;
6643 		} else if (!(f->f_mode & FMODE_WRITE)) {
6644 			pr_warn("%s: error: bitmap file must open for write\n",
6645 				mdname(mddev));
6646 			err = -EBADF;
6647 		} else if (atomic_read(&inode->i_writecount) != 1) {
6648 			pr_warn("%s: error: bitmap file is already in use\n",
6649 				mdname(mddev));
6650 			err = -EBUSY;
6651 		}
6652 		if (err) {
6653 			fput(f);
6654 			return err;
6655 		}
6656 		mddev->bitmap_info.file = f;
6657 		mddev->bitmap_info.offset = 0; /* file overrides offset */
6658 	} else if (mddev->bitmap == NULL)
6659 		return -ENOENT; /* cannot remove what isn't there */
6660 	err = 0;
6661 	if (mddev->pers) {
6662 		if (fd >= 0) {
6663 			struct bitmap *bitmap;
6664 
6665 			bitmap = md_bitmap_create(mddev, -1);
6666 			mddev_suspend(mddev);
6667 			if (!IS_ERR(bitmap)) {
6668 				mddev->bitmap = bitmap;
6669 				err = md_bitmap_load(mddev);
6670 			} else
6671 				err = PTR_ERR(bitmap);
6672 			if (err) {
6673 				md_bitmap_destroy(mddev);
6674 				fd = -1;
6675 			}
6676 			mddev_resume(mddev);
6677 		} else if (fd < 0) {
6678 			mddev_suspend(mddev);
6679 			md_bitmap_destroy(mddev);
6680 			mddev_resume(mddev);
6681 		}
6682 	}
6683 	if (fd < 0) {
6684 		struct file *f = mddev->bitmap_info.file;
6685 		if (f) {
6686 			spin_lock(&mddev->lock);
6687 			mddev->bitmap_info.file = NULL;
6688 			spin_unlock(&mddev->lock);
6689 			fput(f);
6690 		}
6691 	}
6692 
6693 	return err;
6694 }
6695 
6696 /*
6697  * set_array_info is used two different ways
6698  * The original usage is when creating a new array.
6699  * In this usage, raid_disks is > 0 and it together with
6700  *  level, size, not_persistent,layout,chunksize determine the
6701  *  shape of the array.
6702  *  This will always create an array with a type-0.90.0 superblock.
6703  * The newer usage is when assembling an array.
6704  *  In this case raid_disks will be 0, and the major_version field is
6705  *  use to determine which style super-blocks are to be found on the devices.
6706  *  The minor and patch _version numbers are also kept incase the
6707  *  super_block handler wishes to interpret them.
6708  */
6709 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6710 {
6711 
6712 	if (info->raid_disks == 0) {
6713 		/* just setting version number for superblock loading */
6714 		if (info->major_version < 0 ||
6715 		    info->major_version >= ARRAY_SIZE(super_types) ||
6716 		    super_types[info->major_version].name == NULL) {
6717 			/* maybe try to auto-load a module? */
6718 			pr_warn("md: superblock version %d not known\n",
6719 				info->major_version);
6720 			return -EINVAL;
6721 		}
6722 		mddev->major_version = info->major_version;
6723 		mddev->minor_version = info->minor_version;
6724 		mddev->patch_version = info->patch_version;
6725 		mddev->persistent = !info->not_persistent;
6726 		/* ensure mddev_put doesn't delete this now that there
6727 		 * is some minimal configuration.
6728 		 */
6729 		mddev->ctime         = ktime_get_real_seconds();
6730 		return 0;
6731 	}
6732 	mddev->major_version = MD_MAJOR_VERSION;
6733 	mddev->minor_version = MD_MINOR_VERSION;
6734 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6735 	mddev->ctime         = ktime_get_real_seconds();
6736 
6737 	mddev->level         = info->level;
6738 	mddev->clevel[0]     = 0;
6739 	mddev->dev_sectors   = 2 * (sector_t)info->size;
6740 	mddev->raid_disks    = info->raid_disks;
6741 	/* don't set md_minor, it is determined by which /dev/md* was
6742 	 * openned
6743 	 */
6744 	if (info->state & (1<<MD_SB_CLEAN))
6745 		mddev->recovery_cp = MaxSector;
6746 	else
6747 		mddev->recovery_cp = 0;
6748 	mddev->persistent    = ! info->not_persistent;
6749 	mddev->external	     = 0;
6750 
6751 	mddev->layout        = info->layout;
6752 	mddev->chunk_sectors = info->chunk_size >> 9;
6753 
6754 	if (mddev->persistent) {
6755 		mddev->max_disks = MD_SB_DISKS;
6756 		mddev->flags = 0;
6757 		mddev->sb_flags = 0;
6758 	}
6759 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6760 
6761 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6762 	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6763 	mddev->bitmap_info.offset = 0;
6764 
6765 	mddev->reshape_position = MaxSector;
6766 
6767 	/*
6768 	 * Generate a 128 bit UUID
6769 	 */
6770 	get_random_bytes(mddev->uuid, 16);
6771 
6772 	mddev->new_level = mddev->level;
6773 	mddev->new_chunk_sectors = mddev->chunk_sectors;
6774 	mddev->new_layout = mddev->layout;
6775 	mddev->delta_disks = 0;
6776 	mddev->reshape_backwards = 0;
6777 
6778 	return 0;
6779 }
6780 
6781 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6782 {
6783 	lockdep_assert_held(&mddev->reconfig_mutex);
6784 
6785 	if (mddev->external_size)
6786 		return;
6787 
6788 	mddev->array_sectors = array_sectors;
6789 }
6790 EXPORT_SYMBOL(md_set_array_sectors);
6791 
6792 static int update_size(struct mddev *mddev, sector_t num_sectors)
6793 {
6794 	struct md_rdev *rdev;
6795 	int rv;
6796 	int fit = (num_sectors == 0);
6797 	sector_t old_dev_sectors = mddev->dev_sectors;
6798 
6799 	if (mddev->pers->resize == NULL)
6800 		return -EINVAL;
6801 	/* The "num_sectors" is the number of sectors of each device that
6802 	 * is used.  This can only make sense for arrays with redundancy.
6803 	 * linear and raid0 always use whatever space is available. We can only
6804 	 * consider changing this number if no resync or reconstruction is
6805 	 * happening, and if the new size is acceptable. It must fit before the
6806 	 * sb_start or, if that is <data_offset, it must fit before the size
6807 	 * of each device.  If num_sectors is zero, we find the largest size
6808 	 * that fits.
6809 	 */
6810 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6811 	    mddev->sync_thread)
6812 		return -EBUSY;
6813 	if (mddev->ro)
6814 		return -EROFS;
6815 
6816 	rdev_for_each(rdev, mddev) {
6817 		sector_t avail = rdev->sectors;
6818 
6819 		if (fit && (num_sectors == 0 || num_sectors > avail))
6820 			num_sectors = avail;
6821 		if (avail < num_sectors)
6822 			return -ENOSPC;
6823 	}
6824 	rv = mddev->pers->resize(mddev, num_sectors);
6825 	if (!rv) {
6826 		if (mddev_is_clustered(mddev))
6827 			md_cluster_ops->update_size(mddev, old_dev_sectors);
6828 		else if (mddev->queue) {
6829 			set_capacity(mddev->gendisk, mddev->array_sectors);
6830 			revalidate_disk(mddev->gendisk);
6831 		}
6832 	}
6833 	return rv;
6834 }
6835 
6836 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6837 {
6838 	int rv;
6839 	struct md_rdev *rdev;
6840 	/* change the number of raid disks */
6841 	if (mddev->pers->check_reshape == NULL)
6842 		return -EINVAL;
6843 	if (mddev->ro)
6844 		return -EROFS;
6845 	if (raid_disks <= 0 ||
6846 	    (mddev->max_disks && raid_disks >= mddev->max_disks))
6847 		return -EINVAL;
6848 	if (mddev->sync_thread ||
6849 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6850 	    mddev->reshape_position != MaxSector)
6851 		return -EBUSY;
6852 
6853 	rdev_for_each(rdev, mddev) {
6854 		if (mddev->raid_disks < raid_disks &&
6855 		    rdev->data_offset < rdev->new_data_offset)
6856 			return -EINVAL;
6857 		if (mddev->raid_disks > raid_disks &&
6858 		    rdev->data_offset > rdev->new_data_offset)
6859 			return -EINVAL;
6860 	}
6861 
6862 	mddev->delta_disks = raid_disks - mddev->raid_disks;
6863 	if (mddev->delta_disks < 0)
6864 		mddev->reshape_backwards = 1;
6865 	else if (mddev->delta_disks > 0)
6866 		mddev->reshape_backwards = 0;
6867 
6868 	rv = mddev->pers->check_reshape(mddev);
6869 	if (rv < 0) {
6870 		mddev->delta_disks = 0;
6871 		mddev->reshape_backwards = 0;
6872 	}
6873 	return rv;
6874 }
6875 
6876 /*
6877  * update_array_info is used to change the configuration of an
6878  * on-line array.
6879  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6880  * fields in the info are checked against the array.
6881  * Any differences that cannot be handled will cause an error.
6882  * Normally, only one change can be managed at a time.
6883  */
6884 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6885 {
6886 	int rv = 0;
6887 	int cnt = 0;
6888 	int state = 0;
6889 
6890 	/* calculate expected state,ignoring low bits */
6891 	if (mddev->bitmap && mddev->bitmap_info.offset)
6892 		state |= (1 << MD_SB_BITMAP_PRESENT);
6893 
6894 	if (mddev->major_version != info->major_version ||
6895 	    mddev->minor_version != info->minor_version ||
6896 /*	    mddev->patch_version != info->patch_version || */
6897 	    mddev->ctime         != info->ctime         ||
6898 	    mddev->level         != info->level         ||
6899 /*	    mddev->layout        != info->layout        || */
6900 	    mddev->persistent	 != !info->not_persistent ||
6901 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
6902 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6903 	    ((state^info->state) & 0xfffffe00)
6904 		)
6905 		return -EINVAL;
6906 	/* Check there is only one change */
6907 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6908 		cnt++;
6909 	if (mddev->raid_disks != info->raid_disks)
6910 		cnt++;
6911 	if (mddev->layout != info->layout)
6912 		cnt++;
6913 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6914 		cnt++;
6915 	if (cnt == 0)
6916 		return 0;
6917 	if (cnt > 1)
6918 		return -EINVAL;
6919 
6920 	if (mddev->layout != info->layout) {
6921 		/* Change layout
6922 		 * we don't need to do anything at the md level, the
6923 		 * personality will take care of it all.
6924 		 */
6925 		if (mddev->pers->check_reshape == NULL)
6926 			return -EINVAL;
6927 		else {
6928 			mddev->new_layout = info->layout;
6929 			rv = mddev->pers->check_reshape(mddev);
6930 			if (rv)
6931 				mddev->new_layout = mddev->layout;
6932 			return rv;
6933 		}
6934 	}
6935 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6936 		rv = update_size(mddev, (sector_t)info->size * 2);
6937 
6938 	if (mddev->raid_disks    != info->raid_disks)
6939 		rv = update_raid_disks(mddev, info->raid_disks);
6940 
6941 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6942 		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
6943 			rv = -EINVAL;
6944 			goto err;
6945 		}
6946 		if (mddev->recovery || mddev->sync_thread) {
6947 			rv = -EBUSY;
6948 			goto err;
6949 		}
6950 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6951 			struct bitmap *bitmap;
6952 			/* add the bitmap */
6953 			if (mddev->bitmap) {
6954 				rv = -EEXIST;
6955 				goto err;
6956 			}
6957 			if (mddev->bitmap_info.default_offset == 0) {
6958 				rv = -EINVAL;
6959 				goto err;
6960 			}
6961 			mddev->bitmap_info.offset =
6962 				mddev->bitmap_info.default_offset;
6963 			mddev->bitmap_info.space =
6964 				mddev->bitmap_info.default_space;
6965 			bitmap = md_bitmap_create(mddev, -1);
6966 			mddev_suspend(mddev);
6967 			if (!IS_ERR(bitmap)) {
6968 				mddev->bitmap = bitmap;
6969 				rv = md_bitmap_load(mddev);
6970 			} else
6971 				rv = PTR_ERR(bitmap);
6972 			if (rv)
6973 				md_bitmap_destroy(mddev);
6974 			mddev_resume(mddev);
6975 		} else {
6976 			/* remove the bitmap */
6977 			if (!mddev->bitmap) {
6978 				rv = -ENOENT;
6979 				goto err;
6980 			}
6981 			if (mddev->bitmap->storage.file) {
6982 				rv = -EINVAL;
6983 				goto err;
6984 			}
6985 			if (mddev->bitmap_info.nodes) {
6986 				/* hold PW on all the bitmap lock */
6987 				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
6988 					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
6989 					rv = -EPERM;
6990 					md_cluster_ops->unlock_all_bitmaps(mddev);
6991 					goto err;
6992 				}
6993 
6994 				mddev->bitmap_info.nodes = 0;
6995 				md_cluster_ops->leave(mddev);
6996 			}
6997 			mddev_suspend(mddev);
6998 			md_bitmap_destroy(mddev);
6999 			mddev_resume(mddev);
7000 			mddev->bitmap_info.offset = 0;
7001 		}
7002 	}
7003 	md_update_sb(mddev, 1);
7004 	return rv;
7005 err:
7006 	return rv;
7007 }
7008 
7009 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7010 {
7011 	struct md_rdev *rdev;
7012 	int err = 0;
7013 
7014 	if (mddev->pers == NULL)
7015 		return -ENODEV;
7016 
7017 	rcu_read_lock();
7018 	rdev = md_find_rdev_rcu(mddev, dev);
7019 	if (!rdev)
7020 		err =  -ENODEV;
7021 	else {
7022 		md_error(mddev, rdev);
7023 		if (!test_bit(Faulty, &rdev->flags))
7024 			err = -EBUSY;
7025 	}
7026 	rcu_read_unlock();
7027 	return err;
7028 }
7029 
7030 /*
7031  * We have a problem here : there is no easy way to give a CHS
7032  * virtual geometry. We currently pretend that we have a 2 heads
7033  * 4 sectors (with a BIG number of cylinders...). This drives
7034  * dosfs just mad... ;-)
7035  */
7036 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7037 {
7038 	struct mddev *mddev = bdev->bd_disk->private_data;
7039 
7040 	geo->heads = 2;
7041 	geo->sectors = 4;
7042 	geo->cylinders = mddev->array_sectors / 8;
7043 	return 0;
7044 }
7045 
7046 static inline bool md_ioctl_valid(unsigned int cmd)
7047 {
7048 	switch (cmd) {
7049 	case ADD_NEW_DISK:
7050 	case BLKROSET:
7051 	case GET_ARRAY_INFO:
7052 	case GET_BITMAP_FILE:
7053 	case GET_DISK_INFO:
7054 	case HOT_ADD_DISK:
7055 	case HOT_REMOVE_DISK:
7056 	case RAID_AUTORUN:
7057 	case RAID_VERSION:
7058 	case RESTART_ARRAY_RW:
7059 	case RUN_ARRAY:
7060 	case SET_ARRAY_INFO:
7061 	case SET_BITMAP_FILE:
7062 	case SET_DISK_FAULTY:
7063 	case STOP_ARRAY:
7064 	case STOP_ARRAY_RO:
7065 	case CLUSTERED_DISK_NACK:
7066 		return true;
7067 	default:
7068 		return false;
7069 	}
7070 }
7071 
7072 static int md_ioctl(struct block_device *bdev, fmode_t mode,
7073 			unsigned int cmd, unsigned long arg)
7074 {
7075 	int err = 0;
7076 	void __user *argp = (void __user *)arg;
7077 	struct mddev *mddev = NULL;
7078 	int ro;
7079 	bool did_set_md_closing = false;
7080 
7081 	if (!md_ioctl_valid(cmd))
7082 		return -ENOTTY;
7083 
7084 	switch (cmd) {
7085 	case RAID_VERSION:
7086 	case GET_ARRAY_INFO:
7087 	case GET_DISK_INFO:
7088 		break;
7089 	default:
7090 		if (!capable(CAP_SYS_ADMIN))
7091 			return -EACCES;
7092 	}
7093 
7094 	/*
7095 	 * Commands dealing with the RAID driver but not any
7096 	 * particular array:
7097 	 */
7098 	switch (cmd) {
7099 	case RAID_VERSION:
7100 		err = get_version(argp);
7101 		goto out;
7102 
7103 #ifndef MODULE
7104 	case RAID_AUTORUN:
7105 		err = 0;
7106 		autostart_arrays(arg);
7107 		goto out;
7108 #endif
7109 	default:;
7110 	}
7111 
7112 	/*
7113 	 * Commands creating/starting a new array:
7114 	 */
7115 
7116 	mddev = bdev->bd_disk->private_data;
7117 
7118 	if (!mddev) {
7119 		BUG();
7120 		goto out;
7121 	}
7122 
7123 	/* Some actions do not requires the mutex */
7124 	switch (cmd) {
7125 	case GET_ARRAY_INFO:
7126 		if (!mddev->raid_disks && !mddev->external)
7127 			err = -ENODEV;
7128 		else
7129 			err = get_array_info(mddev, argp);
7130 		goto out;
7131 
7132 	case GET_DISK_INFO:
7133 		if (!mddev->raid_disks && !mddev->external)
7134 			err = -ENODEV;
7135 		else
7136 			err = get_disk_info(mddev, argp);
7137 		goto out;
7138 
7139 	case SET_DISK_FAULTY:
7140 		err = set_disk_faulty(mddev, new_decode_dev(arg));
7141 		goto out;
7142 
7143 	case GET_BITMAP_FILE:
7144 		err = get_bitmap_file(mddev, argp);
7145 		goto out;
7146 
7147 	}
7148 
7149 	if (cmd == ADD_NEW_DISK)
7150 		/* need to ensure md_delayed_delete() has completed */
7151 		flush_workqueue(md_misc_wq);
7152 
7153 	if (cmd == HOT_REMOVE_DISK)
7154 		/* need to ensure recovery thread has run */
7155 		wait_event_interruptible_timeout(mddev->sb_wait,
7156 						 !test_bit(MD_RECOVERY_NEEDED,
7157 							   &mddev->recovery),
7158 						 msecs_to_jiffies(5000));
7159 	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7160 		/* Need to flush page cache, and ensure no-one else opens
7161 		 * and writes
7162 		 */
7163 		mutex_lock(&mddev->open_mutex);
7164 		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7165 			mutex_unlock(&mddev->open_mutex);
7166 			err = -EBUSY;
7167 			goto out;
7168 		}
7169 		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7170 		set_bit(MD_CLOSING, &mddev->flags);
7171 		did_set_md_closing = true;
7172 		mutex_unlock(&mddev->open_mutex);
7173 		sync_blockdev(bdev);
7174 	}
7175 	err = mddev_lock(mddev);
7176 	if (err) {
7177 		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7178 			 err, cmd);
7179 		goto out;
7180 	}
7181 
7182 	if (cmd == SET_ARRAY_INFO) {
7183 		mdu_array_info_t info;
7184 		if (!arg)
7185 			memset(&info, 0, sizeof(info));
7186 		else if (copy_from_user(&info, argp, sizeof(info))) {
7187 			err = -EFAULT;
7188 			goto unlock;
7189 		}
7190 		if (mddev->pers) {
7191 			err = update_array_info(mddev, &info);
7192 			if (err) {
7193 				pr_warn("md: couldn't update array info. %d\n", err);
7194 				goto unlock;
7195 			}
7196 			goto unlock;
7197 		}
7198 		if (!list_empty(&mddev->disks)) {
7199 			pr_warn("md: array %s already has disks!\n", mdname(mddev));
7200 			err = -EBUSY;
7201 			goto unlock;
7202 		}
7203 		if (mddev->raid_disks) {
7204 			pr_warn("md: array %s already initialised!\n", mdname(mddev));
7205 			err = -EBUSY;
7206 			goto unlock;
7207 		}
7208 		err = set_array_info(mddev, &info);
7209 		if (err) {
7210 			pr_warn("md: couldn't set array info. %d\n", err);
7211 			goto unlock;
7212 		}
7213 		goto unlock;
7214 	}
7215 
7216 	/*
7217 	 * Commands querying/configuring an existing array:
7218 	 */
7219 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7220 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7221 	if ((!mddev->raid_disks && !mddev->external)
7222 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7223 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7224 	    && cmd != GET_BITMAP_FILE) {
7225 		err = -ENODEV;
7226 		goto unlock;
7227 	}
7228 
7229 	/*
7230 	 * Commands even a read-only array can execute:
7231 	 */
7232 	switch (cmd) {
7233 	case RESTART_ARRAY_RW:
7234 		err = restart_array(mddev);
7235 		goto unlock;
7236 
7237 	case STOP_ARRAY:
7238 		err = do_md_stop(mddev, 0, bdev);
7239 		goto unlock;
7240 
7241 	case STOP_ARRAY_RO:
7242 		err = md_set_readonly(mddev, bdev);
7243 		goto unlock;
7244 
7245 	case HOT_REMOVE_DISK:
7246 		err = hot_remove_disk(mddev, new_decode_dev(arg));
7247 		goto unlock;
7248 
7249 	case ADD_NEW_DISK:
7250 		/* We can support ADD_NEW_DISK on read-only arrays
7251 		 * only if we are re-adding a preexisting device.
7252 		 * So require mddev->pers and MD_DISK_SYNC.
7253 		 */
7254 		if (mddev->pers) {
7255 			mdu_disk_info_t info;
7256 			if (copy_from_user(&info, argp, sizeof(info)))
7257 				err = -EFAULT;
7258 			else if (!(info.state & (1<<MD_DISK_SYNC)))
7259 				/* Need to clear read-only for this */
7260 				break;
7261 			else
7262 				err = add_new_disk(mddev, &info);
7263 			goto unlock;
7264 		}
7265 		break;
7266 
7267 	case BLKROSET:
7268 		if (get_user(ro, (int __user *)(arg))) {
7269 			err = -EFAULT;
7270 			goto unlock;
7271 		}
7272 		err = -EINVAL;
7273 
7274 		/* if the bdev is going readonly the value of mddev->ro
7275 		 * does not matter, no writes are coming
7276 		 */
7277 		if (ro)
7278 			goto unlock;
7279 
7280 		/* are we are already prepared for writes? */
7281 		if (mddev->ro != 1)
7282 			goto unlock;
7283 
7284 		/* transitioning to readauto need only happen for
7285 		 * arrays that call md_write_start
7286 		 */
7287 		if (mddev->pers) {
7288 			err = restart_array(mddev);
7289 			if (err == 0) {
7290 				mddev->ro = 2;
7291 				set_disk_ro(mddev->gendisk, 0);
7292 			}
7293 		}
7294 		goto unlock;
7295 	}
7296 
7297 	/*
7298 	 * The remaining ioctls are changing the state of the
7299 	 * superblock, so we do not allow them on read-only arrays.
7300 	 */
7301 	if (mddev->ro && mddev->pers) {
7302 		if (mddev->ro == 2) {
7303 			mddev->ro = 0;
7304 			sysfs_notify_dirent_safe(mddev->sysfs_state);
7305 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7306 			/* mddev_unlock will wake thread */
7307 			/* If a device failed while we were read-only, we
7308 			 * need to make sure the metadata is updated now.
7309 			 */
7310 			if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7311 				mddev_unlock(mddev);
7312 				wait_event(mddev->sb_wait,
7313 					   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7314 					   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7315 				mddev_lock_nointr(mddev);
7316 			}
7317 		} else {
7318 			err = -EROFS;
7319 			goto unlock;
7320 		}
7321 	}
7322 
7323 	switch (cmd) {
7324 	case ADD_NEW_DISK:
7325 	{
7326 		mdu_disk_info_t info;
7327 		if (copy_from_user(&info, argp, sizeof(info)))
7328 			err = -EFAULT;
7329 		else
7330 			err = add_new_disk(mddev, &info);
7331 		goto unlock;
7332 	}
7333 
7334 	case CLUSTERED_DISK_NACK:
7335 		if (mddev_is_clustered(mddev))
7336 			md_cluster_ops->new_disk_ack(mddev, false);
7337 		else
7338 			err = -EINVAL;
7339 		goto unlock;
7340 
7341 	case HOT_ADD_DISK:
7342 		err = hot_add_disk(mddev, new_decode_dev(arg));
7343 		goto unlock;
7344 
7345 	case RUN_ARRAY:
7346 		err = do_md_run(mddev);
7347 		goto unlock;
7348 
7349 	case SET_BITMAP_FILE:
7350 		err = set_bitmap_file(mddev, (int)arg);
7351 		goto unlock;
7352 
7353 	default:
7354 		err = -EINVAL;
7355 		goto unlock;
7356 	}
7357 
7358 unlock:
7359 	if (mddev->hold_active == UNTIL_IOCTL &&
7360 	    err != -EINVAL)
7361 		mddev->hold_active = 0;
7362 	mddev_unlock(mddev);
7363 out:
7364 	if(did_set_md_closing)
7365 		clear_bit(MD_CLOSING, &mddev->flags);
7366 	return err;
7367 }
7368 #ifdef CONFIG_COMPAT
7369 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7370 		    unsigned int cmd, unsigned long arg)
7371 {
7372 	switch (cmd) {
7373 	case HOT_REMOVE_DISK:
7374 	case HOT_ADD_DISK:
7375 	case SET_DISK_FAULTY:
7376 	case SET_BITMAP_FILE:
7377 		/* These take in integer arg, do not convert */
7378 		break;
7379 	default:
7380 		arg = (unsigned long)compat_ptr(arg);
7381 		break;
7382 	}
7383 
7384 	return md_ioctl(bdev, mode, cmd, arg);
7385 }
7386 #endif /* CONFIG_COMPAT */
7387 
7388 static int md_open(struct block_device *bdev, fmode_t mode)
7389 {
7390 	/*
7391 	 * Succeed if we can lock the mddev, which confirms that
7392 	 * it isn't being stopped right now.
7393 	 */
7394 	struct mddev *mddev = mddev_find(bdev->bd_dev);
7395 	int err;
7396 
7397 	if (!mddev)
7398 		return -ENODEV;
7399 
7400 	if (mddev->gendisk != bdev->bd_disk) {
7401 		/* we are racing with mddev_put which is discarding this
7402 		 * bd_disk.
7403 		 */
7404 		mddev_put(mddev);
7405 		/* Wait until bdev->bd_disk is definitely gone */
7406 		flush_workqueue(md_misc_wq);
7407 		/* Then retry the open from the top */
7408 		return -ERESTARTSYS;
7409 	}
7410 	BUG_ON(mddev != bdev->bd_disk->private_data);
7411 
7412 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7413 		goto out;
7414 
7415 	if (test_bit(MD_CLOSING, &mddev->flags)) {
7416 		mutex_unlock(&mddev->open_mutex);
7417 		err = -ENODEV;
7418 		goto out;
7419 	}
7420 
7421 	err = 0;
7422 	atomic_inc(&mddev->openers);
7423 	mutex_unlock(&mddev->open_mutex);
7424 
7425 	check_disk_change(bdev);
7426  out:
7427 	if (err)
7428 		mddev_put(mddev);
7429 	return err;
7430 }
7431 
7432 static void md_release(struct gendisk *disk, fmode_t mode)
7433 {
7434 	struct mddev *mddev = disk->private_data;
7435 
7436 	BUG_ON(!mddev);
7437 	atomic_dec(&mddev->openers);
7438 	mddev_put(mddev);
7439 }
7440 
7441 static int md_media_changed(struct gendisk *disk)
7442 {
7443 	struct mddev *mddev = disk->private_data;
7444 
7445 	return mddev->changed;
7446 }
7447 
7448 static int md_revalidate(struct gendisk *disk)
7449 {
7450 	struct mddev *mddev = disk->private_data;
7451 
7452 	mddev->changed = 0;
7453 	return 0;
7454 }
7455 static const struct block_device_operations md_fops =
7456 {
7457 	.owner		= THIS_MODULE,
7458 	.open		= md_open,
7459 	.release	= md_release,
7460 	.ioctl		= md_ioctl,
7461 #ifdef CONFIG_COMPAT
7462 	.compat_ioctl	= md_compat_ioctl,
7463 #endif
7464 	.getgeo		= md_getgeo,
7465 	.media_changed  = md_media_changed,
7466 	.revalidate_disk= md_revalidate,
7467 };
7468 
7469 static int md_thread(void *arg)
7470 {
7471 	struct md_thread *thread = arg;
7472 
7473 	/*
7474 	 * md_thread is a 'system-thread', it's priority should be very
7475 	 * high. We avoid resource deadlocks individually in each
7476 	 * raid personality. (RAID5 does preallocation) We also use RR and
7477 	 * the very same RT priority as kswapd, thus we will never get
7478 	 * into a priority inversion deadlock.
7479 	 *
7480 	 * we definitely have to have equal or higher priority than
7481 	 * bdflush, otherwise bdflush will deadlock if there are too
7482 	 * many dirty RAID5 blocks.
7483 	 */
7484 
7485 	allow_signal(SIGKILL);
7486 	while (!kthread_should_stop()) {
7487 
7488 		/* We need to wait INTERRUPTIBLE so that
7489 		 * we don't add to the load-average.
7490 		 * That means we need to be sure no signals are
7491 		 * pending
7492 		 */
7493 		if (signal_pending(current))
7494 			flush_signals(current);
7495 
7496 		wait_event_interruptible_timeout
7497 			(thread->wqueue,
7498 			 test_bit(THREAD_WAKEUP, &thread->flags)
7499 			 || kthread_should_stop() || kthread_should_park(),
7500 			 thread->timeout);
7501 
7502 		clear_bit(THREAD_WAKEUP, &thread->flags);
7503 		if (kthread_should_park())
7504 			kthread_parkme();
7505 		if (!kthread_should_stop())
7506 			thread->run(thread);
7507 	}
7508 
7509 	return 0;
7510 }
7511 
7512 void md_wakeup_thread(struct md_thread *thread)
7513 {
7514 	if (thread) {
7515 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7516 		set_bit(THREAD_WAKEUP, &thread->flags);
7517 		wake_up(&thread->wqueue);
7518 	}
7519 }
7520 EXPORT_SYMBOL(md_wakeup_thread);
7521 
7522 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7523 		struct mddev *mddev, const char *name)
7524 {
7525 	struct md_thread *thread;
7526 
7527 	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7528 	if (!thread)
7529 		return NULL;
7530 
7531 	init_waitqueue_head(&thread->wqueue);
7532 
7533 	thread->run = run;
7534 	thread->mddev = mddev;
7535 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7536 	thread->tsk = kthread_run(md_thread, thread,
7537 				  "%s_%s",
7538 				  mdname(thread->mddev),
7539 				  name);
7540 	if (IS_ERR(thread->tsk)) {
7541 		kfree(thread);
7542 		return NULL;
7543 	}
7544 	return thread;
7545 }
7546 EXPORT_SYMBOL(md_register_thread);
7547 
7548 void md_unregister_thread(struct md_thread **threadp)
7549 {
7550 	struct md_thread *thread = *threadp;
7551 	if (!thread)
7552 		return;
7553 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7554 	/* Locking ensures that mddev_unlock does not wake_up a
7555 	 * non-existent thread
7556 	 */
7557 	spin_lock(&pers_lock);
7558 	*threadp = NULL;
7559 	spin_unlock(&pers_lock);
7560 
7561 	kthread_stop(thread->tsk);
7562 	kfree(thread);
7563 }
7564 EXPORT_SYMBOL(md_unregister_thread);
7565 
7566 void md_error(struct mddev *mddev, struct md_rdev *rdev)
7567 {
7568 	if (!rdev || test_bit(Faulty, &rdev->flags))
7569 		return;
7570 
7571 	if (!mddev->pers || !mddev->pers->error_handler)
7572 		return;
7573 	mddev->pers->error_handler(mddev,rdev);
7574 	if (mddev->degraded)
7575 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7576 	sysfs_notify_dirent_safe(rdev->sysfs_state);
7577 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7578 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7579 	md_wakeup_thread(mddev->thread);
7580 	if (mddev->event_work.func)
7581 		queue_work(md_misc_wq, &mddev->event_work);
7582 	md_new_event(mddev);
7583 }
7584 EXPORT_SYMBOL(md_error);
7585 
7586 /* seq_file implementation /proc/mdstat */
7587 
7588 static void status_unused(struct seq_file *seq)
7589 {
7590 	int i = 0;
7591 	struct md_rdev *rdev;
7592 
7593 	seq_printf(seq, "unused devices: ");
7594 
7595 	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7596 		char b[BDEVNAME_SIZE];
7597 		i++;
7598 		seq_printf(seq, "%s ",
7599 			      bdevname(rdev->bdev,b));
7600 	}
7601 	if (!i)
7602 		seq_printf(seq, "<none>");
7603 
7604 	seq_printf(seq, "\n");
7605 }
7606 
7607 static int status_resync(struct seq_file *seq, struct mddev *mddev)
7608 {
7609 	sector_t max_sectors, resync, res;
7610 	unsigned long dt, db = 0;
7611 	sector_t rt, curr_mark_cnt, resync_mark_cnt;
7612 	int scale, recovery_active;
7613 	unsigned int per_milli;
7614 
7615 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7616 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7617 		max_sectors = mddev->resync_max_sectors;
7618 	else
7619 		max_sectors = mddev->dev_sectors;
7620 
7621 	resync = mddev->curr_resync;
7622 	if (resync <= 3) {
7623 		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7624 			/* Still cleaning up */
7625 			resync = max_sectors;
7626 	} else if (resync > max_sectors)
7627 		resync = max_sectors;
7628 	else
7629 		resync -= atomic_read(&mddev->recovery_active);
7630 
7631 	if (resync == 0) {
7632 		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
7633 			struct md_rdev *rdev;
7634 
7635 			rdev_for_each(rdev, mddev)
7636 				if (rdev->raid_disk >= 0 &&
7637 				    !test_bit(Faulty, &rdev->flags) &&
7638 				    rdev->recovery_offset != MaxSector &&
7639 				    rdev->recovery_offset) {
7640 					seq_printf(seq, "\trecover=REMOTE");
7641 					return 1;
7642 				}
7643 			if (mddev->reshape_position != MaxSector)
7644 				seq_printf(seq, "\treshape=REMOTE");
7645 			else
7646 				seq_printf(seq, "\tresync=REMOTE");
7647 			return 1;
7648 		}
7649 		if (mddev->recovery_cp < MaxSector) {
7650 			seq_printf(seq, "\tresync=PENDING");
7651 			return 1;
7652 		}
7653 		return 0;
7654 	}
7655 	if (resync < 3) {
7656 		seq_printf(seq, "\tresync=DELAYED");
7657 		return 1;
7658 	}
7659 
7660 	WARN_ON(max_sectors == 0);
7661 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
7662 	 * in a sector_t, and (max_sectors>>scale) will fit in a
7663 	 * u32, as those are the requirements for sector_div.
7664 	 * Thus 'scale' must be at least 10
7665 	 */
7666 	scale = 10;
7667 	if (sizeof(sector_t) > sizeof(unsigned long)) {
7668 		while ( max_sectors/2 > (1ULL<<(scale+32)))
7669 			scale++;
7670 	}
7671 	res = (resync>>scale)*1000;
7672 	sector_div(res, (u32)((max_sectors>>scale)+1));
7673 
7674 	per_milli = res;
7675 	{
7676 		int i, x = per_milli/50, y = 20-x;
7677 		seq_printf(seq, "[");
7678 		for (i = 0; i < x; i++)
7679 			seq_printf(seq, "=");
7680 		seq_printf(seq, ">");
7681 		for (i = 0; i < y; i++)
7682 			seq_printf(seq, ".");
7683 		seq_printf(seq, "] ");
7684 	}
7685 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7686 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7687 		    "reshape" :
7688 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7689 		     "check" :
7690 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7691 		      "resync" : "recovery"))),
7692 		   per_milli/10, per_milli % 10,
7693 		   (unsigned long long) resync/2,
7694 		   (unsigned long long) max_sectors/2);
7695 
7696 	/*
7697 	 * dt: time from mark until now
7698 	 * db: blocks written from mark until now
7699 	 * rt: remaining time
7700 	 *
7701 	 * rt is a sector_t, which is always 64bit now. We are keeping
7702 	 * the original algorithm, but it is not really necessary.
7703 	 *
7704 	 * Original algorithm:
7705 	 *   So we divide before multiply in case it is 32bit and close
7706 	 *   to the limit.
7707 	 *   We scale the divisor (db) by 32 to avoid losing precision
7708 	 *   near the end of resync when the number of remaining sectors
7709 	 *   is close to 'db'.
7710 	 *   We then divide rt by 32 after multiplying by db to compensate.
7711 	 *   The '+1' avoids division by zero if db is very small.
7712 	 */
7713 	dt = ((jiffies - mddev->resync_mark) / HZ);
7714 	if (!dt) dt++;
7715 
7716 	curr_mark_cnt = mddev->curr_mark_cnt;
7717 	recovery_active = atomic_read(&mddev->recovery_active);
7718 	resync_mark_cnt = mddev->resync_mark_cnt;
7719 
7720 	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
7721 		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
7722 
7723 	rt = max_sectors - resync;    /* number of remaining sectors */
7724 	rt = div64_u64(rt, db/32+1);
7725 	rt *= dt;
7726 	rt >>= 5;
7727 
7728 	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7729 		   ((unsigned long)rt % 60)/6);
7730 
7731 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7732 	return 1;
7733 }
7734 
7735 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7736 {
7737 	struct list_head *tmp;
7738 	loff_t l = *pos;
7739 	struct mddev *mddev;
7740 
7741 	if (l >= 0x10000)
7742 		return NULL;
7743 	if (!l--)
7744 		/* header */
7745 		return (void*)1;
7746 
7747 	spin_lock(&all_mddevs_lock);
7748 	list_for_each(tmp,&all_mddevs)
7749 		if (!l--) {
7750 			mddev = list_entry(tmp, struct mddev, all_mddevs);
7751 			mddev_get(mddev);
7752 			spin_unlock(&all_mddevs_lock);
7753 			return mddev;
7754 		}
7755 	spin_unlock(&all_mddevs_lock);
7756 	if (!l--)
7757 		return (void*)2;/* tail */
7758 	return NULL;
7759 }
7760 
7761 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7762 {
7763 	struct list_head *tmp;
7764 	struct mddev *next_mddev, *mddev = v;
7765 
7766 	++*pos;
7767 	if (v == (void*)2)
7768 		return NULL;
7769 
7770 	spin_lock(&all_mddevs_lock);
7771 	if (v == (void*)1)
7772 		tmp = all_mddevs.next;
7773 	else
7774 		tmp = mddev->all_mddevs.next;
7775 	if (tmp != &all_mddevs)
7776 		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7777 	else {
7778 		next_mddev = (void*)2;
7779 		*pos = 0x10000;
7780 	}
7781 	spin_unlock(&all_mddevs_lock);
7782 
7783 	if (v != (void*)1)
7784 		mddev_put(mddev);
7785 	return next_mddev;
7786 
7787 }
7788 
7789 static void md_seq_stop(struct seq_file *seq, void *v)
7790 {
7791 	struct mddev *mddev = v;
7792 
7793 	if (mddev && v != (void*)1 && v != (void*)2)
7794 		mddev_put(mddev);
7795 }
7796 
7797 static int md_seq_show(struct seq_file *seq, void *v)
7798 {
7799 	struct mddev *mddev = v;
7800 	sector_t sectors;
7801 	struct md_rdev *rdev;
7802 
7803 	if (v == (void*)1) {
7804 		struct md_personality *pers;
7805 		seq_printf(seq, "Personalities : ");
7806 		spin_lock(&pers_lock);
7807 		list_for_each_entry(pers, &pers_list, list)
7808 			seq_printf(seq, "[%s] ", pers->name);
7809 
7810 		spin_unlock(&pers_lock);
7811 		seq_printf(seq, "\n");
7812 		seq->poll_event = atomic_read(&md_event_count);
7813 		return 0;
7814 	}
7815 	if (v == (void*)2) {
7816 		status_unused(seq);
7817 		return 0;
7818 	}
7819 
7820 	spin_lock(&mddev->lock);
7821 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7822 		seq_printf(seq, "%s : %sactive", mdname(mddev),
7823 						mddev->pers ? "" : "in");
7824 		if (mddev->pers) {
7825 			if (mddev->ro==1)
7826 				seq_printf(seq, " (read-only)");
7827 			if (mddev->ro==2)
7828 				seq_printf(seq, " (auto-read-only)");
7829 			seq_printf(seq, " %s", mddev->pers->name);
7830 		}
7831 
7832 		sectors = 0;
7833 		rcu_read_lock();
7834 		rdev_for_each_rcu(rdev, mddev) {
7835 			char b[BDEVNAME_SIZE];
7836 			seq_printf(seq, " %s[%d]",
7837 				bdevname(rdev->bdev,b), rdev->desc_nr);
7838 			if (test_bit(WriteMostly, &rdev->flags))
7839 				seq_printf(seq, "(W)");
7840 			if (test_bit(Journal, &rdev->flags))
7841 				seq_printf(seq, "(J)");
7842 			if (test_bit(Faulty, &rdev->flags)) {
7843 				seq_printf(seq, "(F)");
7844 				continue;
7845 			}
7846 			if (rdev->raid_disk < 0)
7847 				seq_printf(seq, "(S)"); /* spare */
7848 			if (test_bit(Replacement, &rdev->flags))
7849 				seq_printf(seq, "(R)");
7850 			sectors += rdev->sectors;
7851 		}
7852 		rcu_read_unlock();
7853 
7854 		if (!list_empty(&mddev->disks)) {
7855 			if (mddev->pers)
7856 				seq_printf(seq, "\n      %llu blocks",
7857 					   (unsigned long long)
7858 					   mddev->array_sectors / 2);
7859 			else
7860 				seq_printf(seq, "\n      %llu blocks",
7861 					   (unsigned long long)sectors / 2);
7862 		}
7863 		if (mddev->persistent) {
7864 			if (mddev->major_version != 0 ||
7865 			    mddev->minor_version != 90) {
7866 				seq_printf(seq," super %d.%d",
7867 					   mddev->major_version,
7868 					   mddev->minor_version);
7869 			}
7870 		} else if (mddev->external)
7871 			seq_printf(seq, " super external:%s",
7872 				   mddev->metadata_type);
7873 		else
7874 			seq_printf(seq, " super non-persistent");
7875 
7876 		if (mddev->pers) {
7877 			mddev->pers->status(seq, mddev);
7878 			seq_printf(seq, "\n      ");
7879 			if (mddev->pers->sync_request) {
7880 				if (status_resync(seq, mddev))
7881 					seq_printf(seq, "\n      ");
7882 			}
7883 		} else
7884 			seq_printf(seq, "\n       ");
7885 
7886 		md_bitmap_status(seq, mddev->bitmap);
7887 
7888 		seq_printf(seq, "\n");
7889 	}
7890 	spin_unlock(&mddev->lock);
7891 
7892 	return 0;
7893 }
7894 
7895 static const struct seq_operations md_seq_ops = {
7896 	.start  = md_seq_start,
7897 	.next   = md_seq_next,
7898 	.stop   = md_seq_stop,
7899 	.show   = md_seq_show,
7900 };
7901 
7902 static int md_seq_open(struct inode *inode, struct file *file)
7903 {
7904 	struct seq_file *seq;
7905 	int error;
7906 
7907 	error = seq_open(file, &md_seq_ops);
7908 	if (error)
7909 		return error;
7910 
7911 	seq = file->private_data;
7912 	seq->poll_event = atomic_read(&md_event_count);
7913 	return error;
7914 }
7915 
7916 static int md_unloading;
7917 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
7918 {
7919 	struct seq_file *seq = filp->private_data;
7920 	__poll_t mask;
7921 
7922 	if (md_unloading)
7923 		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
7924 	poll_wait(filp, &md_event_waiters, wait);
7925 
7926 	/* always allow read */
7927 	mask = EPOLLIN | EPOLLRDNORM;
7928 
7929 	if (seq->poll_event != atomic_read(&md_event_count))
7930 		mask |= EPOLLERR | EPOLLPRI;
7931 	return mask;
7932 }
7933 
7934 static const struct file_operations md_seq_fops = {
7935 	.owner		= THIS_MODULE,
7936 	.open           = md_seq_open,
7937 	.read           = seq_read,
7938 	.llseek         = seq_lseek,
7939 	.release	= seq_release,
7940 	.poll		= mdstat_poll,
7941 };
7942 
7943 int register_md_personality(struct md_personality *p)
7944 {
7945 	pr_debug("md: %s personality registered for level %d\n",
7946 		 p->name, p->level);
7947 	spin_lock(&pers_lock);
7948 	list_add_tail(&p->list, &pers_list);
7949 	spin_unlock(&pers_lock);
7950 	return 0;
7951 }
7952 EXPORT_SYMBOL(register_md_personality);
7953 
7954 int unregister_md_personality(struct md_personality *p)
7955 {
7956 	pr_debug("md: %s personality unregistered\n", p->name);
7957 	spin_lock(&pers_lock);
7958 	list_del_init(&p->list);
7959 	spin_unlock(&pers_lock);
7960 	return 0;
7961 }
7962 EXPORT_SYMBOL(unregister_md_personality);
7963 
7964 int register_md_cluster_operations(struct md_cluster_operations *ops,
7965 				   struct module *module)
7966 {
7967 	int ret = 0;
7968 	spin_lock(&pers_lock);
7969 	if (md_cluster_ops != NULL)
7970 		ret = -EALREADY;
7971 	else {
7972 		md_cluster_ops = ops;
7973 		md_cluster_mod = module;
7974 	}
7975 	spin_unlock(&pers_lock);
7976 	return ret;
7977 }
7978 EXPORT_SYMBOL(register_md_cluster_operations);
7979 
7980 int unregister_md_cluster_operations(void)
7981 {
7982 	spin_lock(&pers_lock);
7983 	md_cluster_ops = NULL;
7984 	spin_unlock(&pers_lock);
7985 	return 0;
7986 }
7987 EXPORT_SYMBOL(unregister_md_cluster_operations);
7988 
7989 int md_setup_cluster(struct mddev *mddev, int nodes)
7990 {
7991 	if (!md_cluster_ops)
7992 		request_module("md-cluster");
7993 	spin_lock(&pers_lock);
7994 	/* ensure module won't be unloaded */
7995 	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7996 		pr_warn("can't find md-cluster module or get it's reference.\n");
7997 		spin_unlock(&pers_lock);
7998 		return -ENOENT;
7999 	}
8000 	spin_unlock(&pers_lock);
8001 
8002 	return md_cluster_ops->join(mddev, nodes);
8003 }
8004 
8005 void md_cluster_stop(struct mddev *mddev)
8006 {
8007 	if (!md_cluster_ops)
8008 		return;
8009 	md_cluster_ops->leave(mddev);
8010 	module_put(md_cluster_mod);
8011 }
8012 
8013 static int is_mddev_idle(struct mddev *mddev, int init)
8014 {
8015 	struct md_rdev *rdev;
8016 	int idle;
8017 	int curr_events;
8018 
8019 	idle = 1;
8020 	rcu_read_lock();
8021 	rdev_for_each_rcu(rdev, mddev) {
8022 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8023 		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
8024 			      atomic_read(&disk->sync_io);
8025 		/* sync IO will cause sync_io to increase before the disk_stats
8026 		 * as sync_io is counted when a request starts, and
8027 		 * disk_stats is counted when it completes.
8028 		 * So resync activity will cause curr_events to be smaller than
8029 		 * when there was no such activity.
8030 		 * non-sync IO will cause disk_stat to increase without
8031 		 * increasing sync_io so curr_events will (eventually)
8032 		 * be larger than it was before.  Once it becomes
8033 		 * substantially larger, the test below will cause
8034 		 * the array to appear non-idle, and resync will slow
8035 		 * down.
8036 		 * If there is a lot of outstanding resync activity when
8037 		 * we set last_event to curr_events, then all that activity
8038 		 * completing might cause the array to appear non-idle
8039 		 * and resync will be slowed down even though there might
8040 		 * not have been non-resync activity.  This will only
8041 		 * happen once though.  'last_events' will soon reflect
8042 		 * the state where there is little or no outstanding
8043 		 * resync requests, and further resync activity will
8044 		 * always make curr_events less than last_events.
8045 		 *
8046 		 */
8047 		if (init || curr_events - rdev->last_events > 64) {
8048 			rdev->last_events = curr_events;
8049 			idle = 0;
8050 		}
8051 	}
8052 	rcu_read_unlock();
8053 	return idle;
8054 }
8055 
8056 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8057 {
8058 	/* another "blocks" (512byte) blocks have been synced */
8059 	atomic_sub(blocks, &mddev->recovery_active);
8060 	wake_up(&mddev->recovery_wait);
8061 	if (!ok) {
8062 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8063 		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8064 		md_wakeup_thread(mddev->thread);
8065 		// stop recovery, signal do_sync ....
8066 	}
8067 }
8068 EXPORT_SYMBOL(md_done_sync);
8069 
8070 /* md_write_start(mddev, bi)
8071  * If we need to update some array metadata (e.g. 'active' flag
8072  * in superblock) before writing, schedule a superblock update
8073  * and wait for it to complete.
8074  * A return value of 'false' means that the write wasn't recorded
8075  * and cannot proceed as the array is being suspend.
8076  */
8077 bool md_write_start(struct mddev *mddev, struct bio *bi)
8078 {
8079 	int did_change = 0;
8080 
8081 	if (bio_data_dir(bi) != WRITE)
8082 		return true;
8083 
8084 	BUG_ON(mddev->ro == 1);
8085 	if (mddev->ro == 2) {
8086 		/* need to switch to read/write */
8087 		mddev->ro = 0;
8088 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8089 		md_wakeup_thread(mddev->thread);
8090 		md_wakeup_thread(mddev->sync_thread);
8091 		did_change = 1;
8092 	}
8093 	rcu_read_lock();
8094 	percpu_ref_get(&mddev->writes_pending);
8095 	smp_mb(); /* Match smp_mb in set_in_sync() */
8096 	if (mddev->safemode == 1)
8097 		mddev->safemode = 0;
8098 	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8099 	if (mddev->in_sync || mddev->sync_checkers) {
8100 		spin_lock(&mddev->lock);
8101 		if (mddev->in_sync) {
8102 			mddev->in_sync = 0;
8103 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8104 			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8105 			md_wakeup_thread(mddev->thread);
8106 			did_change = 1;
8107 		}
8108 		spin_unlock(&mddev->lock);
8109 	}
8110 	rcu_read_unlock();
8111 	if (did_change)
8112 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8113 	if (!mddev->has_superblocks)
8114 		return true;
8115 	wait_event(mddev->sb_wait,
8116 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8117 		   mddev->suspended);
8118 	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8119 		percpu_ref_put(&mddev->writes_pending);
8120 		return false;
8121 	}
8122 	return true;
8123 }
8124 EXPORT_SYMBOL(md_write_start);
8125 
8126 /* md_write_inc can only be called when md_write_start() has
8127  * already been called at least once of the current request.
8128  * It increments the counter and is useful when a single request
8129  * is split into several parts.  Each part causes an increment and
8130  * so needs a matching md_write_end().
8131  * Unlike md_write_start(), it is safe to call md_write_inc() inside
8132  * a spinlocked region.
8133  */
8134 void md_write_inc(struct mddev *mddev, struct bio *bi)
8135 {
8136 	if (bio_data_dir(bi) != WRITE)
8137 		return;
8138 	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8139 	percpu_ref_get(&mddev->writes_pending);
8140 }
8141 EXPORT_SYMBOL(md_write_inc);
8142 
8143 void md_write_end(struct mddev *mddev)
8144 {
8145 	percpu_ref_put(&mddev->writes_pending);
8146 
8147 	if (mddev->safemode == 2)
8148 		md_wakeup_thread(mddev->thread);
8149 	else if (mddev->safemode_delay)
8150 		/* The roundup() ensures this only performs locking once
8151 		 * every ->safemode_delay jiffies
8152 		 */
8153 		mod_timer(&mddev->safemode_timer,
8154 			  roundup(jiffies, mddev->safemode_delay) +
8155 			  mddev->safemode_delay);
8156 }
8157 
8158 EXPORT_SYMBOL(md_write_end);
8159 
8160 /* md_allow_write(mddev)
8161  * Calling this ensures that the array is marked 'active' so that writes
8162  * may proceed without blocking.  It is important to call this before
8163  * attempting a GFP_KERNEL allocation while holding the mddev lock.
8164  * Must be called with mddev_lock held.
8165  */
8166 void md_allow_write(struct mddev *mddev)
8167 {
8168 	if (!mddev->pers)
8169 		return;
8170 	if (mddev->ro)
8171 		return;
8172 	if (!mddev->pers->sync_request)
8173 		return;
8174 
8175 	spin_lock(&mddev->lock);
8176 	if (mddev->in_sync) {
8177 		mddev->in_sync = 0;
8178 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8179 		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8180 		if (mddev->safemode_delay &&
8181 		    mddev->safemode == 0)
8182 			mddev->safemode = 1;
8183 		spin_unlock(&mddev->lock);
8184 		md_update_sb(mddev, 0);
8185 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8186 		/* wait for the dirty state to be recorded in the metadata */
8187 		wait_event(mddev->sb_wait,
8188 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8189 	} else
8190 		spin_unlock(&mddev->lock);
8191 }
8192 EXPORT_SYMBOL_GPL(md_allow_write);
8193 
8194 #define SYNC_MARKS	10
8195 #define	SYNC_MARK_STEP	(3*HZ)
8196 #define UPDATE_FREQUENCY (5*60*HZ)
8197 void md_do_sync(struct md_thread *thread)
8198 {
8199 	struct mddev *mddev = thread->mddev;
8200 	struct mddev *mddev2;
8201 	unsigned int currspeed = 0,
8202 		 window;
8203 	sector_t max_sectors,j, io_sectors, recovery_done;
8204 	unsigned long mark[SYNC_MARKS];
8205 	unsigned long update_time;
8206 	sector_t mark_cnt[SYNC_MARKS];
8207 	int last_mark,m;
8208 	struct list_head *tmp;
8209 	sector_t last_check;
8210 	int skipped = 0;
8211 	struct md_rdev *rdev;
8212 	char *desc, *action = NULL;
8213 	struct blk_plug plug;
8214 	int ret;
8215 
8216 	/* just incase thread restarts... */
8217 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8218 	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8219 		return;
8220 	if (mddev->ro) {/* never try to sync a read-only array */
8221 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8222 		return;
8223 	}
8224 
8225 	if (mddev_is_clustered(mddev)) {
8226 		ret = md_cluster_ops->resync_start(mddev);
8227 		if (ret)
8228 			goto skip;
8229 
8230 		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8231 		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8232 			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8233 			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8234 		     && ((unsigned long long)mddev->curr_resync_completed
8235 			 < (unsigned long long)mddev->resync_max_sectors))
8236 			goto skip;
8237 	}
8238 
8239 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8240 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8241 			desc = "data-check";
8242 			action = "check";
8243 		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8244 			desc = "requested-resync";
8245 			action = "repair";
8246 		} else
8247 			desc = "resync";
8248 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8249 		desc = "reshape";
8250 	else
8251 		desc = "recovery";
8252 
8253 	mddev->last_sync_action = action ?: desc;
8254 
8255 	/* we overload curr_resync somewhat here.
8256 	 * 0 == not engaged in resync at all
8257 	 * 2 == checking that there is no conflict with another sync
8258 	 * 1 == like 2, but have yielded to allow conflicting resync to
8259 	 *		commense
8260 	 * other == active in resync - this many blocks
8261 	 *
8262 	 * Before starting a resync we must have set curr_resync to
8263 	 * 2, and then checked that every "conflicting" array has curr_resync
8264 	 * less than ours.  When we find one that is the same or higher
8265 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8266 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8267 	 * This will mean we have to start checking from the beginning again.
8268 	 *
8269 	 */
8270 
8271 	do {
8272 		int mddev2_minor = -1;
8273 		mddev->curr_resync = 2;
8274 
8275 	try_again:
8276 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8277 			goto skip;
8278 		for_each_mddev(mddev2, tmp) {
8279 			if (mddev2 == mddev)
8280 				continue;
8281 			if (!mddev->parallel_resync
8282 			&&  mddev2->curr_resync
8283 			&&  match_mddev_units(mddev, mddev2)) {
8284 				DEFINE_WAIT(wq);
8285 				if (mddev < mddev2 && mddev->curr_resync == 2) {
8286 					/* arbitrarily yield */
8287 					mddev->curr_resync = 1;
8288 					wake_up(&resync_wait);
8289 				}
8290 				if (mddev > mddev2 && mddev->curr_resync == 1)
8291 					/* no need to wait here, we can wait the next
8292 					 * time 'round when curr_resync == 2
8293 					 */
8294 					continue;
8295 				/* We need to wait 'interruptible' so as not to
8296 				 * contribute to the load average, and not to
8297 				 * be caught by 'softlockup'
8298 				 */
8299 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8300 				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8301 				    mddev2->curr_resync >= mddev->curr_resync) {
8302 					if (mddev2_minor != mddev2->md_minor) {
8303 						mddev2_minor = mddev2->md_minor;
8304 						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8305 							desc, mdname(mddev),
8306 							mdname(mddev2));
8307 					}
8308 					mddev_put(mddev2);
8309 					if (signal_pending(current))
8310 						flush_signals(current);
8311 					schedule();
8312 					finish_wait(&resync_wait, &wq);
8313 					goto try_again;
8314 				}
8315 				finish_wait(&resync_wait, &wq);
8316 			}
8317 		}
8318 	} while (mddev->curr_resync < 2);
8319 
8320 	j = 0;
8321 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8322 		/* resync follows the size requested by the personality,
8323 		 * which defaults to physical size, but can be virtual size
8324 		 */
8325 		max_sectors = mddev->resync_max_sectors;
8326 		atomic64_set(&mddev->resync_mismatches, 0);
8327 		/* we don't use the checkpoint if there's a bitmap */
8328 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8329 			j = mddev->resync_min;
8330 		else if (!mddev->bitmap)
8331 			j = mddev->recovery_cp;
8332 
8333 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8334 		max_sectors = mddev->resync_max_sectors;
8335 		/*
8336 		 * If the original node aborts reshaping then we continue the
8337 		 * reshaping, so set j again to avoid restart reshape from the
8338 		 * first beginning
8339 		 */
8340 		if (mddev_is_clustered(mddev) &&
8341 		    mddev->reshape_position != MaxSector)
8342 			j = mddev->reshape_position;
8343 	} else {
8344 		/* recovery follows the physical size of devices */
8345 		max_sectors = mddev->dev_sectors;
8346 		j = MaxSector;
8347 		rcu_read_lock();
8348 		rdev_for_each_rcu(rdev, mddev)
8349 			if (rdev->raid_disk >= 0 &&
8350 			    !test_bit(Journal, &rdev->flags) &&
8351 			    !test_bit(Faulty, &rdev->flags) &&
8352 			    !test_bit(In_sync, &rdev->flags) &&
8353 			    rdev->recovery_offset < j)
8354 				j = rdev->recovery_offset;
8355 		rcu_read_unlock();
8356 
8357 		/* If there is a bitmap, we need to make sure all
8358 		 * writes that started before we added a spare
8359 		 * complete before we start doing a recovery.
8360 		 * Otherwise the write might complete and (via
8361 		 * bitmap_endwrite) set a bit in the bitmap after the
8362 		 * recovery has checked that bit and skipped that
8363 		 * region.
8364 		 */
8365 		if (mddev->bitmap) {
8366 			mddev->pers->quiesce(mddev, 1);
8367 			mddev->pers->quiesce(mddev, 0);
8368 		}
8369 	}
8370 
8371 	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8372 	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8373 	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8374 		 speed_max(mddev), desc);
8375 
8376 	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8377 
8378 	io_sectors = 0;
8379 	for (m = 0; m < SYNC_MARKS; m++) {
8380 		mark[m] = jiffies;
8381 		mark_cnt[m] = io_sectors;
8382 	}
8383 	last_mark = 0;
8384 	mddev->resync_mark = mark[last_mark];
8385 	mddev->resync_mark_cnt = mark_cnt[last_mark];
8386 
8387 	/*
8388 	 * Tune reconstruction:
8389 	 */
8390 	window = 32*(PAGE_SIZE/512);
8391 	pr_debug("md: using %dk window, over a total of %lluk.\n",
8392 		 window/2, (unsigned long long)max_sectors/2);
8393 
8394 	atomic_set(&mddev->recovery_active, 0);
8395 	last_check = 0;
8396 
8397 	if (j>2) {
8398 		pr_debug("md: resuming %s of %s from checkpoint.\n",
8399 			 desc, mdname(mddev));
8400 		mddev->curr_resync = j;
8401 	} else
8402 		mddev->curr_resync = 3; /* no longer delayed */
8403 	mddev->curr_resync_completed = j;
8404 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8405 	md_new_event(mddev);
8406 	update_time = jiffies;
8407 
8408 	blk_start_plug(&plug);
8409 	while (j < max_sectors) {
8410 		sector_t sectors;
8411 
8412 		skipped = 0;
8413 
8414 		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8415 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
8416 		      (mddev->curr_resync - mddev->curr_resync_completed)
8417 		      > (max_sectors >> 4)) ||
8418 		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8419 		     (j - mddev->curr_resync_completed)*2
8420 		     >= mddev->resync_max - mddev->curr_resync_completed ||
8421 		     mddev->curr_resync_completed > mddev->resync_max
8422 			    )) {
8423 			/* time to update curr_resync_completed */
8424 			wait_event(mddev->recovery_wait,
8425 				   atomic_read(&mddev->recovery_active) == 0);
8426 			mddev->curr_resync_completed = j;
8427 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8428 			    j > mddev->recovery_cp)
8429 				mddev->recovery_cp = j;
8430 			update_time = jiffies;
8431 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8432 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8433 		}
8434 
8435 		while (j >= mddev->resync_max &&
8436 		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8437 			/* As this condition is controlled by user-space,
8438 			 * we can block indefinitely, so use '_interruptible'
8439 			 * to avoid triggering warnings.
8440 			 */
8441 			flush_signals(current); /* just in case */
8442 			wait_event_interruptible(mddev->recovery_wait,
8443 						 mddev->resync_max > j
8444 						 || test_bit(MD_RECOVERY_INTR,
8445 							     &mddev->recovery));
8446 		}
8447 
8448 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8449 			break;
8450 
8451 		sectors = mddev->pers->sync_request(mddev, j, &skipped);
8452 		if (sectors == 0) {
8453 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8454 			break;
8455 		}
8456 
8457 		if (!skipped) { /* actual IO requested */
8458 			io_sectors += sectors;
8459 			atomic_add(sectors, &mddev->recovery_active);
8460 		}
8461 
8462 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8463 			break;
8464 
8465 		j += sectors;
8466 		if (j > max_sectors)
8467 			/* when skipping, extra large numbers can be returned. */
8468 			j = max_sectors;
8469 		if (j > 2)
8470 			mddev->curr_resync = j;
8471 		mddev->curr_mark_cnt = io_sectors;
8472 		if (last_check == 0)
8473 			/* this is the earliest that rebuild will be
8474 			 * visible in /proc/mdstat
8475 			 */
8476 			md_new_event(mddev);
8477 
8478 		if (last_check + window > io_sectors || j == max_sectors)
8479 			continue;
8480 
8481 		last_check = io_sectors;
8482 	repeat:
8483 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8484 			/* step marks */
8485 			int next = (last_mark+1) % SYNC_MARKS;
8486 
8487 			mddev->resync_mark = mark[next];
8488 			mddev->resync_mark_cnt = mark_cnt[next];
8489 			mark[next] = jiffies;
8490 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8491 			last_mark = next;
8492 		}
8493 
8494 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8495 			break;
8496 
8497 		/*
8498 		 * this loop exits only if either when we are slower than
8499 		 * the 'hard' speed limit, or the system was IO-idle for
8500 		 * a jiffy.
8501 		 * the system might be non-idle CPU-wise, but we only care
8502 		 * about not overloading the IO subsystem. (things like an
8503 		 * e2fsck being done on the RAID array should execute fast)
8504 		 */
8505 		cond_resched();
8506 
8507 		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8508 		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8509 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
8510 
8511 		if (currspeed > speed_min(mddev)) {
8512 			if (currspeed > speed_max(mddev)) {
8513 				msleep(500);
8514 				goto repeat;
8515 			}
8516 			if (!is_mddev_idle(mddev, 0)) {
8517 				/*
8518 				 * Give other IO more of a chance.
8519 				 * The faster the devices, the less we wait.
8520 				 */
8521 				wait_event(mddev->recovery_wait,
8522 					   !atomic_read(&mddev->recovery_active));
8523 			}
8524 		}
8525 	}
8526 	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8527 		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8528 		? "interrupted" : "done");
8529 	/*
8530 	 * this also signals 'finished resyncing' to md_stop
8531 	 */
8532 	blk_finish_plug(&plug);
8533 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8534 
8535 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8536 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8537 	    mddev->curr_resync > 3) {
8538 		mddev->curr_resync_completed = mddev->curr_resync;
8539 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8540 	}
8541 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
8542 
8543 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8544 	    mddev->curr_resync > 3) {
8545 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8546 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8547 				if (mddev->curr_resync >= mddev->recovery_cp) {
8548 					pr_debug("md: checkpointing %s of %s.\n",
8549 						 desc, mdname(mddev));
8550 					if (test_bit(MD_RECOVERY_ERROR,
8551 						&mddev->recovery))
8552 						mddev->recovery_cp =
8553 							mddev->curr_resync_completed;
8554 					else
8555 						mddev->recovery_cp =
8556 							mddev->curr_resync;
8557 				}
8558 			} else
8559 				mddev->recovery_cp = MaxSector;
8560 		} else {
8561 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8562 				mddev->curr_resync = MaxSector;
8563 			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8564 			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8565 				rcu_read_lock();
8566 				rdev_for_each_rcu(rdev, mddev)
8567 					if (rdev->raid_disk >= 0 &&
8568 					    mddev->delta_disks >= 0 &&
8569 					    !test_bit(Journal, &rdev->flags) &&
8570 					    !test_bit(Faulty, &rdev->flags) &&
8571 					    !test_bit(In_sync, &rdev->flags) &&
8572 					    rdev->recovery_offset < mddev->curr_resync)
8573 						rdev->recovery_offset = mddev->curr_resync;
8574 				rcu_read_unlock();
8575 			}
8576 		}
8577 	}
8578  skip:
8579 	/* set CHANGE_PENDING here since maybe another update is needed,
8580 	 * so other nodes are informed. It should be harmless for normal
8581 	 * raid */
8582 	set_mask_bits(&mddev->sb_flags, 0,
8583 		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8584 
8585 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8586 			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8587 			mddev->delta_disks > 0 &&
8588 			mddev->pers->finish_reshape &&
8589 			mddev->pers->size &&
8590 			mddev->queue) {
8591 		mddev_lock_nointr(mddev);
8592 		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
8593 		mddev_unlock(mddev);
8594 		if (!mddev_is_clustered(mddev)) {
8595 			set_capacity(mddev->gendisk, mddev->array_sectors);
8596 			revalidate_disk(mddev->gendisk);
8597 		}
8598 	}
8599 
8600 	spin_lock(&mddev->lock);
8601 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8602 		/* We completed so min/max setting can be forgotten if used. */
8603 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8604 			mddev->resync_min = 0;
8605 		mddev->resync_max = MaxSector;
8606 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8607 		mddev->resync_min = mddev->curr_resync_completed;
8608 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8609 	mddev->curr_resync = 0;
8610 	spin_unlock(&mddev->lock);
8611 
8612 	wake_up(&resync_wait);
8613 	md_wakeup_thread(mddev->thread);
8614 	return;
8615 }
8616 EXPORT_SYMBOL_GPL(md_do_sync);
8617 
8618 static int remove_and_add_spares(struct mddev *mddev,
8619 				 struct md_rdev *this)
8620 {
8621 	struct md_rdev *rdev;
8622 	int spares = 0;
8623 	int removed = 0;
8624 	bool remove_some = false;
8625 
8626 	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8627 		/* Mustn't remove devices when resync thread is running */
8628 		return 0;
8629 
8630 	rdev_for_each(rdev, mddev) {
8631 		if ((this == NULL || rdev == this) &&
8632 		    rdev->raid_disk >= 0 &&
8633 		    !test_bit(Blocked, &rdev->flags) &&
8634 		    test_bit(Faulty, &rdev->flags) &&
8635 		    atomic_read(&rdev->nr_pending)==0) {
8636 			/* Faulty non-Blocked devices with nr_pending == 0
8637 			 * never get nr_pending incremented,
8638 			 * never get Faulty cleared, and never get Blocked set.
8639 			 * So we can synchronize_rcu now rather than once per device
8640 			 */
8641 			remove_some = true;
8642 			set_bit(RemoveSynchronized, &rdev->flags);
8643 		}
8644 	}
8645 
8646 	if (remove_some)
8647 		synchronize_rcu();
8648 	rdev_for_each(rdev, mddev) {
8649 		if ((this == NULL || rdev == this) &&
8650 		    rdev->raid_disk >= 0 &&
8651 		    !test_bit(Blocked, &rdev->flags) &&
8652 		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
8653 		     (!test_bit(In_sync, &rdev->flags) &&
8654 		      !test_bit(Journal, &rdev->flags))) &&
8655 		    atomic_read(&rdev->nr_pending)==0)) {
8656 			if (mddev->pers->hot_remove_disk(
8657 				    mddev, rdev) == 0) {
8658 				sysfs_unlink_rdev(mddev, rdev);
8659 				rdev->saved_raid_disk = rdev->raid_disk;
8660 				rdev->raid_disk = -1;
8661 				removed++;
8662 			}
8663 		}
8664 		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8665 			clear_bit(RemoveSynchronized, &rdev->flags);
8666 	}
8667 
8668 	if (removed && mddev->kobj.sd)
8669 		sysfs_notify(&mddev->kobj, NULL, "degraded");
8670 
8671 	if (this && removed)
8672 		goto no_add;
8673 
8674 	rdev_for_each(rdev, mddev) {
8675 		if (this && this != rdev)
8676 			continue;
8677 		if (test_bit(Candidate, &rdev->flags))
8678 			continue;
8679 		if (rdev->raid_disk >= 0 &&
8680 		    !test_bit(In_sync, &rdev->flags) &&
8681 		    !test_bit(Journal, &rdev->flags) &&
8682 		    !test_bit(Faulty, &rdev->flags))
8683 			spares++;
8684 		if (rdev->raid_disk >= 0)
8685 			continue;
8686 		if (test_bit(Faulty, &rdev->flags))
8687 			continue;
8688 		if (!test_bit(Journal, &rdev->flags)) {
8689 			if (mddev->ro &&
8690 			    ! (rdev->saved_raid_disk >= 0 &&
8691 			       !test_bit(Bitmap_sync, &rdev->flags)))
8692 				continue;
8693 
8694 			rdev->recovery_offset = 0;
8695 		}
8696 		if (mddev->pers->
8697 		    hot_add_disk(mddev, rdev) == 0) {
8698 			if (sysfs_link_rdev(mddev, rdev))
8699 				/* failure here is OK */;
8700 			if (!test_bit(Journal, &rdev->flags))
8701 				spares++;
8702 			md_new_event(mddev);
8703 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8704 		}
8705 	}
8706 no_add:
8707 	if (removed)
8708 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8709 	return spares;
8710 }
8711 
8712 static void md_start_sync(struct work_struct *ws)
8713 {
8714 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
8715 
8716 	mddev->sync_thread = md_register_thread(md_do_sync,
8717 						mddev,
8718 						"resync");
8719 	if (!mddev->sync_thread) {
8720 		pr_warn("%s: could not start resync thread...\n",
8721 			mdname(mddev));
8722 		/* leave the spares where they are, it shouldn't hurt */
8723 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8724 		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8725 		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8726 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8727 		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8728 		wake_up(&resync_wait);
8729 		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8730 				       &mddev->recovery))
8731 			if (mddev->sysfs_action)
8732 				sysfs_notify_dirent_safe(mddev->sysfs_action);
8733 	} else
8734 		md_wakeup_thread(mddev->sync_thread);
8735 	sysfs_notify_dirent_safe(mddev->sysfs_action);
8736 	md_new_event(mddev);
8737 }
8738 
8739 /*
8740  * This routine is regularly called by all per-raid-array threads to
8741  * deal with generic issues like resync and super-block update.
8742  * Raid personalities that don't have a thread (linear/raid0) do not
8743  * need this as they never do any recovery or update the superblock.
8744  *
8745  * It does not do any resync itself, but rather "forks" off other threads
8746  * to do that as needed.
8747  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8748  * "->recovery" and create a thread at ->sync_thread.
8749  * When the thread finishes it sets MD_RECOVERY_DONE
8750  * and wakeups up this thread which will reap the thread and finish up.
8751  * This thread also removes any faulty devices (with nr_pending == 0).
8752  *
8753  * The overall approach is:
8754  *  1/ if the superblock needs updating, update it.
8755  *  2/ If a recovery thread is running, don't do anything else.
8756  *  3/ If recovery has finished, clean up, possibly marking spares active.
8757  *  4/ If there are any faulty devices, remove them.
8758  *  5/ If array is degraded, try to add spares devices
8759  *  6/ If array has spares or is not in-sync, start a resync thread.
8760  */
8761 void md_check_recovery(struct mddev *mddev)
8762 {
8763 	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8764 		/* Write superblock - thread that called mddev_suspend()
8765 		 * holds reconfig_mutex for us.
8766 		 */
8767 		set_bit(MD_UPDATING_SB, &mddev->flags);
8768 		smp_mb__after_atomic();
8769 		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8770 			md_update_sb(mddev, 0);
8771 		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8772 		wake_up(&mddev->sb_wait);
8773 	}
8774 
8775 	if (mddev->suspended)
8776 		return;
8777 
8778 	if (mddev->bitmap)
8779 		md_bitmap_daemon_work(mddev);
8780 
8781 	if (signal_pending(current)) {
8782 		if (mddev->pers->sync_request && !mddev->external) {
8783 			pr_debug("md: %s in immediate safe mode\n",
8784 				 mdname(mddev));
8785 			mddev->safemode = 2;
8786 		}
8787 		flush_signals(current);
8788 	}
8789 
8790 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8791 		return;
8792 	if ( ! (
8793 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8794 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8795 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8796 		(mddev->external == 0 && mddev->safemode == 1) ||
8797 		(mddev->safemode == 2
8798 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8799 		))
8800 		return;
8801 
8802 	if (mddev_trylock(mddev)) {
8803 		int spares = 0;
8804 
8805 		if (!mddev->external && mddev->safemode == 1)
8806 			mddev->safemode = 0;
8807 
8808 		if (mddev->ro) {
8809 			struct md_rdev *rdev;
8810 			if (!mddev->external && mddev->in_sync)
8811 				/* 'Blocked' flag not needed as failed devices
8812 				 * will be recorded if array switched to read/write.
8813 				 * Leaving it set will prevent the device
8814 				 * from being removed.
8815 				 */
8816 				rdev_for_each(rdev, mddev)
8817 					clear_bit(Blocked, &rdev->flags);
8818 			/* On a read-only array we can:
8819 			 * - remove failed devices
8820 			 * - add already-in_sync devices if the array itself
8821 			 *   is in-sync.
8822 			 * As we only add devices that are already in-sync,
8823 			 * we can activate the spares immediately.
8824 			 */
8825 			remove_and_add_spares(mddev, NULL);
8826 			/* There is no thread, but we need to call
8827 			 * ->spare_active and clear saved_raid_disk
8828 			 */
8829 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8830 			md_reap_sync_thread(mddev);
8831 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8832 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8833 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8834 			goto unlock;
8835 		}
8836 
8837 		if (mddev_is_clustered(mddev)) {
8838 			struct md_rdev *rdev;
8839 			/* kick the device if another node issued a
8840 			 * remove disk.
8841 			 */
8842 			rdev_for_each(rdev, mddev) {
8843 				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
8844 						rdev->raid_disk < 0)
8845 					md_kick_rdev_from_array(rdev);
8846 			}
8847 		}
8848 
8849 		if (!mddev->external && !mddev->in_sync) {
8850 			spin_lock(&mddev->lock);
8851 			set_in_sync(mddev);
8852 			spin_unlock(&mddev->lock);
8853 		}
8854 
8855 		if (mddev->sb_flags)
8856 			md_update_sb(mddev, 0);
8857 
8858 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8859 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8860 			/* resync/recovery still happening */
8861 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8862 			goto unlock;
8863 		}
8864 		if (mddev->sync_thread) {
8865 			md_reap_sync_thread(mddev);
8866 			goto unlock;
8867 		}
8868 		/* Set RUNNING before clearing NEEDED to avoid
8869 		 * any transients in the value of "sync_action".
8870 		 */
8871 		mddev->curr_resync_completed = 0;
8872 		spin_lock(&mddev->lock);
8873 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8874 		spin_unlock(&mddev->lock);
8875 		/* Clear some bits that don't mean anything, but
8876 		 * might be left set
8877 		 */
8878 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8879 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8880 
8881 		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8882 		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8883 			goto not_running;
8884 		/* no recovery is running.
8885 		 * remove any failed drives, then
8886 		 * add spares if possible.
8887 		 * Spares are also removed and re-added, to allow
8888 		 * the personality to fail the re-add.
8889 		 */
8890 
8891 		if (mddev->reshape_position != MaxSector) {
8892 			if (mddev->pers->check_reshape == NULL ||
8893 			    mddev->pers->check_reshape(mddev) != 0)
8894 				/* Cannot proceed */
8895 				goto not_running;
8896 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8897 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8898 		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
8899 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8900 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8901 			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8902 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8903 		} else if (mddev->recovery_cp < MaxSector) {
8904 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8905 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8906 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8907 			/* nothing to be done ... */
8908 			goto not_running;
8909 
8910 		if (mddev->pers->sync_request) {
8911 			if (spares) {
8912 				/* We are adding a device or devices to an array
8913 				 * which has the bitmap stored on all devices.
8914 				 * So make sure all bitmap pages get written
8915 				 */
8916 				md_bitmap_write_all(mddev->bitmap);
8917 			}
8918 			INIT_WORK(&mddev->del_work, md_start_sync);
8919 			queue_work(md_misc_wq, &mddev->del_work);
8920 			goto unlock;
8921 		}
8922 	not_running:
8923 		if (!mddev->sync_thread) {
8924 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8925 			wake_up(&resync_wait);
8926 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8927 					       &mddev->recovery))
8928 				if (mddev->sysfs_action)
8929 					sysfs_notify_dirent_safe(mddev->sysfs_action);
8930 		}
8931 	unlock:
8932 		wake_up(&mddev->sb_wait);
8933 		mddev_unlock(mddev);
8934 	}
8935 }
8936 EXPORT_SYMBOL(md_check_recovery);
8937 
8938 void md_reap_sync_thread(struct mddev *mddev)
8939 {
8940 	struct md_rdev *rdev;
8941 	sector_t old_dev_sectors = mddev->dev_sectors;
8942 	bool is_reshaped = false;
8943 
8944 	/* resync has finished, collect result */
8945 	md_unregister_thread(&mddev->sync_thread);
8946 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8947 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8948 		/* success...*/
8949 		/* activate any spares */
8950 		if (mddev->pers->spare_active(mddev)) {
8951 			sysfs_notify(&mddev->kobj, NULL,
8952 				     "degraded");
8953 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8954 		}
8955 	}
8956 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8957 	    mddev->pers->finish_reshape) {
8958 		mddev->pers->finish_reshape(mddev);
8959 		if (mddev_is_clustered(mddev))
8960 			is_reshaped = true;
8961 	}
8962 
8963 	/* If array is no-longer degraded, then any saved_raid_disk
8964 	 * information must be scrapped.
8965 	 */
8966 	if (!mddev->degraded)
8967 		rdev_for_each(rdev, mddev)
8968 			rdev->saved_raid_disk = -1;
8969 
8970 	md_update_sb(mddev, 1);
8971 	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
8972 	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
8973 	 * clustered raid */
8974 	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
8975 		md_cluster_ops->resync_finish(mddev);
8976 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8977 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8978 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8979 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8980 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8981 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8982 	/*
8983 	 * We call md_cluster_ops->update_size here because sync_size could
8984 	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
8985 	 * so it is time to update size across cluster.
8986 	 */
8987 	if (mddev_is_clustered(mddev) && is_reshaped
8988 				      && !test_bit(MD_CLOSING, &mddev->flags))
8989 		md_cluster_ops->update_size(mddev, old_dev_sectors);
8990 	wake_up(&resync_wait);
8991 	/* flag recovery needed just to double check */
8992 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8993 	sysfs_notify_dirent_safe(mddev->sysfs_action);
8994 	md_new_event(mddev);
8995 	if (mddev->event_work.func)
8996 		queue_work(md_misc_wq, &mddev->event_work);
8997 }
8998 EXPORT_SYMBOL(md_reap_sync_thread);
8999 
9000 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9001 {
9002 	sysfs_notify_dirent_safe(rdev->sysfs_state);
9003 	wait_event_timeout(rdev->blocked_wait,
9004 			   !test_bit(Blocked, &rdev->flags) &&
9005 			   !test_bit(BlockedBadBlocks, &rdev->flags),
9006 			   msecs_to_jiffies(5000));
9007 	rdev_dec_pending(rdev, mddev);
9008 }
9009 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9010 
9011 void md_finish_reshape(struct mddev *mddev)
9012 {
9013 	/* called be personality module when reshape completes. */
9014 	struct md_rdev *rdev;
9015 
9016 	rdev_for_each(rdev, mddev) {
9017 		if (rdev->data_offset > rdev->new_data_offset)
9018 			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9019 		else
9020 			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9021 		rdev->data_offset = rdev->new_data_offset;
9022 	}
9023 }
9024 EXPORT_SYMBOL(md_finish_reshape);
9025 
9026 /* Bad block management */
9027 
9028 /* Returns 1 on success, 0 on failure */
9029 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9030 		       int is_new)
9031 {
9032 	struct mddev *mddev = rdev->mddev;
9033 	int rv;
9034 	if (is_new)
9035 		s += rdev->new_data_offset;
9036 	else
9037 		s += rdev->data_offset;
9038 	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9039 	if (rv == 0) {
9040 		/* Make sure they get written out promptly */
9041 		if (test_bit(ExternalBbl, &rdev->flags))
9042 			sysfs_notify(&rdev->kobj, NULL,
9043 				     "unacknowledged_bad_blocks");
9044 		sysfs_notify_dirent_safe(rdev->sysfs_state);
9045 		set_mask_bits(&mddev->sb_flags, 0,
9046 			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9047 		md_wakeup_thread(rdev->mddev->thread);
9048 		return 1;
9049 	} else
9050 		return 0;
9051 }
9052 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9053 
9054 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9055 			 int is_new)
9056 {
9057 	int rv;
9058 	if (is_new)
9059 		s += rdev->new_data_offset;
9060 	else
9061 		s += rdev->data_offset;
9062 	rv = badblocks_clear(&rdev->badblocks, s, sectors);
9063 	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9064 		sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9065 	return rv;
9066 }
9067 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9068 
9069 static int md_notify_reboot(struct notifier_block *this,
9070 			    unsigned long code, void *x)
9071 {
9072 	struct list_head *tmp;
9073 	struct mddev *mddev;
9074 	int need_delay = 0;
9075 
9076 	for_each_mddev(mddev, tmp) {
9077 		if (mddev_trylock(mddev)) {
9078 			if (mddev->pers)
9079 				__md_stop_writes(mddev);
9080 			if (mddev->persistent)
9081 				mddev->safemode = 2;
9082 			mddev_unlock(mddev);
9083 		}
9084 		need_delay = 1;
9085 	}
9086 	/*
9087 	 * certain more exotic SCSI devices are known to be
9088 	 * volatile wrt too early system reboots. While the
9089 	 * right place to handle this issue is the given
9090 	 * driver, we do want to have a safe RAID driver ...
9091 	 */
9092 	if (need_delay)
9093 		mdelay(1000*1);
9094 
9095 	return NOTIFY_DONE;
9096 }
9097 
9098 static struct notifier_block md_notifier = {
9099 	.notifier_call	= md_notify_reboot,
9100 	.next		= NULL,
9101 	.priority	= INT_MAX, /* before any real devices */
9102 };
9103 
9104 static void md_geninit(void)
9105 {
9106 	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9107 
9108 	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9109 }
9110 
9111 static int __init md_init(void)
9112 {
9113 	int ret = -ENOMEM;
9114 
9115 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9116 	if (!md_wq)
9117 		goto err_wq;
9118 
9119 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9120 	if (!md_misc_wq)
9121 		goto err_misc_wq;
9122 
9123 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9124 		goto err_md;
9125 
9126 	if ((ret = register_blkdev(0, "mdp")) < 0)
9127 		goto err_mdp;
9128 	mdp_major = ret;
9129 
9130 	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9131 			    md_probe, NULL, NULL);
9132 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
9133 			    md_probe, NULL, NULL);
9134 
9135 	register_reboot_notifier(&md_notifier);
9136 	raid_table_header = register_sysctl_table(raid_root_table);
9137 
9138 	md_geninit();
9139 	return 0;
9140 
9141 err_mdp:
9142 	unregister_blkdev(MD_MAJOR, "md");
9143 err_md:
9144 	destroy_workqueue(md_misc_wq);
9145 err_misc_wq:
9146 	destroy_workqueue(md_wq);
9147 err_wq:
9148 	return ret;
9149 }
9150 
9151 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9152 {
9153 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9154 	struct md_rdev *rdev2;
9155 	int role, ret;
9156 	char b[BDEVNAME_SIZE];
9157 
9158 	/*
9159 	 * If size is changed in another node then we need to
9160 	 * do resize as well.
9161 	 */
9162 	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9163 		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9164 		if (ret)
9165 			pr_info("md-cluster: resize failed\n");
9166 		else
9167 			md_bitmap_update_sb(mddev->bitmap);
9168 	}
9169 
9170 	/* Check for change of roles in the active devices */
9171 	rdev_for_each(rdev2, mddev) {
9172 		if (test_bit(Faulty, &rdev2->flags))
9173 			continue;
9174 
9175 		/* Check if the roles changed */
9176 		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9177 
9178 		if (test_bit(Candidate, &rdev2->flags)) {
9179 			if (role == 0xfffe) {
9180 				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9181 				md_kick_rdev_from_array(rdev2);
9182 				continue;
9183 			}
9184 			else
9185 				clear_bit(Candidate, &rdev2->flags);
9186 		}
9187 
9188 		if (role != rdev2->raid_disk) {
9189 			/*
9190 			 * got activated except reshape is happening.
9191 			 */
9192 			if (rdev2->raid_disk == -1 && role != 0xffff &&
9193 			    !(le32_to_cpu(sb->feature_map) &
9194 			      MD_FEATURE_RESHAPE_ACTIVE)) {
9195 				rdev2->saved_raid_disk = role;
9196 				ret = remove_and_add_spares(mddev, rdev2);
9197 				pr_info("Activated spare: %s\n",
9198 					bdevname(rdev2->bdev,b));
9199 				/* wakeup mddev->thread here, so array could
9200 				 * perform resync with the new activated disk */
9201 				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9202 				md_wakeup_thread(mddev->thread);
9203 
9204 			}
9205 			/* device faulty
9206 			 * We just want to do the minimum to mark the disk
9207 			 * as faulty. The recovery is performed by the
9208 			 * one who initiated the error.
9209 			 */
9210 			if ((role == 0xfffe) || (role == 0xfffd)) {
9211 				md_error(mddev, rdev2);
9212 				clear_bit(Blocked, &rdev2->flags);
9213 			}
9214 		}
9215 	}
9216 
9217 	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9218 		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9219 
9220 	/*
9221 	 * Since mddev->delta_disks has already updated in update_raid_disks,
9222 	 * so it is time to check reshape.
9223 	 */
9224 	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9225 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9226 		/*
9227 		 * reshape is happening in the remote node, we need to
9228 		 * update reshape_position and call start_reshape.
9229 		 */
9230 		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9231 		if (mddev->pers->update_reshape_pos)
9232 			mddev->pers->update_reshape_pos(mddev);
9233 		if (mddev->pers->start_reshape)
9234 			mddev->pers->start_reshape(mddev);
9235 	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9236 		   mddev->reshape_position != MaxSector &&
9237 		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9238 		/* reshape is just done in another node. */
9239 		mddev->reshape_position = MaxSector;
9240 		if (mddev->pers->update_reshape_pos)
9241 			mddev->pers->update_reshape_pos(mddev);
9242 	}
9243 
9244 	/* Finally set the event to be up to date */
9245 	mddev->events = le64_to_cpu(sb->events);
9246 }
9247 
9248 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9249 {
9250 	int err;
9251 	struct page *swapout = rdev->sb_page;
9252 	struct mdp_superblock_1 *sb;
9253 
9254 	/* Store the sb page of the rdev in the swapout temporary
9255 	 * variable in case we err in the future
9256 	 */
9257 	rdev->sb_page = NULL;
9258 	err = alloc_disk_sb(rdev);
9259 	if (err == 0) {
9260 		ClearPageUptodate(rdev->sb_page);
9261 		rdev->sb_loaded = 0;
9262 		err = super_types[mddev->major_version].
9263 			load_super(rdev, NULL, mddev->minor_version);
9264 	}
9265 	if (err < 0) {
9266 		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9267 				__func__, __LINE__, rdev->desc_nr, err);
9268 		if (rdev->sb_page)
9269 			put_page(rdev->sb_page);
9270 		rdev->sb_page = swapout;
9271 		rdev->sb_loaded = 1;
9272 		return err;
9273 	}
9274 
9275 	sb = page_address(rdev->sb_page);
9276 	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9277 	 * is not set
9278 	 */
9279 
9280 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9281 		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9282 
9283 	/* The other node finished recovery, call spare_active to set
9284 	 * device In_sync and mddev->degraded
9285 	 */
9286 	if (rdev->recovery_offset == MaxSector &&
9287 	    !test_bit(In_sync, &rdev->flags) &&
9288 	    mddev->pers->spare_active(mddev))
9289 		sysfs_notify(&mddev->kobj, NULL, "degraded");
9290 
9291 	put_page(swapout);
9292 	return 0;
9293 }
9294 
9295 void md_reload_sb(struct mddev *mddev, int nr)
9296 {
9297 	struct md_rdev *rdev;
9298 	int err;
9299 
9300 	/* Find the rdev */
9301 	rdev_for_each_rcu(rdev, mddev) {
9302 		if (rdev->desc_nr == nr)
9303 			break;
9304 	}
9305 
9306 	if (!rdev || rdev->desc_nr != nr) {
9307 		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9308 		return;
9309 	}
9310 
9311 	err = read_rdev(mddev, rdev);
9312 	if (err < 0)
9313 		return;
9314 
9315 	check_sb_changes(mddev, rdev);
9316 
9317 	/* Read all rdev's to update recovery_offset */
9318 	rdev_for_each_rcu(rdev, mddev) {
9319 		if (!test_bit(Faulty, &rdev->flags))
9320 			read_rdev(mddev, rdev);
9321 	}
9322 }
9323 EXPORT_SYMBOL(md_reload_sb);
9324 
9325 #ifndef MODULE
9326 
9327 /*
9328  * Searches all registered partitions for autorun RAID arrays
9329  * at boot time.
9330  */
9331 
9332 static DEFINE_MUTEX(detected_devices_mutex);
9333 static LIST_HEAD(all_detected_devices);
9334 struct detected_devices_node {
9335 	struct list_head list;
9336 	dev_t dev;
9337 };
9338 
9339 void md_autodetect_dev(dev_t dev)
9340 {
9341 	struct detected_devices_node *node_detected_dev;
9342 
9343 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9344 	if (node_detected_dev) {
9345 		node_detected_dev->dev = dev;
9346 		mutex_lock(&detected_devices_mutex);
9347 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9348 		mutex_unlock(&detected_devices_mutex);
9349 	}
9350 }
9351 
9352 static void autostart_arrays(int part)
9353 {
9354 	struct md_rdev *rdev;
9355 	struct detected_devices_node *node_detected_dev;
9356 	dev_t dev;
9357 	int i_scanned, i_passed;
9358 
9359 	i_scanned = 0;
9360 	i_passed = 0;
9361 
9362 	pr_info("md: Autodetecting RAID arrays.\n");
9363 
9364 	mutex_lock(&detected_devices_mutex);
9365 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9366 		i_scanned++;
9367 		node_detected_dev = list_entry(all_detected_devices.next,
9368 					struct detected_devices_node, list);
9369 		list_del(&node_detected_dev->list);
9370 		dev = node_detected_dev->dev;
9371 		kfree(node_detected_dev);
9372 		mutex_unlock(&detected_devices_mutex);
9373 		rdev = md_import_device(dev,0, 90);
9374 		mutex_lock(&detected_devices_mutex);
9375 		if (IS_ERR(rdev))
9376 			continue;
9377 
9378 		if (test_bit(Faulty, &rdev->flags))
9379 			continue;
9380 
9381 		set_bit(AutoDetected, &rdev->flags);
9382 		list_add(&rdev->same_set, &pending_raid_disks);
9383 		i_passed++;
9384 	}
9385 	mutex_unlock(&detected_devices_mutex);
9386 
9387 	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9388 
9389 	autorun_devices(part);
9390 }
9391 
9392 #endif /* !MODULE */
9393 
9394 static __exit void md_exit(void)
9395 {
9396 	struct mddev *mddev;
9397 	struct list_head *tmp;
9398 	int delay = 1;
9399 
9400 	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9401 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9402 
9403 	unregister_blkdev(MD_MAJOR,"md");
9404 	unregister_blkdev(mdp_major, "mdp");
9405 	unregister_reboot_notifier(&md_notifier);
9406 	unregister_sysctl_table(raid_table_header);
9407 
9408 	/* We cannot unload the modules while some process is
9409 	 * waiting for us in select() or poll() - wake them up
9410 	 */
9411 	md_unloading = 1;
9412 	while (waitqueue_active(&md_event_waiters)) {
9413 		/* not safe to leave yet */
9414 		wake_up(&md_event_waiters);
9415 		msleep(delay);
9416 		delay += delay;
9417 	}
9418 	remove_proc_entry("mdstat", NULL);
9419 
9420 	for_each_mddev(mddev, tmp) {
9421 		export_array(mddev);
9422 		mddev->ctime = 0;
9423 		mddev->hold_active = 0;
9424 		/*
9425 		 * for_each_mddev() will call mddev_put() at the end of each
9426 		 * iteration.  As the mddev is now fully clear, this will
9427 		 * schedule the mddev for destruction by a workqueue, and the
9428 		 * destroy_workqueue() below will wait for that to complete.
9429 		 */
9430 	}
9431 	destroy_workqueue(md_misc_wq);
9432 	destroy_workqueue(md_wq);
9433 }
9434 
9435 subsys_initcall(md_init);
9436 module_exit(md_exit)
9437 
9438 static int get_ro(char *buffer, const struct kernel_param *kp)
9439 {
9440 	return sprintf(buffer, "%d", start_readonly);
9441 }
9442 static int set_ro(const char *val, const struct kernel_param *kp)
9443 {
9444 	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9445 }
9446 
9447 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9448 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9449 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9450 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9451 
9452 MODULE_LICENSE("GPL");
9453 MODULE_DESCRIPTION("MD RAID framework");
9454 MODULE_ALIAS("md");
9455 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9456