xref: /linux/drivers/md/md.c (revision 2c97b5ae83dca56718774e7b4bf9640f05d11867)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3    md.c : Multiple Devices driver for Linux
4      Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 
6      completely rewritten, based on the MD driver code from Marc Zyngier
7 
8    Changes:
9 
10    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14    - kmod support by: Cyrus Durgin
15    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 
18    - lots of fixes and improvements to the RAID1/RAID5 and generic
19      RAID code (such as request based resynchronization):
20 
21      Neil Brown <neilb@cse.unsw.edu.au>.
22 
23    - persistent bitmap code
24      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25 
26 
27    Errors, Warnings, etc.
28    Please use:
29      pr_crit() for error conditions that risk data loss
30      pr_err() for error conditions that are unexpected, like an IO error
31          or internal inconsistency
32      pr_warn() for error conditions that could have been predicated, like
33          adding a device to an array when it has incompatible metadata
34      pr_info() for every interesting, very rare events, like an array starting
35          or stopping, or resync starting or stopping
36      pr_debug() for everything else.
37 
38 */
39 
40 #include <linux/sched/mm.h>
41 #include <linux/sched/signal.h>
42 #include <linux/kthread.h>
43 #include <linux/blkdev.h>
44 #include <linux/badblocks.h>
45 #include <linux/sysctl.h>
46 #include <linux/seq_file.h>
47 #include <linux/fs.h>
48 #include <linux/poll.h>
49 #include <linux/ctype.h>
50 #include <linux/string.h>
51 #include <linux/hdreg.h>
52 #include <linux/proc_fs.h>
53 #include <linux/random.h>
54 #include <linux/module.h>
55 #include <linux/reboot.h>
56 #include <linux/file.h>
57 #include <linux/compat.h>
58 #include <linux/delay.h>
59 #include <linux/raid/md_p.h>
60 #include <linux/raid/md_u.h>
61 #include <linux/slab.h>
62 #include <linux/percpu-refcount.h>
63 
64 #include <trace/events/block.h>
65 #include "md.h"
66 #include "md-bitmap.h"
67 #include "md-cluster.h"
68 
69 #ifndef MODULE
70 static void autostart_arrays(int part);
71 #endif
72 
73 /* pers_list is a list of registered personalities protected
74  * by pers_lock.
75  * pers_lock does extra service to protect accesses to
76  * mddev->thread when the mutex cannot be held.
77  */
78 static LIST_HEAD(pers_list);
79 static DEFINE_SPINLOCK(pers_lock);
80 
81 static struct kobj_type md_ktype;
82 
83 struct md_cluster_operations *md_cluster_ops;
84 EXPORT_SYMBOL(md_cluster_ops);
85 static struct module *md_cluster_mod;
86 
87 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
88 static struct workqueue_struct *md_wq;
89 static struct workqueue_struct *md_misc_wq;
90 
91 static int remove_and_add_spares(struct mddev *mddev,
92 				 struct md_rdev *this);
93 static void mddev_detach(struct mddev *mddev);
94 
95 /*
96  * Default number of read corrections we'll attempt on an rdev
97  * before ejecting it from the array. We divide the read error
98  * count by 2 for every hour elapsed between read errors.
99  */
100 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
101 /*
102  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
103  * is 1000 KB/sec, so the extra system load does not show up that much.
104  * Increase it if you want to have more _guaranteed_ speed. Note that
105  * the RAID driver will use the maximum available bandwidth if the IO
106  * subsystem is idle. There is also an 'absolute maximum' reconstruction
107  * speed limit - in case reconstruction slows down your system despite
108  * idle IO detection.
109  *
110  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
111  * or /sys/block/mdX/md/sync_speed_{min,max}
112  */
113 
114 static int sysctl_speed_limit_min = 1000;
115 static int sysctl_speed_limit_max = 200000;
116 static inline int speed_min(struct mddev *mddev)
117 {
118 	return mddev->sync_speed_min ?
119 		mddev->sync_speed_min : sysctl_speed_limit_min;
120 }
121 
122 static inline int speed_max(struct mddev *mddev)
123 {
124 	return mddev->sync_speed_max ?
125 		mddev->sync_speed_max : sysctl_speed_limit_max;
126 }
127 
128 static int rdev_init_wb(struct md_rdev *rdev)
129 {
130 	if (rdev->bdev->bd_queue->nr_hw_queues == 1)
131 		return 0;
132 
133 	spin_lock_init(&rdev->wb_list_lock);
134 	INIT_LIST_HEAD(&rdev->wb_list);
135 	init_waitqueue_head(&rdev->wb_io_wait);
136 	set_bit(WBCollisionCheck, &rdev->flags);
137 
138 	return 1;
139 }
140 
141 /*
142  * Create wb_info_pool if rdev is the first multi-queue device flaged
143  * with writemostly, also write-behind mode is enabled.
144  */
145 void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
146 			  bool is_suspend)
147 {
148 	if (mddev->bitmap_info.max_write_behind == 0)
149 		return;
150 
151 	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
152 		return;
153 
154 	if (mddev->wb_info_pool == NULL) {
155 		unsigned int noio_flag;
156 
157 		if (!is_suspend)
158 			mddev_suspend(mddev);
159 		noio_flag = memalloc_noio_save();
160 		mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
161 							sizeof(struct wb_info));
162 		memalloc_noio_restore(noio_flag);
163 		if (!mddev->wb_info_pool)
164 			pr_err("can't alloc memory pool for writemostly\n");
165 		if (!is_suspend)
166 			mddev_resume(mddev);
167 	}
168 }
169 EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
170 
171 /*
172  * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
173  */
174 static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
175 {
176 	if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
177 		return;
178 
179 	if (mddev->wb_info_pool) {
180 		struct md_rdev *temp;
181 		int num = 0;
182 
183 		/*
184 		 * Check if other rdevs need wb_info_pool.
185 		 */
186 		rdev_for_each(temp, mddev)
187 			if (temp != rdev &&
188 			    test_bit(WBCollisionCheck, &temp->flags))
189 				num++;
190 		if (!num) {
191 			mddev_suspend(rdev->mddev);
192 			mempool_destroy(mddev->wb_info_pool);
193 			mddev->wb_info_pool = NULL;
194 			mddev_resume(rdev->mddev);
195 		}
196 	}
197 }
198 
199 static struct ctl_table_header *raid_table_header;
200 
201 static struct ctl_table raid_table[] = {
202 	{
203 		.procname	= "speed_limit_min",
204 		.data		= &sysctl_speed_limit_min,
205 		.maxlen		= sizeof(int),
206 		.mode		= S_IRUGO|S_IWUSR,
207 		.proc_handler	= proc_dointvec,
208 	},
209 	{
210 		.procname	= "speed_limit_max",
211 		.data		= &sysctl_speed_limit_max,
212 		.maxlen		= sizeof(int),
213 		.mode		= S_IRUGO|S_IWUSR,
214 		.proc_handler	= proc_dointvec,
215 	},
216 	{ }
217 };
218 
219 static struct ctl_table raid_dir_table[] = {
220 	{
221 		.procname	= "raid",
222 		.maxlen		= 0,
223 		.mode		= S_IRUGO|S_IXUGO,
224 		.child		= raid_table,
225 	},
226 	{ }
227 };
228 
229 static struct ctl_table raid_root_table[] = {
230 	{
231 		.procname	= "dev",
232 		.maxlen		= 0,
233 		.mode		= 0555,
234 		.child		= raid_dir_table,
235 	},
236 	{  }
237 };
238 
239 static const struct block_device_operations md_fops;
240 
241 static int start_readonly;
242 
243 /*
244  * The original mechanism for creating an md device is to create
245  * a device node in /dev and to open it.  This causes races with device-close.
246  * The preferred method is to write to the "new_array" module parameter.
247  * This can avoid races.
248  * Setting create_on_open to false disables the original mechanism
249  * so all the races disappear.
250  */
251 static bool create_on_open = true;
252 
253 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
254 			    struct mddev *mddev)
255 {
256 	if (!mddev || !bioset_initialized(&mddev->bio_set))
257 		return bio_alloc(gfp_mask, nr_iovecs);
258 
259 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
260 }
261 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
262 
263 static struct bio *md_bio_alloc_sync(struct mddev *mddev)
264 {
265 	if (!mddev || !bioset_initialized(&mddev->sync_set))
266 		return bio_alloc(GFP_NOIO, 1);
267 
268 	return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
269 }
270 
271 /*
272  * We have a system wide 'event count' that is incremented
273  * on any 'interesting' event, and readers of /proc/mdstat
274  * can use 'poll' or 'select' to find out when the event
275  * count increases.
276  *
277  * Events are:
278  *  start array, stop array, error, add device, remove device,
279  *  start build, activate spare
280  */
281 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
282 static atomic_t md_event_count;
283 void md_new_event(struct mddev *mddev)
284 {
285 	atomic_inc(&md_event_count);
286 	wake_up(&md_event_waiters);
287 }
288 EXPORT_SYMBOL_GPL(md_new_event);
289 
290 /*
291  * Enables to iterate over all existing md arrays
292  * all_mddevs_lock protects this list.
293  */
294 static LIST_HEAD(all_mddevs);
295 static DEFINE_SPINLOCK(all_mddevs_lock);
296 
297 /*
298  * iterates through all used mddevs in the system.
299  * We take care to grab the all_mddevs_lock whenever navigating
300  * the list, and to always hold a refcount when unlocked.
301  * Any code which breaks out of this loop while own
302  * a reference to the current mddev and must mddev_put it.
303  */
304 #define for_each_mddev(_mddev,_tmp)					\
305 									\
306 	for (({ spin_lock(&all_mddevs_lock);				\
307 		_tmp = all_mddevs.next;					\
308 		_mddev = NULL;});					\
309 	     ({ if (_tmp != &all_mddevs)				\
310 			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
311 		spin_unlock(&all_mddevs_lock);				\
312 		if (_mddev) mddev_put(_mddev);				\
313 		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
314 		_tmp != &all_mddevs;});					\
315 	     ({ spin_lock(&all_mddevs_lock);				\
316 		_tmp = _tmp->next;})					\
317 		)
318 
319 /* Rather than calling directly into the personality make_request function,
320  * IO requests come here first so that we can check if the device is
321  * being suspended pending a reconfiguration.
322  * We hold a refcount over the call to ->make_request.  By the time that
323  * call has finished, the bio has been linked into some internal structure
324  * and so is visible to ->quiesce(), so we don't need the refcount any more.
325  */
326 static bool is_suspended(struct mddev *mddev, struct bio *bio)
327 {
328 	if (mddev->suspended)
329 		return true;
330 	if (bio_data_dir(bio) != WRITE)
331 		return false;
332 	if (mddev->suspend_lo >= mddev->suspend_hi)
333 		return false;
334 	if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
335 		return false;
336 	if (bio_end_sector(bio) < mddev->suspend_lo)
337 		return false;
338 	return true;
339 }
340 
341 void md_handle_request(struct mddev *mddev, struct bio *bio)
342 {
343 check_suspended:
344 	rcu_read_lock();
345 	if (is_suspended(mddev, bio)) {
346 		DEFINE_WAIT(__wait);
347 		for (;;) {
348 			prepare_to_wait(&mddev->sb_wait, &__wait,
349 					TASK_UNINTERRUPTIBLE);
350 			if (!is_suspended(mddev, bio))
351 				break;
352 			rcu_read_unlock();
353 			schedule();
354 			rcu_read_lock();
355 		}
356 		finish_wait(&mddev->sb_wait, &__wait);
357 	}
358 	atomic_inc(&mddev->active_io);
359 	rcu_read_unlock();
360 
361 	if (!mddev->pers->make_request(mddev, bio)) {
362 		atomic_dec(&mddev->active_io);
363 		wake_up(&mddev->sb_wait);
364 		goto check_suspended;
365 	}
366 
367 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
368 		wake_up(&mddev->sb_wait);
369 }
370 EXPORT_SYMBOL(md_handle_request);
371 
372 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
373 {
374 	const int rw = bio_data_dir(bio);
375 	const int sgrp = op_stat_group(bio_op(bio));
376 	struct mddev *mddev = q->queuedata;
377 	unsigned int sectors;
378 
379 	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
380 		bio_io_error(bio);
381 		return BLK_QC_T_NONE;
382 	}
383 
384 	blk_queue_split(q, &bio);
385 
386 	if (mddev == NULL || mddev->pers == NULL) {
387 		bio_io_error(bio);
388 		return BLK_QC_T_NONE;
389 	}
390 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
391 		if (bio_sectors(bio) != 0)
392 			bio->bi_status = BLK_STS_IOERR;
393 		bio_endio(bio);
394 		return BLK_QC_T_NONE;
395 	}
396 
397 	/*
398 	 * save the sectors now since our bio can
399 	 * go away inside make_request
400 	 */
401 	sectors = bio_sectors(bio);
402 	/* bio could be mergeable after passing to underlayer */
403 	bio->bi_opf &= ~REQ_NOMERGE;
404 
405 	md_handle_request(mddev, bio);
406 
407 	part_stat_lock();
408 	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
409 	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
410 	part_stat_unlock();
411 
412 	return BLK_QC_T_NONE;
413 }
414 
415 /* mddev_suspend makes sure no new requests are submitted
416  * to the device, and that any requests that have been submitted
417  * are completely handled.
418  * Once mddev_detach() is called and completes, the module will be
419  * completely unused.
420  */
421 void mddev_suspend(struct mddev *mddev)
422 {
423 	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
424 	lockdep_assert_held(&mddev->reconfig_mutex);
425 	if (mddev->suspended++)
426 		return;
427 	synchronize_rcu();
428 	wake_up(&mddev->sb_wait);
429 	set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
430 	smp_mb__after_atomic();
431 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
432 	mddev->pers->quiesce(mddev, 1);
433 	clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
434 	wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
435 
436 	del_timer_sync(&mddev->safemode_timer);
437 }
438 EXPORT_SYMBOL_GPL(mddev_suspend);
439 
440 void mddev_resume(struct mddev *mddev)
441 {
442 	lockdep_assert_held(&mddev->reconfig_mutex);
443 	if (--mddev->suspended)
444 		return;
445 	wake_up(&mddev->sb_wait);
446 	mddev->pers->quiesce(mddev, 0);
447 
448 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
449 	md_wakeup_thread(mddev->thread);
450 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
451 }
452 EXPORT_SYMBOL_GPL(mddev_resume);
453 
454 int mddev_congested(struct mddev *mddev, int bits)
455 {
456 	struct md_personality *pers = mddev->pers;
457 	int ret = 0;
458 
459 	rcu_read_lock();
460 	if (mddev->suspended)
461 		ret = 1;
462 	else if (pers && pers->congested)
463 		ret = pers->congested(mddev, bits);
464 	rcu_read_unlock();
465 	return ret;
466 }
467 EXPORT_SYMBOL_GPL(mddev_congested);
468 static int md_congested(void *data, int bits)
469 {
470 	struct mddev *mddev = data;
471 	return mddev_congested(mddev, bits);
472 }
473 
474 /*
475  * Generic flush handling for md
476  */
477 
478 static void md_end_flush(struct bio *bio)
479 {
480 	struct md_rdev *rdev = bio->bi_private;
481 	struct mddev *mddev = rdev->mddev;
482 
483 	rdev_dec_pending(rdev, mddev);
484 
485 	if (atomic_dec_and_test(&mddev->flush_pending)) {
486 		/* The pre-request flush has finished */
487 		queue_work(md_wq, &mddev->flush_work);
488 	}
489 	bio_put(bio);
490 }
491 
492 static void md_submit_flush_data(struct work_struct *ws);
493 
494 static void submit_flushes(struct work_struct *ws)
495 {
496 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
497 	struct md_rdev *rdev;
498 
499 	mddev->start_flush = ktime_get_boottime();
500 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
501 	atomic_set(&mddev->flush_pending, 1);
502 	rcu_read_lock();
503 	rdev_for_each_rcu(rdev, mddev)
504 		if (rdev->raid_disk >= 0 &&
505 		    !test_bit(Faulty, &rdev->flags)) {
506 			/* Take two references, one is dropped
507 			 * when request finishes, one after
508 			 * we reclaim rcu_read_lock
509 			 */
510 			struct bio *bi;
511 			atomic_inc(&rdev->nr_pending);
512 			atomic_inc(&rdev->nr_pending);
513 			rcu_read_unlock();
514 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
515 			bi->bi_end_io = md_end_flush;
516 			bi->bi_private = rdev;
517 			bio_set_dev(bi, rdev->bdev);
518 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
519 			atomic_inc(&mddev->flush_pending);
520 			submit_bio(bi);
521 			rcu_read_lock();
522 			rdev_dec_pending(rdev, mddev);
523 		}
524 	rcu_read_unlock();
525 	if (atomic_dec_and_test(&mddev->flush_pending))
526 		queue_work(md_wq, &mddev->flush_work);
527 }
528 
529 static void md_submit_flush_data(struct work_struct *ws)
530 {
531 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
532 	struct bio *bio = mddev->flush_bio;
533 
534 	/*
535 	 * must reset flush_bio before calling into md_handle_request to avoid a
536 	 * deadlock, because other bios passed md_handle_request suspend check
537 	 * could wait for this and below md_handle_request could wait for those
538 	 * bios because of suspend check
539 	 */
540 	mddev->last_flush = mddev->start_flush;
541 	mddev->flush_bio = NULL;
542 	wake_up(&mddev->sb_wait);
543 
544 	if (bio->bi_iter.bi_size == 0) {
545 		/* an empty barrier - all done */
546 		bio_endio(bio);
547 	} else {
548 		bio->bi_opf &= ~REQ_PREFLUSH;
549 		md_handle_request(mddev, bio);
550 	}
551 }
552 
553 /*
554  * Manages consolidation of flushes and submitting any flushes needed for
555  * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
556  * being finished in another context.  Returns false if the flushing is
557  * complete but still needs the I/O portion of the bio to be processed.
558  */
559 bool md_flush_request(struct mddev *mddev, struct bio *bio)
560 {
561 	ktime_t start = ktime_get_boottime();
562 	spin_lock_irq(&mddev->lock);
563 	wait_event_lock_irq(mddev->sb_wait,
564 			    !mddev->flush_bio ||
565 			    ktime_after(mddev->last_flush, start),
566 			    mddev->lock);
567 	if (!ktime_after(mddev->last_flush, start)) {
568 		WARN_ON(mddev->flush_bio);
569 		mddev->flush_bio = bio;
570 		bio = NULL;
571 	}
572 	spin_unlock_irq(&mddev->lock);
573 
574 	if (!bio) {
575 		INIT_WORK(&mddev->flush_work, submit_flushes);
576 		queue_work(md_wq, &mddev->flush_work);
577 	} else {
578 		/* flush was performed for some other bio while we waited. */
579 		if (bio->bi_iter.bi_size == 0)
580 			/* an empty barrier - all done */
581 			bio_endio(bio);
582 		else {
583 			bio->bi_opf &= ~REQ_PREFLUSH;
584 			return false;
585 		}
586 	}
587 	return true;
588 }
589 EXPORT_SYMBOL(md_flush_request);
590 
591 static inline struct mddev *mddev_get(struct mddev *mddev)
592 {
593 	atomic_inc(&mddev->active);
594 	return mddev;
595 }
596 
597 static void mddev_delayed_delete(struct work_struct *ws);
598 
599 static void mddev_put(struct mddev *mddev)
600 {
601 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
602 		return;
603 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
604 	    mddev->ctime == 0 && !mddev->hold_active) {
605 		/* Array is not configured at all, and not held active,
606 		 * so destroy it */
607 		list_del_init(&mddev->all_mddevs);
608 
609 		/*
610 		 * Call queue_work inside the spinlock so that
611 		 * flush_workqueue() after mddev_find will succeed in waiting
612 		 * for the work to be done.
613 		 */
614 		INIT_WORK(&mddev->del_work, mddev_delayed_delete);
615 		queue_work(md_misc_wq, &mddev->del_work);
616 	}
617 	spin_unlock(&all_mddevs_lock);
618 }
619 
620 static void md_safemode_timeout(struct timer_list *t);
621 
622 void mddev_init(struct mddev *mddev)
623 {
624 	kobject_init(&mddev->kobj, &md_ktype);
625 	mutex_init(&mddev->open_mutex);
626 	mutex_init(&mddev->reconfig_mutex);
627 	mutex_init(&mddev->bitmap_info.mutex);
628 	INIT_LIST_HEAD(&mddev->disks);
629 	INIT_LIST_HEAD(&mddev->all_mddevs);
630 	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
631 	atomic_set(&mddev->active, 1);
632 	atomic_set(&mddev->openers, 0);
633 	atomic_set(&mddev->active_io, 0);
634 	spin_lock_init(&mddev->lock);
635 	atomic_set(&mddev->flush_pending, 0);
636 	init_waitqueue_head(&mddev->sb_wait);
637 	init_waitqueue_head(&mddev->recovery_wait);
638 	mddev->reshape_position = MaxSector;
639 	mddev->reshape_backwards = 0;
640 	mddev->last_sync_action = "none";
641 	mddev->resync_min = 0;
642 	mddev->resync_max = MaxSector;
643 	mddev->level = LEVEL_NONE;
644 }
645 EXPORT_SYMBOL_GPL(mddev_init);
646 
647 static struct mddev *mddev_find(dev_t unit)
648 {
649 	struct mddev *mddev, *new = NULL;
650 
651 	if (unit && MAJOR(unit) != MD_MAJOR)
652 		unit &= ~((1<<MdpMinorShift)-1);
653 
654  retry:
655 	spin_lock(&all_mddevs_lock);
656 
657 	if (unit) {
658 		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
659 			if (mddev->unit == unit) {
660 				mddev_get(mddev);
661 				spin_unlock(&all_mddevs_lock);
662 				kfree(new);
663 				return mddev;
664 			}
665 
666 		if (new) {
667 			list_add(&new->all_mddevs, &all_mddevs);
668 			spin_unlock(&all_mddevs_lock);
669 			new->hold_active = UNTIL_IOCTL;
670 			return new;
671 		}
672 	} else if (new) {
673 		/* find an unused unit number */
674 		static int next_minor = 512;
675 		int start = next_minor;
676 		int is_free = 0;
677 		int dev = 0;
678 		while (!is_free) {
679 			dev = MKDEV(MD_MAJOR, next_minor);
680 			next_minor++;
681 			if (next_minor > MINORMASK)
682 				next_minor = 0;
683 			if (next_minor == start) {
684 				/* Oh dear, all in use. */
685 				spin_unlock(&all_mddevs_lock);
686 				kfree(new);
687 				return NULL;
688 			}
689 
690 			is_free = 1;
691 			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
692 				if (mddev->unit == dev) {
693 					is_free = 0;
694 					break;
695 				}
696 		}
697 		new->unit = dev;
698 		new->md_minor = MINOR(dev);
699 		new->hold_active = UNTIL_STOP;
700 		list_add(&new->all_mddevs, &all_mddevs);
701 		spin_unlock(&all_mddevs_lock);
702 		return new;
703 	}
704 	spin_unlock(&all_mddevs_lock);
705 
706 	new = kzalloc(sizeof(*new), GFP_KERNEL);
707 	if (!new)
708 		return NULL;
709 
710 	new->unit = unit;
711 	if (MAJOR(unit) == MD_MAJOR)
712 		new->md_minor = MINOR(unit);
713 	else
714 		new->md_minor = MINOR(unit) >> MdpMinorShift;
715 
716 	mddev_init(new);
717 
718 	goto retry;
719 }
720 
721 static struct attribute_group md_redundancy_group;
722 
723 void mddev_unlock(struct mddev *mddev)
724 {
725 	if (mddev->to_remove) {
726 		/* These cannot be removed under reconfig_mutex as
727 		 * an access to the files will try to take reconfig_mutex
728 		 * while holding the file unremovable, which leads to
729 		 * a deadlock.
730 		 * So hold set sysfs_active while the remove in happeing,
731 		 * and anything else which might set ->to_remove or my
732 		 * otherwise change the sysfs namespace will fail with
733 		 * -EBUSY if sysfs_active is still set.
734 		 * We set sysfs_active under reconfig_mutex and elsewhere
735 		 * test it under the same mutex to ensure its correct value
736 		 * is seen.
737 		 */
738 		struct attribute_group *to_remove = mddev->to_remove;
739 		mddev->to_remove = NULL;
740 		mddev->sysfs_active = 1;
741 		mutex_unlock(&mddev->reconfig_mutex);
742 
743 		if (mddev->kobj.sd) {
744 			if (to_remove != &md_redundancy_group)
745 				sysfs_remove_group(&mddev->kobj, to_remove);
746 			if (mddev->pers == NULL ||
747 			    mddev->pers->sync_request == NULL) {
748 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
749 				if (mddev->sysfs_action)
750 					sysfs_put(mddev->sysfs_action);
751 				mddev->sysfs_action = NULL;
752 			}
753 		}
754 		mddev->sysfs_active = 0;
755 	} else
756 		mutex_unlock(&mddev->reconfig_mutex);
757 
758 	/* As we've dropped the mutex we need a spinlock to
759 	 * make sure the thread doesn't disappear
760 	 */
761 	spin_lock(&pers_lock);
762 	md_wakeup_thread(mddev->thread);
763 	wake_up(&mddev->sb_wait);
764 	spin_unlock(&pers_lock);
765 }
766 EXPORT_SYMBOL_GPL(mddev_unlock);
767 
768 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
769 {
770 	struct md_rdev *rdev;
771 
772 	rdev_for_each_rcu(rdev, mddev)
773 		if (rdev->desc_nr == nr)
774 			return rdev;
775 
776 	return NULL;
777 }
778 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
779 
780 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
781 {
782 	struct md_rdev *rdev;
783 
784 	rdev_for_each(rdev, mddev)
785 		if (rdev->bdev->bd_dev == dev)
786 			return rdev;
787 
788 	return NULL;
789 }
790 
791 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
792 {
793 	struct md_rdev *rdev;
794 
795 	rdev_for_each_rcu(rdev, mddev)
796 		if (rdev->bdev->bd_dev == dev)
797 			return rdev;
798 
799 	return NULL;
800 }
801 EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
802 
803 static struct md_personality *find_pers(int level, char *clevel)
804 {
805 	struct md_personality *pers;
806 	list_for_each_entry(pers, &pers_list, list) {
807 		if (level != LEVEL_NONE && pers->level == level)
808 			return pers;
809 		if (strcmp(pers->name, clevel)==0)
810 			return pers;
811 	}
812 	return NULL;
813 }
814 
815 /* return the offset of the super block in 512byte sectors */
816 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
817 {
818 	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
819 	return MD_NEW_SIZE_SECTORS(num_sectors);
820 }
821 
822 static int alloc_disk_sb(struct md_rdev *rdev)
823 {
824 	rdev->sb_page = alloc_page(GFP_KERNEL);
825 	if (!rdev->sb_page)
826 		return -ENOMEM;
827 	return 0;
828 }
829 
830 void md_rdev_clear(struct md_rdev *rdev)
831 {
832 	if (rdev->sb_page) {
833 		put_page(rdev->sb_page);
834 		rdev->sb_loaded = 0;
835 		rdev->sb_page = NULL;
836 		rdev->sb_start = 0;
837 		rdev->sectors = 0;
838 	}
839 	if (rdev->bb_page) {
840 		put_page(rdev->bb_page);
841 		rdev->bb_page = NULL;
842 	}
843 	badblocks_exit(&rdev->badblocks);
844 }
845 EXPORT_SYMBOL_GPL(md_rdev_clear);
846 
847 static void super_written(struct bio *bio)
848 {
849 	struct md_rdev *rdev = bio->bi_private;
850 	struct mddev *mddev = rdev->mddev;
851 
852 	if (bio->bi_status) {
853 		pr_err("md: super_written gets error=%d\n", bio->bi_status);
854 		md_error(mddev, rdev);
855 		if (!test_bit(Faulty, &rdev->flags)
856 		    && (bio->bi_opf & MD_FAILFAST)) {
857 			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
858 			set_bit(LastDev, &rdev->flags);
859 		}
860 	} else
861 		clear_bit(LastDev, &rdev->flags);
862 
863 	if (atomic_dec_and_test(&mddev->pending_writes))
864 		wake_up(&mddev->sb_wait);
865 	rdev_dec_pending(rdev, mddev);
866 	bio_put(bio);
867 }
868 
869 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
870 		   sector_t sector, int size, struct page *page)
871 {
872 	/* write first size bytes of page to sector of rdev
873 	 * Increment mddev->pending_writes before returning
874 	 * and decrement it on completion, waking up sb_wait
875 	 * if zero is reached.
876 	 * If an error occurred, call md_error
877 	 */
878 	struct bio *bio;
879 	int ff = 0;
880 
881 	if (!page)
882 		return;
883 
884 	if (test_bit(Faulty, &rdev->flags))
885 		return;
886 
887 	bio = md_bio_alloc_sync(mddev);
888 
889 	atomic_inc(&rdev->nr_pending);
890 
891 	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
892 	bio->bi_iter.bi_sector = sector;
893 	bio_add_page(bio, page, size, 0);
894 	bio->bi_private = rdev;
895 	bio->bi_end_io = super_written;
896 
897 	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
898 	    test_bit(FailFast, &rdev->flags) &&
899 	    !test_bit(LastDev, &rdev->flags))
900 		ff = MD_FAILFAST;
901 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
902 
903 	atomic_inc(&mddev->pending_writes);
904 	submit_bio(bio);
905 }
906 
907 int md_super_wait(struct mddev *mddev)
908 {
909 	/* wait for all superblock writes that were scheduled to complete */
910 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
911 	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
912 		return -EAGAIN;
913 	return 0;
914 }
915 
916 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
917 		 struct page *page, int op, int op_flags, bool metadata_op)
918 {
919 	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
920 	int ret;
921 
922 	if (metadata_op && rdev->meta_bdev)
923 		bio_set_dev(bio, rdev->meta_bdev);
924 	else
925 		bio_set_dev(bio, rdev->bdev);
926 	bio_set_op_attrs(bio, op, op_flags);
927 	if (metadata_op)
928 		bio->bi_iter.bi_sector = sector + rdev->sb_start;
929 	else if (rdev->mddev->reshape_position != MaxSector &&
930 		 (rdev->mddev->reshape_backwards ==
931 		  (sector >= rdev->mddev->reshape_position)))
932 		bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
933 	else
934 		bio->bi_iter.bi_sector = sector + rdev->data_offset;
935 	bio_add_page(bio, page, size, 0);
936 
937 	submit_bio_wait(bio);
938 
939 	ret = !bio->bi_status;
940 	bio_put(bio);
941 	return ret;
942 }
943 EXPORT_SYMBOL_GPL(sync_page_io);
944 
945 static int read_disk_sb(struct md_rdev *rdev, int size)
946 {
947 	char b[BDEVNAME_SIZE];
948 
949 	if (rdev->sb_loaded)
950 		return 0;
951 
952 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
953 		goto fail;
954 	rdev->sb_loaded = 1;
955 	return 0;
956 
957 fail:
958 	pr_err("md: disabled device %s, could not read superblock.\n",
959 	       bdevname(rdev->bdev,b));
960 	return -EINVAL;
961 }
962 
963 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
964 {
965 	return	sb1->set_uuid0 == sb2->set_uuid0 &&
966 		sb1->set_uuid1 == sb2->set_uuid1 &&
967 		sb1->set_uuid2 == sb2->set_uuid2 &&
968 		sb1->set_uuid3 == sb2->set_uuid3;
969 }
970 
971 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
972 {
973 	int ret;
974 	mdp_super_t *tmp1, *tmp2;
975 
976 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
977 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
978 
979 	if (!tmp1 || !tmp2) {
980 		ret = 0;
981 		goto abort;
982 	}
983 
984 	*tmp1 = *sb1;
985 	*tmp2 = *sb2;
986 
987 	/*
988 	 * nr_disks is not constant
989 	 */
990 	tmp1->nr_disks = 0;
991 	tmp2->nr_disks = 0;
992 
993 	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
994 abort:
995 	kfree(tmp1);
996 	kfree(tmp2);
997 	return ret;
998 }
999 
1000 static u32 md_csum_fold(u32 csum)
1001 {
1002 	csum = (csum & 0xffff) + (csum >> 16);
1003 	return (csum & 0xffff) + (csum >> 16);
1004 }
1005 
1006 static unsigned int calc_sb_csum(mdp_super_t *sb)
1007 {
1008 	u64 newcsum = 0;
1009 	u32 *sb32 = (u32*)sb;
1010 	int i;
1011 	unsigned int disk_csum, csum;
1012 
1013 	disk_csum = sb->sb_csum;
1014 	sb->sb_csum = 0;
1015 
1016 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
1017 		newcsum += sb32[i];
1018 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
1019 
1020 #ifdef CONFIG_ALPHA
1021 	/* This used to use csum_partial, which was wrong for several
1022 	 * reasons including that different results are returned on
1023 	 * different architectures.  It isn't critical that we get exactly
1024 	 * the same return value as before (we always csum_fold before
1025 	 * testing, and that removes any differences).  However as we
1026 	 * know that csum_partial always returned a 16bit value on
1027 	 * alphas, do a fold to maximise conformity to previous behaviour.
1028 	 */
1029 	sb->sb_csum = md_csum_fold(disk_csum);
1030 #else
1031 	sb->sb_csum = disk_csum;
1032 #endif
1033 	return csum;
1034 }
1035 
1036 /*
1037  * Handle superblock details.
1038  * We want to be able to handle multiple superblock formats
1039  * so we have a common interface to them all, and an array of
1040  * different handlers.
1041  * We rely on user-space to write the initial superblock, and support
1042  * reading and updating of superblocks.
1043  * Interface methods are:
1044  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1045  *      loads and validates a superblock on dev.
1046  *      if refdev != NULL, compare superblocks on both devices
1047  *    Return:
1048  *      0 - dev has a superblock that is compatible with refdev
1049  *      1 - dev has a superblock that is compatible and newer than refdev
1050  *          so dev should be used as the refdev in future
1051  *     -EINVAL superblock incompatible or invalid
1052  *     -othererror e.g. -EIO
1053  *
1054  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1055  *      Verify that dev is acceptable into mddev.
1056  *       The first time, mddev->raid_disks will be 0, and data from
1057  *       dev should be merged in.  Subsequent calls check that dev
1058  *       is new enough.  Return 0 or -EINVAL
1059  *
1060  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1061  *     Update the superblock for rdev with data in mddev
1062  *     This does not write to disc.
1063  *
1064  */
1065 
1066 struct super_type  {
1067 	char		    *name;
1068 	struct module	    *owner;
1069 	int		    (*load_super)(struct md_rdev *rdev,
1070 					  struct md_rdev *refdev,
1071 					  int minor_version);
1072 	int		    (*validate_super)(struct mddev *mddev,
1073 					      struct md_rdev *rdev);
1074 	void		    (*sync_super)(struct mddev *mddev,
1075 					  struct md_rdev *rdev);
1076 	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1077 						sector_t num_sectors);
1078 	int		    (*allow_new_offset)(struct md_rdev *rdev,
1079 						unsigned long long new_offset);
1080 };
1081 
1082 /*
1083  * Check that the given mddev has no bitmap.
1084  *
1085  * This function is called from the run method of all personalities that do not
1086  * support bitmaps. It prints an error message and returns non-zero if mddev
1087  * has a bitmap. Otherwise, it returns 0.
1088  *
1089  */
1090 int md_check_no_bitmap(struct mddev *mddev)
1091 {
1092 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1093 		return 0;
1094 	pr_warn("%s: bitmaps are not supported for %s\n",
1095 		mdname(mddev), mddev->pers->name);
1096 	return 1;
1097 }
1098 EXPORT_SYMBOL(md_check_no_bitmap);
1099 
1100 /*
1101  * load_super for 0.90.0
1102  */
1103 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1104 {
1105 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1106 	mdp_super_t *sb;
1107 	int ret;
1108 	bool spare_disk = true;
1109 
1110 	/*
1111 	 * Calculate the position of the superblock (512byte sectors),
1112 	 * it's at the end of the disk.
1113 	 *
1114 	 * It also happens to be a multiple of 4Kb.
1115 	 */
1116 	rdev->sb_start = calc_dev_sboffset(rdev);
1117 
1118 	ret = read_disk_sb(rdev, MD_SB_BYTES);
1119 	if (ret)
1120 		return ret;
1121 
1122 	ret = -EINVAL;
1123 
1124 	bdevname(rdev->bdev, b);
1125 	sb = page_address(rdev->sb_page);
1126 
1127 	if (sb->md_magic != MD_SB_MAGIC) {
1128 		pr_warn("md: invalid raid superblock magic on %s\n", b);
1129 		goto abort;
1130 	}
1131 
1132 	if (sb->major_version != 0 ||
1133 	    sb->minor_version < 90 ||
1134 	    sb->minor_version > 91) {
1135 		pr_warn("Bad version number %d.%d on %s\n",
1136 			sb->major_version, sb->minor_version, b);
1137 		goto abort;
1138 	}
1139 
1140 	if (sb->raid_disks <= 0)
1141 		goto abort;
1142 
1143 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1144 		pr_warn("md: invalid superblock checksum on %s\n", b);
1145 		goto abort;
1146 	}
1147 
1148 	rdev->preferred_minor = sb->md_minor;
1149 	rdev->data_offset = 0;
1150 	rdev->new_data_offset = 0;
1151 	rdev->sb_size = MD_SB_BYTES;
1152 	rdev->badblocks.shift = -1;
1153 
1154 	if (sb->level == LEVEL_MULTIPATH)
1155 		rdev->desc_nr = -1;
1156 	else
1157 		rdev->desc_nr = sb->this_disk.number;
1158 
1159 	/* not spare disk, or LEVEL_MULTIPATH */
1160 	if (sb->level == LEVEL_MULTIPATH ||
1161 		(rdev->desc_nr >= 0 &&
1162 		 sb->disks[rdev->desc_nr].state &
1163 		 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1164 		spare_disk = false;
1165 
1166 	if (!refdev) {
1167 		if (!spare_disk)
1168 			ret = 1;
1169 		else
1170 			ret = 0;
1171 	} else {
1172 		__u64 ev1, ev2;
1173 		mdp_super_t *refsb = page_address(refdev->sb_page);
1174 		if (!md_uuid_equal(refsb, sb)) {
1175 			pr_warn("md: %s has different UUID to %s\n",
1176 				b, bdevname(refdev->bdev,b2));
1177 			goto abort;
1178 		}
1179 		if (!md_sb_equal(refsb, sb)) {
1180 			pr_warn("md: %s has same UUID but different superblock to %s\n",
1181 				b, bdevname(refdev->bdev, b2));
1182 			goto abort;
1183 		}
1184 		ev1 = md_event(sb);
1185 		ev2 = md_event(refsb);
1186 
1187 		if (!spare_disk && ev1 > ev2)
1188 			ret = 1;
1189 		else
1190 			ret = 0;
1191 	}
1192 	rdev->sectors = rdev->sb_start;
1193 	/* Limit to 4TB as metadata cannot record more than that.
1194 	 * (not needed for Linear and RAID0 as metadata doesn't
1195 	 * record this size)
1196 	 */
1197 	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1198 		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1199 
1200 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1201 		/* "this cannot possibly happen" ... */
1202 		ret = -EINVAL;
1203 
1204  abort:
1205 	return ret;
1206 }
1207 
1208 /*
1209  * validate_super for 0.90.0
1210  */
1211 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1212 {
1213 	mdp_disk_t *desc;
1214 	mdp_super_t *sb = page_address(rdev->sb_page);
1215 	__u64 ev1 = md_event(sb);
1216 
1217 	rdev->raid_disk = -1;
1218 	clear_bit(Faulty, &rdev->flags);
1219 	clear_bit(In_sync, &rdev->flags);
1220 	clear_bit(Bitmap_sync, &rdev->flags);
1221 	clear_bit(WriteMostly, &rdev->flags);
1222 
1223 	if (mddev->raid_disks == 0) {
1224 		mddev->major_version = 0;
1225 		mddev->minor_version = sb->minor_version;
1226 		mddev->patch_version = sb->patch_version;
1227 		mddev->external = 0;
1228 		mddev->chunk_sectors = sb->chunk_size >> 9;
1229 		mddev->ctime = sb->ctime;
1230 		mddev->utime = sb->utime;
1231 		mddev->level = sb->level;
1232 		mddev->clevel[0] = 0;
1233 		mddev->layout = sb->layout;
1234 		mddev->raid_disks = sb->raid_disks;
1235 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1236 		mddev->events = ev1;
1237 		mddev->bitmap_info.offset = 0;
1238 		mddev->bitmap_info.space = 0;
1239 		/* bitmap can use 60 K after the 4K superblocks */
1240 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1241 		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1242 		mddev->reshape_backwards = 0;
1243 
1244 		if (mddev->minor_version >= 91) {
1245 			mddev->reshape_position = sb->reshape_position;
1246 			mddev->delta_disks = sb->delta_disks;
1247 			mddev->new_level = sb->new_level;
1248 			mddev->new_layout = sb->new_layout;
1249 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1250 			if (mddev->delta_disks < 0)
1251 				mddev->reshape_backwards = 1;
1252 		} else {
1253 			mddev->reshape_position = MaxSector;
1254 			mddev->delta_disks = 0;
1255 			mddev->new_level = mddev->level;
1256 			mddev->new_layout = mddev->layout;
1257 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1258 		}
1259 		if (mddev->level == 0)
1260 			mddev->layout = -1;
1261 
1262 		if (sb->state & (1<<MD_SB_CLEAN))
1263 			mddev->recovery_cp = MaxSector;
1264 		else {
1265 			if (sb->events_hi == sb->cp_events_hi &&
1266 				sb->events_lo == sb->cp_events_lo) {
1267 				mddev->recovery_cp = sb->recovery_cp;
1268 			} else
1269 				mddev->recovery_cp = 0;
1270 		}
1271 
1272 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1273 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1274 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1275 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1276 
1277 		mddev->max_disks = MD_SB_DISKS;
1278 
1279 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1280 		    mddev->bitmap_info.file == NULL) {
1281 			mddev->bitmap_info.offset =
1282 				mddev->bitmap_info.default_offset;
1283 			mddev->bitmap_info.space =
1284 				mddev->bitmap_info.default_space;
1285 		}
1286 
1287 	} else if (mddev->pers == NULL) {
1288 		/* Insist on good event counter while assembling, except
1289 		 * for spares (which don't need an event count) */
1290 		++ev1;
1291 		if (sb->disks[rdev->desc_nr].state & (
1292 			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1293 			if (ev1 < mddev->events)
1294 				return -EINVAL;
1295 	} else if (mddev->bitmap) {
1296 		/* if adding to array with a bitmap, then we can accept an
1297 		 * older device ... but not too old.
1298 		 */
1299 		if (ev1 < mddev->bitmap->events_cleared)
1300 			return 0;
1301 		if (ev1 < mddev->events)
1302 			set_bit(Bitmap_sync, &rdev->flags);
1303 	} else {
1304 		if (ev1 < mddev->events)
1305 			/* just a hot-add of a new device, leave raid_disk at -1 */
1306 			return 0;
1307 	}
1308 
1309 	if (mddev->level != LEVEL_MULTIPATH) {
1310 		desc = sb->disks + rdev->desc_nr;
1311 
1312 		if (desc->state & (1<<MD_DISK_FAULTY))
1313 			set_bit(Faulty, &rdev->flags);
1314 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1315 			    desc->raid_disk < mddev->raid_disks */) {
1316 			set_bit(In_sync, &rdev->flags);
1317 			rdev->raid_disk = desc->raid_disk;
1318 			rdev->saved_raid_disk = desc->raid_disk;
1319 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1320 			/* active but not in sync implies recovery up to
1321 			 * reshape position.  We don't know exactly where
1322 			 * that is, so set to zero for now */
1323 			if (mddev->minor_version >= 91) {
1324 				rdev->recovery_offset = 0;
1325 				rdev->raid_disk = desc->raid_disk;
1326 			}
1327 		}
1328 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1329 			set_bit(WriteMostly, &rdev->flags);
1330 		if (desc->state & (1<<MD_DISK_FAILFAST))
1331 			set_bit(FailFast, &rdev->flags);
1332 	} else /* MULTIPATH are always insync */
1333 		set_bit(In_sync, &rdev->flags);
1334 	return 0;
1335 }
1336 
1337 /*
1338  * sync_super for 0.90.0
1339  */
1340 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1341 {
1342 	mdp_super_t *sb;
1343 	struct md_rdev *rdev2;
1344 	int next_spare = mddev->raid_disks;
1345 
1346 	/* make rdev->sb match mddev data..
1347 	 *
1348 	 * 1/ zero out disks
1349 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1350 	 * 3/ any empty disks < next_spare become removed
1351 	 *
1352 	 * disks[0] gets initialised to REMOVED because
1353 	 * we cannot be sure from other fields if it has
1354 	 * been initialised or not.
1355 	 */
1356 	int i;
1357 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1358 
1359 	rdev->sb_size = MD_SB_BYTES;
1360 
1361 	sb = page_address(rdev->sb_page);
1362 
1363 	memset(sb, 0, sizeof(*sb));
1364 
1365 	sb->md_magic = MD_SB_MAGIC;
1366 	sb->major_version = mddev->major_version;
1367 	sb->patch_version = mddev->patch_version;
1368 	sb->gvalid_words  = 0; /* ignored */
1369 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1370 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1371 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1372 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1373 
1374 	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1375 	sb->level = mddev->level;
1376 	sb->size = mddev->dev_sectors / 2;
1377 	sb->raid_disks = mddev->raid_disks;
1378 	sb->md_minor = mddev->md_minor;
1379 	sb->not_persistent = 0;
1380 	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1381 	sb->state = 0;
1382 	sb->events_hi = (mddev->events>>32);
1383 	sb->events_lo = (u32)mddev->events;
1384 
1385 	if (mddev->reshape_position == MaxSector)
1386 		sb->minor_version = 90;
1387 	else {
1388 		sb->minor_version = 91;
1389 		sb->reshape_position = mddev->reshape_position;
1390 		sb->new_level = mddev->new_level;
1391 		sb->delta_disks = mddev->delta_disks;
1392 		sb->new_layout = mddev->new_layout;
1393 		sb->new_chunk = mddev->new_chunk_sectors << 9;
1394 	}
1395 	mddev->minor_version = sb->minor_version;
1396 	if (mddev->in_sync)
1397 	{
1398 		sb->recovery_cp = mddev->recovery_cp;
1399 		sb->cp_events_hi = (mddev->events>>32);
1400 		sb->cp_events_lo = (u32)mddev->events;
1401 		if (mddev->recovery_cp == MaxSector)
1402 			sb->state = (1<< MD_SB_CLEAN);
1403 	} else
1404 		sb->recovery_cp = 0;
1405 
1406 	sb->layout = mddev->layout;
1407 	sb->chunk_size = mddev->chunk_sectors << 9;
1408 
1409 	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1410 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1411 
1412 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1413 	rdev_for_each(rdev2, mddev) {
1414 		mdp_disk_t *d;
1415 		int desc_nr;
1416 		int is_active = test_bit(In_sync, &rdev2->flags);
1417 
1418 		if (rdev2->raid_disk >= 0 &&
1419 		    sb->minor_version >= 91)
1420 			/* we have nowhere to store the recovery_offset,
1421 			 * but if it is not below the reshape_position,
1422 			 * we can piggy-back on that.
1423 			 */
1424 			is_active = 1;
1425 		if (rdev2->raid_disk < 0 ||
1426 		    test_bit(Faulty, &rdev2->flags))
1427 			is_active = 0;
1428 		if (is_active)
1429 			desc_nr = rdev2->raid_disk;
1430 		else
1431 			desc_nr = next_spare++;
1432 		rdev2->desc_nr = desc_nr;
1433 		d = &sb->disks[rdev2->desc_nr];
1434 		nr_disks++;
1435 		d->number = rdev2->desc_nr;
1436 		d->major = MAJOR(rdev2->bdev->bd_dev);
1437 		d->minor = MINOR(rdev2->bdev->bd_dev);
1438 		if (is_active)
1439 			d->raid_disk = rdev2->raid_disk;
1440 		else
1441 			d->raid_disk = rdev2->desc_nr; /* compatibility */
1442 		if (test_bit(Faulty, &rdev2->flags))
1443 			d->state = (1<<MD_DISK_FAULTY);
1444 		else if (is_active) {
1445 			d->state = (1<<MD_DISK_ACTIVE);
1446 			if (test_bit(In_sync, &rdev2->flags))
1447 				d->state |= (1<<MD_DISK_SYNC);
1448 			active++;
1449 			working++;
1450 		} else {
1451 			d->state = 0;
1452 			spare++;
1453 			working++;
1454 		}
1455 		if (test_bit(WriteMostly, &rdev2->flags))
1456 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1457 		if (test_bit(FailFast, &rdev2->flags))
1458 			d->state |= (1<<MD_DISK_FAILFAST);
1459 	}
1460 	/* now set the "removed" and "faulty" bits on any missing devices */
1461 	for (i=0 ; i < mddev->raid_disks ; i++) {
1462 		mdp_disk_t *d = &sb->disks[i];
1463 		if (d->state == 0 && d->number == 0) {
1464 			d->number = i;
1465 			d->raid_disk = i;
1466 			d->state = (1<<MD_DISK_REMOVED);
1467 			d->state |= (1<<MD_DISK_FAULTY);
1468 			failed++;
1469 		}
1470 	}
1471 	sb->nr_disks = nr_disks;
1472 	sb->active_disks = active;
1473 	sb->working_disks = working;
1474 	sb->failed_disks = failed;
1475 	sb->spare_disks = spare;
1476 
1477 	sb->this_disk = sb->disks[rdev->desc_nr];
1478 	sb->sb_csum = calc_sb_csum(sb);
1479 }
1480 
1481 /*
1482  * rdev_size_change for 0.90.0
1483  */
1484 static unsigned long long
1485 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1486 {
1487 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1488 		return 0; /* component must fit device */
1489 	if (rdev->mddev->bitmap_info.offset)
1490 		return 0; /* can't move bitmap */
1491 	rdev->sb_start = calc_dev_sboffset(rdev);
1492 	if (!num_sectors || num_sectors > rdev->sb_start)
1493 		num_sectors = rdev->sb_start;
1494 	/* Limit to 4TB as metadata cannot record more than that.
1495 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1496 	 */
1497 	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1498 		num_sectors = (sector_t)(2ULL << 32) - 2;
1499 	do {
1500 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1501 		       rdev->sb_page);
1502 	} while (md_super_wait(rdev->mddev) < 0);
1503 	return num_sectors;
1504 }
1505 
1506 static int
1507 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1508 {
1509 	/* non-zero offset changes not possible with v0.90 */
1510 	return new_offset == 0;
1511 }
1512 
1513 /*
1514  * version 1 superblock
1515  */
1516 
1517 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1518 {
1519 	__le32 disk_csum;
1520 	u32 csum;
1521 	unsigned long long newcsum;
1522 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1523 	__le32 *isuper = (__le32*)sb;
1524 
1525 	disk_csum = sb->sb_csum;
1526 	sb->sb_csum = 0;
1527 	newcsum = 0;
1528 	for (; size >= 4; size -= 4)
1529 		newcsum += le32_to_cpu(*isuper++);
1530 
1531 	if (size == 2)
1532 		newcsum += le16_to_cpu(*(__le16*) isuper);
1533 
1534 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1535 	sb->sb_csum = disk_csum;
1536 	return cpu_to_le32(csum);
1537 }
1538 
1539 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1540 {
1541 	struct mdp_superblock_1 *sb;
1542 	int ret;
1543 	sector_t sb_start;
1544 	sector_t sectors;
1545 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1546 	int bmask;
1547 	bool spare_disk = true;
1548 
1549 	/*
1550 	 * Calculate the position of the superblock in 512byte sectors.
1551 	 * It is always aligned to a 4K boundary and
1552 	 * depeding on minor_version, it can be:
1553 	 * 0: At least 8K, but less than 12K, from end of device
1554 	 * 1: At start of device
1555 	 * 2: 4K from start of device.
1556 	 */
1557 	switch(minor_version) {
1558 	case 0:
1559 		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1560 		sb_start -= 8*2;
1561 		sb_start &= ~(sector_t)(4*2-1);
1562 		break;
1563 	case 1:
1564 		sb_start = 0;
1565 		break;
1566 	case 2:
1567 		sb_start = 8;
1568 		break;
1569 	default:
1570 		return -EINVAL;
1571 	}
1572 	rdev->sb_start = sb_start;
1573 
1574 	/* superblock is rarely larger than 1K, but it can be larger,
1575 	 * and it is safe to read 4k, so we do that
1576 	 */
1577 	ret = read_disk_sb(rdev, 4096);
1578 	if (ret) return ret;
1579 
1580 	sb = page_address(rdev->sb_page);
1581 
1582 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1583 	    sb->major_version != cpu_to_le32(1) ||
1584 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1585 	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1586 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1587 		return -EINVAL;
1588 
1589 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1590 		pr_warn("md: invalid superblock checksum on %s\n",
1591 			bdevname(rdev->bdev,b));
1592 		return -EINVAL;
1593 	}
1594 	if (le64_to_cpu(sb->data_size) < 10) {
1595 		pr_warn("md: data_size too small on %s\n",
1596 			bdevname(rdev->bdev,b));
1597 		return -EINVAL;
1598 	}
1599 	if (sb->pad0 ||
1600 	    sb->pad3[0] ||
1601 	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1602 		/* Some padding is non-zero, might be a new feature */
1603 		return -EINVAL;
1604 
1605 	rdev->preferred_minor = 0xffff;
1606 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1607 	rdev->new_data_offset = rdev->data_offset;
1608 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1609 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1610 		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1611 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1612 
1613 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1614 	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1615 	if (rdev->sb_size & bmask)
1616 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1617 
1618 	if (minor_version
1619 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1620 		return -EINVAL;
1621 	if (minor_version
1622 	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1623 		return -EINVAL;
1624 
1625 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1626 		rdev->desc_nr = -1;
1627 	else
1628 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1629 
1630 	if (!rdev->bb_page) {
1631 		rdev->bb_page = alloc_page(GFP_KERNEL);
1632 		if (!rdev->bb_page)
1633 			return -ENOMEM;
1634 	}
1635 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1636 	    rdev->badblocks.count == 0) {
1637 		/* need to load the bad block list.
1638 		 * Currently we limit it to one page.
1639 		 */
1640 		s32 offset;
1641 		sector_t bb_sector;
1642 		__le64 *bbp;
1643 		int i;
1644 		int sectors = le16_to_cpu(sb->bblog_size);
1645 		if (sectors > (PAGE_SIZE / 512))
1646 			return -EINVAL;
1647 		offset = le32_to_cpu(sb->bblog_offset);
1648 		if (offset == 0)
1649 			return -EINVAL;
1650 		bb_sector = (long long)offset;
1651 		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1652 				  rdev->bb_page, REQ_OP_READ, 0, true))
1653 			return -EIO;
1654 		bbp = (__le64 *)page_address(rdev->bb_page);
1655 		rdev->badblocks.shift = sb->bblog_shift;
1656 		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1657 			u64 bb = le64_to_cpu(*bbp);
1658 			int count = bb & (0x3ff);
1659 			u64 sector = bb >> 10;
1660 			sector <<= sb->bblog_shift;
1661 			count <<= sb->bblog_shift;
1662 			if (bb + 1 == 0)
1663 				break;
1664 			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1665 				return -EINVAL;
1666 		}
1667 	} else if (sb->bblog_offset != 0)
1668 		rdev->badblocks.shift = 0;
1669 
1670 	if ((le32_to_cpu(sb->feature_map) &
1671 	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1672 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1673 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1674 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1675 	}
1676 
1677 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1678 	    sb->level != 0)
1679 		return -EINVAL;
1680 
1681 	/* not spare disk, or LEVEL_MULTIPATH */
1682 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1683 		(rdev->desc_nr >= 0 &&
1684 		rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1685 		(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1686 		 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1687 		spare_disk = false;
1688 
1689 	if (!refdev) {
1690 		if (!spare_disk)
1691 			ret = 1;
1692 		else
1693 			ret = 0;
1694 	} else {
1695 		__u64 ev1, ev2;
1696 		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1697 
1698 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1699 		    sb->level != refsb->level ||
1700 		    sb->layout != refsb->layout ||
1701 		    sb->chunksize != refsb->chunksize) {
1702 			pr_warn("md: %s has strangely different superblock to %s\n",
1703 				bdevname(rdev->bdev,b),
1704 				bdevname(refdev->bdev,b2));
1705 			return -EINVAL;
1706 		}
1707 		ev1 = le64_to_cpu(sb->events);
1708 		ev2 = le64_to_cpu(refsb->events);
1709 
1710 		if (!spare_disk && ev1 > ev2)
1711 			ret = 1;
1712 		else
1713 			ret = 0;
1714 	}
1715 	if (minor_version) {
1716 		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1717 		sectors -= rdev->data_offset;
1718 	} else
1719 		sectors = rdev->sb_start;
1720 	if (sectors < le64_to_cpu(sb->data_size))
1721 		return -EINVAL;
1722 	rdev->sectors = le64_to_cpu(sb->data_size);
1723 	return ret;
1724 }
1725 
1726 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1727 {
1728 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1729 	__u64 ev1 = le64_to_cpu(sb->events);
1730 
1731 	rdev->raid_disk = -1;
1732 	clear_bit(Faulty, &rdev->flags);
1733 	clear_bit(In_sync, &rdev->flags);
1734 	clear_bit(Bitmap_sync, &rdev->flags);
1735 	clear_bit(WriteMostly, &rdev->flags);
1736 
1737 	if (mddev->raid_disks == 0) {
1738 		mddev->major_version = 1;
1739 		mddev->patch_version = 0;
1740 		mddev->external = 0;
1741 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1742 		mddev->ctime = le64_to_cpu(sb->ctime);
1743 		mddev->utime = le64_to_cpu(sb->utime);
1744 		mddev->level = le32_to_cpu(sb->level);
1745 		mddev->clevel[0] = 0;
1746 		mddev->layout = le32_to_cpu(sb->layout);
1747 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1748 		mddev->dev_sectors = le64_to_cpu(sb->size);
1749 		mddev->events = ev1;
1750 		mddev->bitmap_info.offset = 0;
1751 		mddev->bitmap_info.space = 0;
1752 		/* Default location for bitmap is 1K after superblock
1753 		 * using 3K - total of 4K
1754 		 */
1755 		mddev->bitmap_info.default_offset = 1024 >> 9;
1756 		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1757 		mddev->reshape_backwards = 0;
1758 
1759 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1760 		memcpy(mddev->uuid, sb->set_uuid, 16);
1761 
1762 		mddev->max_disks =  (4096-256)/2;
1763 
1764 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1765 		    mddev->bitmap_info.file == NULL) {
1766 			mddev->bitmap_info.offset =
1767 				(__s32)le32_to_cpu(sb->bitmap_offset);
1768 			/* Metadata doesn't record how much space is available.
1769 			 * For 1.0, we assume we can use up to the superblock
1770 			 * if before, else to 4K beyond superblock.
1771 			 * For others, assume no change is possible.
1772 			 */
1773 			if (mddev->minor_version > 0)
1774 				mddev->bitmap_info.space = 0;
1775 			else if (mddev->bitmap_info.offset > 0)
1776 				mddev->bitmap_info.space =
1777 					8 - mddev->bitmap_info.offset;
1778 			else
1779 				mddev->bitmap_info.space =
1780 					-mddev->bitmap_info.offset;
1781 		}
1782 
1783 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1784 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1785 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1786 			mddev->new_level = le32_to_cpu(sb->new_level);
1787 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1788 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1789 			if (mddev->delta_disks < 0 ||
1790 			    (mddev->delta_disks == 0 &&
1791 			     (le32_to_cpu(sb->feature_map)
1792 			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1793 				mddev->reshape_backwards = 1;
1794 		} else {
1795 			mddev->reshape_position = MaxSector;
1796 			mddev->delta_disks = 0;
1797 			mddev->new_level = mddev->level;
1798 			mddev->new_layout = mddev->layout;
1799 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1800 		}
1801 
1802 		if (mddev->level == 0 &&
1803 		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1804 			mddev->layout = -1;
1805 
1806 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1807 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1808 
1809 		if (le32_to_cpu(sb->feature_map) &
1810 		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1811 			if (le32_to_cpu(sb->feature_map) &
1812 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1813 				return -EINVAL;
1814 			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1815 			    (le32_to_cpu(sb->feature_map) &
1816 					    MD_FEATURE_MULTIPLE_PPLS))
1817 				return -EINVAL;
1818 			set_bit(MD_HAS_PPL, &mddev->flags);
1819 		}
1820 	} else if (mddev->pers == NULL) {
1821 		/* Insist of good event counter while assembling, except for
1822 		 * spares (which don't need an event count) */
1823 		++ev1;
1824 		if (rdev->desc_nr >= 0 &&
1825 		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1826 		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1827 		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1828 			if (ev1 < mddev->events)
1829 				return -EINVAL;
1830 	} else if (mddev->bitmap) {
1831 		/* If adding to array with a bitmap, then we can accept an
1832 		 * older device, but not too old.
1833 		 */
1834 		if (ev1 < mddev->bitmap->events_cleared)
1835 			return 0;
1836 		if (ev1 < mddev->events)
1837 			set_bit(Bitmap_sync, &rdev->flags);
1838 	} else {
1839 		if (ev1 < mddev->events)
1840 			/* just a hot-add of a new device, leave raid_disk at -1 */
1841 			return 0;
1842 	}
1843 	if (mddev->level != LEVEL_MULTIPATH) {
1844 		int role;
1845 		if (rdev->desc_nr < 0 ||
1846 		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1847 			role = MD_DISK_ROLE_SPARE;
1848 			rdev->desc_nr = -1;
1849 		} else
1850 			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1851 		switch(role) {
1852 		case MD_DISK_ROLE_SPARE: /* spare */
1853 			break;
1854 		case MD_DISK_ROLE_FAULTY: /* faulty */
1855 			set_bit(Faulty, &rdev->flags);
1856 			break;
1857 		case MD_DISK_ROLE_JOURNAL: /* journal device */
1858 			if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1859 				/* journal device without journal feature */
1860 				pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1861 				return -EINVAL;
1862 			}
1863 			set_bit(Journal, &rdev->flags);
1864 			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1865 			rdev->raid_disk = 0;
1866 			break;
1867 		default:
1868 			rdev->saved_raid_disk = role;
1869 			if ((le32_to_cpu(sb->feature_map) &
1870 			     MD_FEATURE_RECOVERY_OFFSET)) {
1871 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1872 				if (!(le32_to_cpu(sb->feature_map) &
1873 				      MD_FEATURE_RECOVERY_BITMAP))
1874 					rdev->saved_raid_disk = -1;
1875 			} else {
1876 				/*
1877 				 * If the array is FROZEN, then the device can't
1878 				 * be in_sync with rest of array.
1879 				 */
1880 				if (!test_bit(MD_RECOVERY_FROZEN,
1881 					      &mddev->recovery))
1882 					set_bit(In_sync, &rdev->flags);
1883 			}
1884 			rdev->raid_disk = role;
1885 			break;
1886 		}
1887 		if (sb->devflags & WriteMostly1)
1888 			set_bit(WriteMostly, &rdev->flags);
1889 		if (sb->devflags & FailFast1)
1890 			set_bit(FailFast, &rdev->flags);
1891 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1892 			set_bit(Replacement, &rdev->flags);
1893 	} else /* MULTIPATH are always insync */
1894 		set_bit(In_sync, &rdev->flags);
1895 
1896 	return 0;
1897 }
1898 
1899 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1900 {
1901 	struct mdp_superblock_1 *sb;
1902 	struct md_rdev *rdev2;
1903 	int max_dev, i;
1904 	/* make rdev->sb match mddev and rdev data. */
1905 
1906 	sb = page_address(rdev->sb_page);
1907 
1908 	sb->feature_map = 0;
1909 	sb->pad0 = 0;
1910 	sb->recovery_offset = cpu_to_le64(0);
1911 	memset(sb->pad3, 0, sizeof(sb->pad3));
1912 
1913 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1914 	sb->events = cpu_to_le64(mddev->events);
1915 	if (mddev->in_sync)
1916 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1917 	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1918 		sb->resync_offset = cpu_to_le64(MaxSector);
1919 	else
1920 		sb->resync_offset = cpu_to_le64(0);
1921 
1922 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1923 
1924 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1925 	sb->size = cpu_to_le64(mddev->dev_sectors);
1926 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1927 	sb->level = cpu_to_le32(mddev->level);
1928 	sb->layout = cpu_to_le32(mddev->layout);
1929 	if (test_bit(FailFast, &rdev->flags))
1930 		sb->devflags |= FailFast1;
1931 	else
1932 		sb->devflags &= ~FailFast1;
1933 
1934 	if (test_bit(WriteMostly, &rdev->flags))
1935 		sb->devflags |= WriteMostly1;
1936 	else
1937 		sb->devflags &= ~WriteMostly1;
1938 	sb->data_offset = cpu_to_le64(rdev->data_offset);
1939 	sb->data_size = cpu_to_le64(rdev->sectors);
1940 
1941 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1942 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1943 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1944 	}
1945 
1946 	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1947 	    !test_bit(In_sync, &rdev->flags)) {
1948 		sb->feature_map |=
1949 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1950 		sb->recovery_offset =
1951 			cpu_to_le64(rdev->recovery_offset);
1952 		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1953 			sb->feature_map |=
1954 				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1955 	}
1956 	/* Note: recovery_offset and journal_tail share space  */
1957 	if (test_bit(Journal, &rdev->flags))
1958 		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1959 	if (test_bit(Replacement, &rdev->flags))
1960 		sb->feature_map |=
1961 			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1962 
1963 	if (mddev->reshape_position != MaxSector) {
1964 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1965 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1966 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1967 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1968 		sb->new_level = cpu_to_le32(mddev->new_level);
1969 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1970 		if (mddev->delta_disks == 0 &&
1971 		    mddev->reshape_backwards)
1972 			sb->feature_map
1973 				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1974 		if (rdev->new_data_offset != rdev->data_offset) {
1975 			sb->feature_map
1976 				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1977 			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1978 							     - rdev->data_offset));
1979 		}
1980 	}
1981 
1982 	if (mddev_is_clustered(mddev))
1983 		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1984 
1985 	if (rdev->badblocks.count == 0)
1986 		/* Nothing to do for bad blocks*/ ;
1987 	else if (sb->bblog_offset == 0)
1988 		/* Cannot record bad blocks on this device */
1989 		md_error(mddev, rdev);
1990 	else {
1991 		struct badblocks *bb = &rdev->badblocks;
1992 		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
1993 		u64 *p = bb->page;
1994 		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1995 		if (bb->changed) {
1996 			unsigned seq;
1997 
1998 retry:
1999 			seq = read_seqbegin(&bb->lock);
2000 
2001 			memset(bbp, 0xff, PAGE_SIZE);
2002 
2003 			for (i = 0 ; i < bb->count ; i++) {
2004 				u64 internal_bb = p[i];
2005 				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2006 						| BB_LEN(internal_bb));
2007 				bbp[i] = cpu_to_le64(store_bb);
2008 			}
2009 			bb->changed = 0;
2010 			if (read_seqretry(&bb->lock, seq))
2011 				goto retry;
2012 
2013 			bb->sector = (rdev->sb_start +
2014 				      (int)le32_to_cpu(sb->bblog_offset));
2015 			bb->size = le16_to_cpu(sb->bblog_size);
2016 		}
2017 	}
2018 
2019 	max_dev = 0;
2020 	rdev_for_each(rdev2, mddev)
2021 		if (rdev2->desc_nr+1 > max_dev)
2022 			max_dev = rdev2->desc_nr+1;
2023 
2024 	if (max_dev > le32_to_cpu(sb->max_dev)) {
2025 		int bmask;
2026 		sb->max_dev = cpu_to_le32(max_dev);
2027 		rdev->sb_size = max_dev * 2 + 256;
2028 		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2029 		if (rdev->sb_size & bmask)
2030 			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2031 	} else
2032 		max_dev = le32_to_cpu(sb->max_dev);
2033 
2034 	for (i=0; i<max_dev;i++)
2035 		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2036 
2037 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2038 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2039 
2040 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2041 		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2042 			sb->feature_map |=
2043 			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2044 		else
2045 			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2046 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2047 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2048 	}
2049 
2050 	rdev_for_each(rdev2, mddev) {
2051 		i = rdev2->desc_nr;
2052 		if (test_bit(Faulty, &rdev2->flags))
2053 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2054 		else if (test_bit(In_sync, &rdev2->flags))
2055 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2056 		else if (test_bit(Journal, &rdev2->flags))
2057 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2058 		else if (rdev2->raid_disk >= 0)
2059 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2060 		else
2061 			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2062 	}
2063 
2064 	sb->sb_csum = calc_sb_1_csum(sb);
2065 }
2066 
2067 static unsigned long long
2068 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2069 {
2070 	struct mdp_superblock_1 *sb;
2071 	sector_t max_sectors;
2072 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2073 		return 0; /* component must fit device */
2074 	if (rdev->data_offset != rdev->new_data_offset)
2075 		return 0; /* too confusing */
2076 	if (rdev->sb_start < rdev->data_offset) {
2077 		/* minor versions 1 and 2; superblock before data */
2078 		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2079 		max_sectors -= rdev->data_offset;
2080 		if (!num_sectors || num_sectors > max_sectors)
2081 			num_sectors = max_sectors;
2082 	} else if (rdev->mddev->bitmap_info.offset) {
2083 		/* minor version 0 with bitmap we can't move */
2084 		return 0;
2085 	} else {
2086 		/* minor version 0; superblock after data */
2087 		sector_t sb_start;
2088 		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2089 		sb_start &= ~(sector_t)(4*2 - 1);
2090 		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2091 		if (!num_sectors || num_sectors > max_sectors)
2092 			num_sectors = max_sectors;
2093 		rdev->sb_start = sb_start;
2094 	}
2095 	sb = page_address(rdev->sb_page);
2096 	sb->data_size = cpu_to_le64(num_sectors);
2097 	sb->super_offset = cpu_to_le64(rdev->sb_start);
2098 	sb->sb_csum = calc_sb_1_csum(sb);
2099 	do {
2100 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2101 			       rdev->sb_page);
2102 	} while (md_super_wait(rdev->mddev) < 0);
2103 	return num_sectors;
2104 
2105 }
2106 
2107 static int
2108 super_1_allow_new_offset(struct md_rdev *rdev,
2109 			 unsigned long long new_offset)
2110 {
2111 	/* All necessary checks on new >= old have been done */
2112 	struct bitmap *bitmap;
2113 	if (new_offset >= rdev->data_offset)
2114 		return 1;
2115 
2116 	/* with 1.0 metadata, there is no metadata to tread on
2117 	 * so we can always move back */
2118 	if (rdev->mddev->minor_version == 0)
2119 		return 1;
2120 
2121 	/* otherwise we must be sure not to step on
2122 	 * any metadata, so stay:
2123 	 * 36K beyond start of superblock
2124 	 * beyond end of badblocks
2125 	 * beyond write-intent bitmap
2126 	 */
2127 	if (rdev->sb_start + (32+4)*2 > new_offset)
2128 		return 0;
2129 	bitmap = rdev->mddev->bitmap;
2130 	if (bitmap && !rdev->mddev->bitmap_info.file &&
2131 	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2132 	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2133 		return 0;
2134 	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2135 		return 0;
2136 
2137 	return 1;
2138 }
2139 
2140 static struct super_type super_types[] = {
2141 	[0] = {
2142 		.name	= "0.90.0",
2143 		.owner	= THIS_MODULE,
2144 		.load_super	    = super_90_load,
2145 		.validate_super	    = super_90_validate,
2146 		.sync_super	    = super_90_sync,
2147 		.rdev_size_change   = super_90_rdev_size_change,
2148 		.allow_new_offset   = super_90_allow_new_offset,
2149 	},
2150 	[1] = {
2151 		.name	= "md-1",
2152 		.owner	= THIS_MODULE,
2153 		.load_super	    = super_1_load,
2154 		.validate_super	    = super_1_validate,
2155 		.sync_super	    = super_1_sync,
2156 		.rdev_size_change   = super_1_rdev_size_change,
2157 		.allow_new_offset   = super_1_allow_new_offset,
2158 	},
2159 };
2160 
2161 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2162 {
2163 	if (mddev->sync_super) {
2164 		mddev->sync_super(mddev, rdev);
2165 		return;
2166 	}
2167 
2168 	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2169 
2170 	super_types[mddev->major_version].sync_super(mddev, rdev);
2171 }
2172 
2173 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2174 {
2175 	struct md_rdev *rdev, *rdev2;
2176 
2177 	rcu_read_lock();
2178 	rdev_for_each_rcu(rdev, mddev1) {
2179 		if (test_bit(Faulty, &rdev->flags) ||
2180 		    test_bit(Journal, &rdev->flags) ||
2181 		    rdev->raid_disk == -1)
2182 			continue;
2183 		rdev_for_each_rcu(rdev2, mddev2) {
2184 			if (test_bit(Faulty, &rdev2->flags) ||
2185 			    test_bit(Journal, &rdev2->flags) ||
2186 			    rdev2->raid_disk == -1)
2187 				continue;
2188 			if (rdev->bdev->bd_contains ==
2189 			    rdev2->bdev->bd_contains) {
2190 				rcu_read_unlock();
2191 				return 1;
2192 			}
2193 		}
2194 	}
2195 	rcu_read_unlock();
2196 	return 0;
2197 }
2198 
2199 static LIST_HEAD(pending_raid_disks);
2200 
2201 /*
2202  * Try to register data integrity profile for an mddev
2203  *
2204  * This is called when an array is started and after a disk has been kicked
2205  * from the array. It only succeeds if all working and active component devices
2206  * are integrity capable with matching profiles.
2207  */
2208 int md_integrity_register(struct mddev *mddev)
2209 {
2210 	struct md_rdev *rdev, *reference = NULL;
2211 
2212 	if (list_empty(&mddev->disks))
2213 		return 0; /* nothing to do */
2214 	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2215 		return 0; /* shouldn't register, or already is */
2216 	rdev_for_each(rdev, mddev) {
2217 		/* skip spares and non-functional disks */
2218 		if (test_bit(Faulty, &rdev->flags))
2219 			continue;
2220 		if (rdev->raid_disk < 0)
2221 			continue;
2222 		if (!reference) {
2223 			/* Use the first rdev as the reference */
2224 			reference = rdev;
2225 			continue;
2226 		}
2227 		/* does this rdev's profile match the reference profile? */
2228 		if (blk_integrity_compare(reference->bdev->bd_disk,
2229 				rdev->bdev->bd_disk) < 0)
2230 			return -EINVAL;
2231 	}
2232 	if (!reference || !bdev_get_integrity(reference->bdev))
2233 		return 0;
2234 	/*
2235 	 * All component devices are integrity capable and have matching
2236 	 * profiles, register the common profile for the md device.
2237 	 */
2238 	blk_integrity_register(mddev->gendisk,
2239 			       bdev_get_integrity(reference->bdev));
2240 
2241 	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2242 	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2243 		pr_err("md: failed to create integrity pool for %s\n",
2244 		       mdname(mddev));
2245 		return -EINVAL;
2246 	}
2247 	return 0;
2248 }
2249 EXPORT_SYMBOL(md_integrity_register);
2250 
2251 /*
2252  * Attempt to add an rdev, but only if it is consistent with the current
2253  * integrity profile
2254  */
2255 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2256 {
2257 	struct blk_integrity *bi_mddev;
2258 	char name[BDEVNAME_SIZE];
2259 
2260 	if (!mddev->gendisk)
2261 		return 0;
2262 
2263 	bi_mddev = blk_get_integrity(mddev->gendisk);
2264 
2265 	if (!bi_mddev) /* nothing to do */
2266 		return 0;
2267 
2268 	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2269 		pr_err("%s: incompatible integrity profile for %s\n",
2270 		       mdname(mddev), bdevname(rdev->bdev, name));
2271 		return -ENXIO;
2272 	}
2273 
2274 	return 0;
2275 }
2276 EXPORT_SYMBOL(md_integrity_add_rdev);
2277 
2278 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2279 {
2280 	char b[BDEVNAME_SIZE];
2281 	struct kobject *ko;
2282 	int err;
2283 
2284 	/* prevent duplicates */
2285 	if (find_rdev(mddev, rdev->bdev->bd_dev))
2286 		return -EEXIST;
2287 
2288 	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2289 	    mddev->pers)
2290 		return -EROFS;
2291 
2292 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2293 	if (!test_bit(Journal, &rdev->flags) &&
2294 	    rdev->sectors &&
2295 	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2296 		if (mddev->pers) {
2297 			/* Cannot change size, so fail
2298 			 * If mddev->level <= 0, then we don't care
2299 			 * about aligning sizes (e.g. linear)
2300 			 */
2301 			if (mddev->level > 0)
2302 				return -ENOSPC;
2303 		} else
2304 			mddev->dev_sectors = rdev->sectors;
2305 	}
2306 
2307 	/* Verify rdev->desc_nr is unique.
2308 	 * If it is -1, assign a free number, else
2309 	 * check number is not in use
2310 	 */
2311 	rcu_read_lock();
2312 	if (rdev->desc_nr < 0) {
2313 		int choice = 0;
2314 		if (mddev->pers)
2315 			choice = mddev->raid_disks;
2316 		while (md_find_rdev_nr_rcu(mddev, choice))
2317 			choice++;
2318 		rdev->desc_nr = choice;
2319 	} else {
2320 		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2321 			rcu_read_unlock();
2322 			return -EBUSY;
2323 		}
2324 	}
2325 	rcu_read_unlock();
2326 	if (!test_bit(Journal, &rdev->flags) &&
2327 	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2328 		pr_warn("md: %s: array is limited to %d devices\n",
2329 			mdname(mddev), mddev->max_disks);
2330 		return -EBUSY;
2331 	}
2332 	bdevname(rdev->bdev,b);
2333 	strreplace(b, '/', '!');
2334 
2335 	rdev->mddev = mddev;
2336 	pr_debug("md: bind<%s>\n", b);
2337 
2338 	if (mddev->raid_disks)
2339 		mddev_create_wb_pool(mddev, rdev, false);
2340 
2341 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2342 		goto fail;
2343 
2344 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2345 	if (sysfs_create_link(&rdev->kobj, ko, "block"))
2346 		/* failure here is OK */;
2347 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2348 
2349 	list_add_rcu(&rdev->same_set, &mddev->disks);
2350 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2351 
2352 	/* May as well allow recovery to be retried once */
2353 	mddev->recovery_disabled++;
2354 
2355 	return 0;
2356 
2357  fail:
2358 	pr_warn("md: failed to register dev-%s for %s\n",
2359 		b, mdname(mddev));
2360 	return err;
2361 }
2362 
2363 static void md_delayed_delete(struct work_struct *ws)
2364 {
2365 	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2366 	kobject_del(&rdev->kobj);
2367 	kobject_put(&rdev->kobj);
2368 }
2369 
2370 static void unbind_rdev_from_array(struct md_rdev *rdev)
2371 {
2372 	char b[BDEVNAME_SIZE];
2373 
2374 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2375 	list_del_rcu(&rdev->same_set);
2376 	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2377 	mddev_destroy_wb_pool(rdev->mddev, rdev);
2378 	rdev->mddev = NULL;
2379 	sysfs_remove_link(&rdev->kobj, "block");
2380 	sysfs_put(rdev->sysfs_state);
2381 	rdev->sysfs_state = NULL;
2382 	rdev->badblocks.count = 0;
2383 	/* We need to delay this, otherwise we can deadlock when
2384 	 * writing to 'remove' to "dev/state".  We also need
2385 	 * to delay it due to rcu usage.
2386 	 */
2387 	synchronize_rcu();
2388 	INIT_WORK(&rdev->del_work, md_delayed_delete);
2389 	kobject_get(&rdev->kobj);
2390 	queue_work(md_misc_wq, &rdev->del_work);
2391 }
2392 
2393 /*
2394  * prevent the device from being mounted, repartitioned or
2395  * otherwise reused by a RAID array (or any other kernel
2396  * subsystem), by bd_claiming the device.
2397  */
2398 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2399 {
2400 	int err = 0;
2401 	struct block_device *bdev;
2402 	char b[BDEVNAME_SIZE];
2403 
2404 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2405 				 shared ? (struct md_rdev *)lock_rdev : rdev);
2406 	if (IS_ERR(bdev)) {
2407 		pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2408 		return PTR_ERR(bdev);
2409 	}
2410 	rdev->bdev = bdev;
2411 	return err;
2412 }
2413 
2414 static void unlock_rdev(struct md_rdev *rdev)
2415 {
2416 	struct block_device *bdev = rdev->bdev;
2417 	rdev->bdev = NULL;
2418 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2419 }
2420 
2421 void md_autodetect_dev(dev_t dev);
2422 
2423 static void export_rdev(struct md_rdev *rdev)
2424 {
2425 	char b[BDEVNAME_SIZE];
2426 
2427 	pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2428 	md_rdev_clear(rdev);
2429 #ifndef MODULE
2430 	if (test_bit(AutoDetected, &rdev->flags))
2431 		md_autodetect_dev(rdev->bdev->bd_dev);
2432 #endif
2433 	unlock_rdev(rdev);
2434 	kobject_put(&rdev->kobj);
2435 }
2436 
2437 void md_kick_rdev_from_array(struct md_rdev *rdev)
2438 {
2439 	unbind_rdev_from_array(rdev);
2440 	export_rdev(rdev);
2441 }
2442 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2443 
2444 static void export_array(struct mddev *mddev)
2445 {
2446 	struct md_rdev *rdev;
2447 
2448 	while (!list_empty(&mddev->disks)) {
2449 		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2450 					same_set);
2451 		md_kick_rdev_from_array(rdev);
2452 	}
2453 	mddev->raid_disks = 0;
2454 	mddev->major_version = 0;
2455 }
2456 
2457 static bool set_in_sync(struct mddev *mddev)
2458 {
2459 	lockdep_assert_held(&mddev->lock);
2460 	if (!mddev->in_sync) {
2461 		mddev->sync_checkers++;
2462 		spin_unlock(&mddev->lock);
2463 		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2464 		spin_lock(&mddev->lock);
2465 		if (!mddev->in_sync &&
2466 		    percpu_ref_is_zero(&mddev->writes_pending)) {
2467 			mddev->in_sync = 1;
2468 			/*
2469 			 * Ensure ->in_sync is visible before we clear
2470 			 * ->sync_checkers.
2471 			 */
2472 			smp_mb();
2473 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2474 			sysfs_notify_dirent_safe(mddev->sysfs_state);
2475 		}
2476 		if (--mddev->sync_checkers == 0)
2477 			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2478 	}
2479 	if (mddev->safemode == 1)
2480 		mddev->safemode = 0;
2481 	return mddev->in_sync;
2482 }
2483 
2484 static void sync_sbs(struct mddev *mddev, int nospares)
2485 {
2486 	/* Update each superblock (in-memory image), but
2487 	 * if we are allowed to, skip spares which already
2488 	 * have the right event counter, or have one earlier
2489 	 * (which would mean they aren't being marked as dirty
2490 	 * with the rest of the array)
2491 	 */
2492 	struct md_rdev *rdev;
2493 	rdev_for_each(rdev, mddev) {
2494 		if (rdev->sb_events == mddev->events ||
2495 		    (nospares &&
2496 		     rdev->raid_disk < 0 &&
2497 		     rdev->sb_events+1 == mddev->events)) {
2498 			/* Don't update this superblock */
2499 			rdev->sb_loaded = 2;
2500 		} else {
2501 			sync_super(mddev, rdev);
2502 			rdev->sb_loaded = 1;
2503 		}
2504 	}
2505 }
2506 
2507 static bool does_sb_need_changing(struct mddev *mddev)
2508 {
2509 	struct md_rdev *rdev;
2510 	struct mdp_superblock_1 *sb;
2511 	int role;
2512 
2513 	/* Find a good rdev */
2514 	rdev_for_each(rdev, mddev)
2515 		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2516 			break;
2517 
2518 	/* No good device found. */
2519 	if (!rdev)
2520 		return false;
2521 
2522 	sb = page_address(rdev->sb_page);
2523 	/* Check if a device has become faulty or a spare become active */
2524 	rdev_for_each(rdev, mddev) {
2525 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2526 		/* Device activated? */
2527 		if (role == 0xffff && rdev->raid_disk >=0 &&
2528 		    !test_bit(Faulty, &rdev->flags))
2529 			return true;
2530 		/* Device turned faulty? */
2531 		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2532 			return true;
2533 	}
2534 
2535 	/* Check if any mddev parameters have changed */
2536 	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2537 	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2538 	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2539 	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2540 	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2541 		return true;
2542 
2543 	return false;
2544 }
2545 
2546 void md_update_sb(struct mddev *mddev, int force_change)
2547 {
2548 	struct md_rdev *rdev;
2549 	int sync_req;
2550 	int nospares = 0;
2551 	int any_badblocks_changed = 0;
2552 	int ret = -1;
2553 
2554 	if (mddev->ro) {
2555 		if (force_change)
2556 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2557 		return;
2558 	}
2559 
2560 repeat:
2561 	if (mddev_is_clustered(mddev)) {
2562 		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2563 			force_change = 1;
2564 		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2565 			nospares = 1;
2566 		ret = md_cluster_ops->metadata_update_start(mddev);
2567 		/* Has someone else has updated the sb */
2568 		if (!does_sb_need_changing(mddev)) {
2569 			if (ret == 0)
2570 				md_cluster_ops->metadata_update_cancel(mddev);
2571 			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2572 							 BIT(MD_SB_CHANGE_DEVS) |
2573 							 BIT(MD_SB_CHANGE_CLEAN));
2574 			return;
2575 		}
2576 	}
2577 
2578 	/*
2579 	 * First make sure individual recovery_offsets are correct
2580 	 * curr_resync_completed can only be used during recovery.
2581 	 * During reshape/resync it might use array-addresses rather
2582 	 * that device addresses.
2583 	 */
2584 	rdev_for_each(rdev, mddev) {
2585 		if (rdev->raid_disk >= 0 &&
2586 		    mddev->delta_disks >= 0 &&
2587 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2588 		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2589 		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2590 		    !test_bit(Journal, &rdev->flags) &&
2591 		    !test_bit(In_sync, &rdev->flags) &&
2592 		    mddev->curr_resync_completed > rdev->recovery_offset)
2593 				rdev->recovery_offset = mddev->curr_resync_completed;
2594 
2595 	}
2596 	if (!mddev->persistent) {
2597 		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2598 		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2599 		if (!mddev->external) {
2600 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2601 			rdev_for_each(rdev, mddev) {
2602 				if (rdev->badblocks.changed) {
2603 					rdev->badblocks.changed = 0;
2604 					ack_all_badblocks(&rdev->badblocks);
2605 					md_error(mddev, rdev);
2606 				}
2607 				clear_bit(Blocked, &rdev->flags);
2608 				clear_bit(BlockedBadBlocks, &rdev->flags);
2609 				wake_up(&rdev->blocked_wait);
2610 			}
2611 		}
2612 		wake_up(&mddev->sb_wait);
2613 		return;
2614 	}
2615 
2616 	spin_lock(&mddev->lock);
2617 
2618 	mddev->utime = ktime_get_real_seconds();
2619 
2620 	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2621 		force_change = 1;
2622 	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2623 		/* just a clean<-> dirty transition, possibly leave spares alone,
2624 		 * though if events isn't the right even/odd, we will have to do
2625 		 * spares after all
2626 		 */
2627 		nospares = 1;
2628 	if (force_change)
2629 		nospares = 0;
2630 	if (mddev->degraded)
2631 		/* If the array is degraded, then skipping spares is both
2632 		 * dangerous and fairly pointless.
2633 		 * Dangerous because a device that was removed from the array
2634 		 * might have a event_count that still looks up-to-date,
2635 		 * so it can be re-added without a resync.
2636 		 * Pointless because if there are any spares to skip,
2637 		 * then a recovery will happen and soon that array won't
2638 		 * be degraded any more and the spare can go back to sleep then.
2639 		 */
2640 		nospares = 0;
2641 
2642 	sync_req = mddev->in_sync;
2643 
2644 	/* If this is just a dirty<->clean transition, and the array is clean
2645 	 * and 'events' is odd, we can roll back to the previous clean state */
2646 	if (nospares
2647 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2648 	    && mddev->can_decrease_events
2649 	    && mddev->events != 1) {
2650 		mddev->events--;
2651 		mddev->can_decrease_events = 0;
2652 	} else {
2653 		/* otherwise we have to go forward and ... */
2654 		mddev->events ++;
2655 		mddev->can_decrease_events = nospares;
2656 	}
2657 
2658 	/*
2659 	 * This 64-bit counter should never wrap.
2660 	 * Either we are in around ~1 trillion A.C., assuming
2661 	 * 1 reboot per second, or we have a bug...
2662 	 */
2663 	WARN_ON(mddev->events == 0);
2664 
2665 	rdev_for_each(rdev, mddev) {
2666 		if (rdev->badblocks.changed)
2667 			any_badblocks_changed++;
2668 		if (test_bit(Faulty, &rdev->flags))
2669 			set_bit(FaultRecorded, &rdev->flags);
2670 	}
2671 
2672 	sync_sbs(mddev, nospares);
2673 	spin_unlock(&mddev->lock);
2674 
2675 	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2676 		 mdname(mddev), mddev->in_sync);
2677 
2678 	if (mddev->queue)
2679 		blk_add_trace_msg(mddev->queue, "md md_update_sb");
2680 rewrite:
2681 	md_bitmap_update_sb(mddev->bitmap);
2682 	rdev_for_each(rdev, mddev) {
2683 		char b[BDEVNAME_SIZE];
2684 
2685 		if (rdev->sb_loaded != 1)
2686 			continue; /* no noise on spare devices */
2687 
2688 		if (!test_bit(Faulty, &rdev->flags)) {
2689 			md_super_write(mddev,rdev,
2690 				       rdev->sb_start, rdev->sb_size,
2691 				       rdev->sb_page);
2692 			pr_debug("md: (write) %s's sb offset: %llu\n",
2693 				 bdevname(rdev->bdev, b),
2694 				 (unsigned long long)rdev->sb_start);
2695 			rdev->sb_events = mddev->events;
2696 			if (rdev->badblocks.size) {
2697 				md_super_write(mddev, rdev,
2698 					       rdev->badblocks.sector,
2699 					       rdev->badblocks.size << 9,
2700 					       rdev->bb_page);
2701 				rdev->badblocks.size = 0;
2702 			}
2703 
2704 		} else
2705 			pr_debug("md: %s (skipping faulty)\n",
2706 				 bdevname(rdev->bdev, b));
2707 
2708 		if (mddev->level == LEVEL_MULTIPATH)
2709 			/* only need to write one superblock... */
2710 			break;
2711 	}
2712 	if (md_super_wait(mddev) < 0)
2713 		goto rewrite;
2714 	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2715 
2716 	if (mddev_is_clustered(mddev) && ret == 0)
2717 		md_cluster_ops->metadata_update_finish(mddev);
2718 
2719 	if (mddev->in_sync != sync_req ||
2720 	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2721 			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2722 		/* have to write it out again */
2723 		goto repeat;
2724 	wake_up(&mddev->sb_wait);
2725 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2726 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2727 
2728 	rdev_for_each(rdev, mddev) {
2729 		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2730 			clear_bit(Blocked, &rdev->flags);
2731 
2732 		if (any_badblocks_changed)
2733 			ack_all_badblocks(&rdev->badblocks);
2734 		clear_bit(BlockedBadBlocks, &rdev->flags);
2735 		wake_up(&rdev->blocked_wait);
2736 	}
2737 }
2738 EXPORT_SYMBOL(md_update_sb);
2739 
2740 static int add_bound_rdev(struct md_rdev *rdev)
2741 {
2742 	struct mddev *mddev = rdev->mddev;
2743 	int err = 0;
2744 	bool add_journal = test_bit(Journal, &rdev->flags);
2745 
2746 	if (!mddev->pers->hot_remove_disk || add_journal) {
2747 		/* If there is hot_add_disk but no hot_remove_disk
2748 		 * then added disks for geometry changes,
2749 		 * and should be added immediately.
2750 		 */
2751 		super_types[mddev->major_version].
2752 			validate_super(mddev, rdev);
2753 		if (add_journal)
2754 			mddev_suspend(mddev);
2755 		err = mddev->pers->hot_add_disk(mddev, rdev);
2756 		if (add_journal)
2757 			mddev_resume(mddev);
2758 		if (err) {
2759 			md_kick_rdev_from_array(rdev);
2760 			return err;
2761 		}
2762 	}
2763 	sysfs_notify_dirent_safe(rdev->sysfs_state);
2764 
2765 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2766 	if (mddev->degraded)
2767 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2768 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2769 	md_new_event(mddev);
2770 	md_wakeup_thread(mddev->thread);
2771 	return 0;
2772 }
2773 
2774 /* words written to sysfs files may, or may not, be \n terminated.
2775  * We want to accept with case. For this we use cmd_match.
2776  */
2777 static int cmd_match(const char *cmd, const char *str)
2778 {
2779 	/* See if cmd, written into a sysfs file, matches
2780 	 * str.  They must either be the same, or cmd can
2781 	 * have a trailing newline
2782 	 */
2783 	while (*cmd && *str && *cmd == *str) {
2784 		cmd++;
2785 		str++;
2786 	}
2787 	if (*cmd == '\n')
2788 		cmd++;
2789 	if (*str || *cmd)
2790 		return 0;
2791 	return 1;
2792 }
2793 
2794 struct rdev_sysfs_entry {
2795 	struct attribute attr;
2796 	ssize_t (*show)(struct md_rdev *, char *);
2797 	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2798 };
2799 
2800 static ssize_t
2801 state_show(struct md_rdev *rdev, char *page)
2802 {
2803 	char *sep = ",";
2804 	size_t len = 0;
2805 	unsigned long flags = READ_ONCE(rdev->flags);
2806 
2807 	if (test_bit(Faulty, &flags) ||
2808 	    (!test_bit(ExternalBbl, &flags) &&
2809 	    rdev->badblocks.unacked_exist))
2810 		len += sprintf(page+len, "faulty%s", sep);
2811 	if (test_bit(In_sync, &flags))
2812 		len += sprintf(page+len, "in_sync%s", sep);
2813 	if (test_bit(Journal, &flags))
2814 		len += sprintf(page+len, "journal%s", sep);
2815 	if (test_bit(WriteMostly, &flags))
2816 		len += sprintf(page+len, "write_mostly%s", sep);
2817 	if (test_bit(Blocked, &flags) ||
2818 	    (rdev->badblocks.unacked_exist
2819 	     && !test_bit(Faulty, &flags)))
2820 		len += sprintf(page+len, "blocked%s", sep);
2821 	if (!test_bit(Faulty, &flags) &&
2822 	    !test_bit(Journal, &flags) &&
2823 	    !test_bit(In_sync, &flags))
2824 		len += sprintf(page+len, "spare%s", sep);
2825 	if (test_bit(WriteErrorSeen, &flags))
2826 		len += sprintf(page+len, "write_error%s", sep);
2827 	if (test_bit(WantReplacement, &flags))
2828 		len += sprintf(page+len, "want_replacement%s", sep);
2829 	if (test_bit(Replacement, &flags))
2830 		len += sprintf(page+len, "replacement%s", sep);
2831 	if (test_bit(ExternalBbl, &flags))
2832 		len += sprintf(page+len, "external_bbl%s", sep);
2833 	if (test_bit(FailFast, &flags))
2834 		len += sprintf(page+len, "failfast%s", sep);
2835 
2836 	if (len)
2837 		len -= strlen(sep);
2838 
2839 	return len+sprintf(page+len, "\n");
2840 }
2841 
2842 static ssize_t
2843 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2844 {
2845 	/* can write
2846 	 *  faulty  - simulates an error
2847 	 *  remove  - disconnects the device
2848 	 *  writemostly - sets write_mostly
2849 	 *  -writemostly - clears write_mostly
2850 	 *  blocked - sets the Blocked flags
2851 	 *  -blocked - clears the Blocked and possibly simulates an error
2852 	 *  insync - sets Insync providing device isn't active
2853 	 *  -insync - clear Insync for a device with a slot assigned,
2854 	 *            so that it gets rebuilt based on bitmap
2855 	 *  write_error - sets WriteErrorSeen
2856 	 *  -write_error - clears WriteErrorSeen
2857 	 *  {,-}failfast - set/clear FailFast
2858 	 */
2859 	int err = -EINVAL;
2860 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2861 		md_error(rdev->mddev, rdev);
2862 		if (test_bit(Faulty, &rdev->flags))
2863 			err = 0;
2864 		else
2865 			err = -EBUSY;
2866 	} else if (cmd_match(buf, "remove")) {
2867 		if (rdev->mddev->pers) {
2868 			clear_bit(Blocked, &rdev->flags);
2869 			remove_and_add_spares(rdev->mddev, rdev);
2870 		}
2871 		if (rdev->raid_disk >= 0)
2872 			err = -EBUSY;
2873 		else {
2874 			struct mddev *mddev = rdev->mddev;
2875 			err = 0;
2876 			if (mddev_is_clustered(mddev))
2877 				err = md_cluster_ops->remove_disk(mddev, rdev);
2878 
2879 			if (err == 0) {
2880 				md_kick_rdev_from_array(rdev);
2881 				if (mddev->pers) {
2882 					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2883 					md_wakeup_thread(mddev->thread);
2884 				}
2885 				md_new_event(mddev);
2886 			}
2887 		}
2888 	} else if (cmd_match(buf, "writemostly")) {
2889 		set_bit(WriteMostly, &rdev->flags);
2890 		mddev_create_wb_pool(rdev->mddev, rdev, false);
2891 		err = 0;
2892 	} else if (cmd_match(buf, "-writemostly")) {
2893 		mddev_destroy_wb_pool(rdev->mddev, rdev);
2894 		clear_bit(WriteMostly, &rdev->flags);
2895 		err = 0;
2896 	} else if (cmd_match(buf, "blocked")) {
2897 		set_bit(Blocked, &rdev->flags);
2898 		err = 0;
2899 	} else if (cmd_match(buf, "-blocked")) {
2900 		if (!test_bit(Faulty, &rdev->flags) &&
2901 		    !test_bit(ExternalBbl, &rdev->flags) &&
2902 		    rdev->badblocks.unacked_exist) {
2903 			/* metadata handler doesn't understand badblocks,
2904 			 * so we need to fail the device
2905 			 */
2906 			md_error(rdev->mddev, rdev);
2907 		}
2908 		clear_bit(Blocked, &rdev->flags);
2909 		clear_bit(BlockedBadBlocks, &rdev->flags);
2910 		wake_up(&rdev->blocked_wait);
2911 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2912 		md_wakeup_thread(rdev->mddev->thread);
2913 
2914 		err = 0;
2915 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2916 		set_bit(In_sync, &rdev->flags);
2917 		err = 0;
2918 	} else if (cmd_match(buf, "failfast")) {
2919 		set_bit(FailFast, &rdev->flags);
2920 		err = 0;
2921 	} else if (cmd_match(buf, "-failfast")) {
2922 		clear_bit(FailFast, &rdev->flags);
2923 		err = 0;
2924 	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2925 		   !test_bit(Journal, &rdev->flags)) {
2926 		if (rdev->mddev->pers == NULL) {
2927 			clear_bit(In_sync, &rdev->flags);
2928 			rdev->saved_raid_disk = rdev->raid_disk;
2929 			rdev->raid_disk = -1;
2930 			err = 0;
2931 		}
2932 	} else if (cmd_match(buf, "write_error")) {
2933 		set_bit(WriteErrorSeen, &rdev->flags);
2934 		err = 0;
2935 	} else if (cmd_match(buf, "-write_error")) {
2936 		clear_bit(WriteErrorSeen, &rdev->flags);
2937 		err = 0;
2938 	} else if (cmd_match(buf, "want_replacement")) {
2939 		/* Any non-spare device that is not a replacement can
2940 		 * become want_replacement at any time, but we then need to
2941 		 * check if recovery is needed.
2942 		 */
2943 		if (rdev->raid_disk >= 0 &&
2944 		    !test_bit(Journal, &rdev->flags) &&
2945 		    !test_bit(Replacement, &rdev->flags))
2946 			set_bit(WantReplacement, &rdev->flags);
2947 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2948 		md_wakeup_thread(rdev->mddev->thread);
2949 		err = 0;
2950 	} else if (cmd_match(buf, "-want_replacement")) {
2951 		/* Clearing 'want_replacement' is always allowed.
2952 		 * Once replacements starts it is too late though.
2953 		 */
2954 		err = 0;
2955 		clear_bit(WantReplacement, &rdev->flags);
2956 	} else if (cmd_match(buf, "replacement")) {
2957 		/* Can only set a device as a replacement when array has not
2958 		 * yet been started.  Once running, replacement is automatic
2959 		 * from spares, or by assigning 'slot'.
2960 		 */
2961 		if (rdev->mddev->pers)
2962 			err = -EBUSY;
2963 		else {
2964 			set_bit(Replacement, &rdev->flags);
2965 			err = 0;
2966 		}
2967 	} else if (cmd_match(buf, "-replacement")) {
2968 		/* Similarly, can only clear Replacement before start */
2969 		if (rdev->mddev->pers)
2970 			err = -EBUSY;
2971 		else {
2972 			clear_bit(Replacement, &rdev->flags);
2973 			err = 0;
2974 		}
2975 	} else if (cmd_match(buf, "re-add")) {
2976 		if (!rdev->mddev->pers)
2977 			err = -EINVAL;
2978 		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2979 				rdev->saved_raid_disk >= 0) {
2980 			/* clear_bit is performed _after_ all the devices
2981 			 * have their local Faulty bit cleared. If any writes
2982 			 * happen in the meantime in the local node, they
2983 			 * will land in the local bitmap, which will be synced
2984 			 * by this node eventually
2985 			 */
2986 			if (!mddev_is_clustered(rdev->mddev) ||
2987 			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2988 				clear_bit(Faulty, &rdev->flags);
2989 				err = add_bound_rdev(rdev);
2990 			}
2991 		} else
2992 			err = -EBUSY;
2993 	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2994 		set_bit(ExternalBbl, &rdev->flags);
2995 		rdev->badblocks.shift = 0;
2996 		err = 0;
2997 	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2998 		clear_bit(ExternalBbl, &rdev->flags);
2999 		err = 0;
3000 	}
3001 	if (!err)
3002 		sysfs_notify_dirent_safe(rdev->sysfs_state);
3003 	return err ? err : len;
3004 }
3005 static struct rdev_sysfs_entry rdev_state =
3006 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3007 
3008 static ssize_t
3009 errors_show(struct md_rdev *rdev, char *page)
3010 {
3011 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3012 }
3013 
3014 static ssize_t
3015 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3016 {
3017 	unsigned int n;
3018 	int rv;
3019 
3020 	rv = kstrtouint(buf, 10, &n);
3021 	if (rv < 0)
3022 		return rv;
3023 	atomic_set(&rdev->corrected_errors, n);
3024 	return len;
3025 }
3026 static struct rdev_sysfs_entry rdev_errors =
3027 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3028 
3029 static ssize_t
3030 slot_show(struct md_rdev *rdev, char *page)
3031 {
3032 	if (test_bit(Journal, &rdev->flags))
3033 		return sprintf(page, "journal\n");
3034 	else if (rdev->raid_disk < 0)
3035 		return sprintf(page, "none\n");
3036 	else
3037 		return sprintf(page, "%d\n", rdev->raid_disk);
3038 }
3039 
3040 static ssize_t
3041 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3042 {
3043 	int slot;
3044 	int err;
3045 
3046 	if (test_bit(Journal, &rdev->flags))
3047 		return -EBUSY;
3048 	if (strncmp(buf, "none", 4)==0)
3049 		slot = -1;
3050 	else {
3051 		err = kstrtouint(buf, 10, (unsigned int *)&slot);
3052 		if (err < 0)
3053 			return err;
3054 	}
3055 	if (rdev->mddev->pers && slot == -1) {
3056 		/* Setting 'slot' on an active array requires also
3057 		 * updating the 'rd%d' link, and communicating
3058 		 * with the personality with ->hot_*_disk.
3059 		 * For now we only support removing
3060 		 * failed/spare devices.  This normally happens automatically,
3061 		 * but not when the metadata is externally managed.
3062 		 */
3063 		if (rdev->raid_disk == -1)
3064 			return -EEXIST;
3065 		/* personality does all needed checks */
3066 		if (rdev->mddev->pers->hot_remove_disk == NULL)
3067 			return -EINVAL;
3068 		clear_bit(Blocked, &rdev->flags);
3069 		remove_and_add_spares(rdev->mddev, rdev);
3070 		if (rdev->raid_disk >= 0)
3071 			return -EBUSY;
3072 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3073 		md_wakeup_thread(rdev->mddev->thread);
3074 	} else if (rdev->mddev->pers) {
3075 		/* Activating a spare .. or possibly reactivating
3076 		 * if we ever get bitmaps working here.
3077 		 */
3078 		int err;
3079 
3080 		if (rdev->raid_disk != -1)
3081 			return -EBUSY;
3082 
3083 		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3084 			return -EBUSY;
3085 
3086 		if (rdev->mddev->pers->hot_add_disk == NULL)
3087 			return -EINVAL;
3088 
3089 		if (slot >= rdev->mddev->raid_disks &&
3090 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3091 			return -ENOSPC;
3092 
3093 		rdev->raid_disk = slot;
3094 		if (test_bit(In_sync, &rdev->flags))
3095 			rdev->saved_raid_disk = slot;
3096 		else
3097 			rdev->saved_raid_disk = -1;
3098 		clear_bit(In_sync, &rdev->flags);
3099 		clear_bit(Bitmap_sync, &rdev->flags);
3100 		err = rdev->mddev->pers->
3101 			hot_add_disk(rdev->mddev, rdev);
3102 		if (err) {
3103 			rdev->raid_disk = -1;
3104 			return err;
3105 		} else
3106 			sysfs_notify_dirent_safe(rdev->sysfs_state);
3107 		if (sysfs_link_rdev(rdev->mddev, rdev))
3108 			/* failure here is OK */;
3109 		/* don't wakeup anyone, leave that to userspace. */
3110 	} else {
3111 		if (slot >= rdev->mddev->raid_disks &&
3112 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3113 			return -ENOSPC;
3114 		rdev->raid_disk = slot;
3115 		/* assume it is working */
3116 		clear_bit(Faulty, &rdev->flags);
3117 		clear_bit(WriteMostly, &rdev->flags);
3118 		set_bit(In_sync, &rdev->flags);
3119 		sysfs_notify_dirent_safe(rdev->sysfs_state);
3120 	}
3121 	return len;
3122 }
3123 
3124 static struct rdev_sysfs_entry rdev_slot =
3125 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3126 
3127 static ssize_t
3128 offset_show(struct md_rdev *rdev, char *page)
3129 {
3130 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3131 }
3132 
3133 static ssize_t
3134 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3135 {
3136 	unsigned long long offset;
3137 	if (kstrtoull(buf, 10, &offset) < 0)
3138 		return -EINVAL;
3139 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3140 		return -EBUSY;
3141 	if (rdev->sectors && rdev->mddev->external)
3142 		/* Must set offset before size, so overlap checks
3143 		 * can be sane */
3144 		return -EBUSY;
3145 	rdev->data_offset = offset;
3146 	rdev->new_data_offset = offset;
3147 	return len;
3148 }
3149 
3150 static struct rdev_sysfs_entry rdev_offset =
3151 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3152 
3153 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3154 {
3155 	return sprintf(page, "%llu\n",
3156 		       (unsigned long long)rdev->new_data_offset);
3157 }
3158 
3159 static ssize_t new_offset_store(struct md_rdev *rdev,
3160 				const char *buf, size_t len)
3161 {
3162 	unsigned long long new_offset;
3163 	struct mddev *mddev = rdev->mddev;
3164 
3165 	if (kstrtoull(buf, 10, &new_offset) < 0)
3166 		return -EINVAL;
3167 
3168 	if (mddev->sync_thread ||
3169 	    test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3170 		return -EBUSY;
3171 	if (new_offset == rdev->data_offset)
3172 		/* reset is always permitted */
3173 		;
3174 	else if (new_offset > rdev->data_offset) {
3175 		/* must not push array size beyond rdev_sectors */
3176 		if (new_offset - rdev->data_offset
3177 		    + mddev->dev_sectors > rdev->sectors)
3178 				return -E2BIG;
3179 	}
3180 	/* Metadata worries about other space details. */
3181 
3182 	/* decreasing the offset is inconsistent with a backwards
3183 	 * reshape.
3184 	 */
3185 	if (new_offset < rdev->data_offset &&
3186 	    mddev->reshape_backwards)
3187 		return -EINVAL;
3188 	/* Increasing offset is inconsistent with forwards
3189 	 * reshape.  reshape_direction should be set to
3190 	 * 'backwards' first.
3191 	 */
3192 	if (new_offset > rdev->data_offset &&
3193 	    !mddev->reshape_backwards)
3194 		return -EINVAL;
3195 
3196 	if (mddev->pers && mddev->persistent &&
3197 	    !super_types[mddev->major_version]
3198 	    .allow_new_offset(rdev, new_offset))
3199 		return -E2BIG;
3200 	rdev->new_data_offset = new_offset;
3201 	if (new_offset > rdev->data_offset)
3202 		mddev->reshape_backwards = 1;
3203 	else if (new_offset < rdev->data_offset)
3204 		mddev->reshape_backwards = 0;
3205 
3206 	return len;
3207 }
3208 static struct rdev_sysfs_entry rdev_new_offset =
3209 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3210 
3211 static ssize_t
3212 rdev_size_show(struct md_rdev *rdev, char *page)
3213 {
3214 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3215 }
3216 
3217 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3218 {
3219 	/* check if two start/length pairs overlap */
3220 	if (s1+l1 <= s2)
3221 		return 0;
3222 	if (s2+l2 <= s1)
3223 		return 0;
3224 	return 1;
3225 }
3226 
3227 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3228 {
3229 	unsigned long long blocks;
3230 	sector_t new;
3231 
3232 	if (kstrtoull(buf, 10, &blocks) < 0)
3233 		return -EINVAL;
3234 
3235 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3236 		return -EINVAL; /* sector conversion overflow */
3237 
3238 	new = blocks * 2;
3239 	if (new != blocks * 2)
3240 		return -EINVAL; /* unsigned long long to sector_t overflow */
3241 
3242 	*sectors = new;
3243 	return 0;
3244 }
3245 
3246 static ssize_t
3247 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3248 {
3249 	struct mddev *my_mddev = rdev->mddev;
3250 	sector_t oldsectors = rdev->sectors;
3251 	sector_t sectors;
3252 
3253 	if (test_bit(Journal, &rdev->flags))
3254 		return -EBUSY;
3255 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3256 		return -EINVAL;
3257 	if (rdev->data_offset != rdev->new_data_offset)
3258 		return -EINVAL; /* too confusing */
3259 	if (my_mddev->pers && rdev->raid_disk >= 0) {
3260 		if (my_mddev->persistent) {
3261 			sectors = super_types[my_mddev->major_version].
3262 				rdev_size_change(rdev, sectors);
3263 			if (!sectors)
3264 				return -EBUSY;
3265 		} else if (!sectors)
3266 			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3267 				rdev->data_offset;
3268 		if (!my_mddev->pers->resize)
3269 			/* Cannot change size for RAID0 or Linear etc */
3270 			return -EINVAL;
3271 	}
3272 	if (sectors < my_mddev->dev_sectors)
3273 		return -EINVAL; /* component must fit device */
3274 
3275 	rdev->sectors = sectors;
3276 	if (sectors > oldsectors && my_mddev->external) {
3277 		/* Need to check that all other rdevs with the same
3278 		 * ->bdev do not overlap.  'rcu' is sufficient to walk
3279 		 * the rdev lists safely.
3280 		 * This check does not provide a hard guarantee, it
3281 		 * just helps avoid dangerous mistakes.
3282 		 */
3283 		struct mddev *mddev;
3284 		int overlap = 0;
3285 		struct list_head *tmp;
3286 
3287 		rcu_read_lock();
3288 		for_each_mddev(mddev, tmp) {
3289 			struct md_rdev *rdev2;
3290 
3291 			rdev_for_each(rdev2, mddev)
3292 				if (rdev->bdev == rdev2->bdev &&
3293 				    rdev != rdev2 &&
3294 				    overlaps(rdev->data_offset, rdev->sectors,
3295 					     rdev2->data_offset,
3296 					     rdev2->sectors)) {
3297 					overlap = 1;
3298 					break;
3299 				}
3300 			if (overlap) {
3301 				mddev_put(mddev);
3302 				break;
3303 			}
3304 		}
3305 		rcu_read_unlock();
3306 		if (overlap) {
3307 			/* Someone else could have slipped in a size
3308 			 * change here, but doing so is just silly.
3309 			 * We put oldsectors back because we *know* it is
3310 			 * safe, and trust userspace not to race with
3311 			 * itself
3312 			 */
3313 			rdev->sectors = oldsectors;
3314 			return -EBUSY;
3315 		}
3316 	}
3317 	return len;
3318 }
3319 
3320 static struct rdev_sysfs_entry rdev_size =
3321 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3322 
3323 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3324 {
3325 	unsigned long long recovery_start = rdev->recovery_offset;
3326 
3327 	if (test_bit(In_sync, &rdev->flags) ||
3328 	    recovery_start == MaxSector)
3329 		return sprintf(page, "none\n");
3330 
3331 	return sprintf(page, "%llu\n", recovery_start);
3332 }
3333 
3334 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3335 {
3336 	unsigned long long recovery_start;
3337 
3338 	if (cmd_match(buf, "none"))
3339 		recovery_start = MaxSector;
3340 	else if (kstrtoull(buf, 10, &recovery_start))
3341 		return -EINVAL;
3342 
3343 	if (rdev->mddev->pers &&
3344 	    rdev->raid_disk >= 0)
3345 		return -EBUSY;
3346 
3347 	rdev->recovery_offset = recovery_start;
3348 	if (recovery_start == MaxSector)
3349 		set_bit(In_sync, &rdev->flags);
3350 	else
3351 		clear_bit(In_sync, &rdev->flags);
3352 	return len;
3353 }
3354 
3355 static struct rdev_sysfs_entry rdev_recovery_start =
3356 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3357 
3358 /* sysfs access to bad-blocks list.
3359  * We present two files.
3360  * 'bad-blocks' lists sector numbers and lengths of ranges that
3361  *    are recorded as bad.  The list is truncated to fit within
3362  *    the one-page limit of sysfs.
3363  *    Writing "sector length" to this file adds an acknowledged
3364  *    bad block list.
3365  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3366  *    been acknowledged.  Writing to this file adds bad blocks
3367  *    without acknowledging them.  This is largely for testing.
3368  */
3369 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3370 {
3371 	return badblocks_show(&rdev->badblocks, page, 0);
3372 }
3373 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3374 {
3375 	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3376 	/* Maybe that ack was all we needed */
3377 	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3378 		wake_up(&rdev->blocked_wait);
3379 	return rv;
3380 }
3381 static struct rdev_sysfs_entry rdev_bad_blocks =
3382 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3383 
3384 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3385 {
3386 	return badblocks_show(&rdev->badblocks, page, 1);
3387 }
3388 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3389 {
3390 	return badblocks_store(&rdev->badblocks, page, len, 1);
3391 }
3392 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3393 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3394 
3395 static ssize_t
3396 ppl_sector_show(struct md_rdev *rdev, char *page)
3397 {
3398 	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3399 }
3400 
3401 static ssize_t
3402 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3403 {
3404 	unsigned long long sector;
3405 
3406 	if (kstrtoull(buf, 10, &sector) < 0)
3407 		return -EINVAL;
3408 	if (sector != (sector_t)sector)
3409 		return -EINVAL;
3410 
3411 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3412 	    rdev->raid_disk >= 0)
3413 		return -EBUSY;
3414 
3415 	if (rdev->mddev->persistent) {
3416 		if (rdev->mddev->major_version == 0)
3417 			return -EINVAL;
3418 		if ((sector > rdev->sb_start &&
3419 		     sector - rdev->sb_start > S16_MAX) ||
3420 		    (sector < rdev->sb_start &&
3421 		     rdev->sb_start - sector > -S16_MIN))
3422 			return -EINVAL;
3423 		rdev->ppl.offset = sector - rdev->sb_start;
3424 	} else if (!rdev->mddev->external) {
3425 		return -EBUSY;
3426 	}
3427 	rdev->ppl.sector = sector;
3428 	return len;
3429 }
3430 
3431 static struct rdev_sysfs_entry rdev_ppl_sector =
3432 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3433 
3434 static ssize_t
3435 ppl_size_show(struct md_rdev *rdev, char *page)
3436 {
3437 	return sprintf(page, "%u\n", rdev->ppl.size);
3438 }
3439 
3440 static ssize_t
3441 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3442 {
3443 	unsigned int size;
3444 
3445 	if (kstrtouint(buf, 10, &size) < 0)
3446 		return -EINVAL;
3447 
3448 	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3449 	    rdev->raid_disk >= 0)
3450 		return -EBUSY;
3451 
3452 	if (rdev->mddev->persistent) {
3453 		if (rdev->mddev->major_version == 0)
3454 			return -EINVAL;
3455 		if (size > U16_MAX)
3456 			return -EINVAL;
3457 	} else if (!rdev->mddev->external) {
3458 		return -EBUSY;
3459 	}
3460 	rdev->ppl.size = size;
3461 	return len;
3462 }
3463 
3464 static struct rdev_sysfs_entry rdev_ppl_size =
3465 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3466 
3467 static struct attribute *rdev_default_attrs[] = {
3468 	&rdev_state.attr,
3469 	&rdev_errors.attr,
3470 	&rdev_slot.attr,
3471 	&rdev_offset.attr,
3472 	&rdev_new_offset.attr,
3473 	&rdev_size.attr,
3474 	&rdev_recovery_start.attr,
3475 	&rdev_bad_blocks.attr,
3476 	&rdev_unack_bad_blocks.attr,
3477 	&rdev_ppl_sector.attr,
3478 	&rdev_ppl_size.attr,
3479 	NULL,
3480 };
3481 static ssize_t
3482 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3483 {
3484 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3485 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3486 
3487 	if (!entry->show)
3488 		return -EIO;
3489 	if (!rdev->mddev)
3490 		return -ENODEV;
3491 	return entry->show(rdev, page);
3492 }
3493 
3494 static ssize_t
3495 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3496 	      const char *page, size_t length)
3497 {
3498 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3499 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3500 	ssize_t rv;
3501 	struct mddev *mddev = rdev->mddev;
3502 
3503 	if (!entry->store)
3504 		return -EIO;
3505 	if (!capable(CAP_SYS_ADMIN))
3506 		return -EACCES;
3507 	rv = mddev ? mddev_lock(mddev) : -ENODEV;
3508 	if (!rv) {
3509 		if (rdev->mddev == NULL)
3510 			rv = -ENODEV;
3511 		else
3512 			rv = entry->store(rdev, page, length);
3513 		mddev_unlock(mddev);
3514 	}
3515 	return rv;
3516 }
3517 
3518 static void rdev_free(struct kobject *ko)
3519 {
3520 	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3521 	kfree(rdev);
3522 }
3523 static const struct sysfs_ops rdev_sysfs_ops = {
3524 	.show		= rdev_attr_show,
3525 	.store		= rdev_attr_store,
3526 };
3527 static struct kobj_type rdev_ktype = {
3528 	.release	= rdev_free,
3529 	.sysfs_ops	= &rdev_sysfs_ops,
3530 	.default_attrs	= rdev_default_attrs,
3531 };
3532 
3533 int md_rdev_init(struct md_rdev *rdev)
3534 {
3535 	rdev->desc_nr = -1;
3536 	rdev->saved_raid_disk = -1;
3537 	rdev->raid_disk = -1;
3538 	rdev->flags = 0;
3539 	rdev->data_offset = 0;
3540 	rdev->new_data_offset = 0;
3541 	rdev->sb_events = 0;
3542 	rdev->last_read_error = 0;
3543 	rdev->sb_loaded = 0;
3544 	rdev->bb_page = NULL;
3545 	atomic_set(&rdev->nr_pending, 0);
3546 	atomic_set(&rdev->read_errors, 0);
3547 	atomic_set(&rdev->corrected_errors, 0);
3548 
3549 	INIT_LIST_HEAD(&rdev->same_set);
3550 	init_waitqueue_head(&rdev->blocked_wait);
3551 
3552 	/* Add space to store bad block list.
3553 	 * This reserves the space even on arrays where it cannot
3554 	 * be used - I wonder if that matters
3555 	 */
3556 	return badblocks_init(&rdev->badblocks, 0);
3557 }
3558 EXPORT_SYMBOL_GPL(md_rdev_init);
3559 /*
3560  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3561  *
3562  * mark the device faulty if:
3563  *
3564  *   - the device is nonexistent (zero size)
3565  *   - the device has no valid superblock
3566  *
3567  * a faulty rdev _never_ has rdev->sb set.
3568  */
3569 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3570 {
3571 	char b[BDEVNAME_SIZE];
3572 	int err;
3573 	struct md_rdev *rdev;
3574 	sector_t size;
3575 
3576 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3577 	if (!rdev)
3578 		return ERR_PTR(-ENOMEM);
3579 
3580 	err = md_rdev_init(rdev);
3581 	if (err)
3582 		goto abort_free;
3583 	err = alloc_disk_sb(rdev);
3584 	if (err)
3585 		goto abort_free;
3586 
3587 	err = lock_rdev(rdev, newdev, super_format == -2);
3588 	if (err)
3589 		goto abort_free;
3590 
3591 	kobject_init(&rdev->kobj, &rdev_ktype);
3592 
3593 	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3594 	if (!size) {
3595 		pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3596 			bdevname(rdev->bdev,b));
3597 		err = -EINVAL;
3598 		goto abort_free;
3599 	}
3600 
3601 	if (super_format >= 0) {
3602 		err = super_types[super_format].
3603 			load_super(rdev, NULL, super_minor);
3604 		if (err == -EINVAL) {
3605 			pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3606 				bdevname(rdev->bdev,b),
3607 				super_format, super_minor);
3608 			goto abort_free;
3609 		}
3610 		if (err < 0) {
3611 			pr_warn("md: could not read %s's sb, not importing!\n",
3612 				bdevname(rdev->bdev,b));
3613 			goto abort_free;
3614 		}
3615 	}
3616 
3617 	return rdev;
3618 
3619 abort_free:
3620 	if (rdev->bdev)
3621 		unlock_rdev(rdev);
3622 	md_rdev_clear(rdev);
3623 	kfree(rdev);
3624 	return ERR_PTR(err);
3625 }
3626 
3627 /*
3628  * Check a full RAID array for plausibility
3629  */
3630 
3631 static int analyze_sbs(struct mddev *mddev)
3632 {
3633 	int i;
3634 	struct md_rdev *rdev, *freshest, *tmp;
3635 	char b[BDEVNAME_SIZE];
3636 
3637 	freshest = NULL;
3638 	rdev_for_each_safe(rdev, tmp, mddev)
3639 		switch (super_types[mddev->major_version].
3640 			load_super(rdev, freshest, mddev->minor_version)) {
3641 		case 1:
3642 			freshest = rdev;
3643 			break;
3644 		case 0:
3645 			break;
3646 		default:
3647 			pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3648 				bdevname(rdev->bdev,b));
3649 			md_kick_rdev_from_array(rdev);
3650 		}
3651 
3652 	/* Cannot find a valid fresh disk */
3653 	if (!freshest) {
3654 		pr_warn("md: cannot find a valid disk\n");
3655 		return -EINVAL;
3656 	}
3657 
3658 	super_types[mddev->major_version].
3659 		validate_super(mddev, freshest);
3660 
3661 	i = 0;
3662 	rdev_for_each_safe(rdev, tmp, mddev) {
3663 		if (mddev->max_disks &&
3664 		    (rdev->desc_nr >= mddev->max_disks ||
3665 		     i > mddev->max_disks)) {
3666 			pr_warn("md: %s: %s: only %d devices permitted\n",
3667 				mdname(mddev), bdevname(rdev->bdev, b),
3668 				mddev->max_disks);
3669 			md_kick_rdev_from_array(rdev);
3670 			continue;
3671 		}
3672 		if (rdev != freshest) {
3673 			if (super_types[mddev->major_version].
3674 			    validate_super(mddev, rdev)) {
3675 				pr_warn("md: kicking non-fresh %s from array!\n",
3676 					bdevname(rdev->bdev,b));
3677 				md_kick_rdev_from_array(rdev);
3678 				continue;
3679 			}
3680 		}
3681 		if (mddev->level == LEVEL_MULTIPATH) {
3682 			rdev->desc_nr = i++;
3683 			rdev->raid_disk = rdev->desc_nr;
3684 			set_bit(In_sync, &rdev->flags);
3685 		} else if (rdev->raid_disk >=
3686 			    (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3687 			   !test_bit(Journal, &rdev->flags)) {
3688 			rdev->raid_disk = -1;
3689 			clear_bit(In_sync, &rdev->flags);
3690 		}
3691 	}
3692 
3693 	return 0;
3694 }
3695 
3696 /* Read a fixed-point number.
3697  * Numbers in sysfs attributes should be in "standard" units where
3698  * possible, so time should be in seconds.
3699  * However we internally use a a much smaller unit such as
3700  * milliseconds or jiffies.
3701  * This function takes a decimal number with a possible fractional
3702  * component, and produces an integer which is the result of
3703  * multiplying that number by 10^'scale'.
3704  * all without any floating-point arithmetic.
3705  */
3706 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3707 {
3708 	unsigned long result = 0;
3709 	long decimals = -1;
3710 	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3711 		if (*cp == '.')
3712 			decimals = 0;
3713 		else if (decimals < scale) {
3714 			unsigned int value;
3715 			value = *cp - '0';
3716 			result = result * 10 + value;
3717 			if (decimals >= 0)
3718 				decimals++;
3719 		}
3720 		cp++;
3721 	}
3722 	if (*cp == '\n')
3723 		cp++;
3724 	if (*cp)
3725 		return -EINVAL;
3726 	if (decimals < 0)
3727 		decimals = 0;
3728 	*res = result * int_pow(10, scale - decimals);
3729 	return 0;
3730 }
3731 
3732 static ssize_t
3733 safe_delay_show(struct mddev *mddev, char *page)
3734 {
3735 	int msec = (mddev->safemode_delay*1000)/HZ;
3736 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3737 }
3738 static ssize_t
3739 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3740 {
3741 	unsigned long msec;
3742 
3743 	if (mddev_is_clustered(mddev)) {
3744 		pr_warn("md: Safemode is disabled for clustered mode\n");
3745 		return -EINVAL;
3746 	}
3747 
3748 	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3749 		return -EINVAL;
3750 	if (msec == 0)
3751 		mddev->safemode_delay = 0;
3752 	else {
3753 		unsigned long old_delay = mddev->safemode_delay;
3754 		unsigned long new_delay = (msec*HZ)/1000;
3755 
3756 		if (new_delay == 0)
3757 			new_delay = 1;
3758 		mddev->safemode_delay = new_delay;
3759 		if (new_delay < old_delay || old_delay == 0)
3760 			mod_timer(&mddev->safemode_timer, jiffies+1);
3761 	}
3762 	return len;
3763 }
3764 static struct md_sysfs_entry md_safe_delay =
3765 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3766 
3767 static ssize_t
3768 level_show(struct mddev *mddev, char *page)
3769 {
3770 	struct md_personality *p;
3771 	int ret;
3772 	spin_lock(&mddev->lock);
3773 	p = mddev->pers;
3774 	if (p)
3775 		ret = sprintf(page, "%s\n", p->name);
3776 	else if (mddev->clevel[0])
3777 		ret = sprintf(page, "%s\n", mddev->clevel);
3778 	else if (mddev->level != LEVEL_NONE)
3779 		ret = sprintf(page, "%d\n", mddev->level);
3780 	else
3781 		ret = 0;
3782 	spin_unlock(&mddev->lock);
3783 	return ret;
3784 }
3785 
3786 static ssize_t
3787 level_store(struct mddev *mddev, const char *buf, size_t len)
3788 {
3789 	char clevel[16];
3790 	ssize_t rv;
3791 	size_t slen = len;
3792 	struct md_personality *pers, *oldpers;
3793 	long level;
3794 	void *priv, *oldpriv;
3795 	struct md_rdev *rdev;
3796 
3797 	if (slen == 0 || slen >= sizeof(clevel))
3798 		return -EINVAL;
3799 
3800 	rv = mddev_lock(mddev);
3801 	if (rv)
3802 		return rv;
3803 
3804 	if (mddev->pers == NULL) {
3805 		strncpy(mddev->clevel, buf, slen);
3806 		if (mddev->clevel[slen-1] == '\n')
3807 			slen--;
3808 		mddev->clevel[slen] = 0;
3809 		mddev->level = LEVEL_NONE;
3810 		rv = len;
3811 		goto out_unlock;
3812 	}
3813 	rv = -EROFS;
3814 	if (mddev->ro)
3815 		goto out_unlock;
3816 
3817 	/* request to change the personality.  Need to ensure:
3818 	 *  - array is not engaged in resync/recovery/reshape
3819 	 *  - old personality can be suspended
3820 	 *  - new personality will access other array.
3821 	 */
3822 
3823 	rv = -EBUSY;
3824 	if (mddev->sync_thread ||
3825 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3826 	    mddev->reshape_position != MaxSector ||
3827 	    mddev->sysfs_active)
3828 		goto out_unlock;
3829 
3830 	rv = -EINVAL;
3831 	if (!mddev->pers->quiesce) {
3832 		pr_warn("md: %s: %s does not support online personality change\n",
3833 			mdname(mddev), mddev->pers->name);
3834 		goto out_unlock;
3835 	}
3836 
3837 	/* Now find the new personality */
3838 	strncpy(clevel, buf, slen);
3839 	if (clevel[slen-1] == '\n')
3840 		slen--;
3841 	clevel[slen] = 0;
3842 	if (kstrtol(clevel, 10, &level))
3843 		level = LEVEL_NONE;
3844 
3845 	if (request_module("md-%s", clevel) != 0)
3846 		request_module("md-level-%s", clevel);
3847 	spin_lock(&pers_lock);
3848 	pers = find_pers(level, clevel);
3849 	if (!pers || !try_module_get(pers->owner)) {
3850 		spin_unlock(&pers_lock);
3851 		pr_warn("md: personality %s not loaded\n", clevel);
3852 		rv = -EINVAL;
3853 		goto out_unlock;
3854 	}
3855 	spin_unlock(&pers_lock);
3856 
3857 	if (pers == mddev->pers) {
3858 		/* Nothing to do! */
3859 		module_put(pers->owner);
3860 		rv = len;
3861 		goto out_unlock;
3862 	}
3863 	if (!pers->takeover) {
3864 		module_put(pers->owner);
3865 		pr_warn("md: %s: %s does not support personality takeover\n",
3866 			mdname(mddev), clevel);
3867 		rv = -EINVAL;
3868 		goto out_unlock;
3869 	}
3870 
3871 	rdev_for_each(rdev, mddev)
3872 		rdev->new_raid_disk = rdev->raid_disk;
3873 
3874 	/* ->takeover must set new_* and/or delta_disks
3875 	 * if it succeeds, and may set them when it fails.
3876 	 */
3877 	priv = pers->takeover(mddev);
3878 	if (IS_ERR(priv)) {
3879 		mddev->new_level = mddev->level;
3880 		mddev->new_layout = mddev->layout;
3881 		mddev->new_chunk_sectors = mddev->chunk_sectors;
3882 		mddev->raid_disks -= mddev->delta_disks;
3883 		mddev->delta_disks = 0;
3884 		mddev->reshape_backwards = 0;
3885 		module_put(pers->owner);
3886 		pr_warn("md: %s: %s would not accept array\n",
3887 			mdname(mddev), clevel);
3888 		rv = PTR_ERR(priv);
3889 		goto out_unlock;
3890 	}
3891 
3892 	/* Looks like we have a winner */
3893 	mddev_suspend(mddev);
3894 	mddev_detach(mddev);
3895 
3896 	spin_lock(&mddev->lock);
3897 	oldpers = mddev->pers;
3898 	oldpriv = mddev->private;
3899 	mddev->pers = pers;
3900 	mddev->private = priv;
3901 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3902 	mddev->level = mddev->new_level;
3903 	mddev->layout = mddev->new_layout;
3904 	mddev->chunk_sectors = mddev->new_chunk_sectors;
3905 	mddev->delta_disks = 0;
3906 	mddev->reshape_backwards = 0;
3907 	mddev->degraded = 0;
3908 	spin_unlock(&mddev->lock);
3909 
3910 	if (oldpers->sync_request == NULL &&
3911 	    mddev->external) {
3912 		/* We are converting from a no-redundancy array
3913 		 * to a redundancy array and metadata is managed
3914 		 * externally so we need to be sure that writes
3915 		 * won't block due to a need to transition
3916 		 *      clean->dirty
3917 		 * until external management is started.
3918 		 */
3919 		mddev->in_sync = 0;
3920 		mddev->safemode_delay = 0;
3921 		mddev->safemode = 0;
3922 	}
3923 
3924 	oldpers->free(mddev, oldpriv);
3925 
3926 	if (oldpers->sync_request == NULL &&
3927 	    pers->sync_request != NULL) {
3928 		/* need to add the md_redundancy_group */
3929 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3930 			pr_warn("md: cannot register extra attributes for %s\n",
3931 				mdname(mddev));
3932 		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3933 	}
3934 	if (oldpers->sync_request != NULL &&
3935 	    pers->sync_request == NULL) {
3936 		/* need to remove the md_redundancy_group */
3937 		if (mddev->to_remove == NULL)
3938 			mddev->to_remove = &md_redundancy_group;
3939 	}
3940 
3941 	module_put(oldpers->owner);
3942 
3943 	rdev_for_each(rdev, mddev) {
3944 		if (rdev->raid_disk < 0)
3945 			continue;
3946 		if (rdev->new_raid_disk >= mddev->raid_disks)
3947 			rdev->new_raid_disk = -1;
3948 		if (rdev->new_raid_disk == rdev->raid_disk)
3949 			continue;
3950 		sysfs_unlink_rdev(mddev, rdev);
3951 	}
3952 	rdev_for_each(rdev, mddev) {
3953 		if (rdev->raid_disk < 0)
3954 			continue;
3955 		if (rdev->new_raid_disk == rdev->raid_disk)
3956 			continue;
3957 		rdev->raid_disk = rdev->new_raid_disk;
3958 		if (rdev->raid_disk < 0)
3959 			clear_bit(In_sync, &rdev->flags);
3960 		else {
3961 			if (sysfs_link_rdev(mddev, rdev))
3962 				pr_warn("md: cannot register rd%d for %s after level change\n",
3963 					rdev->raid_disk, mdname(mddev));
3964 		}
3965 	}
3966 
3967 	if (pers->sync_request == NULL) {
3968 		/* this is now an array without redundancy, so
3969 		 * it must always be in_sync
3970 		 */
3971 		mddev->in_sync = 1;
3972 		del_timer_sync(&mddev->safemode_timer);
3973 	}
3974 	blk_set_stacking_limits(&mddev->queue->limits);
3975 	pers->run(mddev);
3976 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3977 	mddev_resume(mddev);
3978 	if (!mddev->thread)
3979 		md_update_sb(mddev, 1);
3980 	sysfs_notify(&mddev->kobj, NULL, "level");
3981 	md_new_event(mddev);
3982 	rv = len;
3983 out_unlock:
3984 	mddev_unlock(mddev);
3985 	return rv;
3986 }
3987 
3988 static struct md_sysfs_entry md_level =
3989 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3990 
3991 static ssize_t
3992 layout_show(struct mddev *mddev, char *page)
3993 {
3994 	/* just a number, not meaningful for all levels */
3995 	if (mddev->reshape_position != MaxSector &&
3996 	    mddev->layout != mddev->new_layout)
3997 		return sprintf(page, "%d (%d)\n",
3998 			       mddev->new_layout, mddev->layout);
3999 	return sprintf(page, "%d\n", mddev->layout);
4000 }
4001 
4002 static ssize_t
4003 layout_store(struct mddev *mddev, const char *buf, size_t len)
4004 {
4005 	unsigned int n;
4006 	int err;
4007 
4008 	err = kstrtouint(buf, 10, &n);
4009 	if (err < 0)
4010 		return err;
4011 	err = mddev_lock(mddev);
4012 	if (err)
4013 		return err;
4014 
4015 	if (mddev->pers) {
4016 		if (mddev->pers->check_reshape == NULL)
4017 			err = -EBUSY;
4018 		else if (mddev->ro)
4019 			err = -EROFS;
4020 		else {
4021 			mddev->new_layout = n;
4022 			err = mddev->pers->check_reshape(mddev);
4023 			if (err)
4024 				mddev->new_layout = mddev->layout;
4025 		}
4026 	} else {
4027 		mddev->new_layout = n;
4028 		if (mddev->reshape_position == MaxSector)
4029 			mddev->layout = n;
4030 	}
4031 	mddev_unlock(mddev);
4032 	return err ?: len;
4033 }
4034 static struct md_sysfs_entry md_layout =
4035 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4036 
4037 static ssize_t
4038 raid_disks_show(struct mddev *mddev, char *page)
4039 {
4040 	if (mddev->raid_disks == 0)
4041 		return 0;
4042 	if (mddev->reshape_position != MaxSector &&
4043 	    mddev->delta_disks != 0)
4044 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4045 			       mddev->raid_disks - mddev->delta_disks);
4046 	return sprintf(page, "%d\n", mddev->raid_disks);
4047 }
4048 
4049 static int update_raid_disks(struct mddev *mddev, int raid_disks);
4050 
4051 static ssize_t
4052 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4053 {
4054 	unsigned int n;
4055 	int err;
4056 
4057 	err = kstrtouint(buf, 10, &n);
4058 	if (err < 0)
4059 		return err;
4060 
4061 	err = mddev_lock(mddev);
4062 	if (err)
4063 		return err;
4064 	if (mddev->pers)
4065 		err = update_raid_disks(mddev, n);
4066 	else if (mddev->reshape_position != MaxSector) {
4067 		struct md_rdev *rdev;
4068 		int olddisks = mddev->raid_disks - mddev->delta_disks;
4069 
4070 		err = -EINVAL;
4071 		rdev_for_each(rdev, mddev) {
4072 			if (olddisks < n &&
4073 			    rdev->data_offset < rdev->new_data_offset)
4074 				goto out_unlock;
4075 			if (olddisks > n &&
4076 			    rdev->data_offset > rdev->new_data_offset)
4077 				goto out_unlock;
4078 		}
4079 		err = 0;
4080 		mddev->delta_disks = n - olddisks;
4081 		mddev->raid_disks = n;
4082 		mddev->reshape_backwards = (mddev->delta_disks < 0);
4083 	} else
4084 		mddev->raid_disks = n;
4085 out_unlock:
4086 	mddev_unlock(mddev);
4087 	return err ? err : len;
4088 }
4089 static struct md_sysfs_entry md_raid_disks =
4090 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4091 
4092 static ssize_t
4093 chunk_size_show(struct mddev *mddev, char *page)
4094 {
4095 	if (mddev->reshape_position != MaxSector &&
4096 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
4097 		return sprintf(page, "%d (%d)\n",
4098 			       mddev->new_chunk_sectors << 9,
4099 			       mddev->chunk_sectors << 9);
4100 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4101 }
4102 
4103 static ssize_t
4104 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4105 {
4106 	unsigned long n;
4107 	int err;
4108 
4109 	err = kstrtoul(buf, 10, &n);
4110 	if (err < 0)
4111 		return err;
4112 
4113 	err = mddev_lock(mddev);
4114 	if (err)
4115 		return err;
4116 	if (mddev->pers) {
4117 		if (mddev->pers->check_reshape == NULL)
4118 			err = -EBUSY;
4119 		else if (mddev->ro)
4120 			err = -EROFS;
4121 		else {
4122 			mddev->new_chunk_sectors = n >> 9;
4123 			err = mddev->pers->check_reshape(mddev);
4124 			if (err)
4125 				mddev->new_chunk_sectors = mddev->chunk_sectors;
4126 		}
4127 	} else {
4128 		mddev->new_chunk_sectors = n >> 9;
4129 		if (mddev->reshape_position == MaxSector)
4130 			mddev->chunk_sectors = n >> 9;
4131 	}
4132 	mddev_unlock(mddev);
4133 	return err ?: len;
4134 }
4135 static struct md_sysfs_entry md_chunk_size =
4136 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4137 
4138 static ssize_t
4139 resync_start_show(struct mddev *mddev, char *page)
4140 {
4141 	if (mddev->recovery_cp == MaxSector)
4142 		return sprintf(page, "none\n");
4143 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4144 }
4145 
4146 static ssize_t
4147 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4148 {
4149 	unsigned long long n;
4150 	int err;
4151 
4152 	if (cmd_match(buf, "none"))
4153 		n = MaxSector;
4154 	else {
4155 		err = kstrtoull(buf, 10, &n);
4156 		if (err < 0)
4157 			return err;
4158 		if (n != (sector_t)n)
4159 			return -EINVAL;
4160 	}
4161 
4162 	err = mddev_lock(mddev);
4163 	if (err)
4164 		return err;
4165 	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4166 		err = -EBUSY;
4167 
4168 	if (!err) {
4169 		mddev->recovery_cp = n;
4170 		if (mddev->pers)
4171 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4172 	}
4173 	mddev_unlock(mddev);
4174 	return err ?: len;
4175 }
4176 static struct md_sysfs_entry md_resync_start =
4177 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4178 		resync_start_show, resync_start_store);
4179 
4180 /*
4181  * The array state can be:
4182  *
4183  * clear
4184  *     No devices, no size, no level
4185  *     Equivalent to STOP_ARRAY ioctl
4186  * inactive
4187  *     May have some settings, but array is not active
4188  *        all IO results in error
4189  *     When written, doesn't tear down array, but just stops it
4190  * suspended (not supported yet)
4191  *     All IO requests will block. The array can be reconfigured.
4192  *     Writing this, if accepted, will block until array is quiescent
4193  * readonly
4194  *     no resync can happen.  no superblocks get written.
4195  *     write requests fail
4196  * read-auto
4197  *     like readonly, but behaves like 'clean' on a write request.
4198  *
4199  * clean - no pending writes, but otherwise active.
4200  *     When written to inactive array, starts without resync
4201  *     If a write request arrives then
4202  *       if metadata is known, mark 'dirty' and switch to 'active'.
4203  *       if not known, block and switch to write-pending
4204  *     If written to an active array that has pending writes, then fails.
4205  * active
4206  *     fully active: IO and resync can be happening.
4207  *     When written to inactive array, starts with resync
4208  *
4209  * write-pending
4210  *     clean, but writes are blocked waiting for 'active' to be written.
4211  *
4212  * active-idle
4213  *     like active, but no writes have been seen for a while (100msec).
4214  *
4215  * broken
4216  *     RAID0/LINEAR-only: same as clean, but array is missing a member.
4217  *     It's useful because RAID0/LINEAR mounted-arrays aren't stopped
4218  *     when a member is gone, so this state will at least alert the
4219  *     user that something is wrong.
4220  */
4221 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4222 		   write_pending, active_idle, broken, bad_word};
4223 static char *array_states[] = {
4224 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4225 	"write-pending", "active-idle", "broken", NULL };
4226 
4227 static int match_word(const char *word, char **list)
4228 {
4229 	int n;
4230 	for (n=0; list[n]; n++)
4231 		if (cmd_match(word, list[n]))
4232 			break;
4233 	return n;
4234 }
4235 
4236 static ssize_t
4237 array_state_show(struct mddev *mddev, char *page)
4238 {
4239 	enum array_state st = inactive;
4240 
4241 	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4242 		switch(mddev->ro) {
4243 		case 1:
4244 			st = readonly;
4245 			break;
4246 		case 2:
4247 			st = read_auto;
4248 			break;
4249 		case 0:
4250 			spin_lock(&mddev->lock);
4251 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4252 				st = write_pending;
4253 			else if (mddev->in_sync)
4254 				st = clean;
4255 			else if (mddev->safemode)
4256 				st = active_idle;
4257 			else
4258 				st = active;
4259 			spin_unlock(&mddev->lock);
4260 		}
4261 
4262 		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4263 			st = broken;
4264 	} else {
4265 		if (list_empty(&mddev->disks) &&
4266 		    mddev->raid_disks == 0 &&
4267 		    mddev->dev_sectors == 0)
4268 			st = clear;
4269 		else
4270 			st = inactive;
4271 	}
4272 	return sprintf(page, "%s\n", array_states[st]);
4273 }
4274 
4275 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4276 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4277 static int do_md_run(struct mddev *mddev);
4278 static int restart_array(struct mddev *mddev);
4279 
4280 static ssize_t
4281 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4282 {
4283 	int err = 0;
4284 	enum array_state st = match_word(buf, array_states);
4285 
4286 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4287 		/* don't take reconfig_mutex when toggling between
4288 		 * clean and active
4289 		 */
4290 		spin_lock(&mddev->lock);
4291 		if (st == active) {
4292 			restart_array(mddev);
4293 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4294 			md_wakeup_thread(mddev->thread);
4295 			wake_up(&mddev->sb_wait);
4296 		} else /* st == clean */ {
4297 			restart_array(mddev);
4298 			if (!set_in_sync(mddev))
4299 				err = -EBUSY;
4300 		}
4301 		if (!err)
4302 			sysfs_notify_dirent_safe(mddev->sysfs_state);
4303 		spin_unlock(&mddev->lock);
4304 		return err ?: len;
4305 	}
4306 	err = mddev_lock(mddev);
4307 	if (err)
4308 		return err;
4309 	err = -EINVAL;
4310 	switch(st) {
4311 	case bad_word:
4312 		break;
4313 	case clear:
4314 		/* stopping an active array */
4315 		err = do_md_stop(mddev, 0, NULL);
4316 		break;
4317 	case inactive:
4318 		/* stopping an active array */
4319 		if (mddev->pers)
4320 			err = do_md_stop(mddev, 2, NULL);
4321 		else
4322 			err = 0; /* already inactive */
4323 		break;
4324 	case suspended:
4325 		break; /* not supported yet */
4326 	case readonly:
4327 		if (mddev->pers)
4328 			err = md_set_readonly(mddev, NULL);
4329 		else {
4330 			mddev->ro = 1;
4331 			set_disk_ro(mddev->gendisk, 1);
4332 			err = do_md_run(mddev);
4333 		}
4334 		break;
4335 	case read_auto:
4336 		if (mddev->pers) {
4337 			if (mddev->ro == 0)
4338 				err = md_set_readonly(mddev, NULL);
4339 			else if (mddev->ro == 1)
4340 				err = restart_array(mddev);
4341 			if (err == 0) {
4342 				mddev->ro = 2;
4343 				set_disk_ro(mddev->gendisk, 0);
4344 			}
4345 		} else {
4346 			mddev->ro = 2;
4347 			err = do_md_run(mddev);
4348 		}
4349 		break;
4350 	case clean:
4351 		if (mddev->pers) {
4352 			err = restart_array(mddev);
4353 			if (err)
4354 				break;
4355 			spin_lock(&mddev->lock);
4356 			if (!set_in_sync(mddev))
4357 				err = -EBUSY;
4358 			spin_unlock(&mddev->lock);
4359 		} else
4360 			err = -EINVAL;
4361 		break;
4362 	case active:
4363 		if (mddev->pers) {
4364 			err = restart_array(mddev);
4365 			if (err)
4366 				break;
4367 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4368 			wake_up(&mddev->sb_wait);
4369 			err = 0;
4370 		} else {
4371 			mddev->ro = 0;
4372 			set_disk_ro(mddev->gendisk, 0);
4373 			err = do_md_run(mddev);
4374 		}
4375 		break;
4376 	case write_pending:
4377 	case active_idle:
4378 	case broken:
4379 		/* these cannot be set */
4380 		break;
4381 	}
4382 
4383 	if (!err) {
4384 		if (mddev->hold_active == UNTIL_IOCTL)
4385 			mddev->hold_active = 0;
4386 		sysfs_notify_dirent_safe(mddev->sysfs_state);
4387 	}
4388 	mddev_unlock(mddev);
4389 	return err ?: len;
4390 }
4391 static struct md_sysfs_entry md_array_state =
4392 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4393 
4394 static ssize_t
4395 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4396 	return sprintf(page, "%d\n",
4397 		       atomic_read(&mddev->max_corr_read_errors));
4398 }
4399 
4400 static ssize_t
4401 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4402 {
4403 	unsigned int n;
4404 	int rv;
4405 
4406 	rv = kstrtouint(buf, 10, &n);
4407 	if (rv < 0)
4408 		return rv;
4409 	atomic_set(&mddev->max_corr_read_errors, n);
4410 	return len;
4411 }
4412 
4413 static struct md_sysfs_entry max_corr_read_errors =
4414 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4415 	max_corrected_read_errors_store);
4416 
4417 static ssize_t
4418 null_show(struct mddev *mddev, char *page)
4419 {
4420 	return -EINVAL;
4421 }
4422 
4423 static ssize_t
4424 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4425 {
4426 	/* buf must be %d:%d\n? giving major and minor numbers */
4427 	/* The new device is added to the array.
4428 	 * If the array has a persistent superblock, we read the
4429 	 * superblock to initialise info and check validity.
4430 	 * Otherwise, only checking done is that in bind_rdev_to_array,
4431 	 * which mainly checks size.
4432 	 */
4433 	char *e;
4434 	int major = simple_strtoul(buf, &e, 10);
4435 	int minor;
4436 	dev_t dev;
4437 	struct md_rdev *rdev;
4438 	int err;
4439 
4440 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4441 		return -EINVAL;
4442 	minor = simple_strtoul(e+1, &e, 10);
4443 	if (*e && *e != '\n')
4444 		return -EINVAL;
4445 	dev = MKDEV(major, minor);
4446 	if (major != MAJOR(dev) ||
4447 	    minor != MINOR(dev))
4448 		return -EOVERFLOW;
4449 
4450 	flush_workqueue(md_misc_wq);
4451 
4452 	err = mddev_lock(mddev);
4453 	if (err)
4454 		return err;
4455 	if (mddev->persistent) {
4456 		rdev = md_import_device(dev, mddev->major_version,
4457 					mddev->minor_version);
4458 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4459 			struct md_rdev *rdev0
4460 				= list_entry(mddev->disks.next,
4461 					     struct md_rdev, same_set);
4462 			err = super_types[mddev->major_version]
4463 				.load_super(rdev, rdev0, mddev->minor_version);
4464 			if (err < 0)
4465 				goto out;
4466 		}
4467 	} else if (mddev->external)
4468 		rdev = md_import_device(dev, -2, -1);
4469 	else
4470 		rdev = md_import_device(dev, -1, -1);
4471 
4472 	if (IS_ERR(rdev)) {
4473 		mddev_unlock(mddev);
4474 		return PTR_ERR(rdev);
4475 	}
4476 	err = bind_rdev_to_array(rdev, mddev);
4477  out:
4478 	if (err)
4479 		export_rdev(rdev);
4480 	mddev_unlock(mddev);
4481 	if (!err)
4482 		md_new_event(mddev);
4483 	return err ? err : len;
4484 }
4485 
4486 static struct md_sysfs_entry md_new_device =
4487 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4488 
4489 static ssize_t
4490 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4491 {
4492 	char *end;
4493 	unsigned long chunk, end_chunk;
4494 	int err;
4495 
4496 	err = mddev_lock(mddev);
4497 	if (err)
4498 		return err;
4499 	if (!mddev->bitmap)
4500 		goto out;
4501 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4502 	while (*buf) {
4503 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4504 		if (buf == end) break;
4505 		if (*end == '-') { /* range */
4506 			buf = end + 1;
4507 			end_chunk = simple_strtoul(buf, &end, 0);
4508 			if (buf == end) break;
4509 		}
4510 		if (*end && !isspace(*end)) break;
4511 		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4512 		buf = skip_spaces(end);
4513 	}
4514 	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4515 out:
4516 	mddev_unlock(mddev);
4517 	return len;
4518 }
4519 
4520 static struct md_sysfs_entry md_bitmap =
4521 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4522 
4523 static ssize_t
4524 size_show(struct mddev *mddev, char *page)
4525 {
4526 	return sprintf(page, "%llu\n",
4527 		(unsigned long long)mddev->dev_sectors / 2);
4528 }
4529 
4530 static int update_size(struct mddev *mddev, sector_t num_sectors);
4531 
4532 static ssize_t
4533 size_store(struct mddev *mddev, const char *buf, size_t len)
4534 {
4535 	/* If array is inactive, we can reduce the component size, but
4536 	 * not increase it (except from 0).
4537 	 * If array is active, we can try an on-line resize
4538 	 */
4539 	sector_t sectors;
4540 	int err = strict_blocks_to_sectors(buf, &sectors);
4541 
4542 	if (err < 0)
4543 		return err;
4544 	err = mddev_lock(mddev);
4545 	if (err)
4546 		return err;
4547 	if (mddev->pers) {
4548 		err = update_size(mddev, sectors);
4549 		if (err == 0)
4550 			md_update_sb(mddev, 1);
4551 	} else {
4552 		if (mddev->dev_sectors == 0 ||
4553 		    mddev->dev_sectors > sectors)
4554 			mddev->dev_sectors = sectors;
4555 		else
4556 			err = -ENOSPC;
4557 	}
4558 	mddev_unlock(mddev);
4559 	return err ? err : len;
4560 }
4561 
4562 static struct md_sysfs_entry md_size =
4563 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4564 
4565 /* Metadata version.
4566  * This is one of
4567  *   'none' for arrays with no metadata (good luck...)
4568  *   'external' for arrays with externally managed metadata,
4569  * or N.M for internally known formats
4570  */
4571 static ssize_t
4572 metadata_show(struct mddev *mddev, char *page)
4573 {
4574 	if (mddev->persistent)
4575 		return sprintf(page, "%d.%d\n",
4576 			       mddev->major_version, mddev->minor_version);
4577 	else if (mddev->external)
4578 		return sprintf(page, "external:%s\n", mddev->metadata_type);
4579 	else
4580 		return sprintf(page, "none\n");
4581 }
4582 
4583 static ssize_t
4584 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4585 {
4586 	int major, minor;
4587 	char *e;
4588 	int err;
4589 	/* Changing the details of 'external' metadata is
4590 	 * always permitted.  Otherwise there must be
4591 	 * no devices attached to the array.
4592 	 */
4593 
4594 	err = mddev_lock(mddev);
4595 	if (err)
4596 		return err;
4597 	err = -EBUSY;
4598 	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4599 		;
4600 	else if (!list_empty(&mddev->disks))
4601 		goto out_unlock;
4602 
4603 	err = 0;
4604 	if (cmd_match(buf, "none")) {
4605 		mddev->persistent = 0;
4606 		mddev->external = 0;
4607 		mddev->major_version = 0;
4608 		mddev->minor_version = 90;
4609 		goto out_unlock;
4610 	}
4611 	if (strncmp(buf, "external:", 9) == 0) {
4612 		size_t namelen = len-9;
4613 		if (namelen >= sizeof(mddev->metadata_type))
4614 			namelen = sizeof(mddev->metadata_type)-1;
4615 		strncpy(mddev->metadata_type, buf+9, namelen);
4616 		mddev->metadata_type[namelen] = 0;
4617 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4618 			mddev->metadata_type[--namelen] = 0;
4619 		mddev->persistent = 0;
4620 		mddev->external = 1;
4621 		mddev->major_version = 0;
4622 		mddev->minor_version = 90;
4623 		goto out_unlock;
4624 	}
4625 	major = simple_strtoul(buf, &e, 10);
4626 	err = -EINVAL;
4627 	if (e==buf || *e != '.')
4628 		goto out_unlock;
4629 	buf = e+1;
4630 	minor = simple_strtoul(buf, &e, 10);
4631 	if (e==buf || (*e && *e != '\n') )
4632 		goto out_unlock;
4633 	err = -ENOENT;
4634 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4635 		goto out_unlock;
4636 	mddev->major_version = major;
4637 	mddev->minor_version = minor;
4638 	mddev->persistent = 1;
4639 	mddev->external = 0;
4640 	err = 0;
4641 out_unlock:
4642 	mddev_unlock(mddev);
4643 	return err ?: len;
4644 }
4645 
4646 static struct md_sysfs_entry md_metadata =
4647 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4648 
4649 static ssize_t
4650 action_show(struct mddev *mddev, char *page)
4651 {
4652 	char *type = "idle";
4653 	unsigned long recovery = mddev->recovery;
4654 	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4655 		type = "frozen";
4656 	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4657 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4658 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4659 			type = "reshape";
4660 		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4661 			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4662 				type = "resync";
4663 			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4664 				type = "check";
4665 			else
4666 				type = "repair";
4667 		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4668 			type = "recover";
4669 		else if (mddev->reshape_position != MaxSector)
4670 			type = "reshape";
4671 	}
4672 	return sprintf(page, "%s\n", type);
4673 }
4674 
4675 static ssize_t
4676 action_store(struct mddev *mddev, const char *page, size_t len)
4677 {
4678 	if (!mddev->pers || !mddev->pers->sync_request)
4679 		return -EINVAL;
4680 
4681 
4682 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4683 		if (cmd_match(page, "frozen"))
4684 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4685 		else
4686 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4687 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4688 		    mddev_lock(mddev) == 0) {
4689 			flush_workqueue(md_misc_wq);
4690 			if (mddev->sync_thread) {
4691 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4692 				md_reap_sync_thread(mddev);
4693 			}
4694 			mddev_unlock(mddev);
4695 		}
4696 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4697 		return -EBUSY;
4698 	else if (cmd_match(page, "resync"))
4699 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4700 	else if (cmd_match(page, "recover")) {
4701 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4702 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4703 	} else if (cmd_match(page, "reshape")) {
4704 		int err;
4705 		if (mddev->pers->start_reshape == NULL)
4706 			return -EINVAL;
4707 		err = mddev_lock(mddev);
4708 		if (!err) {
4709 			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4710 				err =  -EBUSY;
4711 			else {
4712 				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4713 				err = mddev->pers->start_reshape(mddev);
4714 			}
4715 			mddev_unlock(mddev);
4716 		}
4717 		if (err)
4718 			return err;
4719 		sysfs_notify(&mddev->kobj, NULL, "degraded");
4720 	} else {
4721 		if (cmd_match(page, "check"))
4722 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4723 		else if (!cmd_match(page, "repair"))
4724 			return -EINVAL;
4725 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4726 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4727 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4728 	}
4729 	if (mddev->ro == 2) {
4730 		/* A write to sync_action is enough to justify
4731 		 * canceling read-auto mode
4732 		 */
4733 		mddev->ro = 0;
4734 		md_wakeup_thread(mddev->sync_thread);
4735 	}
4736 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4737 	md_wakeup_thread(mddev->thread);
4738 	sysfs_notify_dirent_safe(mddev->sysfs_action);
4739 	return len;
4740 }
4741 
4742 static struct md_sysfs_entry md_scan_mode =
4743 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4744 
4745 static ssize_t
4746 last_sync_action_show(struct mddev *mddev, char *page)
4747 {
4748 	return sprintf(page, "%s\n", mddev->last_sync_action);
4749 }
4750 
4751 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4752 
4753 static ssize_t
4754 mismatch_cnt_show(struct mddev *mddev, char *page)
4755 {
4756 	return sprintf(page, "%llu\n",
4757 		       (unsigned long long)
4758 		       atomic64_read(&mddev->resync_mismatches));
4759 }
4760 
4761 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4762 
4763 static ssize_t
4764 sync_min_show(struct mddev *mddev, char *page)
4765 {
4766 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4767 		       mddev->sync_speed_min ? "local": "system");
4768 }
4769 
4770 static ssize_t
4771 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4772 {
4773 	unsigned int min;
4774 	int rv;
4775 
4776 	if (strncmp(buf, "system", 6)==0) {
4777 		min = 0;
4778 	} else {
4779 		rv = kstrtouint(buf, 10, &min);
4780 		if (rv < 0)
4781 			return rv;
4782 		if (min == 0)
4783 			return -EINVAL;
4784 	}
4785 	mddev->sync_speed_min = min;
4786 	return len;
4787 }
4788 
4789 static struct md_sysfs_entry md_sync_min =
4790 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4791 
4792 static ssize_t
4793 sync_max_show(struct mddev *mddev, char *page)
4794 {
4795 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4796 		       mddev->sync_speed_max ? "local": "system");
4797 }
4798 
4799 static ssize_t
4800 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4801 {
4802 	unsigned int max;
4803 	int rv;
4804 
4805 	if (strncmp(buf, "system", 6)==0) {
4806 		max = 0;
4807 	} else {
4808 		rv = kstrtouint(buf, 10, &max);
4809 		if (rv < 0)
4810 			return rv;
4811 		if (max == 0)
4812 			return -EINVAL;
4813 	}
4814 	mddev->sync_speed_max = max;
4815 	return len;
4816 }
4817 
4818 static struct md_sysfs_entry md_sync_max =
4819 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4820 
4821 static ssize_t
4822 degraded_show(struct mddev *mddev, char *page)
4823 {
4824 	return sprintf(page, "%d\n", mddev->degraded);
4825 }
4826 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4827 
4828 static ssize_t
4829 sync_force_parallel_show(struct mddev *mddev, char *page)
4830 {
4831 	return sprintf(page, "%d\n", mddev->parallel_resync);
4832 }
4833 
4834 static ssize_t
4835 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4836 {
4837 	long n;
4838 
4839 	if (kstrtol(buf, 10, &n))
4840 		return -EINVAL;
4841 
4842 	if (n != 0 && n != 1)
4843 		return -EINVAL;
4844 
4845 	mddev->parallel_resync = n;
4846 
4847 	if (mddev->sync_thread)
4848 		wake_up(&resync_wait);
4849 
4850 	return len;
4851 }
4852 
4853 /* force parallel resync, even with shared block devices */
4854 static struct md_sysfs_entry md_sync_force_parallel =
4855 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4856        sync_force_parallel_show, sync_force_parallel_store);
4857 
4858 static ssize_t
4859 sync_speed_show(struct mddev *mddev, char *page)
4860 {
4861 	unsigned long resync, dt, db;
4862 	if (mddev->curr_resync == 0)
4863 		return sprintf(page, "none\n");
4864 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4865 	dt = (jiffies - mddev->resync_mark) / HZ;
4866 	if (!dt) dt++;
4867 	db = resync - mddev->resync_mark_cnt;
4868 	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4869 }
4870 
4871 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4872 
4873 static ssize_t
4874 sync_completed_show(struct mddev *mddev, char *page)
4875 {
4876 	unsigned long long max_sectors, resync;
4877 
4878 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4879 		return sprintf(page, "none\n");
4880 
4881 	if (mddev->curr_resync == 1 ||
4882 	    mddev->curr_resync == 2)
4883 		return sprintf(page, "delayed\n");
4884 
4885 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4886 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4887 		max_sectors = mddev->resync_max_sectors;
4888 	else
4889 		max_sectors = mddev->dev_sectors;
4890 
4891 	resync = mddev->curr_resync_completed;
4892 	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4893 }
4894 
4895 static struct md_sysfs_entry md_sync_completed =
4896 	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4897 
4898 static ssize_t
4899 min_sync_show(struct mddev *mddev, char *page)
4900 {
4901 	return sprintf(page, "%llu\n",
4902 		       (unsigned long long)mddev->resync_min);
4903 }
4904 static ssize_t
4905 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4906 {
4907 	unsigned long long min;
4908 	int err;
4909 
4910 	if (kstrtoull(buf, 10, &min))
4911 		return -EINVAL;
4912 
4913 	spin_lock(&mddev->lock);
4914 	err = -EINVAL;
4915 	if (min > mddev->resync_max)
4916 		goto out_unlock;
4917 
4918 	err = -EBUSY;
4919 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4920 		goto out_unlock;
4921 
4922 	/* Round down to multiple of 4K for safety */
4923 	mddev->resync_min = round_down(min, 8);
4924 	err = 0;
4925 
4926 out_unlock:
4927 	spin_unlock(&mddev->lock);
4928 	return err ?: len;
4929 }
4930 
4931 static struct md_sysfs_entry md_min_sync =
4932 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4933 
4934 static ssize_t
4935 max_sync_show(struct mddev *mddev, char *page)
4936 {
4937 	if (mddev->resync_max == MaxSector)
4938 		return sprintf(page, "max\n");
4939 	else
4940 		return sprintf(page, "%llu\n",
4941 			       (unsigned long long)mddev->resync_max);
4942 }
4943 static ssize_t
4944 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4945 {
4946 	int err;
4947 	spin_lock(&mddev->lock);
4948 	if (strncmp(buf, "max", 3) == 0)
4949 		mddev->resync_max = MaxSector;
4950 	else {
4951 		unsigned long long max;
4952 		int chunk;
4953 
4954 		err = -EINVAL;
4955 		if (kstrtoull(buf, 10, &max))
4956 			goto out_unlock;
4957 		if (max < mddev->resync_min)
4958 			goto out_unlock;
4959 
4960 		err = -EBUSY;
4961 		if (max < mddev->resync_max &&
4962 		    mddev->ro == 0 &&
4963 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4964 			goto out_unlock;
4965 
4966 		/* Must be a multiple of chunk_size */
4967 		chunk = mddev->chunk_sectors;
4968 		if (chunk) {
4969 			sector_t temp = max;
4970 
4971 			err = -EINVAL;
4972 			if (sector_div(temp, chunk))
4973 				goto out_unlock;
4974 		}
4975 		mddev->resync_max = max;
4976 	}
4977 	wake_up(&mddev->recovery_wait);
4978 	err = 0;
4979 out_unlock:
4980 	spin_unlock(&mddev->lock);
4981 	return err ?: len;
4982 }
4983 
4984 static struct md_sysfs_entry md_max_sync =
4985 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4986 
4987 static ssize_t
4988 suspend_lo_show(struct mddev *mddev, char *page)
4989 {
4990 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4991 }
4992 
4993 static ssize_t
4994 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4995 {
4996 	unsigned long long new;
4997 	int err;
4998 
4999 	err = kstrtoull(buf, 10, &new);
5000 	if (err < 0)
5001 		return err;
5002 	if (new != (sector_t)new)
5003 		return -EINVAL;
5004 
5005 	err = mddev_lock(mddev);
5006 	if (err)
5007 		return err;
5008 	err = -EINVAL;
5009 	if (mddev->pers == NULL ||
5010 	    mddev->pers->quiesce == NULL)
5011 		goto unlock;
5012 	mddev_suspend(mddev);
5013 	mddev->suspend_lo = new;
5014 	mddev_resume(mddev);
5015 
5016 	err = 0;
5017 unlock:
5018 	mddev_unlock(mddev);
5019 	return err ?: len;
5020 }
5021 static struct md_sysfs_entry md_suspend_lo =
5022 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5023 
5024 static ssize_t
5025 suspend_hi_show(struct mddev *mddev, char *page)
5026 {
5027 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
5028 }
5029 
5030 static ssize_t
5031 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5032 {
5033 	unsigned long long new;
5034 	int err;
5035 
5036 	err = kstrtoull(buf, 10, &new);
5037 	if (err < 0)
5038 		return err;
5039 	if (new != (sector_t)new)
5040 		return -EINVAL;
5041 
5042 	err = mddev_lock(mddev);
5043 	if (err)
5044 		return err;
5045 	err = -EINVAL;
5046 	if (mddev->pers == NULL)
5047 		goto unlock;
5048 
5049 	mddev_suspend(mddev);
5050 	mddev->suspend_hi = new;
5051 	mddev_resume(mddev);
5052 
5053 	err = 0;
5054 unlock:
5055 	mddev_unlock(mddev);
5056 	return err ?: len;
5057 }
5058 static struct md_sysfs_entry md_suspend_hi =
5059 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5060 
5061 static ssize_t
5062 reshape_position_show(struct mddev *mddev, char *page)
5063 {
5064 	if (mddev->reshape_position != MaxSector)
5065 		return sprintf(page, "%llu\n",
5066 			       (unsigned long long)mddev->reshape_position);
5067 	strcpy(page, "none\n");
5068 	return 5;
5069 }
5070 
5071 static ssize_t
5072 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5073 {
5074 	struct md_rdev *rdev;
5075 	unsigned long long new;
5076 	int err;
5077 
5078 	err = kstrtoull(buf, 10, &new);
5079 	if (err < 0)
5080 		return err;
5081 	if (new != (sector_t)new)
5082 		return -EINVAL;
5083 	err = mddev_lock(mddev);
5084 	if (err)
5085 		return err;
5086 	err = -EBUSY;
5087 	if (mddev->pers)
5088 		goto unlock;
5089 	mddev->reshape_position = new;
5090 	mddev->delta_disks = 0;
5091 	mddev->reshape_backwards = 0;
5092 	mddev->new_level = mddev->level;
5093 	mddev->new_layout = mddev->layout;
5094 	mddev->new_chunk_sectors = mddev->chunk_sectors;
5095 	rdev_for_each(rdev, mddev)
5096 		rdev->new_data_offset = rdev->data_offset;
5097 	err = 0;
5098 unlock:
5099 	mddev_unlock(mddev);
5100 	return err ?: len;
5101 }
5102 
5103 static struct md_sysfs_entry md_reshape_position =
5104 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5105        reshape_position_store);
5106 
5107 static ssize_t
5108 reshape_direction_show(struct mddev *mddev, char *page)
5109 {
5110 	return sprintf(page, "%s\n",
5111 		       mddev->reshape_backwards ? "backwards" : "forwards");
5112 }
5113 
5114 static ssize_t
5115 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5116 {
5117 	int backwards = 0;
5118 	int err;
5119 
5120 	if (cmd_match(buf, "forwards"))
5121 		backwards = 0;
5122 	else if (cmd_match(buf, "backwards"))
5123 		backwards = 1;
5124 	else
5125 		return -EINVAL;
5126 	if (mddev->reshape_backwards == backwards)
5127 		return len;
5128 
5129 	err = mddev_lock(mddev);
5130 	if (err)
5131 		return err;
5132 	/* check if we are allowed to change */
5133 	if (mddev->delta_disks)
5134 		err = -EBUSY;
5135 	else if (mddev->persistent &&
5136 	    mddev->major_version == 0)
5137 		err =  -EINVAL;
5138 	else
5139 		mddev->reshape_backwards = backwards;
5140 	mddev_unlock(mddev);
5141 	return err ?: len;
5142 }
5143 
5144 static struct md_sysfs_entry md_reshape_direction =
5145 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5146        reshape_direction_store);
5147 
5148 static ssize_t
5149 array_size_show(struct mddev *mddev, char *page)
5150 {
5151 	if (mddev->external_size)
5152 		return sprintf(page, "%llu\n",
5153 			       (unsigned long long)mddev->array_sectors/2);
5154 	else
5155 		return sprintf(page, "default\n");
5156 }
5157 
5158 static ssize_t
5159 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5160 {
5161 	sector_t sectors;
5162 	int err;
5163 
5164 	err = mddev_lock(mddev);
5165 	if (err)
5166 		return err;
5167 
5168 	/* cluster raid doesn't support change array_sectors */
5169 	if (mddev_is_clustered(mddev)) {
5170 		mddev_unlock(mddev);
5171 		return -EINVAL;
5172 	}
5173 
5174 	if (strncmp(buf, "default", 7) == 0) {
5175 		if (mddev->pers)
5176 			sectors = mddev->pers->size(mddev, 0, 0);
5177 		else
5178 			sectors = mddev->array_sectors;
5179 
5180 		mddev->external_size = 0;
5181 	} else {
5182 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5183 			err = -EINVAL;
5184 		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5185 			err = -E2BIG;
5186 		else
5187 			mddev->external_size = 1;
5188 	}
5189 
5190 	if (!err) {
5191 		mddev->array_sectors = sectors;
5192 		if (mddev->pers) {
5193 			set_capacity(mddev->gendisk, mddev->array_sectors);
5194 			revalidate_disk(mddev->gendisk);
5195 		}
5196 	}
5197 	mddev_unlock(mddev);
5198 	return err ?: len;
5199 }
5200 
5201 static struct md_sysfs_entry md_array_size =
5202 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5203        array_size_store);
5204 
5205 static ssize_t
5206 consistency_policy_show(struct mddev *mddev, char *page)
5207 {
5208 	int ret;
5209 
5210 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5211 		ret = sprintf(page, "journal\n");
5212 	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5213 		ret = sprintf(page, "ppl\n");
5214 	} else if (mddev->bitmap) {
5215 		ret = sprintf(page, "bitmap\n");
5216 	} else if (mddev->pers) {
5217 		if (mddev->pers->sync_request)
5218 			ret = sprintf(page, "resync\n");
5219 		else
5220 			ret = sprintf(page, "none\n");
5221 	} else {
5222 		ret = sprintf(page, "unknown\n");
5223 	}
5224 
5225 	return ret;
5226 }
5227 
5228 static ssize_t
5229 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5230 {
5231 	int err = 0;
5232 
5233 	if (mddev->pers) {
5234 		if (mddev->pers->change_consistency_policy)
5235 			err = mddev->pers->change_consistency_policy(mddev, buf);
5236 		else
5237 			err = -EBUSY;
5238 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5239 		set_bit(MD_HAS_PPL, &mddev->flags);
5240 	} else {
5241 		err = -EINVAL;
5242 	}
5243 
5244 	return err ? err : len;
5245 }
5246 
5247 static struct md_sysfs_entry md_consistency_policy =
5248 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5249        consistency_policy_store);
5250 
5251 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5252 {
5253 	return sprintf(page, "%d\n", mddev->fail_last_dev);
5254 }
5255 
5256 /*
5257  * Setting fail_last_dev to true to allow last device to be forcibly removed
5258  * from RAID1/RAID10.
5259  */
5260 static ssize_t
5261 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5262 {
5263 	int ret;
5264 	bool value;
5265 
5266 	ret = kstrtobool(buf, &value);
5267 	if (ret)
5268 		return ret;
5269 
5270 	if (value != mddev->fail_last_dev)
5271 		mddev->fail_last_dev = value;
5272 
5273 	return len;
5274 }
5275 static struct md_sysfs_entry md_fail_last_dev =
5276 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5277        fail_last_dev_store);
5278 
5279 static struct attribute *md_default_attrs[] = {
5280 	&md_level.attr,
5281 	&md_layout.attr,
5282 	&md_raid_disks.attr,
5283 	&md_chunk_size.attr,
5284 	&md_size.attr,
5285 	&md_resync_start.attr,
5286 	&md_metadata.attr,
5287 	&md_new_device.attr,
5288 	&md_safe_delay.attr,
5289 	&md_array_state.attr,
5290 	&md_reshape_position.attr,
5291 	&md_reshape_direction.attr,
5292 	&md_array_size.attr,
5293 	&max_corr_read_errors.attr,
5294 	&md_consistency_policy.attr,
5295 	&md_fail_last_dev.attr,
5296 	NULL,
5297 };
5298 
5299 static struct attribute *md_redundancy_attrs[] = {
5300 	&md_scan_mode.attr,
5301 	&md_last_scan_mode.attr,
5302 	&md_mismatches.attr,
5303 	&md_sync_min.attr,
5304 	&md_sync_max.attr,
5305 	&md_sync_speed.attr,
5306 	&md_sync_force_parallel.attr,
5307 	&md_sync_completed.attr,
5308 	&md_min_sync.attr,
5309 	&md_max_sync.attr,
5310 	&md_suspend_lo.attr,
5311 	&md_suspend_hi.attr,
5312 	&md_bitmap.attr,
5313 	&md_degraded.attr,
5314 	NULL,
5315 };
5316 static struct attribute_group md_redundancy_group = {
5317 	.name = NULL,
5318 	.attrs = md_redundancy_attrs,
5319 };
5320 
5321 static ssize_t
5322 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5323 {
5324 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5325 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5326 	ssize_t rv;
5327 
5328 	if (!entry->show)
5329 		return -EIO;
5330 	spin_lock(&all_mddevs_lock);
5331 	if (list_empty(&mddev->all_mddevs)) {
5332 		spin_unlock(&all_mddevs_lock);
5333 		return -EBUSY;
5334 	}
5335 	mddev_get(mddev);
5336 	spin_unlock(&all_mddevs_lock);
5337 
5338 	rv = entry->show(mddev, page);
5339 	mddev_put(mddev);
5340 	return rv;
5341 }
5342 
5343 static ssize_t
5344 md_attr_store(struct kobject *kobj, struct attribute *attr,
5345 	      const char *page, size_t length)
5346 {
5347 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5348 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5349 	ssize_t rv;
5350 
5351 	if (!entry->store)
5352 		return -EIO;
5353 	if (!capable(CAP_SYS_ADMIN))
5354 		return -EACCES;
5355 	spin_lock(&all_mddevs_lock);
5356 	if (list_empty(&mddev->all_mddevs)) {
5357 		spin_unlock(&all_mddevs_lock);
5358 		return -EBUSY;
5359 	}
5360 	mddev_get(mddev);
5361 	spin_unlock(&all_mddevs_lock);
5362 	rv = entry->store(mddev, page, length);
5363 	mddev_put(mddev);
5364 	return rv;
5365 }
5366 
5367 static void md_free(struct kobject *ko)
5368 {
5369 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5370 
5371 	if (mddev->sysfs_state)
5372 		sysfs_put(mddev->sysfs_state);
5373 
5374 	if (mddev->gendisk)
5375 		del_gendisk(mddev->gendisk);
5376 	if (mddev->queue)
5377 		blk_cleanup_queue(mddev->queue);
5378 	if (mddev->gendisk)
5379 		put_disk(mddev->gendisk);
5380 	percpu_ref_exit(&mddev->writes_pending);
5381 
5382 	bioset_exit(&mddev->bio_set);
5383 	bioset_exit(&mddev->sync_set);
5384 	kfree(mddev);
5385 }
5386 
5387 static const struct sysfs_ops md_sysfs_ops = {
5388 	.show	= md_attr_show,
5389 	.store	= md_attr_store,
5390 };
5391 static struct kobj_type md_ktype = {
5392 	.release	= md_free,
5393 	.sysfs_ops	= &md_sysfs_ops,
5394 	.default_attrs	= md_default_attrs,
5395 };
5396 
5397 int mdp_major = 0;
5398 
5399 static void mddev_delayed_delete(struct work_struct *ws)
5400 {
5401 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5402 
5403 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5404 	kobject_del(&mddev->kobj);
5405 	kobject_put(&mddev->kobj);
5406 }
5407 
5408 static void no_op(struct percpu_ref *r) {}
5409 
5410 int mddev_init_writes_pending(struct mddev *mddev)
5411 {
5412 	if (mddev->writes_pending.percpu_count_ptr)
5413 		return 0;
5414 	if (percpu_ref_init(&mddev->writes_pending, no_op,
5415 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
5416 		return -ENOMEM;
5417 	/* We want to start with the refcount at zero */
5418 	percpu_ref_put(&mddev->writes_pending);
5419 	return 0;
5420 }
5421 EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5422 
5423 static int md_alloc(dev_t dev, char *name)
5424 {
5425 	/*
5426 	 * If dev is zero, name is the name of a device to allocate with
5427 	 * an arbitrary minor number.  It will be "md_???"
5428 	 * If dev is non-zero it must be a device number with a MAJOR of
5429 	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5430 	 * the device is being created by opening a node in /dev.
5431 	 * If "name" is not NULL, the device is being created by
5432 	 * writing to /sys/module/md_mod/parameters/new_array.
5433 	 */
5434 	static DEFINE_MUTEX(disks_mutex);
5435 	struct mddev *mddev = mddev_find(dev);
5436 	struct gendisk *disk;
5437 	int partitioned;
5438 	int shift;
5439 	int unit;
5440 	int error;
5441 
5442 	if (!mddev)
5443 		return -ENODEV;
5444 
5445 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5446 	shift = partitioned ? MdpMinorShift : 0;
5447 	unit = MINOR(mddev->unit) >> shift;
5448 
5449 	/* wait for any previous instance of this device to be
5450 	 * completely removed (mddev_delayed_delete).
5451 	 */
5452 	flush_workqueue(md_misc_wq);
5453 
5454 	mutex_lock(&disks_mutex);
5455 	error = -EEXIST;
5456 	if (mddev->gendisk)
5457 		goto abort;
5458 
5459 	if (name && !dev) {
5460 		/* Need to ensure that 'name' is not a duplicate.
5461 		 */
5462 		struct mddev *mddev2;
5463 		spin_lock(&all_mddevs_lock);
5464 
5465 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5466 			if (mddev2->gendisk &&
5467 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5468 				spin_unlock(&all_mddevs_lock);
5469 				goto abort;
5470 			}
5471 		spin_unlock(&all_mddevs_lock);
5472 	}
5473 	if (name && dev)
5474 		/*
5475 		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5476 		 */
5477 		mddev->hold_active = UNTIL_STOP;
5478 
5479 	error = -ENOMEM;
5480 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
5481 	if (!mddev->queue)
5482 		goto abort;
5483 	mddev->queue->queuedata = mddev;
5484 
5485 	blk_queue_make_request(mddev->queue, md_make_request);
5486 	blk_set_stacking_limits(&mddev->queue->limits);
5487 
5488 	disk = alloc_disk(1 << shift);
5489 	if (!disk) {
5490 		blk_cleanup_queue(mddev->queue);
5491 		mddev->queue = NULL;
5492 		goto abort;
5493 	}
5494 	disk->major = MAJOR(mddev->unit);
5495 	disk->first_minor = unit << shift;
5496 	if (name)
5497 		strcpy(disk->disk_name, name);
5498 	else if (partitioned)
5499 		sprintf(disk->disk_name, "md_d%d", unit);
5500 	else
5501 		sprintf(disk->disk_name, "md%d", unit);
5502 	disk->fops = &md_fops;
5503 	disk->private_data = mddev;
5504 	disk->queue = mddev->queue;
5505 	blk_queue_write_cache(mddev->queue, true, true);
5506 	/* Allow extended partitions.  This makes the
5507 	 * 'mdp' device redundant, but we can't really
5508 	 * remove it now.
5509 	 */
5510 	disk->flags |= GENHD_FL_EXT_DEVT;
5511 	mddev->gendisk = disk;
5512 	/* As soon as we call add_disk(), another thread could get
5513 	 * through to md_open, so make sure it doesn't get too far
5514 	 */
5515 	mutex_lock(&mddev->open_mutex);
5516 	add_disk(disk);
5517 
5518 	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5519 	if (error) {
5520 		/* This isn't possible, but as kobject_init_and_add is marked
5521 		 * __must_check, we must do something with the result
5522 		 */
5523 		pr_debug("md: cannot register %s/md - name in use\n",
5524 			 disk->disk_name);
5525 		error = 0;
5526 	}
5527 	if (mddev->kobj.sd &&
5528 	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5529 		pr_debug("pointless warning\n");
5530 	mutex_unlock(&mddev->open_mutex);
5531  abort:
5532 	mutex_unlock(&disks_mutex);
5533 	if (!error && mddev->kobj.sd) {
5534 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
5535 		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5536 	}
5537 	mddev_put(mddev);
5538 	return error;
5539 }
5540 
5541 static struct kobject *md_probe(dev_t dev, int *part, void *data)
5542 {
5543 	if (create_on_open)
5544 		md_alloc(dev, NULL);
5545 	return NULL;
5546 }
5547 
5548 static int add_named_array(const char *val, const struct kernel_param *kp)
5549 {
5550 	/*
5551 	 * val must be "md_*" or "mdNNN".
5552 	 * For "md_*" we allocate an array with a large free minor number, and
5553 	 * set the name to val.  val must not already be an active name.
5554 	 * For "mdNNN" we allocate an array with the minor number NNN
5555 	 * which must not already be in use.
5556 	 */
5557 	int len = strlen(val);
5558 	char buf[DISK_NAME_LEN];
5559 	unsigned long devnum;
5560 
5561 	while (len && val[len-1] == '\n')
5562 		len--;
5563 	if (len >= DISK_NAME_LEN)
5564 		return -E2BIG;
5565 	strlcpy(buf, val, len+1);
5566 	if (strncmp(buf, "md_", 3) == 0)
5567 		return md_alloc(0, buf);
5568 	if (strncmp(buf, "md", 2) == 0 &&
5569 	    isdigit(buf[2]) &&
5570 	    kstrtoul(buf+2, 10, &devnum) == 0 &&
5571 	    devnum <= MINORMASK)
5572 		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5573 
5574 	return -EINVAL;
5575 }
5576 
5577 static void md_safemode_timeout(struct timer_list *t)
5578 {
5579 	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5580 
5581 	mddev->safemode = 1;
5582 	if (mddev->external)
5583 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5584 
5585 	md_wakeup_thread(mddev->thread);
5586 }
5587 
5588 static int start_dirty_degraded;
5589 
5590 int md_run(struct mddev *mddev)
5591 {
5592 	int err;
5593 	struct md_rdev *rdev;
5594 	struct md_personality *pers;
5595 
5596 	if (list_empty(&mddev->disks))
5597 		/* cannot run an array with no devices.. */
5598 		return -EINVAL;
5599 
5600 	if (mddev->pers)
5601 		return -EBUSY;
5602 	/* Cannot run until previous stop completes properly */
5603 	if (mddev->sysfs_active)
5604 		return -EBUSY;
5605 
5606 	/*
5607 	 * Analyze all RAID superblock(s)
5608 	 */
5609 	if (!mddev->raid_disks) {
5610 		if (!mddev->persistent)
5611 			return -EINVAL;
5612 		err = analyze_sbs(mddev);
5613 		if (err)
5614 			return -EINVAL;
5615 	}
5616 
5617 	if (mddev->level != LEVEL_NONE)
5618 		request_module("md-level-%d", mddev->level);
5619 	else if (mddev->clevel[0])
5620 		request_module("md-%s", mddev->clevel);
5621 
5622 	/*
5623 	 * Drop all container device buffers, from now on
5624 	 * the only valid external interface is through the md
5625 	 * device.
5626 	 */
5627 	mddev->has_superblocks = false;
5628 	rdev_for_each(rdev, mddev) {
5629 		if (test_bit(Faulty, &rdev->flags))
5630 			continue;
5631 		sync_blockdev(rdev->bdev);
5632 		invalidate_bdev(rdev->bdev);
5633 		if (mddev->ro != 1 &&
5634 		    (bdev_read_only(rdev->bdev) ||
5635 		     bdev_read_only(rdev->meta_bdev))) {
5636 			mddev->ro = 1;
5637 			if (mddev->gendisk)
5638 				set_disk_ro(mddev->gendisk, 1);
5639 		}
5640 
5641 		if (rdev->sb_page)
5642 			mddev->has_superblocks = true;
5643 
5644 		/* perform some consistency tests on the device.
5645 		 * We don't want the data to overlap the metadata,
5646 		 * Internal Bitmap issues have been handled elsewhere.
5647 		 */
5648 		if (rdev->meta_bdev) {
5649 			/* Nothing to check */;
5650 		} else if (rdev->data_offset < rdev->sb_start) {
5651 			if (mddev->dev_sectors &&
5652 			    rdev->data_offset + mddev->dev_sectors
5653 			    > rdev->sb_start) {
5654 				pr_warn("md: %s: data overlaps metadata\n",
5655 					mdname(mddev));
5656 				return -EINVAL;
5657 			}
5658 		} else {
5659 			if (rdev->sb_start + rdev->sb_size/512
5660 			    > rdev->data_offset) {
5661 				pr_warn("md: %s: metadata overlaps data\n",
5662 					mdname(mddev));
5663 				return -EINVAL;
5664 			}
5665 		}
5666 		sysfs_notify_dirent_safe(rdev->sysfs_state);
5667 	}
5668 
5669 	if (!bioset_initialized(&mddev->bio_set)) {
5670 		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5671 		if (err)
5672 			return err;
5673 	}
5674 	if (!bioset_initialized(&mddev->sync_set)) {
5675 		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5676 		if (err)
5677 			return err;
5678 	}
5679 
5680 	spin_lock(&pers_lock);
5681 	pers = find_pers(mddev->level, mddev->clevel);
5682 	if (!pers || !try_module_get(pers->owner)) {
5683 		spin_unlock(&pers_lock);
5684 		if (mddev->level != LEVEL_NONE)
5685 			pr_warn("md: personality for level %d is not loaded!\n",
5686 				mddev->level);
5687 		else
5688 			pr_warn("md: personality for level %s is not loaded!\n",
5689 				mddev->clevel);
5690 		err = -EINVAL;
5691 		goto abort;
5692 	}
5693 	spin_unlock(&pers_lock);
5694 	if (mddev->level != pers->level) {
5695 		mddev->level = pers->level;
5696 		mddev->new_level = pers->level;
5697 	}
5698 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5699 
5700 	if (mddev->reshape_position != MaxSector &&
5701 	    pers->start_reshape == NULL) {
5702 		/* This personality cannot handle reshaping... */
5703 		module_put(pers->owner);
5704 		err = -EINVAL;
5705 		goto abort;
5706 	}
5707 
5708 	if (pers->sync_request) {
5709 		/* Warn if this is a potentially silly
5710 		 * configuration.
5711 		 */
5712 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5713 		struct md_rdev *rdev2;
5714 		int warned = 0;
5715 
5716 		rdev_for_each(rdev, mddev)
5717 			rdev_for_each(rdev2, mddev) {
5718 				if (rdev < rdev2 &&
5719 				    rdev->bdev->bd_contains ==
5720 				    rdev2->bdev->bd_contains) {
5721 					pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5722 						mdname(mddev),
5723 						bdevname(rdev->bdev,b),
5724 						bdevname(rdev2->bdev,b2));
5725 					warned = 1;
5726 				}
5727 			}
5728 
5729 		if (warned)
5730 			pr_warn("True protection against single-disk failure might be compromised.\n");
5731 	}
5732 
5733 	mddev->recovery = 0;
5734 	/* may be over-ridden by personality */
5735 	mddev->resync_max_sectors = mddev->dev_sectors;
5736 
5737 	mddev->ok_start_degraded = start_dirty_degraded;
5738 
5739 	if (start_readonly && mddev->ro == 0)
5740 		mddev->ro = 2; /* read-only, but switch on first write */
5741 
5742 	err = pers->run(mddev);
5743 	if (err)
5744 		pr_warn("md: pers->run() failed ...\n");
5745 	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5746 		WARN_ONCE(!mddev->external_size,
5747 			  "%s: default size too small, but 'external_size' not in effect?\n",
5748 			  __func__);
5749 		pr_warn("md: invalid array_size %llu > default size %llu\n",
5750 			(unsigned long long)mddev->array_sectors / 2,
5751 			(unsigned long long)pers->size(mddev, 0, 0) / 2);
5752 		err = -EINVAL;
5753 	}
5754 	if (err == 0 && pers->sync_request &&
5755 	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5756 		struct bitmap *bitmap;
5757 
5758 		bitmap = md_bitmap_create(mddev, -1);
5759 		if (IS_ERR(bitmap)) {
5760 			err = PTR_ERR(bitmap);
5761 			pr_warn("%s: failed to create bitmap (%d)\n",
5762 				mdname(mddev), err);
5763 		} else
5764 			mddev->bitmap = bitmap;
5765 
5766 	}
5767 	if (err)
5768 		goto bitmap_abort;
5769 
5770 	if (mddev->bitmap_info.max_write_behind > 0) {
5771 		bool creat_pool = false;
5772 
5773 		rdev_for_each(rdev, mddev) {
5774 			if (test_bit(WriteMostly, &rdev->flags) &&
5775 			    rdev_init_wb(rdev))
5776 				creat_pool = true;
5777 		}
5778 		if (creat_pool && mddev->wb_info_pool == NULL) {
5779 			mddev->wb_info_pool =
5780 				mempool_create_kmalloc_pool(NR_WB_INFOS,
5781 						    sizeof(struct wb_info));
5782 			if (!mddev->wb_info_pool) {
5783 				err = -ENOMEM;
5784 				goto bitmap_abort;
5785 			}
5786 		}
5787 	}
5788 
5789 	if (mddev->queue) {
5790 		bool nonrot = true;
5791 
5792 		rdev_for_each(rdev, mddev) {
5793 			if (rdev->raid_disk >= 0 &&
5794 			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5795 				nonrot = false;
5796 				break;
5797 			}
5798 		}
5799 		if (mddev->degraded)
5800 			nonrot = false;
5801 		if (nonrot)
5802 			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
5803 		else
5804 			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
5805 		mddev->queue->backing_dev_info->congested_data = mddev;
5806 		mddev->queue->backing_dev_info->congested_fn = md_congested;
5807 	}
5808 	if (pers->sync_request) {
5809 		if (mddev->kobj.sd &&
5810 		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5811 			pr_warn("md: cannot register extra attributes for %s\n",
5812 				mdname(mddev));
5813 		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5814 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5815 		mddev->ro = 0;
5816 
5817 	atomic_set(&mddev->max_corr_read_errors,
5818 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5819 	mddev->safemode = 0;
5820 	if (mddev_is_clustered(mddev))
5821 		mddev->safemode_delay = 0;
5822 	else
5823 		mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5824 	mddev->in_sync = 1;
5825 	smp_wmb();
5826 	spin_lock(&mddev->lock);
5827 	mddev->pers = pers;
5828 	spin_unlock(&mddev->lock);
5829 	rdev_for_each(rdev, mddev)
5830 		if (rdev->raid_disk >= 0)
5831 			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
5832 
5833 	if (mddev->degraded && !mddev->ro)
5834 		/* This ensures that recovering status is reported immediately
5835 		 * via sysfs - until a lack of spares is confirmed.
5836 		 */
5837 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5838 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5839 
5840 	if (mddev->sb_flags)
5841 		md_update_sb(mddev, 0);
5842 
5843 	md_new_event(mddev);
5844 	return 0;
5845 
5846 bitmap_abort:
5847 	mddev_detach(mddev);
5848 	if (mddev->private)
5849 		pers->free(mddev, mddev->private);
5850 	mddev->private = NULL;
5851 	module_put(pers->owner);
5852 	md_bitmap_destroy(mddev);
5853 abort:
5854 	bioset_exit(&mddev->bio_set);
5855 	bioset_exit(&mddev->sync_set);
5856 	return err;
5857 }
5858 EXPORT_SYMBOL_GPL(md_run);
5859 
5860 static int do_md_run(struct mddev *mddev)
5861 {
5862 	int err;
5863 
5864 	set_bit(MD_NOT_READY, &mddev->flags);
5865 	err = md_run(mddev);
5866 	if (err)
5867 		goto out;
5868 	err = md_bitmap_load(mddev);
5869 	if (err) {
5870 		md_bitmap_destroy(mddev);
5871 		goto out;
5872 	}
5873 
5874 	if (mddev_is_clustered(mddev))
5875 		md_allow_write(mddev);
5876 
5877 	/* run start up tasks that require md_thread */
5878 	md_start(mddev);
5879 
5880 	md_wakeup_thread(mddev->thread);
5881 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5882 
5883 	set_capacity(mddev->gendisk, mddev->array_sectors);
5884 	revalidate_disk(mddev->gendisk);
5885 	clear_bit(MD_NOT_READY, &mddev->flags);
5886 	mddev->changed = 1;
5887 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5888 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5889 	sysfs_notify_dirent_safe(mddev->sysfs_action);
5890 	sysfs_notify(&mddev->kobj, NULL, "degraded");
5891 out:
5892 	clear_bit(MD_NOT_READY, &mddev->flags);
5893 	return err;
5894 }
5895 
5896 int md_start(struct mddev *mddev)
5897 {
5898 	int ret = 0;
5899 
5900 	if (mddev->pers->start) {
5901 		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5902 		md_wakeup_thread(mddev->thread);
5903 		ret = mddev->pers->start(mddev);
5904 		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
5905 		md_wakeup_thread(mddev->sync_thread);
5906 	}
5907 	return ret;
5908 }
5909 EXPORT_SYMBOL_GPL(md_start);
5910 
5911 static int restart_array(struct mddev *mddev)
5912 {
5913 	struct gendisk *disk = mddev->gendisk;
5914 	struct md_rdev *rdev;
5915 	bool has_journal = false;
5916 	bool has_readonly = false;
5917 
5918 	/* Complain if it has no devices */
5919 	if (list_empty(&mddev->disks))
5920 		return -ENXIO;
5921 	if (!mddev->pers)
5922 		return -EINVAL;
5923 	if (!mddev->ro)
5924 		return -EBUSY;
5925 
5926 	rcu_read_lock();
5927 	rdev_for_each_rcu(rdev, mddev) {
5928 		if (test_bit(Journal, &rdev->flags) &&
5929 		    !test_bit(Faulty, &rdev->flags))
5930 			has_journal = true;
5931 		if (bdev_read_only(rdev->bdev))
5932 			has_readonly = true;
5933 	}
5934 	rcu_read_unlock();
5935 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5936 		/* Don't restart rw with journal missing/faulty */
5937 			return -EINVAL;
5938 	if (has_readonly)
5939 		return -EROFS;
5940 
5941 	mddev->safemode = 0;
5942 	mddev->ro = 0;
5943 	set_disk_ro(disk, 0);
5944 	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5945 	/* Kick recovery or resync if necessary */
5946 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5947 	md_wakeup_thread(mddev->thread);
5948 	md_wakeup_thread(mddev->sync_thread);
5949 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5950 	return 0;
5951 }
5952 
5953 static void md_clean(struct mddev *mddev)
5954 {
5955 	mddev->array_sectors = 0;
5956 	mddev->external_size = 0;
5957 	mddev->dev_sectors = 0;
5958 	mddev->raid_disks = 0;
5959 	mddev->recovery_cp = 0;
5960 	mddev->resync_min = 0;
5961 	mddev->resync_max = MaxSector;
5962 	mddev->reshape_position = MaxSector;
5963 	mddev->external = 0;
5964 	mddev->persistent = 0;
5965 	mddev->level = LEVEL_NONE;
5966 	mddev->clevel[0] = 0;
5967 	mddev->flags = 0;
5968 	mddev->sb_flags = 0;
5969 	mddev->ro = 0;
5970 	mddev->metadata_type[0] = 0;
5971 	mddev->chunk_sectors = 0;
5972 	mddev->ctime = mddev->utime = 0;
5973 	mddev->layout = 0;
5974 	mddev->max_disks = 0;
5975 	mddev->events = 0;
5976 	mddev->can_decrease_events = 0;
5977 	mddev->delta_disks = 0;
5978 	mddev->reshape_backwards = 0;
5979 	mddev->new_level = LEVEL_NONE;
5980 	mddev->new_layout = 0;
5981 	mddev->new_chunk_sectors = 0;
5982 	mddev->curr_resync = 0;
5983 	atomic64_set(&mddev->resync_mismatches, 0);
5984 	mddev->suspend_lo = mddev->suspend_hi = 0;
5985 	mddev->sync_speed_min = mddev->sync_speed_max = 0;
5986 	mddev->recovery = 0;
5987 	mddev->in_sync = 0;
5988 	mddev->changed = 0;
5989 	mddev->degraded = 0;
5990 	mddev->safemode = 0;
5991 	mddev->private = NULL;
5992 	mddev->cluster_info = NULL;
5993 	mddev->bitmap_info.offset = 0;
5994 	mddev->bitmap_info.default_offset = 0;
5995 	mddev->bitmap_info.default_space = 0;
5996 	mddev->bitmap_info.chunksize = 0;
5997 	mddev->bitmap_info.daemon_sleep = 0;
5998 	mddev->bitmap_info.max_write_behind = 0;
5999 	mddev->bitmap_info.nodes = 0;
6000 }
6001 
6002 static void __md_stop_writes(struct mddev *mddev)
6003 {
6004 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6005 	flush_workqueue(md_misc_wq);
6006 	if (mddev->sync_thread) {
6007 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6008 		md_reap_sync_thread(mddev);
6009 	}
6010 
6011 	del_timer_sync(&mddev->safemode_timer);
6012 
6013 	if (mddev->pers && mddev->pers->quiesce) {
6014 		mddev->pers->quiesce(mddev, 1);
6015 		mddev->pers->quiesce(mddev, 0);
6016 	}
6017 	md_bitmap_flush(mddev);
6018 
6019 	if (mddev->ro == 0 &&
6020 	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6021 	     mddev->sb_flags)) {
6022 		/* mark array as shutdown cleanly */
6023 		if (!mddev_is_clustered(mddev))
6024 			mddev->in_sync = 1;
6025 		md_update_sb(mddev, 1);
6026 	}
6027 	mempool_destroy(mddev->wb_info_pool);
6028 	mddev->wb_info_pool = NULL;
6029 }
6030 
6031 void md_stop_writes(struct mddev *mddev)
6032 {
6033 	mddev_lock_nointr(mddev);
6034 	__md_stop_writes(mddev);
6035 	mddev_unlock(mddev);
6036 }
6037 EXPORT_SYMBOL_GPL(md_stop_writes);
6038 
6039 static void mddev_detach(struct mddev *mddev)
6040 {
6041 	md_bitmap_wait_behind_writes(mddev);
6042 	if (mddev->pers && mddev->pers->quiesce) {
6043 		mddev->pers->quiesce(mddev, 1);
6044 		mddev->pers->quiesce(mddev, 0);
6045 	}
6046 	md_unregister_thread(&mddev->thread);
6047 	if (mddev->queue)
6048 		blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
6049 }
6050 
6051 static void __md_stop(struct mddev *mddev)
6052 {
6053 	struct md_personality *pers = mddev->pers;
6054 	md_bitmap_destroy(mddev);
6055 	mddev_detach(mddev);
6056 	/* Ensure ->event_work is done */
6057 	flush_workqueue(md_misc_wq);
6058 	spin_lock(&mddev->lock);
6059 	mddev->pers = NULL;
6060 	spin_unlock(&mddev->lock);
6061 	pers->free(mddev, mddev->private);
6062 	mddev->private = NULL;
6063 	if (pers->sync_request && mddev->to_remove == NULL)
6064 		mddev->to_remove = &md_redundancy_group;
6065 	module_put(pers->owner);
6066 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6067 }
6068 
6069 void md_stop(struct mddev *mddev)
6070 {
6071 	/* stop the array and free an attached data structures.
6072 	 * This is called from dm-raid
6073 	 */
6074 	__md_stop(mddev);
6075 	bioset_exit(&mddev->bio_set);
6076 	bioset_exit(&mddev->sync_set);
6077 }
6078 
6079 EXPORT_SYMBOL_GPL(md_stop);
6080 
6081 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
6082 {
6083 	int err = 0;
6084 	int did_freeze = 0;
6085 
6086 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6087 		did_freeze = 1;
6088 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6089 		md_wakeup_thread(mddev->thread);
6090 	}
6091 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6092 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6093 	if (mddev->sync_thread)
6094 		/* Thread might be blocked waiting for metadata update
6095 		 * which will now never happen */
6096 		wake_up_process(mddev->sync_thread->tsk);
6097 
6098 	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6099 		return -EBUSY;
6100 	mddev_unlock(mddev);
6101 	wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6102 					  &mddev->recovery));
6103 	wait_event(mddev->sb_wait,
6104 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6105 	mddev_lock_nointr(mddev);
6106 
6107 	mutex_lock(&mddev->open_mutex);
6108 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6109 	    mddev->sync_thread ||
6110 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6111 		pr_warn("md: %s still in use.\n",mdname(mddev));
6112 		if (did_freeze) {
6113 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6114 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6115 			md_wakeup_thread(mddev->thread);
6116 		}
6117 		err = -EBUSY;
6118 		goto out;
6119 	}
6120 	if (mddev->pers) {
6121 		__md_stop_writes(mddev);
6122 
6123 		err  = -ENXIO;
6124 		if (mddev->ro==1)
6125 			goto out;
6126 		mddev->ro = 1;
6127 		set_disk_ro(mddev->gendisk, 1);
6128 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6129 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6130 		md_wakeup_thread(mddev->thread);
6131 		sysfs_notify_dirent_safe(mddev->sysfs_state);
6132 		err = 0;
6133 	}
6134 out:
6135 	mutex_unlock(&mddev->open_mutex);
6136 	return err;
6137 }
6138 
6139 /* mode:
6140  *   0 - completely stop and dis-assemble array
6141  *   2 - stop but do not disassemble array
6142  */
6143 static int do_md_stop(struct mddev *mddev, int mode,
6144 		      struct block_device *bdev)
6145 {
6146 	struct gendisk *disk = mddev->gendisk;
6147 	struct md_rdev *rdev;
6148 	int did_freeze = 0;
6149 
6150 	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6151 		did_freeze = 1;
6152 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6153 		md_wakeup_thread(mddev->thread);
6154 	}
6155 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6156 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6157 	if (mddev->sync_thread)
6158 		/* Thread might be blocked waiting for metadata update
6159 		 * which will now never happen */
6160 		wake_up_process(mddev->sync_thread->tsk);
6161 
6162 	mddev_unlock(mddev);
6163 	wait_event(resync_wait, (mddev->sync_thread == NULL &&
6164 				 !test_bit(MD_RECOVERY_RUNNING,
6165 					   &mddev->recovery)));
6166 	mddev_lock_nointr(mddev);
6167 
6168 	mutex_lock(&mddev->open_mutex);
6169 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6170 	    mddev->sysfs_active ||
6171 	    mddev->sync_thread ||
6172 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6173 		pr_warn("md: %s still in use.\n",mdname(mddev));
6174 		mutex_unlock(&mddev->open_mutex);
6175 		if (did_freeze) {
6176 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6177 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6178 			md_wakeup_thread(mddev->thread);
6179 		}
6180 		return -EBUSY;
6181 	}
6182 	if (mddev->pers) {
6183 		if (mddev->ro)
6184 			set_disk_ro(disk, 0);
6185 
6186 		__md_stop_writes(mddev);
6187 		__md_stop(mddev);
6188 		mddev->queue->backing_dev_info->congested_fn = NULL;
6189 
6190 		/* tell userspace to handle 'inactive' */
6191 		sysfs_notify_dirent_safe(mddev->sysfs_state);
6192 
6193 		rdev_for_each(rdev, mddev)
6194 			if (rdev->raid_disk >= 0)
6195 				sysfs_unlink_rdev(mddev, rdev);
6196 
6197 		set_capacity(disk, 0);
6198 		mutex_unlock(&mddev->open_mutex);
6199 		mddev->changed = 1;
6200 		revalidate_disk(disk);
6201 
6202 		if (mddev->ro)
6203 			mddev->ro = 0;
6204 	} else
6205 		mutex_unlock(&mddev->open_mutex);
6206 	/*
6207 	 * Free resources if final stop
6208 	 */
6209 	if (mode == 0) {
6210 		pr_info("md: %s stopped.\n", mdname(mddev));
6211 
6212 		if (mddev->bitmap_info.file) {
6213 			struct file *f = mddev->bitmap_info.file;
6214 			spin_lock(&mddev->lock);
6215 			mddev->bitmap_info.file = NULL;
6216 			spin_unlock(&mddev->lock);
6217 			fput(f);
6218 		}
6219 		mddev->bitmap_info.offset = 0;
6220 
6221 		export_array(mddev);
6222 
6223 		md_clean(mddev);
6224 		if (mddev->hold_active == UNTIL_STOP)
6225 			mddev->hold_active = 0;
6226 	}
6227 	md_new_event(mddev);
6228 	sysfs_notify_dirent_safe(mddev->sysfs_state);
6229 	return 0;
6230 }
6231 
6232 #ifndef MODULE
6233 static void autorun_array(struct mddev *mddev)
6234 {
6235 	struct md_rdev *rdev;
6236 	int err;
6237 
6238 	if (list_empty(&mddev->disks))
6239 		return;
6240 
6241 	pr_info("md: running: ");
6242 
6243 	rdev_for_each(rdev, mddev) {
6244 		char b[BDEVNAME_SIZE];
6245 		pr_cont("<%s>", bdevname(rdev->bdev,b));
6246 	}
6247 	pr_cont("\n");
6248 
6249 	err = do_md_run(mddev);
6250 	if (err) {
6251 		pr_warn("md: do_md_run() returned %d\n", err);
6252 		do_md_stop(mddev, 0, NULL);
6253 	}
6254 }
6255 
6256 /*
6257  * lets try to run arrays based on all disks that have arrived
6258  * until now. (those are in pending_raid_disks)
6259  *
6260  * the method: pick the first pending disk, collect all disks with
6261  * the same UUID, remove all from the pending list and put them into
6262  * the 'same_array' list. Then order this list based on superblock
6263  * update time (freshest comes first), kick out 'old' disks and
6264  * compare superblocks. If everything's fine then run it.
6265  *
6266  * If "unit" is allocated, then bump its reference count
6267  */
6268 static void autorun_devices(int part)
6269 {
6270 	struct md_rdev *rdev0, *rdev, *tmp;
6271 	struct mddev *mddev;
6272 	char b[BDEVNAME_SIZE];
6273 
6274 	pr_info("md: autorun ...\n");
6275 	while (!list_empty(&pending_raid_disks)) {
6276 		int unit;
6277 		dev_t dev;
6278 		LIST_HEAD(candidates);
6279 		rdev0 = list_entry(pending_raid_disks.next,
6280 					 struct md_rdev, same_set);
6281 
6282 		pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6283 		INIT_LIST_HEAD(&candidates);
6284 		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6285 			if (super_90_load(rdev, rdev0, 0) >= 0) {
6286 				pr_debug("md:  adding %s ...\n",
6287 					 bdevname(rdev->bdev,b));
6288 				list_move(&rdev->same_set, &candidates);
6289 			}
6290 		/*
6291 		 * now we have a set of devices, with all of them having
6292 		 * mostly sane superblocks. It's time to allocate the
6293 		 * mddev.
6294 		 */
6295 		if (part) {
6296 			dev = MKDEV(mdp_major,
6297 				    rdev0->preferred_minor << MdpMinorShift);
6298 			unit = MINOR(dev) >> MdpMinorShift;
6299 		} else {
6300 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6301 			unit = MINOR(dev);
6302 		}
6303 		if (rdev0->preferred_minor != unit) {
6304 			pr_warn("md: unit number in %s is bad: %d\n",
6305 				bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6306 			break;
6307 		}
6308 
6309 		md_probe(dev, NULL, NULL);
6310 		mddev = mddev_find(dev);
6311 		if (!mddev || !mddev->gendisk) {
6312 			if (mddev)
6313 				mddev_put(mddev);
6314 			break;
6315 		}
6316 		if (mddev_lock(mddev))
6317 			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6318 		else if (mddev->raid_disks || mddev->major_version
6319 			 || !list_empty(&mddev->disks)) {
6320 			pr_warn("md: %s already running, cannot run %s\n",
6321 				mdname(mddev), bdevname(rdev0->bdev,b));
6322 			mddev_unlock(mddev);
6323 		} else {
6324 			pr_debug("md: created %s\n", mdname(mddev));
6325 			mddev->persistent = 1;
6326 			rdev_for_each_list(rdev, tmp, &candidates) {
6327 				list_del_init(&rdev->same_set);
6328 				if (bind_rdev_to_array(rdev, mddev))
6329 					export_rdev(rdev);
6330 			}
6331 			autorun_array(mddev);
6332 			mddev_unlock(mddev);
6333 		}
6334 		/* on success, candidates will be empty, on error
6335 		 * it won't...
6336 		 */
6337 		rdev_for_each_list(rdev, tmp, &candidates) {
6338 			list_del_init(&rdev->same_set);
6339 			export_rdev(rdev);
6340 		}
6341 		mddev_put(mddev);
6342 	}
6343 	pr_info("md: ... autorun DONE.\n");
6344 }
6345 #endif /* !MODULE */
6346 
6347 static int get_version(void __user *arg)
6348 {
6349 	mdu_version_t ver;
6350 
6351 	ver.major = MD_MAJOR_VERSION;
6352 	ver.minor = MD_MINOR_VERSION;
6353 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6354 
6355 	if (copy_to_user(arg, &ver, sizeof(ver)))
6356 		return -EFAULT;
6357 
6358 	return 0;
6359 }
6360 
6361 static int get_array_info(struct mddev *mddev, void __user *arg)
6362 {
6363 	mdu_array_info_t info;
6364 	int nr,working,insync,failed,spare;
6365 	struct md_rdev *rdev;
6366 
6367 	nr = working = insync = failed = spare = 0;
6368 	rcu_read_lock();
6369 	rdev_for_each_rcu(rdev, mddev) {
6370 		nr++;
6371 		if (test_bit(Faulty, &rdev->flags))
6372 			failed++;
6373 		else {
6374 			working++;
6375 			if (test_bit(In_sync, &rdev->flags))
6376 				insync++;
6377 			else if (test_bit(Journal, &rdev->flags))
6378 				/* TODO: add journal count to md_u.h */
6379 				;
6380 			else
6381 				spare++;
6382 		}
6383 	}
6384 	rcu_read_unlock();
6385 
6386 	info.major_version = mddev->major_version;
6387 	info.minor_version = mddev->minor_version;
6388 	info.patch_version = MD_PATCHLEVEL_VERSION;
6389 	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6390 	info.level         = mddev->level;
6391 	info.size          = mddev->dev_sectors / 2;
6392 	if (info.size != mddev->dev_sectors / 2) /* overflow */
6393 		info.size = -1;
6394 	info.nr_disks      = nr;
6395 	info.raid_disks    = mddev->raid_disks;
6396 	info.md_minor      = mddev->md_minor;
6397 	info.not_persistent= !mddev->persistent;
6398 
6399 	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6400 	info.state         = 0;
6401 	if (mddev->in_sync)
6402 		info.state = (1<<MD_SB_CLEAN);
6403 	if (mddev->bitmap && mddev->bitmap_info.offset)
6404 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6405 	if (mddev_is_clustered(mddev))
6406 		info.state |= (1<<MD_SB_CLUSTERED);
6407 	info.active_disks  = insync;
6408 	info.working_disks = working;
6409 	info.failed_disks  = failed;
6410 	info.spare_disks   = spare;
6411 
6412 	info.layout        = mddev->layout;
6413 	info.chunk_size    = mddev->chunk_sectors << 9;
6414 
6415 	if (copy_to_user(arg, &info, sizeof(info)))
6416 		return -EFAULT;
6417 
6418 	return 0;
6419 }
6420 
6421 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6422 {
6423 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6424 	char *ptr;
6425 	int err;
6426 
6427 	file = kzalloc(sizeof(*file), GFP_NOIO);
6428 	if (!file)
6429 		return -ENOMEM;
6430 
6431 	err = 0;
6432 	spin_lock(&mddev->lock);
6433 	/* bitmap enabled */
6434 	if (mddev->bitmap_info.file) {
6435 		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6436 				sizeof(file->pathname));
6437 		if (IS_ERR(ptr))
6438 			err = PTR_ERR(ptr);
6439 		else
6440 			memmove(file->pathname, ptr,
6441 				sizeof(file->pathname)-(ptr-file->pathname));
6442 	}
6443 	spin_unlock(&mddev->lock);
6444 
6445 	if (err == 0 &&
6446 	    copy_to_user(arg, file, sizeof(*file)))
6447 		err = -EFAULT;
6448 
6449 	kfree(file);
6450 	return err;
6451 }
6452 
6453 static int get_disk_info(struct mddev *mddev, void __user * arg)
6454 {
6455 	mdu_disk_info_t info;
6456 	struct md_rdev *rdev;
6457 
6458 	if (copy_from_user(&info, arg, sizeof(info)))
6459 		return -EFAULT;
6460 
6461 	rcu_read_lock();
6462 	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6463 	if (rdev) {
6464 		info.major = MAJOR(rdev->bdev->bd_dev);
6465 		info.minor = MINOR(rdev->bdev->bd_dev);
6466 		info.raid_disk = rdev->raid_disk;
6467 		info.state = 0;
6468 		if (test_bit(Faulty, &rdev->flags))
6469 			info.state |= (1<<MD_DISK_FAULTY);
6470 		else if (test_bit(In_sync, &rdev->flags)) {
6471 			info.state |= (1<<MD_DISK_ACTIVE);
6472 			info.state |= (1<<MD_DISK_SYNC);
6473 		}
6474 		if (test_bit(Journal, &rdev->flags))
6475 			info.state |= (1<<MD_DISK_JOURNAL);
6476 		if (test_bit(WriteMostly, &rdev->flags))
6477 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6478 		if (test_bit(FailFast, &rdev->flags))
6479 			info.state |= (1<<MD_DISK_FAILFAST);
6480 	} else {
6481 		info.major = info.minor = 0;
6482 		info.raid_disk = -1;
6483 		info.state = (1<<MD_DISK_REMOVED);
6484 	}
6485 	rcu_read_unlock();
6486 
6487 	if (copy_to_user(arg, &info, sizeof(info)))
6488 		return -EFAULT;
6489 
6490 	return 0;
6491 }
6492 
6493 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6494 {
6495 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6496 	struct md_rdev *rdev;
6497 	dev_t dev = MKDEV(info->major,info->minor);
6498 
6499 	if (mddev_is_clustered(mddev) &&
6500 		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6501 		pr_warn("%s: Cannot add to clustered mddev.\n",
6502 			mdname(mddev));
6503 		return -EINVAL;
6504 	}
6505 
6506 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6507 		return -EOVERFLOW;
6508 
6509 	if (!mddev->raid_disks) {
6510 		int err;
6511 		/* expecting a device which has a superblock */
6512 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6513 		if (IS_ERR(rdev)) {
6514 			pr_warn("md: md_import_device returned %ld\n",
6515 				PTR_ERR(rdev));
6516 			return PTR_ERR(rdev);
6517 		}
6518 		if (!list_empty(&mddev->disks)) {
6519 			struct md_rdev *rdev0
6520 				= list_entry(mddev->disks.next,
6521 					     struct md_rdev, same_set);
6522 			err = super_types[mddev->major_version]
6523 				.load_super(rdev, rdev0, mddev->minor_version);
6524 			if (err < 0) {
6525 				pr_warn("md: %s has different UUID to %s\n",
6526 					bdevname(rdev->bdev,b),
6527 					bdevname(rdev0->bdev,b2));
6528 				export_rdev(rdev);
6529 				return -EINVAL;
6530 			}
6531 		}
6532 		err = bind_rdev_to_array(rdev, mddev);
6533 		if (err)
6534 			export_rdev(rdev);
6535 		return err;
6536 	}
6537 
6538 	/*
6539 	 * add_new_disk can be used once the array is assembled
6540 	 * to add "hot spares".  They must already have a superblock
6541 	 * written
6542 	 */
6543 	if (mddev->pers) {
6544 		int err;
6545 		if (!mddev->pers->hot_add_disk) {
6546 			pr_warn("%s: personality does not support diskops!\n",
6547 				mdname(mddev));
6548 			return -EINVAL;
6549 		}
6550 		if (mddev->persistent)
6551 			rdev = md_import_device(dev, mddev->major_version,
6552 						mddev->minor_version);
6553 		else
6554 			rdev = md_import_device(dev, -1, -1);
6555 		if (IS_ERR(rdev)) {
6556 			pr_warn("md: md_import_device returned %ld\n",
6557 				PTR_ERR(rdev));
6558 			return PTR_ERR(rdev);
6559 		}
6560 		/* set saved_raid_disk if appropriate */
6561 		if (!mddev->persistent) {
6562 			if (info->state & (1<<MD_DISK_SYNC)  &&
6563 			    info->raid_disk < mddev->raid_disks) {
6564 				rdev->raid_disk = info->raid_disk;
6565 				set_bit(In_sync, &rdev->flags);
6566 				clear_bit(Bitmap_sync, &rdev->flags);
6567 			} else
6568 				rdev->raid_disk = -1;
6569 			rdev->saved_raid_disk = rdev->raid_disk;
6570 		} else
6571 			super_types[mddev->major_version].
6572 				validate_super(mddev, rdev);
6573 		if ((info->state & (1<<MD_DISK_SYNC)) &&
6574 		     rdev->raid_disk != info->raid_disk) {
6575 			/* This was a hot-add request, but events doesn't
6576 			 * match, so reject it.
6577 			 */
6578 			export_rdev(rdev);
6579 			return -EINVAL;
6580 		}
6581 
6582 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6583 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6584 			set_bit(WriteMostly, &rdev->flags);
6585 		else
6586 			clear_bit(WriteMostly, &rdev->flags);
6587 		if (info->state & (1<<MD_DISK_FAILFAST))
6588 			set_bit(FailFast, &rdev->flags);
6589 		else
6590 			clear_bit(FailFast, &rdev->flags);
6591 
6592 		if (info->state & (1<<MD_DISK_JOURNAL)) {
6593 			struct md_rdev *rdev2;
6594 			bool has_journal = false;
6595 
6596 			/* make sure no existing journal disk */
6597 			rdev_for_each(rdev2, mddev) {
6598 				if (test_bit(Journal, &rdev2->flags)) {
6599 					has_journal = true;
6600 					break;
6601 				}
6602 			}
6603 			if (has_journal || mddev->bitmap) {
6604 				export_rdev(rdev);
6605 				return -EBUSY;
6606 			}
6607 			set_bit(Journal, &rdev->flags);
6608 		}
6609 		/*
6610 		 * check whether the device shows up in other nodes
6611 		 */
6612 		if (mddev_is_clustered(mddev)) {
6613 			if (info->state & (1 << MD_DISK_CANDIDATE))
6614 				set_bit(Candidate, &rdev->flags);
6615 			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6616 				/* --add initiated by this node */
6617 				err = md_cluster_ops->add_new_disk(mddev, rdev);
6618 				if (err) {
6619 					export_rdev(rdev);
6620 					return err;
6621 				}
6622 			}
6623 		}
6624 
6625 		rdev->raid_disk = -1;
6626 		err = bind_rdev_to_array(rdev, mddev);
6627 
6628 		if (err)
6629 			export_rdev(rdev);
6630 
6631 		if (mddev_is_clustered(mddev)) {
6632 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
6633 				if (!err) {
6634 					err = md_cluster_ops->new_disk_ack(mddev,
6635 						err == 0);
6636 					if (err)
6637 						md_kick_rdev_from_array(rdev);
6638 				}
6639 			} else {
6640 				if (err)
6641 					md_cluster_ops->add_new_disk_cancel(mddev);
6642 				else
6643 					err = add_bound_rdev(rdev);
6644 			}
6645 
6646 		} else if (!err)
6647 			err = add_bound_rdev(rdev);
6648 
6649 		return err;
6650 	}
6651 
6652 	/* otherwise, add_new_disk is only allowed
6653 	 * for major_version==0 superblocks
6654 	 */
6655 	if (mddev->major_version != 0) {
6656 		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6657 		return -EINVAL;
6658 	}
6659 
6660 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
6661 		int err;
6662 		rdev = md_import_device(dev, -1, 0);
6663 		if (IS_ERR(rdev)) {
6664 			pr_warn("md: error, md_import_device() returned %ld\n",
6665 				PTR_ERR(rdev));
6666 			return PTR_ERR(rdev);
6667 		}
6668 		rdev->desc_nr = info->number;
6669 		if (info->raid_disk < mddev->raid_disks)
6670 			rdev->raid_disk = info->raid_disk;
6671 		else
6672 			rdev->raid_disk = -1;
6673 
6674 		if (rdev->raid_disk < mddev->raid_disks)
6675 			if (info->state & (1<<MD_DISK_SYNC))
6676 				set_bit(In_sync, &rdev->flags);
6677 
6678 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6679 			set_bit(WriteMostly, &rdev->flags);
6680 		if (info->state & (1<<MD_DISK_FAILFAST))
6681 			set_bit(FailFast, &rdev->flags);
6682 
6683 		if (!mddev->persistent) {
6684 			pr_debug("md: nonpersistent superblock ...\n");
6685 			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6686 		} else
6687 			rdev->sb_start = calc_dev_sboffset(rdev);
6688 		rdev->sectors = rdev->sb_start;
6689 
6690 		err = bind_rdev_to_array(rdev, mddev);
6691 		if (err) {
6692 			export_rdev(rdev);
6693 			return err;
6694 		}
6695 	}
6696 
6697 	return 0;
6698 }
6699 
6700 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6701 {
6702 	char b[BDEVNAME_SIZE];
6703 	struct md_rdev *rdev;
6704 
6705 	if (!mddev->pers)
6706 		return -ENODEV;
6707 
6708 	rdev = find_rdev(mddev, dev);
6709 	if (!rdev)
6710 		return -ENXIO;
6711 
6712 	if (rdev->raid_disk < 0)
6713 		goto kick_rdev;
6714 
6715 	clear_bit(Blocked, &rdev->flags);
6716 	remove_and_add_spares(mddev, rdev);
6717 
6718 	if (rdev->raid_disk >= 0)
6719 		goto busy;
6720 
6721 kick_rdev:
6722 	if (mddev_is_clustered(mddev))
6723 		md_cluster_ops->remove_disk(mddev, rdev);
6724 
6725 	md_kick_rdev_from_array(rdev);
6726 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6727 	if (mddev->thread)
6728 		md_wakeup_thread(mddev->thread);
6729 	else
6730 		md_update_sb(mddev, 1);
6731 	md_new_event(mddev);
6732 
6733 	return 0;
6734 busy:
6735 	pr_debug("md: cannot remove active disk %s from %s ...\n",
6736 		 bdevname(rdev->bdev,b), mdname(mddev));
6737 	return -EBUSY;
6738 }
6739 
6740 static int hot_add_disk(struct mddev *mddev, dev_t dev)
6741 {
6742 	char b[BDEVNAME_SIZE];
6743 	int err;
6744 	struct md_rdev *rdev;
6745 
6746 	if (!mddev->pers)
6747 		return -ENODEV;
6748 
6749 	if (mddev->major_version != 0) {
6750 		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6751 			mdname(mddev));
6752 		return -EINVAL;
6753 	}
6754 	if (!mddev->pers->hot_add_disk) {
6755 		pr_warn("%s: personality does not support diskops!\n",
6756 			mdname(mddev));
6757 		return -EINVAL;
6758 	}
6759 
6760 	rdev = md_import_device(dev, -1, 0);
6761 	if (IS_ERR(rdev)) {
6762 		pr_warn("md: error, md_import_device() returned %ld\n",
6763 			PTR_ERR(rdev));
6764 		return -EINVAL;
6765 	}
6766 
6767 	if (mddev->persistent)
6768 		rdev->sb_start = calc_dev_sboffset(rdev);
6769 	else
6770 		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6771 
6772 	rdev->sectors = rdev->sb_start;
6773 
6774 	if (test_bit(Faulty, &rdev->flags)) {
6775 		pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6776 			bdevname(rdev->bdev,b), mdname(mddev));
6777 		err = -EINVAL;
6778 		goto abort_export;
6779 	}
6780 
6781 	clear_bit(In_sync, &rdev->flags);
6782 	rdev->desc_nr = -1;
6783 	rdev->saved_raid_disk = -1;
6784 	err = bind_rdev_to_array(rdev, mddev);
6785 	if (err)
6786 		goto abort_export;
6787 
6788 	/*
6789 	 * The rest should better be atomic, we can have disk failures
6790 	 * noticed in interrupt contexts ...
6791 	 */
6792 
6793 	rdev->raid_disk = -1;
6794 
6795 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6796 	if (!mddev->thread)
6797 		md_update_sb(mddev, 1);
6798 	/*
6799 	 * Kick recovery, maybe this spare has to be added to the
6800 	 * array immediately.
6801 	 */
6802 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6803 	md_wakeup_thread(mddev->thread);
6804 	md_new_event(mddev);
6805 	return 0;
6806 
6807 abort_export:
6808 	export_rdev(rdev);
6809 	return err;
6810 }
6811 
6812 static int set_bitmap_file(struct mddev *mddev, int fd)
6813 {
6814 	int err = 0;
6815 
6816 	if (mddev->pers) {
6817 		if (!mddev->pers->quiesce || !mddev->thread)
6818 			return -EBUSY;
6819 		if (mddev->recovery || mddev->sync_thread)
6820 			return -EBUSY;
6821 		/* we should be able to change the bitmap.. */
6822 	}
6823 
6824 	if (fd >= 0) {
6825 		struct inode *inode;
6826 		struct file *f;
6827 
6828 		if (mddev->bitmap || mddev->bitmap_info.file)
6829 			return -EEXIST; /* cannot add when bitmap is present */
6830 		f = fget(fd);
6831 
6832 		if (f == NULL) {
6833 			pr_warn("%s: error: failed to get bitmap file\n",
6834 				mdname(mddev));
6835 			return -EBADF;
6836 		}
6837 
6838 		inode = f->f_mapping->host;
6839 		if (!S_ISREG(inode->i_mode)) {
6840 			pr_warn("%s: error: bitmap file must be a regular file\n",
6841 				mdname(mddev));
6842 			err = -EBADF;
6843 		} else if (!(f->f_mode & FMODE_WRITE)) {
6844 			pr_warn("%s: error: bitmap file must open for write\n",
6845 				mdname(mddev));
6846 			err = -EBADF;
6847 		} else if (atomic_read(&inode->i_writecount) != 1) {
6848 			pr_warn("%s: error: bitmap file is already in use\n",
6849 				mdname(mddev));
6850 			err = -EBUSY;
6851 		}
6852 		if (err) {
6853 			fput(f);
6854 			return err;
6855 		}
6856 		mddev->bitmap_info.file = f;
6857 		mddev->bitmap_info.offset = 0; /* file overrides offset */
6858 	} else if (mddev->bitmap == NULL)
6859 		return -ENOENT; /* cannot remove what isn't there */
6860 	err = 0;
6861 	if (mddev->pers) {
6862 		if (fd >= 0) {
6863 			struct bitmap *bitmap;
6864 
6865 			bitmap = md_bitmap_create(mddev, -1);
6866 			mddev_suspend(mddev);
6867 			if (!IS_ERR(bitmap)) {
6868 				mddev->bitmap = bitmap;
6869 				err = md_bitmap_load(mddev);
6870 			} else
6871 				err = PTR_ERR(bitmap);
6872 			if (err) {
6873 				md_bitmap_destroy(mddev);
6874 				fd = -1;
6875 			}
6876 			mddev_resume(mddev);
6877 		} else if (fd < 0) {
6878 			mddev_suspend(mddev);
6879 			md_bitmap_destroy(mddev);
6880 			mddev_resume(mddev);
6881 		}
6882 	}
6883 	if (fd < 0) {
6884 		struct file *f = mddev->bitmap_info.file;
6885 		if (f) {
6886 			spin_lock(&mddev->lock);
6887 			mddev->bitmap_info.file = NULL;
6888 			spin_unlock(&mddev->lock);
6889 			fput(f);
6890 		}
6891 	}
6892 
6893 	return err;
6894 }
6895 
6896 /*
6897  * set_array_info is used two different ways
6898  * The original usage is when creating a new array.
6899  * In this usage, raid_disks is > 0 and it together with
6900  *  level, size, not_persistent,layout,chunksize determine the
6901  *  shape of the array.
6902  *  This will always create an array with a type-0.90.0 superblock.
6903  * The newer usage is when assembling an array.
6904  *  In this case raid_disks will be 0, and the major_version field is
6905  *  use to determine which style super-blocks are to be found on the devices.
6906  *  The minor and patch _version numbers are also kept incase the
6907  *  super_block handler wishes to interpret them.
6908  */
6909 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6910 {
6911 
6912 	if (info->raid_disks == 0) {
6913 		/* just setting version number for superblock loading */
6914 		if (info->major_version < 0 ||
6915 		    info->major_version >= ARRAY_SIZE(super_types) ||
6916 		    super_types[info->major_version].name == NULL) {
6917 			/* maybe try to auto-load a module? */
6918 			pr_warn("md: superblock version %d not known\n",
6919 				info->major_version);
6920 			return -EINVAL;
6921 		}
6922 		mddev->major_version = info->major_version;
6923 		mddev->minor_version = info->minor_version;
6924 		mddev->patch_version = info->patch_version;
6925 		mddev->persistent = !info->not_persistent;
6926 		/* ensure mddev_put doesn't delete this now that there
6927 		 * is some minimal configuration.
6928 		 */
6929 		mddev->ctime         = ktime_get_real_seconds();
6930 		return 0;
6931 	}
6932 	mddev->major_version = MD_MAJOR_VERSION;
6933 	mddev->minor_version = MD_MINOR_VERSION;
6934 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6935 	mddev->ctime         = ktime_get_real_seconds();
6936 
6937 	mddev->level         = info->level;
6938 	mddev->clevel[0]     = 0;
6939 	mddev->dev_sectors   = 2 * (sector_t)info->size;
6940 	mddev->raid_disks    = info->raid_disks;
6941 	/* don't set md_minor, it is determined by which /dev/md* was
6942 	 * openned
6943 	 */
6944 	if (info->state & (1<<MD_SB_CLEAN))
6945 		mddev->recovery_cp = MaxSector;
6946 	else
6947 		mddev->recovery_cp = 0;
6948 	mddev->persistent    = ! info->not_persistent;
6949 	mddev->external	     = 0;
6950 
6951 	mddev->layout        = info->layout;
6952 	if (mddev->level == 0)
6953 		/* Cannot trust RAID0 layout info here */
6954 		mddev->layout = -1;
6955 	mddev->chunk_sectors = info->chunk_size >> 9;
6956 
6957 	if (mddev->persistent) {
6958 		mddev->max_disks = MD_SB_DISKS;
6959 		mddev->flags = 0;
6960 		mddev->sb_flags = 0;
6961 	}
6962 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6963 
6964 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6965 	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6966 	mddev->bitmap_info.offset = 0;
6967 
6968 	mddev->reshape_position = MaxSector;
6969 
6970 	/*
6971 	 * Generate a 128 bit UUID
6972 	 */
6973 	get_random_bytes(mddev->uuid, 16);
6974 
6975 	mddev->new_level = mddev->level;
6976 	mddev->new_chunk_sectors = mddev->chunk_sectors;
6977 	mddev->new_layout = mddev->layout;
6978 	mddev->delta_disks = 0;
6979 	mddev->reshape_backwards = 0;
6980 
6981 	return 0;
6982 }
6983 
6984 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6985 {
6986 	lockdep_assert_held(&mddev->reconfig_mutex);
6987 
6988 	if (mddev->external_size)
6989 		return;
6990 
6991 	mddev->array_sectors = array_sectors;
6992 }
6993 EXPORT_SYMBOL(md_set_array_sectors);
6994 
6995 static int update_size(struct mddev *mddev, sector_t num_sectors)
6996 {
6997 	struct md_rdev *rdev;
6998 	int rv;
6999 	int fit = (num_sectors == 0);
7000 	sector_t old_dev_sectors = mddev->dev_sectors;
7001 
7002 	if (mddev->pers->resize == NULL)
7003 		return -EINVAL;
7004 	/* The "num_sectors" is the number of sectors of each device that
7005 	 * is used.  This can only make sense for arrays with redundancy.
7006 	 * linear and raid0 always use whatever space is available. We can only
7007 	 * consider changing this number if no resync or reconstruction is
7008 	 * happening, and if the new size is acceptable. It must fit before the
7009 	 * sb_start or, if that is <data_offset, it must fit before the size
7010 	 * of each device.  If num_sectors is zero, we find the largest size
7011 	 * that fits.
7012 	 */
7013 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7014 	    mddev->sync_thread)
7015 		return -EBUSY;
7016 	if (mddev->ro)
7017 		return -EROFS;
7018 
7019 	rdev_for_each(rdev, mddev) {
7020 		sector_t avail = rdev->sectors;
7021 
7022 		if (fit && (num_sectors == 0 || num_sectors > avail))
7023 			num_sectors = avail;
7024 		if (avail < num_sectors)
7025 			return -ENOSPC;
7026 	}
7027 	rv = mddev->pers->resize(mddev, num_sectors);
7028 	if (!rv) {
7029 		if (mddev_is_clustered(mddev))
7030 			md_cluster_ops->update_size(mddev, old_dev_sectors);
7031 		else if (mddev->queue) {
7032 			set_capacity(mddev->gendisk, mddev->array_sectors);
7033 			revalidate_disk(mddev->gendisk);
7034 		}
7035 	}
7036 	return rv;
7037 }
7038 
7039 static int update_raid_disks(struct mddev *mddev, int raid_disks)
7040 {
7041 	int rv;
7042 	struct md_rdev *rdev;
7043 	/* change the number of raid disks */
7044 	if (mddev->pers->check_reshape == NULL)
7045 		return -EINVAL;
7046 	if (mddev->ro)
7047 		return -EROFS;
7048 	if (raid_disks <= 0 ||
7049 	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7050 		return -EINVAL;
7051 	if (mddev->sync_thread ||
7052 	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7053 	    mddev->reshape_position != MaxSector)
7054 		return -EBUSY;
7055 
7056 	rdev_for_each(rdev, mddev) {
7057 		if (mddev->raid_disks < raid_disks &&
7058 		    rdev->data_offset < rdev->new_data_offset)
7059 			return -EINVAL;
7060 		if (mddev->raid_disks > raid_disks &&
7061 		    rdev->data_offset > rdev->new_data_offset)
7062 			return -EINVAL;
7063 	}
7064 
7065 	mddev->delta_disks = raid_disks - mddev->raid_disks;
7066 	if (mddev->delta_disks < 0)
7067 		mddev->reshape_backwards = 1;
7068 	else if (mddev->delta_disks > 0)
7069 		mddev->reshape_backwards = 0;
7070 
7071 	rv = mddev->pers->check_reshape(mddev);
7072 	if (rv < 0) {
7073 		mddev->delta_disks = 0;
7074 		mddev->reshape_backwards = 0;
7075 	}
7076 	return rv;
7077 }
7078 
7079 /*
7080  * update_array_info is used to change the configuration of an
7081  * on-line array.
7082  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7083  * fields in the info are checked against the array.
7084  * Any differences that cannot be handled will cause an error.
7085  * Normally, only one change can be managed at a time.
7086  */
7087 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7088 {
7089 	int rv = 0;
7090 	int cnt = 0;
7091 	int state = 0;
7092 
7093 	/* calculate expected state,ignoring low bits */
7094 	if (mddev->bitmap && mddev->bitmap_info.offset)
7095 		state |= (1 << MD_SB_BITMAP_PRESENT);
7096 
7097 	if (mddev->major_version != info->major_version ||
7098 	    mddev->minor_version != info->minor_version ||
7099 /*	    mddev->patch_version != info->patch_version || */
7100 	    mddev->ctime         != info->ctime         ||
7101 	    mddev->level         != info->level         ||
7102 /*	    mddev->layout        != info->layout        || */
7103 	    mddev->persistent	 != !info->not_persistent ||
7104 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7105 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7106 	    ((state^info->state) & 0xfffffe00)
7107 		)
7108 		return -EINVAL;
7109 	/* Check there is only one change */
7110 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7111 		cnt++;
7112 	if (mddev->raid_disks != info->raid_disks)
7113 		cnt++;
7114 	if (mddev->layout != info->layout)
7115 		cnt++;
7116 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7117 		cnt++;
7118 	if (cnt == 0)
7119 		return 0;
7120 	if (cnt > 1)
7121 		return -EINVAL;
7122 
7123 	if (mddev->layout != info->layout) {
7124 		/* Change layout
7125 		 * we don't need to do anything at the md level, the
7126 		 * personality will take care of it all.
7127 		 */
7128 		if (mddev->pers->check_reshape == NULL)
7129 			return -EINVAL;
7130 		else {
7131 			mddev->new_layout = info->layout;
7132 			rv = mddev->pers->check_reshape(mddev);
7133 			if (rv)
7134 				mddev->new_layout = mddev->layout;
7135 			return rv;
7136 		}
7137 	}
7138 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7139 		rv = update_size(mddev, (sector_t)info->size * 2);
7140 
7141 	if (mddev->raid_disks    != info->raid_disks)
7142 		rv = update_raid_disks(mddev, info->raid_disks);
7143 
7144 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7145 		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7146 			rv = -EINVAL;
7147 			goto err;
7148 		}
7149 		if (mddev->recovery || mddev->sync_thread) {
7150 			rv = -EBUSY;
7151 			goto err;
7152 		}
7153 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7154 			struct bitmap *bitmap;
7155 			/* add the bitmap */
7156 			if (mddev->bitmap) {
7157 				rv = -EEXIST;
7158 				goto err;
7159 			}
7160 			if (mddev->bitmap_info.default_offset == 0) {
7161 				rv = -EINVAL;
7162 				goto err;
7163 			}
7164 			mddev->bitmap_info.offset =
7165 				mddev->bitmap_info.default_offset;
7166 			mddev->bitmap_info.space =
7167 				mddev->bitmap_info.default_space;
7168 			bitmap = md_bitmap_create(mddev, -1);
7169 			mddev_suspend(mddev);
7170 			if (!IS_ERR(bitmap)) {
7171 				mddev->bitmap = bitmap;
7172 				rv = md_bitmap_load(mddev);
7173 			} else
7174 				rv = PTR_ERR(bitmap);
7175 			if (rv)
7176 				md_bitmap_destroy(mddev);
7177 			mddev_resume(mddev);
7178 		} else {
7179 			/* remove the bitmap */
7180 			if (!mddev->bitmap) {
7181 				rv = -ENOENT;
7182 				goto err;
7183 			}
7184 			if (mddev->bitmap->storage.file) {
7185 				rv = -EINVAL;
7186 				goto err;
7187 			}
7188 			if (mddev->bitmap_info.nodes) {
7189 				/* hold PW on all the bitmap lock */
7190 				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7191 					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7192 					rv = -EPERM;
7193 					md_cluster_ops->unlock_all_bitmaps(mddev);
7194 					goto err;
7195 				}
7196 
7197 				mddev->bitmap_info.nodes = 0;
7198 				md_cluster_ops->leave(mddev);
7199 			}
7200 			mddev_suspend(mddev);
7201 			md_bitmap_destroy(mddev);
7202 			mddev_resume(mddev);
7203 			mddev->bitmap_info.offset = 0;
7204 		}
7205 	}
7206 	md_update_sb(mddev, 1);
7207 	return rv;
7208 err:
7209 	return rv;
7210 }
7211 
7212 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7213 {
7214 	struct md_rdev *rdev;
7215 	int err = 0;
7216 
7217 	if (mddev->pers == NULL)
7218 		return -ENODEV;
7219 
7220 	rcu_read_lock();
7221 	rdev = md_find_rdev_rcu(mddev, dev);
7222 	if (!rdev)
7223 		err =  -ENODEV;
7224 	else {
7225 		md_error(mddev, rdev);
7226 		if (!test_bit(Faulty, &rdev->flags))
7227 			err = -EBUSY;
7228 	}
7229 	rcu_read_unlock();
7230 	return err;
7231 }
7232 
7233 /*
7234  * We have a problem here : there is no easy way to give a CHS
7235  * virtual geometry. We currently pretend that we have a 2 heads
7236  * 4 sectors (with a BIG number of cylinders...). This drives
7237  * dosfs just mad... ;-)
7238  */
7239 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7240 {
7241 	struct mddev *mddev = bdev->bd_disk->private_data;
7242 
7243 	geo->heads = 2;
7244 	geo->sectors = 4;
7245 	geo->cylinders = mddev->array_sectors / 8;
7246 	return 0;
7247 }
7248 
7249 static inline bool md_ioctl_valid(unsigned int cmd)
7250 {
7251 	switch (cmd) {
7252 	case ADD_NEW_DISK:
7253 	case BLKROSET:
7254 	case GET_ARRAY_INFO:
7255 	case GET_BITMAP_FILE:
7256 	case GET_DISK_INFO:
7257 	case HOT_ADD_DISK:
7258 	case HOT_REMOVE_DISK:
7259 	case RAID_AUTORUN:
7260 	case RAID_VERSION:
7261 	case RESTART_ARRAY_RW:
7262 	case RUN_ARRAY:
7263 	case SET_ARRAY_INFO:
7264 	case SET_BITMAP_FILE:
7265 	case SET_DISK_FAULTY:
7266 	case STOP_ARRAY:
7267 	case STOP_ARRAY_RO:
7268 	case CLUSTERED_DISK_NACK:
7269 		return true;
7270 	default:
7271 		return false;
7272 	}
7273 }
7274 
7275 static int md_ioctl(struct block_device *bdev, fmode_t mode,
7276 			unsigned int cmd, unsigned long arg)
7277 {
7278 	int err = 0;
7279 	void __user *argp = (void __user *)arg;
7280 	struct mddev *mddev = NULL;
7281 	int ro;
7282 	bool did_set_md_closing = false;
7283 
7284 	if (!md_ioctl_valid(cmd))
7285 		return -ENOTTY;
7286 
7287 	switch (cmd) {
7288 	case RAID_VERSION:
7289 	case GET_ARRAY_INFO:
7290 	case GET_DISK_INFO:
7291 		break;
7292 	default:
7293 		if (!capable(CAP_SYS_ADMIN))
7294 			return -EACCES;
7295 	}
7296 
7297 	/*
7298 	 * Commands dealing with the RAID driver but not any
7299 	 * particular array:
7300 	 */
7301 	switch (cmd) {
7302 	case RAID_VERSION:
7303 		err = get_version(argp);
7304 		goto out;
7305 
7306 #ifndef MODULE
7307 	case RAID_AUTORUN:
7308 		err = 0;
7309 		autostart_arrays(arg);
7310 		goto out;
7311 #endif
7312 	default:;
7313 	}
7314 
7315 	/*
7316 	 * Commands creating/starting a new array:
7317 	 */
7318 
7319 	mddev = bdev->bd_disk->private_data;
7320 
7321 	if (!mddev) {
7322 		BUG();
7323 		goto out;
7324 	}
7325 
7326 	/* Some actions do not requires the mutex */
7327 	switch (cmd) {
7328 	case GET_ARRAY_INFO:
7329 		if (!mddev->raid_disks && !mddev->external)
7330 			err = -ENODEV;
7331 		else
7332 			err = get_array_info(mddev, argp);
7333 		goto out;
7334 
7335 	case GET_DISK_INFO:
7336 		if (!mddev->raid_disks && !mddev->external)
7337 			err = -ENODEV;
7338 		else
7339 			err = get_disk_info(mddev, argp);
7340 		goto out;
7341 
7342 	case SET_DISK_FAULTY:
7343 		err = set_disk_faulty(mddev, new_decode_dev(arg));
7344 		goto out;
7345 
7346 	case GET_BITMAP_FILE:
7347 		err = get_bitmap_file(mddev, argp);
7348 		goto out;
7349 
7350 	}
7351 
7352 	if (cmd == ADD_NEW_DISK)
7353 		/* need to ensure md_delayed_delete() has completed */
7354 		flush_workqueue(md_misc_wq);
7355 
7356 	if (cmd == HOT_REMOVE_DISK)
7357 		/* need to ensure recovery thread has run */
7358 		wait_event_interruptible_timeout(mddev->sb_wait,
7359 						 !test_bit(MD_RECOVERY_NEEDED,
7360 							   &mddev->recovery),
7361 						 msecs_to_jiffies(5000));
7362 	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7363 		/* Need to flush page cache, and ensure no-one else opens
7364 		 * and writes
7365 		 */
7366 		mutex_lock(&mddev->open_mutex);
7367 		if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7368 			mutex_unlock(&mddev->open_mutex);
7369 			err = -EBUSY;
7370 			goto out;
7371 		}
7372 		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7373 		set_bit(MD_CLOSING, &mddev->flags);
7374 		did_set_md_closing = true;
7375 		mutex_unlock(&mddev->open_mutex);
7376 		sync_blockdev(bdev);
7377 	}
7378 	err = mddev_lock(mddev);
7379 	if (err) {
7380 		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7381 			 err, cmd);
7382 		goto out;
7383 	}
7384 
7385 	if (cmd == SET_ARRAY_INFO) {
7386 		mdu_array_info_t info;
7387 		if (!arg)
7388 			memset(&info, 0, sizeof(info));
7389 		else if (copy_from_user(&info, argp, sizeof(info))) {
7390 			err = -EFAULT;
7391 			goto unlock;
7392 		}
7393 		if (mddev->pers) {
7394 			err = update_array_info(mddev, &info);
7395 			if (err) {
7396 				pr_warn("md: couldn't update array info. %d\n", err);
7397 				goto unlock;
7398 			}
7399 			goto unlock;
7400 		}
7401 		if (!list_empty(&mddev->disks)) {
7402 			pr_warn("md: array %s already has disks!\n", mdname(mddev));
7403 			err = -EBUSY;
7404 			goto unlock;
7405 		}
7406 		if (mddev->raid_disks) {
7407 			pr_warn("md: array %s already initialised!\n", mdname(mddev));
7408 			err = -EBUSY;
7409 			goto unlock;
7410 		}
7411 		err = set_array_info(mddev, &info);
7412 		if (err) {
7413 			pr_warn("md: couldn't set array info. %d\n", err);
7414 			goto unlock;
7415 		}
7416 		goto unlock;
7417 	}
7418 
7419 	/*
7420 	 * Commands querying/configuring an existing array:
7421 	 */
7422 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7423 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7424 	if ((!mddev->raid_disks && !mddev->external)
7425 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7426 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7427 	    && cmd != GET_BITMAP_FILE) {
7428 		err = -ENODEV;
7429 		goto unlock;
7430 	}
7431 
7432 	/*
7433 	 * Commands even a read-only array can execute:
7434 	 */
7435 	switch (cmd) {
7436 	case RESTART_ARRAY_RW:
7437 		err = restart_array(mddev);
7438 		goto unlock;
7439 
7440 	case STOP_ARRAY:
7441 		err = do_md_stop(mddev, 0, bdev);
7442 		goto unlock;
7443 
7444 	case STOP_ARRAY_RO:
7445 		err = md_set_readonly(mddev, bdev);
7446 		goto unlock;
7447 
7448 	case HOT_REMOVE_DISK:
7449 		err = hot_remove_disk(mddev, new_decode_dev(arg));
7450 		goto unlock;
7451 
7452 	case ADD_NEW_DISK:
7453 		/* We can support ADD_NEW_DISK on read-only arrays
7454 		 * only if we are re-adding a preexisting device.
7455 		 * So require mddev->pers and MD_DISK_SYNC.
7456 		 */
7457 		if (mddev->pers) {
7458 			mdu_disk_info_t info;
7459 			if (copy_from_user(&info, argp, sizeof(info)))
7460 				err = -EFAULT;
7461 			else if (!(info.state & (1<<MD_DISK_SYNC)))
7462 				/* Need to clear read-only for this */
7463 				break;
7464 			else
7465 				err = add_new_disk(mddev, &info);
7466 			goto unlock;
7467 		}
7468 		break;
7469 
7470 	case BLKROSET:
7471 		if (get_user(ro, (int __user *)(arg))) {
7472 			err = -EFAULT;
7473 			goto unlock;
7474 		}
7475 		err = -EINVAL;
7476 
7477 		/* if the bdev is going readonly the value of mddev->ro
7478 		 * does not matter, no writes are coming
7479 		 */
7480 		if (ro)
7481 			goto unlock;
7482 
7483 		/* are we are already prepared for writes? */
7484 		if (mddev->ro != 1)
7485 			goto unlock;
7486 
7487 		/* transitioning to readauto need only happen for
7488 		 * arrays that call md_write_start
7489 		 */
7490 		if (mddev->pers) {
7491 			err = restart_array(mddev);
7492 			if (err == 0) {
7493 				mddev->ro = 2;
7494 				set_disk_ro(mddev->gendisk, 0);
7495 			}
7496 		}
7497 		goto unlock;
7498 	}
7499 
7500 	/*
7501 	 * The remaining ioctls are changing the state of the
7502 	 * superblock, so we do not allow them on read-only arrays.
7503 	 */
7504 	if (mddev->ro && mddev->pers) {
7505 		if (mddev->ro == 2) {
7506 			mddev->ro = 0;
7507 			sysfs_notify_dirent_safe(mddev->sysfs_state);
7508 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7509 			/* mddev_unlock will wake thread */
7510 			/* If a device failed while we were read-only, we
7511 			 * need to make sure the metadata is updated now.
7512 			 */
7513 			if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7514 				mddev_unlock(mddev);
7515 				wait_event(mddev->sb_wait,
7516 					   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7517 					   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7518 				mddev_lock_nointr(mddev);
7519 			}
7520 		} else {
7521 			err = -EROFS;
7522 			goto unlock;
7523 		}
7524 	}
7525 
7526 	switch (cmd) {
7527 	case ADD_NEW_DISK:
7528 	{
7529 		mdu_disk_info_t info;
7530 		if (copy_from_user(&info, argp, sizeof(info)))
7531 			err = -EFAULT;
7532 		else
7533 			err = add_new_disk(mddev, &info);
7534 		goto unlock;
7535 	}
7536 
7537 	case CLUSTERED_DISK_NACK:
7538 		if (mddev_is_clustered(mddev))
7539 			md_cluster_ops->new_disk_ack(mddev, false);
7540 		else
7541 			err = -EINVAL;
7542 		goto unlock;
7543 
7544 	case HOT_ADD_DISK:
7545 		err = hot_add_disk(mddev, new_decode_dev(arg));
7546 		goto unlock;
7547 
7548 	case RUN_ARRAY:
7549 		err = do_md_run(mddev);
7550 		goto unlock;
7551 
7552 	case SET_BITMAP_FILE:
7553 		err = set_bitmap_file(mddev, (int)arg);
7554 		goto unlock;
7555 
7556 	default:
7557 		err = -EINVAL;
7558 		goto unlock;
7559 	}
7560 
7561 unlock:
7562 	if (mddev->hold_active == UNTIL_IOCTL &&
7563 	    err != -EINVAL)
7564 		mddev->hold_active = 0;
7565 	mddev_unlock(mddev);
7566 out:
7567 	if(did_set_md_closing)
7568 		clear_bit(MD_CLOSING, &mddev->flags);
7569 	return err;
7570 }
7571 #ifdef CONFIG_COMPAT
7572 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7573 		    unsigned int cmd, unsigned long arg)
7574 {
7575 	switch (cmd) {
7576 	case HOT_REMOVE_DISK:
7577 	case HOT_ADD_DISK:
7578 	case SET_DISK_FAULTY:
7579 	case SET_BITMAP_FILE:
7580 		/* These take in integer arg, do not convert */
7581 		break;
7582 	default:
7583 		arg = (unsigned long)compat_ptr(arg);
7584 		break;
7585 	}
7586 
7587 	return md_ioctl(bdev, mode, cmd, arg);
7588 }
7589 #endif /* CONFIG_COMPAT */
7590 
7591 static int md_open(struct block_device *bdev, fmode_t mode)
7592 {
7593 	/*
7594 	 * Succeed if we can lock the mddev, which confirms that
7595 	 * it isn't being stopped right now.
7596 	 */
7597 	struct mddev *mddev = mddev_find(bdev->bd_dev);
7598 	int err;
7599 
7600 	if (!mddev)
7601 		return -ENODEV;
7602 
7603 	if (mddev->gendisk != bdev->bd_disk) {
7604 		/* we are racing with mddev_put which is discarding this
7605 		 * bd_disk.
7606 		 */
7607 		mddev_put(mddev);
7608 		/* Wait until bdev->bd_disk is definitely gone */
7609 		flush_workqueue(md_misc_wq);
7610 		/* Then retry the open from the top */
7611 		return -ERESTARTSYS;
7612 	}
7613 	BUG_ON(mddev != bdev->bd_disk->private_data);
7614 
7615 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7616 		goto out;
7617 
7618 	if (test_bit(MD_CLOSING, &mddev->flags)) {
7619 		mutex_unlock(&mddev->open_mutex);
7620 		err = -ENODEV;
7621 		goto out;
7622 	}
7623 
7624 	err = 0;
7625 	atomic_inc(&mddev->openers);
7626 	mutex_unlock(&mddev->open_mutex);
7627 
7628 	check_disk_change(bdev);
7629  out:
7630 	if (err)
7631 		mddev_put(mddev);
7632 	return err;
7633 }
7634 
7635 static void md_release(struct gendisk *disk, fmode_t mode)
7636 {
7637 	struct mddev *mddev = disk->private_data;
7638 
7639 	BUG_ON(!mddev);
7640 	atomic_dec(&mddev->openers);
7641 	mddev_put(mddev);
7642 }
7643 
7644 static int md_media_changed(struct gendisk *disk)
7645 {
7646 	struct mddev *mddev = disk->private_data;
7647 
7648 	return mddev->changed;
7649 }
7650 
7651 static int md_revalidate(struct gendisk *disk)
7652 {
7653 	struct mddev *mddev = disk->private_data;
7654 
7655 	mddev->changed = 0;
7656 	return 0;
7657 }
7658 static const struct block_device_operations md_fops =
7659 {
7660 	.owner		= THIS_MODULE,
7661 	.open		= md_open,
7662 	.release	= md_release,
7663 	.ioctl		= md_ioctl,
7664 #ifdef CONFIG_COMPAT
7665 	.compat_ioctl	= md_compat_ioctl,
7666 #endif
7667 	.getgeo		= md_getgeo,
7668 	.media_changed  = md_media_changed,
7669 	.revalidate_disk= md_revalidate,
7670 };
7671 
7672 static int md_thread(void *arg)
7673 {
7674 	struct md_thread *thread = arg;
7675 
7676 	/*
7677 	 * md_thread is a 'system-thread', it's priority should be very
7678 	 * high. We avoid resource deadlocks individually in each
7679 	 * raid personality. (RAID5 does preallocation) We also use RR and
7680 	 * the very same RT priority as kswapd, thus we will never get
7681 	 * into a priority inversion deadlock.
7682 	 *
7683 	 * we definitely have to have equal or higher priority than
7684 	 * bdflush, otherwise bdflush will deadlock if there are too
7685 	 * many dirty RAID5 blocks.
7686 	 */
7687 
7688 	allow_signal(SIGKILL);
7689 	while (!kthread_should_stop()) {
7690 
7691 		/* We need to wait INTERRUPTIBLE so that
7692 		 * we don't add to the load-average.
7693 		 * That means we need to be sure no signals are
7694 		 * pending
7695 		 */
7696 		if (signal_pending(current))
7697 			flush_signals(current);
7698 
7699 		wait_event_interruptible_timeout
7700 			(thread->wqueue,
7701 			 test_bit(THREAD_WAKEUP, &thread->flags)
7702 			 || kthread_should_stop() || kthread_should_park(),
7703 			 thread->timeout);
7704 
7705 		clear_bit(THREAD_WAKEUP, &thread->flags);
7706 		if (kthread_should_park())
7707 			kthread_parkme();
7708 		if (!kthread_should_stop())
7709 			thread->run(thread);
7710 	}
7711 
7712 	return 0;
7713 }
7714 
7715 void md_wakeup_thread(struct md_thread *thread)
7716 {
7717 	if (thread) {
7718 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7719 		set_bit(THREAD_WAKEUP, &thread->flags);
7720 		wake_up(&thread->wqueue);
7721 	}
7722 }
7723 EXPORT_SYMBOL(md_wakeup_thread);
7724 
7725 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7726 		struct mddev *mddev, const char *name)
7727 {
7728 	struct md_thread *thread;
7729 
7730 	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7731 	if (!thread)
7732 		return NULL;
7733 
7734 	init_waitqueue_head(&thread->wqueue);
7735 
7736 	thread->run = run;
7737 	thread->mddev = mddev;
7738 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
7739 	thread->tsk = kthread_run(md_thread, thread,
7740 				  "%s_%s",
7741 				  mdname(thread->mddev),
7742 				  name);
7743 	if (IS_ERR(thread->tsk)) {
7744 		kfree(thread);
7745 		return NULL;
7746 	}
7747 	return thread;
7748 }
7749 EXPORT_SYMBOL(md_register_thread);
7750 
7751 void md_unregister_thread(struct md_thread **threadp)
7752 {
7753 	struct md_thread *thread = *threadp;
7754 	if (!thread)
7755 		return;
7756 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7757 	/* Locking ensures that mddev_unlock does not wake_up a
7758 	 * non-existent thread
7759 	 */
7760 	spin_lock(&pers_lock);
7761 	*threadp = NULL;
7762 	spin_unlock(&pers_lock);
7763 
7764 	kthread_stop(thread->tsk);
7765 	kfree(thread);
7766 }
7767 EXPORT_SYMBOL(md_unregister_thread);
7768 
7769 void md_error(struct mddev *mddev, struct md_rdev *rdev)
7770 {
7771 	if (!rdev || test_bit(Faulty, &rdev->flags))
7772 		return;
7773 
7774 	if (!mddev->pers || !mddev->pers->error_handler)
7775 		return;
7776 	mddev->pers->error_handler(mddev,rdev);
7777 	if (mddev->degraded)
7778 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7779 	sysfs_notify_dirent_safe(rdev->sysfs_state);
7780 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7781 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7782 	md_wakeup_thread(mddev->thread);
7783 	if (mddev->event_work.func)
7784 		queue_work(md_misc_wq, &mddev->event_work);
7785 	md_new_event(mddev);
7786 }
7787 EXPORT_SYMBOL(md_error);
7788 
7789 /* seq_file implementation /proc/mdstat */
7790 
7791 static void status_unused(struct seq_file *seq)
7792 {
7793 	int i = 0;
7794 	struct md_rdev *rdev;
7795 
7796 	seq_printf(seq, "unused devices: ");
7797 
7798 	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7799 		char b[BDEVNAME_SIZE];
7800 		i++;
7801 		seq_printf(seq, "%s ",
7802 			      bdevname(rdev->bdev,b));
7803 	}
7804 	if (!i)
7805 		seq_printf(seq, "<none>");
7806 
7807 	seq_printf(seq, "\n");
7808 }
7809 
7810 static int status_resync(struct seq_file *seq, struct mddev *mddev)
7811 {
7812 	sector_t max_sectors, resync, res;
7813 	unsigned long dt, db = 0;
7814 	sector_t rt, curr_mark_cnt, resync_mark_cnt;
7815 	int scale, recovery_active;
7816 	unsigned int per_milli;
7817 
7818 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7819 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7820 		max_sectors = mddev->resync_max_sectors;
7821 	else
7822 		max_sectors = mddev->dev_sectors;
7823 
7824 	resync = mddev->curr_resync;
7825 	if (resync <= 3) {
7826 		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7827 			/* Still cleaning up */
7828 			resync = max_sectors;
7829 	} else if (resync > max_sectors)
7830 		resync = max_sectors;
7831 	else
7832 		resync -= atomic_read(&mddev->recovery_active);
7833 
7834 	if (resync == 0) {
7835 		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
7836 			struct md_rdev *rdev;
7837 
7838 			rdev_for_each(rdev, mddev)
7839 				if (rdev->raid_disk >= 0 &&
7840 				    !test_bit(Faulty, &rdev->flags) &&
7841 				    rdev->recovery_offset != MaxSector &&
7842 				    rdev->recovery_offset) {
7843 					seq_printf(seq, "\trecover=REMOTE");
7844 					return 1;
7845 				}
7846 			if (mddev->reshape_position != MaxSector)
7847 				seq_printf(seq, "\treshape=REMOTE");
7848 			else
7849 				seq_printf(seq, "\tresync=REMOTE");
7850 			return 1;
7851 		}
7852 		if (mddev->recovery_cp < MaxSector) {
7853 			seq_printf(seq, "\tresync=PENDING");
7854 			return 1;
7855 		}
7856 		return 0;
7857 	}
7858 	if (resync < 3) {
7859 		seq_printf(seq, "\tresync=DELAYED");
7860 		return 1;
7861 	}
7862 
7863 	WARN_ON(max_sectors == 0);
7864 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
7865 	 * in a sector_t, and (max_sectors>>scale) will fit in a
7866 	 * u32, as those are the requirements for sector_div.
7867 	 * Thus 'scale' must be at least 10
7868 	 */
7869 	scale = 10;
7870 	if (sizeof(sector_t) > sizeof(unsigned long)) {
7871 		while ( max_sectors/2 > (1ULL<<(scale+32)))
7872 			scale++;
7873 	}
7874 	res = (resync>>scale)*1000;
7875 	sector_div(res, (u32)((max_sectors>>scale)+1));
7876 
7877 	per_milli = res;
7878 	{
7879 		int i, x = per_milli/50, y = 20-x;
7880 		seq_printf(seq, "[");
7881 		for (i = 0; i < x; i++)
7882 			seq_printf(seq, "=");
7883 		seq_printf(seq, ">");
7884 		for (i = 0; i < y; i++)
7885 			seq_printf(seq, ".");
7886 		seq_printf(seq, "] ");
7887 	}
7888 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7889 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7890 		    "reshape" :
7891 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7892 		     "check" :
7893 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7894 		      "resync" : "recovery"))),
7895 		   per_milli/10, per_milli % 10,
7896 		   (unsigned long long) resync/2,
7897 		   (unsigned long long) max_sectors/2);
7898 
7899 	/*
7900 	 * dt: time from mark until now
7901 	 * db: blocks written from mark until now
7902 	 * rt: remaining time
7903 	 *
7904 	 * rt is a sector_t, which is always 64bit now. We are keeping
7905 	 * the original algorithm, but it is not really necessary.
7906 	 *
7907 	 * Original algorithm:
7908 	 *   So we divide before multiply in case it is 32bit and close
7909 	 *   to the limit.
7910 	 *   We scale the divisor (db) by 32 to avoid losing precision
7911 	 *   near the end of resync when the number of remaining sectors
7912 	 *   is close to 'db'.
7913 	 *   We then divide rt by 32 after multiplying by db to compensate.
7914 	 *   The '+1' avoids division by zero if db is very small.
7915 	 */
7916 	dt = ((jiffies - mddev->resync_mark) / HZ);
7917 	if (!dt) dt++;
7918 
7919 	curr_mark_cnt = mddev->curr_mark_cnt;
7920 	recovery_active = atomic_read(&mddev->recovery_active);
7921 	resync_mark_cnt = mddev->resync_mark_cnt;
7922 
7923 	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
7924 		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
7925 
7926 	rt = max_sectors - resync;    /* number of remaining sectors */
7927 	rt = div64_u64(rt, db/32+1);
7928 	rt *= dt;
7929 	rt >>= 5;
7930 
7931 	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7932 		   ((unsigned long)rt % 60)/6);
7933 
7934 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7935 	return 1;
7936 }
7937 
7938 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7939 {
7940 	struct list_head *tmp;
7941 	loff_t l = *pos;
7942 	struct mddev *mddev;
7943 
7944 	if (l >= 0x10000)
7945 		return NULL;
7946 	if (!l--)
7947 		/* header */
7948 		return (void*)1;
7949 
7950 	spin_lock(&all_mddevs_lock);
7951 	list_for_each(tmp,&all_mddevs)
7952 		if (!l--) {
7953 			mddev = list_entry(tmp, struct mddev, all_mddevs);
7954 			mddev_get(mddev);
7955 			spin_unlock(&all_mddevs_lock);
7956 			return mddev;
7957 		}
7958 	spin_unlock(&all_mddevs_lock);
7959 	if (!l--)
7960 		return (void*)2;/* tail */
7961 	return NULL;
7962 }
7963 
7964 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7965 {
7966 	struct list_head *tmp;
7967 	struct mddev *next_mddev, *mddev = v;
7968 
7969 	++*pos;
7970 	if (v == (void*)2)
7971 		return NULL;
7972 
7973 	spin_lock(&all_mddevs_lock);
7974 	if (v == (void*)1)
7975 		tmp = all_mddevs.next;
7976 	else
7977 		tmp = mddev->all_mddevs.next;
7978 	if (tmp != &all_mddevs)
7979 		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7980 	else {
7981 		next_mddev = (void*)2;
7982 		*pos = 0x10000;
7983 	}
7984 	spin_unlock(&all_mddevs_lock);
7985 
7986 	if (v != (void*)1)
7987 		mddev_put(mddev);
7988 	return next_mddev;
7989 
7990 }
7991 
7992 static void md_seq_stop(struct seq_file *seq, void *v)
7993 {
7994 	struct mddev *mddev = v;
7995 
7996 	if (mddev && v != (void*)1 && v != (void*)2)
7997 		mddev_put(mddev);
7998 }
7999 
8000 static int md_seq_show(struct seq_file *seq, void *v)
8001 {
8002 	struct mddev *mddev = v;
8003 	sector_t sectors;
8004 	struct md_rdev *rdev;
8005 
8006 	if (v == (void*)1) {
8007 		struct md_personality *pers;
8008 		seq_printf(seq, "Personalities : ");
8009 		spin_lock(&pers_lock);
8010 		list_for_each_entry(pers, &pers_list, list)
8011 			seq_printf(seq, "[%s] ", pers->name);
8012 
8013 		spin_unlock(&pers_lock);
8014 		seq_printf(seq, "\n");
8015 		seq->poll_event = atomic_read(&md_event_count);
8016 		return 0;
8017 	}
8018 	if (v == (void*)2) {
8019 		status_unused(seq);
8020 		return 0;
8021 	}
8022 
8023 	spin_lock(&mddev->lock);
8024 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8025 		seq_printf(seq, "%s : %sactive", mdname(mddev),
8026 						mddev->pers ? "" : "in");
8027 		if (mddev->pers) {
8028 			if (mddev->ro==1)
8029 				seq_printf(seq, " (read-only)");
8030 			if (mddev->ro==2)
8031 				seq_printf(seq, " (auto-read-only)");
8032 			seq_printf(seq, " %s", mddev->pers->name);
8033 		}
8034 
8035 		sectors = 0;
8036 		rcu_read_lock();
8037 		rdev_for_each_rcu(rdev, mddev) {
8038 			char b[BDEVNAME_SIZE];
8039 			seq_printf(seq, " %s[%d]",
8040 				bdevname(rdev->bdev,b), rdev->desc_nr);
8041 			if (test_bit(WriteMostly, &rdev->flags))
8042 				seq_printf(seq, "(W)");
8043 			if (test_bit(Journal, &rdev->flags))
8044 				seq_printf(seq, "(J)");
8045 			if (test_bit(Faulty, &rdev->flags)) {
8046 				seq_printf(seq, "(F)");
8047 				continue;
8048 			}
8049 			if (rdev->raid_disk < 0)
8050 				seq_printf(seq, "(S)"); /* spare */
8051 			if (test_bit(Replacement, &rdev->flags))
8052 				seq_printf(seq, "(R)");
8053 			sectors += rdev->sectors;
8054 		}
8055 		rcu_read_unlock();
8056 
8057 		if (!list_empty(&mddev->disks)) {
8058 			if (mddev->pers)
8059 				seq_printf(seq, "\n      %llu blocks",
8060 					   (unsigned long long)
8061 					   mddev->array_sectors / 2);
8062 			else
8063 				seq_printf(seq, "\n      %llu blocks",
8064 					   (unsigned long long)sectors / 2);
8065 		}
8066 		if (mddev->persistent) {
8067 			if (mddev->major_version != 0 ||
8068 			    mddev->minor_version != 90) {
8069 				seq_printf(seq," super %d.%d",
8070 					   mddev->major_version,
8071 					   mddev->minor_version);
8072 			}
8073 		} else if (mddev->external)
8074 			seq_printf(seq, " super external:%s",
8075 				   mddev->metadata_type);
8076 		else
8077 			seq_printf(seq, " super non-persistent");
8078 
8079 		if (mddev->pers) {
8080 			mddev->pers->status(seq, mddev);
8081 			seq_printf(seq, "\n      ");
8082 			if (mddev->pers->sync_request) {
8083 				if (status_resync(seq, mddev))
8084 					seq_printf(seq, "\n      ");
8085 			}
8086 		} else
8087 			seq_printf(seq, "\n       ");
8088 
8089 		md_bitmap_status(seq, mddev->bitmap);
8090 
8091 		seq_printf(seq, "\n");
8092 	}
8093 	spin_unlock(&mddev->lock);
8094 
8095 	return 0;
8096 }
8097 
8098 static const struct seq_operations md_seq_ops = {
8099 	.start  = md_seq_start,
8100 	.next   = md_seq_next,
8101 	.stop   = md_seq_stop,
8102 	.show   = md_seq_show,
8103 };
8104 
8105 static int md_seq_open(struct inode *inode, struct file *file)
8106 {
8107 	struct seq_file *seq;
8108 	int error;
8109 
8110 	error = seq_open(file, &md_seq_ops);
8111 	if (error)
8112 		return error;
8113 
8114 	seq = file->private_data;
8115 	seq->poll_event = atomic_read(&md_event_count);
8116 	return error;
8117 }
8118 
8119 static int md_unloading;
8120 static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8121 {
8122 	struct seq_file *seq = filp->private_data;
8123 	__poll_t mask;
8124 
8125 	if (md_unloading)
8126 		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8127 	poll_wait(filp, &md_event_waiters, wait);
8128 
8129 	/* always allow read */
8130 	mask = EPOLLIN | EPOLLRDNORM;
8131 
8132 	if (seq->poll_event != atomic_read(&md_event_count))
8133 		mask |= EPOLLERR | EPOLLPRI;
8134 	return mask;
8135 }
8136 
8137 static const struct file_operations md_seq_fops = {
8138 	.owner		= THIS_MODULE,
8139 	.open           = md_seq_open,
8140 	.read           = seq_read,
8141 	.llseek         = seq_lseek,
8142 	.release	= seq_release,
8143 	.poll		= mdstat_poll,
8144 };
8145 
8146 int register_md_personality(struct md_personality *p)
8147 {
8148 	pr_debug("md: %s personality registered for level %d\n",
8149 		 p->name, p->level);
8150 	spin_lock(&pers_lock);
8151 	list_add_tail(&p->list, &pers_list);
8152 	spin_unlock(&pers_lock);
8153 	return 0;
8154 }
8155 EXPORT_SYMBOL(register_md_personality);
8156 
8157 int unregister_md_personality(struct md_personality *p)
8158 {
8159 	pr_debug("md: %s personality unregistered\n", p->name);
8160 	spin_lock(&pers_lock);
8161 	list_del_init(&p->list);
8162 	spin_unlock(&pers_lock);
8163 	return 0;
8164 }
8165 EXPORT_SYMBOL(unregister_md_personality);
8166 
8167 int register_md_cluster_operations(struct md_cluster_operations *ops,
8168 				   struct module *module)
8169 {
8170 	int ret = 0;
8171 	spin_lock(&pers_lock);
8172 	if (md_cluster_ops != NULL)
8173 		ret = -EALREADY;
8174 	else {
8175 		md_cluster_ops = ops;
8176 		md_cluster_mod = module;
8177 	}
8178 	spin_unlock(&pers_lock);
8179 	return ret;
8180 }
8181 EXPORT_SYMBOL(register_md_cluster_operations);
8182 
8183 int unregister_md_cluster_operations(void)
8184 {
8185 	spin_lock(&pers_lock);
8186 	md_cluster_ops = NULL;
8187 	spin_unlock(&pers_lock);
8188 	return 0;
8189 }
8190 EXPORT_SYMBOL(unregister_md_cluster_operations);
8191 
8192 int md_setup_cluster(struct mddev *mddev, int nodes)
8193 {
8194 	if (!md_cluster_ops)
8195 		request_module("md-cluster");
8196 	spin_lock(&pers_lock);
8197 	/* ensure module won't be unloaded */
8198 	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8199 		pr_warn("can't find md-cluster module or get it's reference.\n");
8200 		spin_unlock(&pers_lock);
8201 		return -ENOENT;
8202 	}
8203 	spin_unlock(&pers_lock);
8204 
8205 	return md_cluster_ops->join(mddev, nodes);
8206 }
8207 
8208 void md_cluster_stop(struct mddev *mddev)
8209 {
8210 	if (!md_cluster_ops)
8211 		return;
8212 	md_cluster_ops->leave(mddev);
8213 	module_put(md_cluster_mod);
8214 }
8215 
8216 static int is_mddev_idle(struct mddev *mddev, int init)
8217 {
8218 	struct md_rdev *rdev;
8219 	int idle;
8220 	int curr_events;
8221 
8222 	idle = 1;
8223 	rcu_read_lock();
8224 	rdev_for_each_rcu(rdev, mddev) {
8225 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8226 		curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
8227 			      atomic_read(&disk->sync_io);
8228 		/* sync IO will cause sync_io to increase before the disk_stats
8229 		 * as sync_io is counted when a request starts, and
8230 		 * disk_stats is counted when it completes.
8231 		 * So resync activity will cause curr_events to be smaller than
8232 		 * when there was no such activity.
8233 		 * non-sync IO will cause disk_stat to increase without
8234 		 * increasing sync_io so curr_events will (eventually)
8235 		 * be larger than it was before.  Once it becomes
8236 		 * substantially larger, the test below will cause
8237 		 * the array to appear non-idle, and resync will slow
8238 		 * down.
8239 		 * If there is a lot of outstanding resync activity when
8240 		 * we set last_event to curr_events, then all that activity
8241 		 * completing might cause the array to appear non-idle
8242 		 * and resync will be slowed down even though there might
8243 		 * not have been non-resync activity.  This will only
8244 		 * happen once though.  'last_events' will soon reflect
8245 		 * the state where there is little or no outstanding
8246 		 * resync requests, and further resync activity will
8247 		 * always make curr_events less than last_events.
8248 		 *
8249 		 */
8250 		if (init || curr_events - rdev->last_events > 64) {
8251 			rdev->last_events = curr_events;
8252 			idle = 0;
8253 		}
8254 	}
8255 	rcu_read_unlock();
8256 	return idle;
8257 }
8258 
8259 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8260 {
8261 	/* another "blocks" (512byte) blocks have been synced */
8262 	atomic_sub(blocks, &mddev->recovery_active);
8263 	wake_up(&mddev->recovery_wait);
8264 	if (!ok) {
8265 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8266 		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8267 		md_wakeup_thread(mddev->thread);
8268 		// stop recovery, signal do_sync ....
8269 	}
8270 }
8271 EXPORT_SYMBOL(md_done_sync);
8272 
8273 /* md_write_start(mddev, bi)
8274  * If we need to update some array metadata (e.g. 'active' flag
8275  * in superblock) before writing, schedule a superblock update
8276  * and wait for it to complete.
8277  * A return value of 'false' means that the write wasn't recorded
8278  * and cannot proceed as the array is being suspend.
8279  */
8280 bool md_write_start(struct mddev *mddev, struct bio *bi)
8281 {
8282 	int did_change = 0;
8283 
8284 	if (bio_data_dir(bi) != WRITE)
8285 		return true;
8286 
8287 	BUG_ON(mddev->ro == 1);
8288 	if (mddev->ro == 2) {
8289 		/* need to switch to read/write */
8290 		mddev->ro = 0;
8291 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8292 		md_wakeup_thread(mddev->thread);
8293 		md_wakeup_thread(mddev->sync_thread);
8294 		did_change = 1;
8295 	}
8296 	rcu_read_lock();
8297 	percpu_ref_get(&mddev->writes_pending);
8298 	smp_mb(); /* Match smp_mb in set_in_sync() */
8299 	if (mddev->safemode == 1)
8300 		mddev->safemode = 0;
8301 	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8302 	if (mddev->in_sync || mddev->sync_checkers) {
8303 		spin_lock(&mddev->lock);
8304 		if (mddev->in_sync) {
8305 			mddev->in_sync = 0;
8306 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8307 			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8308 			md_wakeup_thread(mddev->thread);
8309 			did_change = 1;
8310 		}
8311 		spin_unlock(&mddev->lock);
8312 	}
8313 	rcu_read_unlock();
8314 	if (did_change)
8315 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8316 	if (!mddev->has_superblocks)
8317 		return true;
8318 	wait_event(mddev->sb_wait,
8319 		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8320 		   mddev->suspended);
8321 	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8322 		percpu_ref_put(&mddev->writes_pending);
8323 		return false;
8324 	}
8325 	return true;
8326 }
8327 EXPORT_SYMBOL(md_write_start);
8328 
8329 /* md_write_inc can only be called when md_write_start() has
8330  * already been called at least once of the current request.
8331  * It increments the counter and is useful when a single request
8332  * is split into several parts.  Each part causes an increment and
8333  * so needs a matching md_write_end().
8334  * Unlike md_write_start(), it is safe to call md_write_inc() inside
8335  * a spinlocked region.
8336  */
8337 void md_write_inc(struct mddev *mddev, struct bio *bi)
8338 {
8339 	if (bio_data_dir(bi) != WRITE)
8340 		return;
8341 	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8342 	percpu_ref_get(&mddev->writes_pending);
8343 }
8344 EXPORT_SYMBOL(md_write_inc);
8345 
8346 void md_write_end(struct mddev *mddev)
8347 {
8348 	percpu_ref_put(&mddev->writes_pending);
8349 
8350 	if (mddev->safemode == 2)
8351 		md_wakeup_thread(mddev->thread);
8352 	else if (mddev->safemode_delay)
8353 		/* The roundup() ensures this only performs locking once
8354 		 * every ->safemode_delay jiffies
8355 		 */
8356 		mod_timer(&mddev->safemode_timer,
8357 			  roundup(jiffies, mddev->safemode_delay) +
8358 			  mddev->safemode_delay);
8359 }
8360 
8361 EXPORT_SYMBOL(md_write_end);
8362 
8363 /* md_allow_write(mddev)
8364  * Calling this ensures that the array is marked 'active' so that writes
8365  * may proceed without blocking.  It is important to call this before
8366  * attempting a GFP_KERNEL allocation while holding the mddev lock.
8367  * Must be called with mddev_lock held.
8368  */
8369 void md_allow_write(struct mddev *mddev)
8370 {
8371 	if (!mddev->pers)
8372 		return;
8373 	if (mddev->ro)
8374 		return;
8375 	if (!mddev->pers->sync_request)
8376 		return;
8377 
8378 	spin_lock(&mddev->lock);
8379 	if (mddev->in_sync) {
8380 		mddev->in_sync = 0;
8381 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8382 		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8383 		if (mddev->safemode_delay &&
8384 		    mddev->safemode == 0)
8385 			mddev->safemode = 1;
8386 		spin_unlock(&mddev->lock);
8387 		md_update_sb(mddev, 0);
8388 		sysfs_notify_dirent_safe(mddev->sysfs_state);
8389 		/* wait for the dirty state to be recorded in the metadata */
8390 		wait_event(mddev->sb_wait,
8391 			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8392 	} else
8393 		spin_unlock(&mddev->lock);
8394 }
8395 EXPORT_SYMBOL_GPL(md_allow_write);
8396 
8397 #define SYNC_MARKS	10
8398 #define	SYNC_MARK_STEP	(3*HZ)
8399 #define UPDATE_FREQUENCY (5*60*HZ)
8400 void md_do_sync(struct md_thread *thread)
8401 {
8402 	struct mddev *mddev = thread->mddev;
8403 	struct mddev *mddev2;
8404 	unsigned int currspeed = 0, window;
8405 	sector_t max_sectors,j, io_sectors, recovery_done;
8406 	unsigned long mark[SYNC_MARKS];
8407 	unsigned long update_time;
8408 	sector_t mark_cnt[SYNC_MARKS];
8409 	int last_mark,m;
8410 	struct list_head *tmp;
8411 	sector_t last_check;
8412 	int skipped = 0;
8413 	struct md_rdev *rdev;
8414 	char *desc, *action = NULL;
8415 	struct blk_plug plug;
8416 	int ret;
8417 
8418 	/* just incase thread restarts... */
8419 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8420 	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
8421 		return;
8422 	if (mddev->ro) {/* never try to sync a read-only array */
8423 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8424 		return;
8425 	}
8426 
8427 	if (mddev_is_clustered(mddev)) {
8428 		ret = md_cluster_ops->resync_start(mddev);
8429 		if (ret)
8430 			goto skip;
8431 
8432 		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8433 		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8434 			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8435 			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8436 		     && ((unsigned long long)mddev->curr_resync_completed
8437 			 < (unsigned long long)mddev->resync_max_sectors))
8438 			goto skip;
8439 	}
8440 
8441 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8442 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8443 			desc = "data-check";
8444 			action = "check";
8445 		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8446 			desc = "requested-resync";
8447 			action = "repair";
8448 		} else
8449 			desc = "resync";
8450 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8451 		desc = "reshape";
8452 	else
8453 		desc = "recovery";
8454 
8455 	mddev->last_sync_action = action ?: desc;
8456 
8457 	/* we overload curr_resync somewhat here.
8458 	 * 0 == not engaged in resync at all
8459 	 * 2 == checking that there is no conflict with another sync
8460 	 * 1 == like 2, but have yielded to allow conflicting resync to
8461 	 *		commence
8462 	 * other == active in resync - this many blocks
8463 	 *
8464 	 * Before starting a resync we must have set curr_resync to
8465 	 * 2, and then checked that every "conflicting" array has curr_resync
8466 	 * less than ours.  When we find one that is the same or higher
8467 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8468 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8469 	 * This will mean we have to start checking from the beginning again.
8470 	 *
8471 	 */
8472 
8473 	do {
8474 		int mddev2_minor = -1;
8475 		mddev->curr_resync = 2;
8476 
8477 	try_again:
8478 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8479 			goto skip;
8480 		for_each_mddev(mddev2, tmp) {
8481 			if (mddev2 == mddev)
8482 				continue;
8483 			if (!mddev->parallel_resync
8484 			&&  mddev2->curr_resync
8485 			&&  match_mddev_units(mddev, mddev2)) {
8486 				DEFINE_WAIT(wq);
8487 				if (mddev < mddev2 && mddev->curr_resync == 2) {
8488 					/* arbitrarily yield */
8489 					mddev->curr_resync = 1;
8490 					wake_up(&resync_wait);
8491 				}
8492 				if (mddev > mddev2 && mddev->curr_resync == 1)
8493 					/* no need to wait here, we can wait the next
8494 					 * time 'round when curr_resync == 2
8495 					 */
8496 					continue;
8497 				/* We need to wait 'interruptible' so as not to
8498 				 * contribute to the load average, and not to
8499 				 * be caught by 'softlockup'
8500 				 */
8501 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8502 				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8503 				    mddev2->curr_resync >= mddev->curr_resync) {
8504 					if (mddev2_minor != mddev2->md_minor) {
8505 						mddev2_minor = mddev2->md_minor;
8506 						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8507 							desc, mdname(mddev),
8508 							mdname(mddev2));
8509 					}
8510 					mddev_put(mddev2);
8511 					if (signal_pending(current))
8512 						flush_signals(current);
8513 					schedule();
8514 					finish_wait(&resync_wait, &wq);
8515 					goto try_again;
8516 				}
8517 				finish_wait(&resync_wait, &wq);
8518 			}
8519 		}
8520 	} while (mddev->curr_resync < 2);
8521 
8522 	j = 0;
8523 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8524 		/* resync follows the size requested by the personality,
8525 		 * which defaults to physical size, but can be virtual size
8526 		 */
8527 		max_sectors = mddev->resync_max_sectors;
8528 		atomic64_set(&mddev->resync_mismatches, 0);
8529 		/* we don't use the checkpoint if there's a bitmap */
8530 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8531 			j = mddev->resync_min;
8532 		else if (!mddev->bitmap)
8533 			j = mddev->recovery_cp;
8534 
8535 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8536 		max_sectors = mddev->resync_max_sectors;
8537 		/*
8538 		 * If the original node aborts reshaping then we continue the
8539 		 * reshaping, so set j again to avoid restart reshape from the
8540 		 * first beginning
8541 		 */
8542 		if (mddev_is_clustered(mddev) &&
8543 		    mddev->reshape_position != MaxSector)
8544 			j = mddev->reshape_position;
8545 	} else {
8546 		/* recovery follows the physical size of devices */
8547 		max_sectors = mddev->dev_sectors;
8548 		j = MaxSector;
8549 		rcu_read_lock();
8550 		rdev_for_each_rcu(rdev, mddev)
8551 			if (rdev->raid_disk >= 0 &&
8552 			    !test_bit(Journal, &rdev->flags) &&
8553 			    !test_bit(Faulty, &rdev->flags) &&
8554 			    !test_bit(In_sync, &rdev->flags) &&
8555 			    rdev->recovery_offset < j)
8556 				j = rdev->recovery_offset;
8557 		rcu_read_unlock();
8558 
8559 		/* If there is a bitmap, we need to make sure all
8560 		 * writes that started before we added a spare
8561 		 * complete before we start doing a recovery.
8562 		 * Otherwise the write might complete and (via
8563 		 * bitmap_endwrite) set a bit in the bitmap after the
8564 		 * recovery has checked that bit and skipped that
8565 		 * region.
8566 		 */
8567 		if (mddev->bitmap) {
8568 			mddev->pers->quiesce(mddev, 1);
8569 			mddev->pers->quiesce(mddev, 0);
8570 		}
8571 	}
8572 
8573 	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8574 	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8575 	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8576 		 speed_max(mddev), desc);
8577 
8578 	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8579 
8580 	io_sectors = 0;
8581 	for (m = 0; m < SYNC_MARKS; m++) {
8582 		mark[m] = jiffies;
8583 		mark_cnt[m] = io_sectors;
8584 	}
8585 	last_mark = 0;
8586 	mddev->resync_mark = mark[last_mark];
8587 	mddev->resync_mark_cnt = mark_cnt[last_mark];
8588 
8589 	/*
8590 	 * Tune reconstruction:
8591 	 */
8592 	window = 32 * (PAGE_SIZE / 512);
8593 	pr_debug("md: using %dk window, over a total of %lluk.\n",
8594 		 window/2, (unsigned long long)max_sectors/2);
8595 
8596 	atomic_set(&mddev->recovery_active, 0);
8597 	last_check = 0;
8598 
8599 	if (j>2) {
8600 		pr_debug("md: resuming %s of %s from checkpoint.\n",
8601 			 desc, mdname(mddev));
8602 		mddev->curr_resync = j;
8603 	} else
8604 		mddev->curr_resync = 3; /* no longer delayed */
8605 	mddev->curr_resync_completed = j;
8606 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8607 	md_new_event(mddev);
8608 	update_time = jiffies;
8609 
8610 	blk_start_plug(&plug);
8611 	while (j < max_sectors) {
8612 		sector_t sectors;
8613 
8614 		skipped = 0;
8615 
8616 		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8617 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
8618 		      (mddev->curr_resync - mddev->curr_resync_completed)
8619 		      > (max_sectors >> 4)) ||
8620 		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8621 		     (j - mddev->curr_resync_completed)*2
8622 		     >= mddev->resync_max - mddev->curr_resync_completed ||
8623 		     mddev->curr_resync_completed > mddev->resync_max
8624 			    )) {
8625 			/* time to update curr_resync_completed */
8626 			wait_event(mddev->recovery_wait,
8627 				   atomic_read(&mddev->recovery_active) == 0);
8628 			mddev->curr_resync_completed = j;
8629 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8630 			    j > mddev->recovery_cp)
8631 				mddev->recovery_cp = j;
8632 			update_time = jiffies;
8633 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8634 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8635 		}
8636 
8637 		while (j >= mddev->resync_max &&
8638 		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8639 			/* As this condition is controlled by user-space,
8640 			 * we can block indefinitely, so use '_interruptible'
8641 			 * to avoid triggering warnings.
8642 			 */
8643 			flush_signals(current); /* just in case */
8644 			wait_event_interruptible(mddev->recovery_wait,
8645 						 mddev->resync_max > j
8646 						 || test_bit(MD_RECOVERY_INTR,
8647 							     &mddev->recovery));
8648 		}
8649 
8650 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8651 			break;
8652 
8653 		sectors = mddev->pers->sync_request(mddev, j, &skipped);
8654 		if (sectors == 0) {
8655 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8656 			break;
8657 		}
8658 
8659 		if (!skipped) { /* actual IO requested */
8660 			io_sectors += sectors;
8661 			atomic_add(sectors, &mddev->recovery_active);
8662 		}
8663 
8664 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8665 			break;
8666 
8667 		j += sectors;
8668 		if (j > max_sectors)
8669 			/* when skipping, extra large numbers can be returned. */
8670 			j = max_sectors;
8671 		if (j > 2)
8672 			mddev->curr_resync = j;
8673 		mddev->curr_mark_cnt = io_sectors;
8674 		if (last_check == 0)
8675 			/* this is the earliest that rebuild will be
8676 			 * visible in /proc/mdstat
8677 			 */
8678 			md_new_event(mddev);
8679 
8680 		if (last_check + window > io_sectors || j == max_sectors)
8681 			continue;
8682 
8683 		last_check = io_sectors;
8684 	repeat:
8685 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8686 			/* step marks */
8687 			int next = (last_mark+1) % SYNC_MARKS;
8688 
8689 			mddev->resync_mark = mark[next];
8690 			mddev->resync_mark_cnt = mark_cnt[next];
8691 			mark[next] = jiffies;
8692 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8693 			last_mark = next;
8694 		}
8695 
8696 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8697 			break;
8698 
8699 		/*
8700 		 * this loop exits only if either when we are slower than
8701 		 * the 'hard' speed limit, or the system was IO-idle for
8702 		 * a jiffy.
8703 		 * the system might be non-idle CPU-wise, but we only care
8704 		 * about not overloading the IO subsystem. (things like an
8705 		 * e2fsck being done on the RAID array should execute fast)
8706 		 */
8707 		cond_resched();
8708 
8709 		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8710 		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8711 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
8712 
8713 		if (currspeed > speed_min(mddev)) {
8714 			if (currspeed > speed_max(mddev)) {
8715 				msleep(500);
8716 				goto repeat;
8717 			}
8718 			if (!is_mddev_idle(mddev, 0)) {
8719 				/*
8720 				 * Give other IO more of a chance.
8721 				 * The faster the devices, the less we wait.
8722 				 */
8723 				wait_event(mddev->recovery_wait,
8724 					   !atomic_read(&mddev->recovery_active));
8725 			}
8726 		}
8727 	}
8728 	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8729 		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8730 		? "interrupted" : "done");
8731 	/*
8732 	 * this also signals 'finished resyncing' to md_stop
8733 	 */
8734 	blk_finish_plug(&plug);
8735 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8736 
8737 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8738 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8739 	    mddev->curr_resync > 3) {
8740 		mddev->curr_resync_completed = mddev->curr_resync;
8741 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8742 	}
8743 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
8744 
8745 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8746 	    mddev->curr_resync > 3) {
8747 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8748 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8749 				if (mddev->curr_resync >= mddev->recovery_cp) {
8750 					pr_debug("md: checkpointing %s of %s.\n",
8751 						 desc, mdname(mddev));
8752 					if (test_bit(MD_RECOVERY_ERROR,
8753 						&mddev->recovery))
8754 						mddev->recovery_cp =
8755 							mddev->curr_resync_completed;
8756 					else
8757 						mddev->recovery_cp =
8758 							mddev->curr_resync;
8759 				}
8760 			} else
8761 				mddev->recovery_cp = MaxSector;
8762 		} else {
8763 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8764 				mddev->curr_resync = MaxSector;
8765 			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8766 			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8767 				rcu_read_lock();
8768 				rdev_for_each_rcu(rdev, mddev)
8769 					if (rdev->raid_disk >= 0 &&
8770 					    mddev->delta_disks >= 0 &&
8771 					    !test_bit(Journal, &rdev->flags) &&
8772 					    !test_bit(Faulty, &rdev->flags) &&
8773 					    !test_bit(In_sync, &rdev->flags) &&
8774 					    rdev->recovery_offset < mddev->curr_resync)
8775 						rdev->recovery_offset = mddev->curr_resync;
8776 				rcu_read_unlock();
8777 			}
8778 		}
8779 	}
8780  skip:
8781 	/* set CHANGE_PENDING here since maybe another update is needed,
8782 	 * so other nodes are informed. It should be harmless for normal
8783 	 * raid */
8784 	set_mask_bits(&mddev->sb_flags, 0,
8785 		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8786 
8787 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8788 			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8789 			mddev->delta_disks > 0 &&
8790 			mddev->pers->finish_reshape &&
8791 			mddev->pers->size &&
8792 			mddev->queue) {
8793 		mddev_lock_nointr(mddev);
8794 		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
8795 		mddev_unlock(mddev);
8796 		if (!mddev_is_clustered(mddev)) {
8797 			set_capacity(mddev->gendisk, mddev->array_sectors);
8798 			revalidate_disk(mddev->gendisk);
8799 		}
8800 	}
8801 
8802 	spin_lock(&mddev->lock);
8803 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8804 		/* We completed so min/max setting can be forgotten if used. */
8805 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8806 			mddev->resync_min = 0;
8807 		mddev->resync_max = MaxSector;
8808 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8809 		mddev->resync_min = mddev->curr_resync_completed;
8810 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8811 	mddev->curr_resync = 0;
8812 	spin_unlock(&mddev->lock);
8813 
8814 	wake_up(&resync_wait);
8815 	md_wakeup_thread(mddev->thread);
8816 	return;
8817 }
8818 EXPORT_SYMBOL_GPL(md_do_sync);
8819 
8820 static int remove_and_add_spares(struct mddev *mddev,
8821 				 struct md_rdev *this)
8822 {
8823 	struct md_rdev *rdev;
8824 	int spares = 0;
8825 	int removed = 0;
8826 	bool remove_some = false;
8827 
8828 	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8829 		/* Mustn't remove devices when resync thread is running */
8830 		return 0;
8831 
8832 	rdev_for_each(rdev, mddev) {
8833 		if ((this == NULL || rdev == this) &&
8834 		    rdev->raid_disk >= 0 &&
8835 		    !test_bit(Blocked, &rdev->flags) &&
8836 		    test_bit(Faulty, &rdev->flags) &&
8837 		    atomic_read(&rdev->nr_pending)==0) {
8838 			/* Faulty non-Blocked devices with nr_pending == 0
8839 			 * never get nr_pending incremented,
8840 			 * never get Faulty cleared, and never get Blocked set.
8841 			 * So we can synchronize_rcu now rather than once per device
8842 			 */
8843 			remove_some = true;
8844 			set_bit(RemoveSynchronized, &rdev->flags);
8845 		}
8846 	}
8847 
8848 	if (remove_some)
8849 		synchronize_rcu();
8850 	rdev_for_each(rdev, mddev) {
8851 		if ((this == NULL || rdev == this) &&
8852 		    rdev->raid_disk >= 0 &&
8853 		    !test_bit(Blocked, &rdev->flags) &&
8854 		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
8855 		     (!test_bit(In_sync, &rdev->flags) &&
8856 		      !test_bit(Journal, &rdev->flags))) &&
8857 		    atomic_read(&rdev->nr_pending)==0)) {
8858 			if (mddev->pers->hot_remove_disk(
8859 				    mddev, rdev) == 0) {
8860 				sysfs_unlink_rdev(mddev, rdev);
8861 				rdev->saved_raid_disk = rdev->raid_disk;
8862 				rdev->raid_disk = -1;
8863 				removed++;
8864 			}
8865 		}
8866 		if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8867 			clear_bit(RemoveSynchronized, &rdev->flags);
8868 	}
8869 
8870 	if (removed && mddev->kobj.sd)
8871 		sysfs_notify(&mddev->kobj, NULL, "degraded");
8872 
8873 	if (this && removed)
8874 		goto no_add;
8875 
8876 	rdev_for_each(rdev, mddev) {
8877 		if (this && this != rdev)
8878 			continue;
8879 		if (test_bit(Candidate, &rdev->flags))
8880 			continue;
8881 		if (rdev->raid_disk >= 0 &&
8882 		    !test_bit(In_sync, &rdev->flags) &&
8883 		    !test_bit(Journal, &rdev->flags) &&
8884 		    !test_bit(Faulty, &rdev->flags))
8885 			spares++;
8886 		if (rdev->raid_disk >= 0)
8887 			continue;
8888 		if (test_bit(Faulty, &rdev->flags))
8889 			continue;
8890 		if (!test_bit(Journal, &rdev->flags)) {
8891 			if (mddev->ro &&
8892 			    ! (rdev->saved_raid_disk >= 0 &&
8893 			       !test_bit(Bitmap_sync, &rdev->flags)))
8894 				continue;
8895 
8896 			rdev->recovery_offset = 0;
8897 		}
8898 		if (mddev->pers->
8899 		    hot_add_disk(mddev, rdev) == 0) {
8900 			if (sysfs_link_rdev(mddev, rdev))
8901 				/* failure here is OK */;
8902 			if (!test_bit(Journal, &rdev->flags))
8903 				spares++;
8904 			md_new_event(mddev);
8905 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8906 		}
8907 	}
8908 no_add:
8909 	if (removed)
8910 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8911 	return spares;
8912 }
8913 
8914 static void md_start_sync(struct work_struct *ws)
8915 {
8916 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
8917 
8918 	mddev->sync_thread = md_register_thread(md_do_sync,
8919 						mddev,
8920 						"resync");
8921 	if (!mddev->sync_thread) {
8922 		pr_warn("%s: could not start resync thread...\n",
8923 			mdname(mddev));
8924 		/* leave the spares where they are, it shouldn't hurt */
8925 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8926 		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8927 		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8928 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8929 		clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8930 		wake_up(&resync_wait);
8931 		if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8932 				       &mddev->recovery))
8933 			if (mddev->sysfs_action)
8934 				sysfs_notify_dirent_safe(mddev->sysfs_action);
8935 	} else
8936 		md_wakeup_thread(mddev->sync_thread);
8937 	sysfs_notify_dirent_safe(mddev->sysfs_action);
8938 	md_new_event(mddev);
8939 }
8940 
8941 /*
8942  * This routine is regularly called by all per-raid-array threads to
8943  * deal with generic issues like resync and super-block update.
8944  * Raid personalities that don't have a thread (linear/raid0) do not
8945  * need this as they never do any recovery or update the superblock.
8946  *
8947  * It does not do any resync itself, but rather "forks" off other threads
8948  * to do that as needed.
8949  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8950  * "->recovery" and create a thread at ->sync_thread.
8951  * When the thread finishes it sets MD_RECOVERY_DONE
8952  * and wakeups up this thread which will reap the thread and finish up.
8953  * This thread also removes any faulty devices (with nr_pending == 0).
8954  *
8955  * The overall approach is:
8956  *  1/ if the superblock needs updating, update it.
8957  *  2/ If a recovery thread is running, don't do anything else.
8958  *  3/ If recovery has finished, clean up, possibly marking spares active.
8959  *  4/ If there are any faulty devices, remove them.
8960  *  5/ If array is degraded, try to add spares devices
8961  *  6/ If array has spares or is not in-sync, start a resync thread.
8962  */
8963 void md_check_recovery(struct mddev *mddev)
8964 {
8965 	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8966 		/* Write superblock - thread that called mddev_suspend()
8967 		 * holds reconfig_mutex for us.
8968 		 */
8969 		set_bit(MD_UPDATING_SB, &mddev->flags);
8970 		smp_mb__after_atomic();
8971 		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8972 			md_update_sb(mddev, 0);
8973 		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8974 		wake_up(&mddev->sb_wait);
8975 	}
8976 
8977 	if (mddev->suspended)
8978 		return;
8979 
8980 	if (mddev->bitmap)
8981 		md_bitmap_daemon_work(mddev);
8982 
8983 	if (signal_pending(current)) {
8984 		if (mddev->pers->sync_request && !mddev->external) {
8985 			pr_debug("md: %s in immediate safe mode\n",
8986 				 mdname(mddev));
8987 			mddev->safemode = 2;
8988 		}
8989 		flush_signals(current);
8990 	}
8991 
8992 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8993 		return;
8994 	if ( ! (
8995 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8996 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8997 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8998 		(mddev->external == 0 && mddev->safemode == 1) ||
8999 		(mddev->safemode == 2
9000 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9001 		))
9002 		return;
9003 
9004 	if (mddev_trylock(mddev)) {
9005 		int spares = 0;
9006 		bool try_set_sync = mddev->safemode != 0;
9007 
9008 		if (!mddev->external && mddev->safemode == 1)
9009 			mddev->safemode = 0;
9010 
9011 		if (mddev->ro) {
9012 			struct md_rdev *rdev;
9013 			if (!mddev->external && mddev->in_sync)
9014 				/* 'Blocked' flag not needed as failed devices
9015 				 * will be recorded if array switched to read/write.
9016 				 * Leaving it set will prevent the device
9017 				 * from being removed.
9018 				 */
9019 				rdev_for_each(rdev, mddev)
9020 					clear_bit(Blocked, &rdev->flags);
9021 			/* On a read-only array we can:
9022 			 * - remove failed devices
9023 			 * - add already-in_sync devices if the array itself
9024 			 *   is in-sync.
9025 			 * As we only add devices that are already in-sync,
9026 			 * we can activate the spares immediately.
9027 			 */
9028 			remove_and_add_spares(mddev, NULL);
9029 			/* There is no thread, but we need to call
9030 			 * ->spare_active and clear saved_raid_disk
9031 			 */
9032 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9033 			md_reap_sync_thread(mddev);
9034 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9035 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9036 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9037 			goto unlock;
9038 		}
9039 
9040 		if (mddev_is_clustered(mddev)) {
9041 			struct md_rdev *rdev;
9042 			/* kick the device if another node issued a
9043 			 * remove disk.
9044 			 */
9045 			rdev_for_each(rdev, mddev) {
9046 				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9047 						rdev->raid_disk < 0)
9048 					md_kick_rdev_from_array(rdev);
9049 			}
9050 		}
9051 
9052 		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9053 			spin_lock(&mddev->lock);
9054 			set_in_sync(mddev);
9055 			spin_unlock(&mddev->lock);
9056 		}
9057 
9058 		if (mddev->sb_flags)
9059 			md_update_sb(mddev, 0);
9060 
9061 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
9062 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9063 			/* resync/recovery still happening */
9064 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9065 			goto unlock;
9066 		}
9067 		if (mddev->sync_thread) {
9068 			md_reap_sync_thread(mddev);
9069 			goto unlock;
9070 		}
9071 		/* Set RUNNING before clearing NEEDED to avoid
9072 		 * any transients in the value of "sync_action".
9073 		 */
9074 		mddev->curr_resync_completed = 0;
9075 		spin_lock(&mddev->lock);
9076 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9077 		spin_unlock(&mddev->lock);
9078 		/* Clear some bits that don't mean anything, but
9079 		 * might be left set
9080 		 */
9081 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9082 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9083 
9084 		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9085 		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
9086 			goto not_running;
9087 		/* no recovery is running.
9088 		 * remove any failed drives, then
9089 		 * add spares if possible.
9090 		 * Spares are also removed and re-added, to allow
9091 		 * the personality to fail the re-add.
9092 		 */
9093 
9094 		if (mddev->reshape_position != MaxSector) {
9095 			if (mddev->pers->check_reshape == NULL ||
9096 			    mddev->pers->check_reshape(mddev) != 0)
9097 				/* Cannot proceed */
9098 				goto not_running;
9099 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9100 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9101 		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
9102 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9103 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9104 			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9105 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9106 		} else if (mddev->recovery_cp < MaxSector) {
9107 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9108 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9109 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9110 			/* nothing to be done ... */
9111 			goto not_running;
9112 
9113 		if (mddev->pers->sync_request) {
9114 			if (spares) {
9115 				/* We are adding a device or devices to an array
9116 				 * which has the bitmap stored on all devices.
9117 				 * So make sure all bitmap pages get written
9118 				 */
9119 				md_bitmap_write_all(mddev->bitmap);
9120 			}
9121 			INIT_WORK(&mddev->del_work, md_start_sync);
9122 			queue_work(md_misc_wq, &mddev->del_work);
9123 			goto unlock;
9124 		}
9125 	not_running:
9126 		if (!mddev->sync_thread) {
9127 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9128 			wake_up(&resync_wait);
9129 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9130 					       &mddev->recovery))
9131 				if (mddev->sysfs_action)
9132 					sysfs_notify_dirent_safe(mddev->sysfs_action);
9133 		}
9134 	unlock:
9135 		wake_up(&mddev->sb_wait);
9136 		mddev_unlock(mddev);
9137 	}
9138 }
9139 EXPORT_SYMBOL(md_check_recovery);
9140 
9141 void md_reap_sync_thread(struct mddev *mddev)
9142 {
9143 	struct md_rdev *rdev;
9144 	sector_t old_dev_sectors = mddev->dev_sectors;
9145 	bool is_reshaped = false;
9146 
9147 	/* resync has finished, collect result */
9148 	md_unregister_thread(&mddev->sync_thread);
9149 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9150 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9151 	    mddev->degraded != mddev->raid_disks) {
9152 		/* success...*/
9153 		/* activate any spares */
9154 		if (mddev->pers->spare_active(mddev)) {
9155 			sysfs_notify(&mddev->kobj, NULL,
9156 				     "degraded");
9157 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9158 		}
9159 	}
9160 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9161 	    mddev->pers->finish_reshape) {
9162 		mddev->pers->finish_reshape(mddev);
9163 		if (mddev_is_clustered(mddev))
9164 			is_reshaped = true;
9165 	}
9166 
9167 	/* If array is no-longer degraded, then any saved_raid_disk
9168 	 * information must be scrapped.
9169 	 */
9170 	if (!mddev->degraded)
9171 		rdev_for_each(rdev, mddev)
9172 			rdev->saved_raid_disk = -1;
9173 
9174 	md_update_sb(mddev, 1);
9175 	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9176 	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9177 	 * clustered raid */
9178 	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9179 		md_cluster_ops->resync_finish(mddev);
9180 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9181 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9182 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9183 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9184 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9185 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9186 	/*
9187 	 * We call md_cluster_ops->update_size here because sync_size could
9188 	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9189 	 * so it is time to update size across cluster.
9190 	 */
9191 	if (mddev_is_clustered(mddev) && is_reshaped
9192 				      && !test_bit(MD_CLOSING, &mddev->flags))
9193 		md_cluster_ops->update_size(mddev, old_dev_sectors);
9194 	wake_up(&resync_wait);
9195 	/* flag recovery needed just to double check */
9196 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9197 	sysfs_notify_dirent_safe(mddev->sysfs_action);
9198 	md_new_event(mddev);
9199 	if (mddev->event_work.func)
9200 		queue_work(md_misc_wq, &mddev->event_work);
9201 }
9202 EXPORT_SYMBOL(md_reap_sync_thread);
9203 
9204 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9205 {
9206 	sysfs_notify_dirent_safe(rdev->sysfs_state);
9207 	wait_event_timeout(rdev->blocked_wait,
9208 			   !test_bit(Blocked, &rdev->flags) &&
9209 			   !test_bit(BlockedBadBlocks, &rdev->flags),
9210 			   msecs_to_jiffies(5000));
9211 	rdev_dec_pending(rdev, mddev);
9212 }
9213 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9214 
9215 void md_finish_reshape(struct mddev *mddev)
9216 {
9217 	/* called be personality module when reshape completes. */
9218 	struct md_rdev *rdev;
9219 
9220 	rdev_for_each(rdev, mddev) {
9221 		if (rdev->data_offset > rdev->new_data_offset)
9222 			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9223 		else
9224 			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9225 		rdev->data_offset = rdev->new_data_offset;
9226 	}
9227 }
9228 EXPORT_SYMBOL(md_finish_reshape);
9229 
9230 /* Bad block management */
9231 
9232 /* Returns 1 on success, 0 on failure */
9233 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9234 		       int is_new)
9235 {
9236 	struct mddev *mddev = rdev->mddev;
9237 	int rv;
9238 	if (is_new)
9239 		s += rdev->new_data_offset;
9240 	else
9241 		s += rdev->data_offset;
9242 	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9243 	if (rv == 0) {
9244 		/* Make sure they get written out promptly */
9245 		if (test_bit(ExternalBbl, &rdev->flags))
9246 			sysfs_notify(&rdev->kobj, NULL,
9247 				     "unacknowledged_bad_blocks");
9248 		sysfs_notify_dirent_safe(rdev->sysfs_state);
9249 		set_mask_bits(&mddev->sb_flags, 0,
9250 			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9251 		md_wakeup_thread(rdev->mddev->thread);
9252 		return 1;
9253 	} else
9254 		return 0;
9255 }
9256 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9257 
9258 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9259 			 int is_new)
9260 {
9261 	int rv;
9262 	if (is_new)
9263 		s += rdev->new_data_offset;
9264 	else
9265 		s += rdev->data_offset;
9266 	rv = badblocks_clear(&rdev->badblocks, s, sectors);
9267 	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9268 		sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9269 	return rv;
9270 }
9271 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9272 
9273 static int md_notify_reboot(struct notifier_block *this,
9274 			    unsigned long code, void *x)
9275 {
9276 	struct list_head *tmp;
9277 	struct mddev *mddev;
9278 	int need_delay = 0;
9279 
9280 	for_each_mddev(mddev, tmp) {
9281 		if (mddev_trylock(mddev)) {
9282 			if (mddev->pers)
9283 				__md_stop_writes(mddev);
9284 			if (mddev->persistent)
9285 				mddev->safemode = 2;
9286 			mddev_unlock(mddev);
9287 		}
9288 		need_delay = 1;
9289 	}
9290 	/*
9291 	 * certain more exotic SCSI devices are known to be
9292 	 * volatile wrt too early system reboots. While the
9293 	 * right place to handle this issue is the given
9294 	 * driver, we do want to have a safe RAID driver ...
9295 	 */
9296 	if (need_delay)
9297 		mdelay(1000*1);
9298 
9299 	return NOTIFY_DONE;
9300 }
9301 
9302 static struct notifier_block md_notifier = {
9303 	.notifier_call	= md_notify_reboot,
9304 	.next		= NULL,
9305 	.priority	= INT_MAX, /* before any real devices */
9306 };
9307 
9308 static void md_geninit(void)
9309 {
9310 	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9311 
9312 	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9313 }
9314 
9315 static int __init md_init(void)
9316 {
9317 	int ret = -ENOMEM;
9318 
9319 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9320 	if (!md_wq)
9321 		goto err_wq;
9322 
9323 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9324 	if (!md_misc_wq)
9325 		goto err_misc_wq;
9326 
9327 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9328 		goto err_md;
9329 
9330 	if ((ret = register_blkdev(0, "mdp")) < 0)
9331 		goto err_mdp;
9332 	mdp_major = ret;
9333 
9334 	blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9335 			    md_probe, NULL, NULL);
9336 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
9337 			    md_probe, NULL, NULL);
9338 
9339 	register_reboot_notifier(&md_notifier);
9340 	raid_table_header = register_sysctl_table(raid_root_table);
9341 
9342 	md_geninit();
9343 	return 0;
9344 
9345 err_mdp:
9346 	unregister_blkdev(MD_MAJOR, "md");
9347 err_md:
9348 	destroy_workqueue(md_misc_wq);
9349 err_misc_wq:
9350 	destroy_workqueue(md_wq);
9351 err_wq:
9352 	return ret;
9353 }
9354 
9355 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9356 {
9357 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9358 	struct md_rdev *rdev2;
9359 	int role, ret;
9360 	char b[BDEVNAME_SIZE];
9361 
9362 	/*
9363 	 * If size is changed in another node then we need to
9364 	 * do resize as well.
9365 	 */
9366 	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9367 		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9368 		if (ret)
9369 			pr_info("md-cluster: resize failed\n");
9370 		else
9371 			md_bitmap_update_sb(mddev->bitmap);
9372 	}
9373 
9374 	/* Check for change of roles in the active devices */
9375 	rdev_for_each(rdev2, mddev) {
9376 		if (test_bit(Faulty, &rdev2->flags))
9377 			continue;
9378 
9379 		/* Check if the roles changed */
9380 		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9381 
9382 		if (test_bit(Candidate, &rdev2->flags)) {
9383 			if (role == 0xfffe) {
9384 				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9385 				md_kick_rdev_from_array(rdev2);
9386 				continue;
9387 			}
9388 			else
9389 				clear_bit(Candidate, &rdev2->flags);
9390 		}
9391 
9392 		if (role != rdev2->raid_disk) {
9393 			/*
9394 			 * got activated except reshape is happening.
9395 			 */
9396 			if (rdev2->raid_disk == -1 && role != 0xffff &&
9397 			    !(le32_to_cpu(sb->feature_map) &
9398 			      MD_FEATURE_RESHAPE_ACTIVE)) {
9399 				rdev2->saved_raid_disk = role;
9400 				ret = remove_and_add_spares(mddev, rdev2);
9401 				pr_info("Activated spare: %s\n",
9402 					bdevname(rdev2->bdev,b));
9403 				/* wakeup mddev->thread here, so array could
9404 				 * perform resync with the new activated disk */
9405 				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9406 				md_wakeup_thread(mddev->thread);
9407 			}
9408 			/* device faulty
9409 			 * We just want to do the minimum to mark the disk
9410 			 * as faulty. The recovery is performed by the
9411 			 * one who initiated the error.
9412 			 */
9413 			if ((role == 0xfffe) || (role == 0xfffd)) {
9414 				md_error(mddev, rdev2);
9415 				clear_bit(Blocked, &rdev2->flags);
9416 			}
9417 		}
9418 	}
9419 
9420 	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9421 		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9422 
9423 	/*
9424 	 * Since mddev->delta_disks has already updated in update_raid_disks,
9425 	 * so it is time to check reshape.
9426 	 */
9427 	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9428 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9429 		/*
9430 		 * reshape is happening in the remote node, we need to
9431 		 * update reshape_position and call start_reshape.
9432 		 */
9433 		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9434 		if (mddev->pers->update_reshape_pos)
9435 			mddev->pers->update_reshape_pos(mddev);
9436 		if (mddev->pers->start_reshape)
9437 			mddev->pers->start_reshape(mddev);
9438 	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9439 		   mddev->reshape_position != MaxSector &&
9440 		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9441 		/* reshape is just done in another node. */
9442 		mddev->reshape_position = MaxSector;
9443 		if (mddev->pers->update_reshape_pos)
9444 			mddev->pers->update_reshape_pos(mddev);
9445 	}
9446 
9447 	/* Finally set the event to be up to date */
9448 	mddev->events = le64_to_cpu(sb->events);
9449 }
9450 
9451 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9452 {
9453 	int err;
9454 	struct page *swapout = rdev->sb_page;
9455 	struct mdp_superblock_1 *sb;
9456 
9457 	/* Store the sb page of the rdev in the swapout temporary
9458 	 * variable in case we err in the future
9459 	 */
9460 	rdev->sb_page = NULL;
9461 	err = alloc_disk_sb(rdev);
9462 	if (err == 0) {
9463 		ClearPageUptodate(rdev->sb_page);
9464 		rdev->sb_loaded = 0;
9465 		err = super_types[mddev->major_version].
9466 			load_super(rdev, NULL, mddev->minor_version);
9467 	}
9468 	if (err < 0) {
9469 		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9470 				__func__, __LINE__, rdev->desc_nr, err);
9471 		if (rdev->sb_page)
9472 			put_page(rdev->sb_page);
9473 		rdev->sb_page = swapout;
9474 		rdev->sb_loaded = 1;
9475 		return err;
9476 	}
9477 
9478 	sb = page_address(rdev->sb_page);
9479 	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9480 	 * is not set
9481 	 */
9482 
9483 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9484 		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9485 
9486 	/* The other node finished recovery, call spare_active to set
9487 	 * device In_sync and mddev->degraded
9488 	 */
9489 	if (rdev->recovery_offset == MaxSector &&
9490 	    !test_bit(In_sync, &rdev->flags) &&
9491 	    mddev->pers->spare_active(mddev))
9492 		sysfs_notify(&mddev->kobj, NULL, "degraded");
9493 
9494 	put_page(swapout);
9495 	return 0;
9496 }
9497 
9498 void md_reload_sb(struct mddev *mddev, int nr)
9499 {
9500 	struct md_rdev *rdev;
9501 	int err;
9502 
9503 	/* Find the rdev */
9504 	rdev_for_each_rcu(rdev, mddev) {
9505 		if (rdev->desc_nr == nr)
9506 			break;
9507 	}
9508 
9509 	if (!rdev || rdev->desc_nr != nr) {
9510 		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9511 		return;
9512 	}
9513 
9514 	err = read_rdev(mddev, rdev);
9515 	if (err < 0)
9516 		return;
9517 
9518 	check_sb_changes(mddev, rdev);
9519 
9520 	/* Read all rdev's to update recovery_offset */
9521 	rdev_for_each_rcu(rdev, mddev) {
9522 		if (!test_bit(Faulty, &rdev->flags))
9523 			read_rdev(mddev, rdev);
9524 	}
9525 }
9526 EXPORT_SYMBOL(md_reload_sb);
9527 
9528 #ifndef MODULE
9529 
9530 /*
9531  * Searches all registered partitions for autorun RAID arrays
9532  * at boot time.
9533  */
9534 
9535 static DEFINE_MUTEX(detected_devices_mutex);
9536 static LIST_HEAD(all_detected_devices);
9537 struct detected_devices_node {
9538 	struct list_head list;
9539 	dev_t dev;
9540 };
9541 
9542 void md_autodetect_dev(dev_t dev)
9543 {
9544 	struct detected_devices_node *node_detected_dev;
9545 
9546 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9547 	if (node_detected_dev) {
9548 		node_detected_dev->dev = dev;
9549 		mutex_lock(&detected_devices_mutex);
9550 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
9551 		mutex_unlock(&detected_devices_mutex);
9552 	}
9553 }
9554 
9555 static void autostart_arrays(int part)
9556 {
9557 	struct md_rdev *rdev;
9558 	struct detected_devices_node *node_detected_dev;
9559 	dev_t dev;
9560 	int i_scanned, i_passed;
9561 
9562 	i_scanned = 0;
9563 	i_passed = 0;
9564 
9565 	pr_info("md: Autodetecting RAID arrays.\n");
9566 
9567 	mutex_lock(&detected_devices_mutex);
9568 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9569 		i_scanned++;
9570 		node_detected_dev = list_entry(all_detected_devices.next,
9571 					struct detected_devices_node, list);
9572 		list_del(&node_detected_dev->list);
9573 		dev = node_detected_dev->dev;
9574 		kfree(node_detected_dev);
9575 		mutex_unlock(&detected_devices_mutex);
9576 		rdev = md_import_device(dev,0, 90);
9577 		mutex_lock(&detected_devices_mutex);
9578 		if (IS_ERR(rdev))
9579 			continue;
9580 
9581 		if (test_bit(Faulty, &rdev->flags))
9582 			continue;
9583 
9584 		set_bit(AutoDetected, &rdev->flags);
9585 		list_add(&rdev->same_set, &pending_raid_disks);
9586 		i_passed++;
9587 	}
9588 	mutex_unlock(&detected_devices_mutex);
9589 
9590 	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9591 
9592 	autorun_devices(part);
9593 }
9594 
9595 #endif /* !MODULE */
9596 
9597 static __exit void md_exit(void)
9598 {
9599 	struct mddev *mddev;
9600 	struct list_head *tmp;
9601 	int delay = 1;
9602 
9603 	blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9604 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9605 
9606 	unregister_blkdev(MD_MAJOR,"md");
9607 	unregister_blkdev(mdp_major, "mdp");
9608 	unregister_reboot_notifier(&md_notifier);
9609 	unregister_sysctl_table(raid_table_header);
9610 
9611 	/* We cannot unload the modules while some process is
9612 	 * waiting for us in select() or poll() - wake them up
9613 	 */
9614 	md_unloading = 1;
9615 	while (waitqueue_active(&md_event_waiters)) {
9616 		/* not safe to leave yet */
9617 		wake_up(&md_event_waiters);
9618 		msleep(delay);
9619 		delay += delay;
9620 	}
9621 	remove_proc_entry("mdstat", NULL);
9622 
9623 	for_each_mddev(mddev, tmp) {
9624 		export_array(mddev);
9625 		mddev->ctime = 0;
9626 		mddev->hold_active = 0;
9627 		/*
9628 		 * for_each_mddev() will call mddev_put() at the end of each
9629 		 * iteration.  As the mddev is now fully clear, this will
9630 		 * schedule the mddev for destruction by a workqueue, and the
9631 		 * destroy_workqueue() below will wait for that to complete.
9632 		 */
9633 	}
9634 	destroy_workqueue(md_misc_wq);
9635 	destroy_workqueue(md_wq);
9636 }
9637 
9638 subsys_initcall(md_init);
9639 module_exit(md_exit)
9640 
9641 static int get_ro(char *buffer, const struct kernel_param *kp)
9642 {
9643 	return sprintf(buffer, "%d", start_readonly);
9644 }
9645 static int set_ro(const char *val, const struct kernel_param *kp)
9646 {
9647 	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9648 }
9649 
9650 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9651 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9652 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9653 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9654 
9655 MODULE_LICENSE("GPL");
9656 MODULE_DESCRIPTION("MD RAID framework");
9657 MODULE_ALIAS("md");
9658 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
9659