xref: /linux/drivers/md/md.c (revision 9b960d8cd6f712cb2c03e2bdd4d5ca058238037f)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3     md.c : Multiple Devices driver for Linux
4       Copyright (C) 1998, 1999, 2000 Ingo Molnar
5  
6       completely rewritten, based on the MD driver code from Marc Zyngier
7  
8     Changes:
9  
10     - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11     - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12     - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13     - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14     - kmod support by: Cyrus Durgin
15     - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16     - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17  
18     - lots of fixes and improvements to the RAID1/RAID5 and generic
19       RAID code (such as request based resynchronization):
20  
21       Neil Brown <neilb@cse.unsw.edu.au>.
22  
23     - persistent bitmap code
24       Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25  
26  
27     Errors, Warnings, etc.
28     Please use:
29       pr_crit() for error conditions that risk data loss
30       pr_err() for error conditions that are unexpected, like an IO error
31           or internal inconsistency
32       pr_warn() for error conditions that could have been predicated, like
33           adding a device to an array when it has incompatible metadata
34       pr_info() for every interesting, very rare events, like an array starting
35           or stopping, or resync starting or stopping
36       pr_debug() for everything else.
37  
38  */
39  
40  #include <linux/sched/mm.h>
41  #include <linux/sched/signal.h>
42  #include <linux/kthread.h>
43  #include <linux/blkdev.h>
44  #include <linux/blk-integrity.h>
45  #include <linux/badblocks.h>
46  #include <linux/sysctl.h>
47  #include <linux/seq_file.h>
48  #include <linux/fs.h>
49  #include <linux/poll.h>
50  #include <linux/ctype.h>
51  #include <linux/string.h>
52  #include <linux/hdreg.h>
53  #include <linux/proc_fs.h>
54  #include <linux/random.h>
55  #include <linux/major.h>
56  #include <linux/module.h>
57  #include <linux/reboot.h>
58  #include <linux/file.h>
59  #include <linux/compat.h>
60  #include <linux/delay.h>
61  #include <linux/raid/md_p.h>
62  #include <linux/raid/md_u.h>
63  #include <linux/raid/detect.h>
64  #include <linux/slab.h>
65  #include <linux/percpu-refcount.h>
66  #include <linux/part_stat.h>
67  
68  #include "md.h"
69  #include "md-bitmap.h"
70  #include "md-cluster.h"
71  
72  static const char *action_name[NR_SYNC_ACTIONS] = {
73  	[ACTION_RESYNC]		= "resync",
74  	[ACTION_RECOVER]	= "recover",
75  	[ACTION_CHECK]		= "check",
76  	[ACTION_REPAIR]		= "repair",
77  	[ACTION_RESHAPE]	= "reshape",
78  	[ACTION_FROZEN]		= "frozen",
79  	[ACTION_IDLE]		= "idle",
80  };
81  
82  static DEFINE_XARRAY(md_submodule);
83  
84  static const struct kobj_type md_ktype;
85  
86  static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
87  static struct workqueue_struct *md_wq;
88  
89  /*
90   * This workqueue is used for sync_work to register new sync_thread, and for
91   * del_work to remove rdev, and for event_work that is only set by dm-raid.
92   *
93   * Noted that sync_work will grab reconfig_mutex, hence never flush this
94   * workqueue whith reconfig_mutex grabbed.
95   */
96  static struct workqueue_struct *md_misc_wq;
97  struct workqueue_struct *md_bitmap_wq;
98  
99  static int remove_and_add_spares(struct mddev *mddev,
100  				 struct md_rdev *this);
101  static void mddev_detach(struct mddev *mddev);
102  static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
103  static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
104  
105  /*
106   * Default number of read corrections we'll attempt on an rdev
107   * before ejecting it from the array. We divide the read error
108   * count by 2 for every hour elapsed between read errors.
109   */
110  #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
111  /* Default safemode delay: 200 msec */
112  #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
113  /*
114   * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
115   * is 1000 KB/sec, so the extra system load does not show up that much.
116   * Increase it if you want to have more _guaranteed_ speed. Note that
117   * the RAID driver will use the maximum available bandwidth if the IO
118   * subsystem is idle. There is also an 'absolute maximum' reconstruction
119   * speed limit - in case reconstruction slows down your system despite
120   * idle IO detection.
121   *
122   * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
123   * or /sys/block/mdX/md/sync_speed_{min,max}
124   */
125  
126  static int sysctl_speed_limit_min = 1000;
127  static int sysctl_speed_limit_max = 200000;
speed_min(struct mddev * mddev)128  static inline int speed_min(struct mddev *mddev)
129  {
130  	return mddev->sync_speed_min ?
131  		mddev->sync_speed_min : sysctl_speed_limit_min;
132  }
133  
speed_max(struct mddev * mddev)134  static inline int speed_max(struct mddev *mddev)
135  {
136  	return mddev->sync_speed_max ?
137  		mddev->sync_speed_max : sysctl_speed_limit_max;
138  }
139  
rdev_uninit_serial(struct md_rdev * rdev)140  static void rdev_uninit_serial(struct md_rdev *rdev)
141  {
142  	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
143  		return;
144  
145  	kvfree(rdev->serial);
146  	rdev->serial = NULL;
147  }
148  
rdevs_uninit_serial(struct mddev * mddev)149  static void rdevs_uninit_serial(struct mddev *mddev)
150  {
151  	struct md_rdev *rdev;
152  
153  	rdev_for_each(rdev, mddev)
154  		rdev_uninit_serial(rdev);
155  }
156  
rdev_init_serial(struct md_rdev * rdev)157  static int rdev_init_serial(struct md_rdev *rdev)
158  {
159  	/* serial_nums equals with BARRIER_BUCKETS_NR */
160  	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
161  	struct serial_in_rdev *serial = NULL;
162  
163  	if (test_bit(CollisionCheck, &rdev->flags))
164  		return 0;
165  
166  	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
167  			  GFP_KERNEL);
168  	if (!serial)
169  		return -ENOMEM;
170  
171  	for (i = 0; i < serial_nums; i++) {
172  		struct serial_in_rdev *serial_tmp = &serial[i];
173  
174  		spin_lock_init(&serial_tmp->serial_lock);
175  		serial_tmp->serial_rb = RB_ROOT_CACHED;
176  		init_waitqueue_head(&serial_tmp->serial_io_wait);
177  	}
178  
179  	rdev->serial = serial;
180  	set_bit(CollisionCheck, &rdev->flags);
181  
182  	return 0;
183  }
184  
rdevs_init_serial(struct mddev * mddev)185  static int rdevs_init_serial(struct mddev *mddev)
186  {
187  	struct md_rdev *rdev;
188  	int ret = 0;
189  
190  	rdev_for_each(rdev, mddev) {
191  		ret = rdev_init_serial(rdev);
192  		if (ret)
193  			break;
194  	}
195  
196  	/* Free all resources if pool is not existed */
197  	if (ret && !mddev->serial_info_pool)
198  		rdevs_uninit_serial(mddev);
199  
200  	return ret;
201  }
202  
203  /*
204   * rdev needs to enable serial stuffs if it meets the conditions:
205   * 1. it is multi-queue device flaged with writemostly.
206   * 2. the write-behind mode is enabled.
207   */
rdev_need_serial(struct md_rdev * rdev)208  static int rdev_need_serial(struct md_rdev *rdev)
209  {
210  	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
211  		rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
212  		test_bit(WriteMostly, &rdev->flags));
213  }
214  
215  /*
216   * Init resource for rdev(s), then create serial_info_pool if:
217   * 1. rdev is the first device which return true from rdev_enable_serial.
218   * 2. rdev is NULL, means we want to enable serialization for all rdevs.
219   */
mddev_create_serial_pool(struct mddev * mddev,struct md_rdev * rdev)220  void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
221  {
222  	int ret = 0;
223  
224  	if (rdev && !rdev_need_serial(rdev) &&
225  	    !test_bit(CollisionCheck, &rdev->flags))
226  		return;
227  
228  	if (!rdev)
229  		ret = rdevs_init_serial(mddev);
230  	else
231  		ret = rdev_init_serial(rdev);
232  	if (ret)
233  		return;
234  
235  	if (mddev->serial_info_pool == NULL) {
236  		/*
237  		 * already in memalloc noio context by
238  		 * mddev_suspend()
239  		 */
240  		mddev->serial_info_pool =
241  			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
242  						sizeof(struct serial_info));
243  		if (!mddev->serial_info_pool) {
244  			rdevs_uninit_serial(mddev);
245  			pr_err("can't alloc memory pool for serialization\n");
246  		}
247  	}
248  }
249  
250  /*
251   * Free resource from rdev(s), and destroy serial_info_pool under conditions:
252   * 1. rdev is the last device flaged with CollisionCheck.
253   * 2. when bitmap is destroyed while policy is not enabled.
254   * 3. for disable policy, the pool is destroyed only when no rdev needs it.
255   */
mddev_destroy_serial_pool(struct mddev * mddev,struct md_rdev * rdev)256  void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
257  {
258  	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
259  		return;
260  
261  	if (mddev->serial_info_pool) {
262  		struct md_rdev *temp;
263  		int num = 0; /* used to track if other rdevs need the pool */
264  
265  		rdev_for_each(temp, mddev) {
266  			if (!rdev) {
267  				if (!mddev->serialize_policy ||
268  				    !rdev_need_serial(temp))
269  					rdev_uninit_serial(temp);
270  				else
271  					num++;
272  			} else if (temp != rdev &&
273  				   test_bit(CollisionCheck, &temp->flags))
274  				num++;
275  		}
276  
277  		if (rdev)
278  			rdev_uninit_serial(rdev);
279  
280  		if (num)
281  			pr_info("The mempool could be used by other devices\n");
282  		else {
283  			mempool_destroy(mddev->serial_info_pool);
284  			mddev->serial_info_pool = NULL;
285  		}
286  	}
287  }
288  
289  static struct ctl_table_header *raid_table_header;
290  
291  static const struct ctl_table raid_table[] = {
292  	{
293  		.procname	= "speed_limit_min",
294  		.data		= &sysctl_speed_limit_min,
295  		.maxlen		= sizeof(int),
296  		.mode		= S_IRUGO|S_IWUSR,
297  		.proc_handler	= proc_dointvec,
298  	},
299  	{
300  		.procname	= "speed_limit_max",
301  		.data		= &sysctl_speed_limit_max,
302  		.maxlen		= sizeof(int),
303  		.mode		= S_IRUGO|S_IWUSR,
304  		.proc_handler	= proc_dointvec,
305  	},
306  };
307  
308  static int start_readonly;
309  
310  /*
311   * The original mechanism for creating an md device is to create
312   * a device node in /dev and to open it.  This causes races with device-close.
313   * The preferred method is to write to the "new_array" module parameter.
314   * This can avoid races.
315   * Setting create_on_open to false disables the original mechanism
316   * so all the races disappear.
317   */
318  static bool create_on_open = true;
319  
320  /*
321   * We have a system wide 'event count' that is incremented
322   * on any 'interesting' event, and readers of /proc/mdstat
323   * can use 'poll' or 'select' to find out when the event
324   * count increases.
325   *
326   * Events are:
327   *  start array, stop array, error, add device, remove device,
328   *  start build, activate spare
329   */
330  static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
331  static atomic_t md_event_count;
md_new_event(void)332  void md_new_event(void)
333  {
334  	atomic_inc(&md_event_count);
335  	wake_up(&md_event_waiters);
336  }
337  EXPORT_SYMBOL_GPL(md_new_event);
338  
339  /*
340   * Enables to iterate over all existing md arrays
341   * all_mddevs_lock protects this list.
342   */
343  static LIST_HEAD(all_mddevs);
344  static DEFINE_SPINLOCK(all_mddevs_lock);
345  
is_md_suspended(struct mddev * mddev)346  static bool is_md_suspended(struct mddev *mddev)
347  {
348  	return percpu_ref_is_dying(&mddev->active_io);
349  }
350  /* Rather than calling directly into the personality make_request function,
351   * IO requests come here first so that we can check if the device is
352   * being suspended pending a reconfiguration.
353   * We hold a refcount over the call to ->make_request.  By the time that
354   * call has finished, the bio has been linked into some internal structure
355   * and so is visible to ->quiesce(), so we don't need the refcount any more.
356   */
is_suspended(struct mddev * mddev,struct bio * bio)357  static bool is_suspended(struct mddev *mddev, struct bio *bio)
358  {
359  	if (is_md_suspended(mddev))
360  		return true;
361  	if (bio_data_dir(bio) != WRITE)
362  		return false;
363  	if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
364  		return false;
365  	if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
366  		return false;
367  	if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
368  		return false;
369  	return true;
370  }
371  
md_handle_request(struct mddev * mddev,struct bio * bio)372  bool md_handle_request(struct mddev *mddev, struct bio *bio)
373  {
374  check_suspended:
375  	if (is_suspended(mddev, bio)) {
376  		DEFINE_WAIT(__wait);
377  		/* Bail out if REQ_NOWAIT is set for the bio */
378  		if (bio->bi_opf & REQ_NOWAIT) {
379  			bio_wouldblock_error(bio);
380  			return true;
381  		}
382  		for (;;) {
383  			prepare_to_wait(&mddev->sb_wait, &__wait,
384  					TASK_UNINTERRUPTIBLE);
385  			if (!is_suspended(mddev, bio))
386  				break;
387  			schedule();
388  		}
389  		finish_wait(&mddev->sb_wait, &__wait);
390  	}
391  	if (!percpu_ref_tryget_live(&mddev->active_io))
392  		goto check_suspended;
393  
394  	if (!mddev->pers->make_request(mddev, bio)) {
395  		percpu_ref_put(&mddev->active_io);
396  		if (!mddev->gendisk && mddev->pers->prepare_suspend)
397  			return false;
398  		goto check_suspended;
399  	}
400  
401  	percpu_ref_put(&mddev->active_io);
402  	return true;
403  }
404  EXPORT_SYMBOL(md_handle_request);
405  
md_submit_bio(struct bio * bio)406  static void md_submit_bio(struct bio *bio)
407  {
408  	const int rw = bio_data_dir(bio);
409  	struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
410  
411  	if (mddev == NULL || mddev->pers == NULL) {
412  		bio_io_error(bio);
413  		return;
414  	}
415  
416  	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
417  		bio_io_error(bio);
418  		return;
419  	}
420  
421  	bio = bio_split_to_limits(bio);
422  	if (!bio)
423  		return;
424  
425  	if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
426  		if (bio_sectors(bio) != 0)
427  			bio->bi_status = BLK_STS_IOERR;
428  		bio_endio(bio);
429  		return;
430  	}
431  
432  	/* bio could be mergeable after passing to underlayer */
433  	bio->bi_opf &= ~REQ_NOMERGE;
434  
435  	md_handle_request(mddev, bio);
436  }
437  
438  /*
439   * Make sure no new requests are submitted to the device, and any requests that
440   * have been submitted are completely handled.
441   */
mddev_suspend(struct mddev * mddev,bool interruptible)442  int mddev_suspend(struct mddev *mddev, bool interruptible)
443  {
444  	int err = 0;
445  
446  	/*
447  	 * hold reconfig_mutex to wait for normal io will deadlock, because
448  	 * other context can't update super_block, and normal io can rely on
449  	 * updating super_block.
450  	 */
451  	lockdep_assert_not_held(&mddev->reconfig_mutex);
452  
453  	if (interruptible)
454  		err = mutex_lock_interruptible(&mddev->suspend_mutex);
455  	else
456  		mutex_lock(&mddev->suspend_mutex);
457  	if (err)
458  		return err;
459  
460  	if (mddev->suspended) {
461  		WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
462  		mutex_unlock(&mddev->suspend_mutex);
463  		return 0;
464  	}
465  
466  	percpu_ref_kill(&mddev->active_io);
467  	if (interruptible)
468  		err = wait_event_interruptible(mddev->sb_wait,
469  				percpu_ref_is_zero(&mddev->active_io));
470  	else
471  		wait_event(mddev->sb_wait,
472  				percpu_ref_is_zero(&mddev->active_io));
473  	if (err) {
474  		percpu_ref_resurrect(&mddev->active_io);
475  		mutex_unlock(&mddev->suspend_mutex);
476  		return err;
477  	}
478  
479  	/*
480  	 * For raid456, io might be waiting for reshape to make progress,
481  	 * allow new reshape to start while waiting for io to be done to
482  	 * prevent deadlock.
483  	 */
484  	WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
485  
486  	/* restrict memory reclaim I/O during raid array is suspend */
487  	mddev->noio_flag = memalloc_noio_save();
488  
489  	mutex_unlock(&mddev->suspend_mutex);
490  	return 0;
491  }
492  EXPORT_SYMBOL_GPL(mddev_suspend);
493  
__mddev_resume(struct mddev * mddev,bool recovery_needed)494  static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
495  {
496  	lockdep_assert_not_held(&mddev->reconfig_mutex);
497  
498  	mutex_lock(&mddev->suspend_mutex);
499  	WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
500  	if (mddev->suspended) {
501  		mutex_unlock(&mddev->suspend_mutex);
502  		return;
503  	}
504  
505  	/* entred the memalloc scope from mddev_suspend() */
506  	memalloc_noio_restore(mddev->noio_flag);
507  
508  	percpu_ref_resurrect(&mddev->active_io);
509  	wake_up(&mddev->sb_wait);
510  
511  	if (recovery_needed)
512  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
513  	md_wakeup_thread(mddev->thread);
514  	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
515  
516  	mutex_unlock(&mddev->suspend_mutex);
517  }
518  
mddev_resume(struct mddev * mddev)519  void mddev_resume(struct mddev *mddev)
520  {
521  	return __mddev_resume(mddev, true);
522  }
523  EXPORT_SYMBOL_GPL(mddev_resume);
524  
525  /* sync bdev before setting device to readonly or stopping raid*/
mddev_set_closing_and_sync_blockdev(struct mddev * mddev,int opener_num)526  static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
527  {
528  	mutex_lock(&mddev->open_mutex);
529  	if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
530  		mutex_unlock(&mddev->open_mutex);
531  		return -EBUSY;
532  	}
533  	if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
534  		mutex_unlock(&mddev->open_mutex);
535  		return -EBUSY;
536  	}
537  	mutex_unlock(&mddev->open_mutex);
538  
539  	sync_blockdev(mddev->gendisk->part0);
540  	return 0;
541  }
542  
543  /*
544   * The only difference from bio_chain_endio() is that the current
545   * bi_status of bio does not affect the bi_status of parent.
546   */
md_end_flush(struct bio * bio)547  static void md_end_flush(struct bio *bio)
548  {
549  	struct bio *parent = bio->bi_private;
550  
551  	/*
552  	 * If any flush io error before the power failure,
553  	 * disk data may be lost.
554  	 */
555  	if (bio->bi_status)
556  		pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
557  			blk_status_to_errno(bio->bi_status));
558  
559  	bio_put(bio);
560  	bio_endio(parent);
561  }
562  
md_flush_request(struct mddev * mddev,struct bio * bio)563  bool md_flush_request(struct mddev *mddev, struct bio *bio)
564  {
565  	struct md_rdev *rdev;
566  	struct bio *new;
567  
568  	/*
569  	 * md_flush_reqeust() should be called under md_handle_request() and
570  	 * 'active_io' is already grabbed. Hence it's safe to get rdev directly
571  	 * without rcu protection.
572  	 */
573  	WARN_ON(percpu_ref_is_zero(&mddev->active_io));
574  
575  	rdev_for_each(rdev, mddev) {
576  		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
577  			continue;
578  
579  		new = bio_alloc_bioset(rdev->bdev, 0,
580  				       REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
581  				       &mddev->bio_set);
582  		new->bi_private = bio;
583  		new->bi_end_io = md_end_flush;
584  		bio_inc_remaining(bio);
585  		submit_bio(new);
586  	}
587  
588  	if (bio_sectors(bio) == 0) {
589  		bio_endio(bio);
590  		return true;
591  	}
592  
593  	bio->bi_opf &= ~REQ_PREFLUSH;
594  	return false;
595  }
596  EXPORT_SYMBOL(md_flush_request);
597  
mddev_get(struct mddev * mddev)598  static inline struct mddev *mddev_get(struct mddev *mddev)
599  {
600  	lockdep_assert_held(&all_mddevs_lock);
601  
602  	if (test_bit(MD_DELETED, &mddev->flags))
603  		return NULL;
604  	atomic_inc(&mddev->active);
605  	return mddev;
606  }
607  
608  static void mddev_delayed_delete(struct work_struct *ws);
609  
__mddev_put(struct mddev * mddev)610  static void __mddev_put(struct mddev *mddev)
611  {
612  	if (mddev->raid_disks || !list_empty(&mddev->disks) ||
613  	    mddev->ctime || mddev->hold_active)
614  		return;
615  
616  	/* Array is not configured at all, and not held active, so destroy it */
617  	set_bit(MD_DELETED, &mddev->flags);
618  
619  	/*
620  	 * Call queue_work inside the spinlock so that flush_workqueue() after
621  	 * mddev_find will succeed in waiting for the work to be done.
622  	 */
623  	queue_work(md_misc_wq, &mddev->del_work);
624  }
625  
mddev_put_locked(struct mddev * mddev)626  static void mddev_put_locked(struct mddev *mddev)
627  {
628  	if (atomic_dec_and_test(&mddev->active))
629  		__mddev_put(mddev);
630  }
631  
mddev_put(struct mddev * mddev)632  void mddev_put(struct mddev *mddev)
633  {
634  	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
635  		return;
636  
637  	__mddev_put(mddev);
638  	spin_unlock(&all_mddevs_lock);
639  }
640  
641  static void md_safemode_timeout(struct timer_list *t);
642  static void md_start_sync(struct work_struct *ws);
643  
active_io_release(struct percpu_ref * ref)644  static void active_io_release(struct percpu_ref *ref)
645  {
646  	struct mddev *mddev = container_of(ref, struct mddev, active_io);
647  
648  	wake_up(&mddev->sb_wait);
649  }
650  
no_op(struct percpu_ref * r)651  static void no_op(struct percpu_ref *r) {}
652  
mddev_init(struct mddev * mddev)653  int mddev_init(struct mddev *mddev)
654  {
655  
656  	if (percpu_ref_init(&mddev->active_io, active_io_release,
657  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
658  		return -ENOMEM;
659  
660  	if (percpu_ref_init(&mddev->writes_pending, no_op,
661  			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
662  		percpu_ref_exit(&mddev->active_io);
663  		return -ENOMEM;
664  	}
665  
666  	/* We want to start with the refcount at zero */
667  	percpu_ref_put(&mddev->writes_pending);
668  
669  	mutex_init(&mddev->open_mutex);
670  	mutex_init(&mddev->reconfig_mutex);
671  	mutex_init(&mddev->suspend_mutex);
672  	mutex_init(&mddev->bitmap_info.mutex);
673  	INIT_LIST_HEAD(&mddev->disks);
674  	INIT_LIST_HEAD(&mddev->all_mddevs);
675  	INIT_LIST_HEAD(&mddev->deleting);
676  	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
677  	atomic_set(&mddev->active, 1);
678  	atomic_set(&mddev->openers, 0);
679  	atomic_set(&mddev->sync_seq, 0);
680  	spin_lock_init(&mddev->lock);
681  	init_waitqueue_head(&mddev->sb_wait);
682  	init_waitqueue_head(&mddev->recovery_wait);
683  	mddev->reshape_position = MaxSector;
684  	mddev->reshape_backwards = 0;
685  	mddev->last_sync_action = ACTION_IDLE;
686  	mddev->resync_min = 0;
687  	mddev->resync_max = MaxSector;
688  	mddev->level = LEVEL_NONE;
689  	mddev_set_bitmap_ops(mddev);
690  
691  	INIT_WORK(&mddev->sync_work, md_start_sync);
692  	INIT_WORK(&mddev->del_work, mddev_delayed_delete);
693  
694  	return 0;
695  }
696  EXPORT_SYMBOL_GPL(mddev_init);
697  
mddev_destroy(struct mddev * mddev)698  void mddev_destroy(struct mddev *mddev)
699  {
700  	percpu_ref_exit(&mddev->active_io);
701  	percpu_ref_exit(&mddev->writes_pending);
702  }
703  EXPORT_SYMBOL_GPL(mddev_destroy);
704  
mddev_find_locked(dev_t unit)705  static struct mddev *mddev_find_locked(dev_t unit)
706  {
707  	struct mddev *mddev;
708  
709  	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
710  		if (mddev->unit == unit)
711  			return mddev;
712  
713  	return NULL;
714  }
715  
716  /* find an unused unit number */
mddev_alloc_unit(void)717  static dev_t mddev_alloc_unit(void)
718  {
719  	static int next_minor = 512;
720  	int start = next_minor;
721  	bool is_free = 0;
722  	dev_t dev = 0;
723  
724  	while (!is_free) {
725  		dev = MKDEV(MD_MAJOR, next_minor);
726  		next_minor++;
727  		if (next_minor > MINORMASK)
728  			next_minor = 0;
729  		if (next_minor == start)
730  			return 0;		/* Oh dear, all in use. */
731  		is_free = !mddev_find_locked(dev);
732  	}
733  
734  	return dev;
735  }
736  
mddev_alloc(dev_t unit)737  static struct mddev *mddev_alloc(dev_t unit)
738  {
739  	struct mddev *new;
740  	int error;
741  
742  	if (unit && MAJOR(unit) != MD_MAJOR)
743  		unit &= ~((1 << MdpMinorShift) - 1);
744  
745  	new = kzalloc(sizeof(*new), GFP_KERNEL);
746  	if (!new)
747  		return ERR_PTR(-ENOMEM);
748  
749  	error = mddev_init(new);
750  	if (error)
751  		goto out_free_new;
752  
753  	spin_lock(&all_mddevs_lock);
754  	if (unit) {
755  		error = -EEXIST;
756  		if (mddev_find_locked(unit))
757  			goto out_destroy_new;
758  		new->unit = unit;
759  		if (MAJOR(unit) == MD_MAJOR)
760  			new->md_minor = MINOR(unit);
761  		else
762  			new->md_minor = MINOR(unit) >> MdpMinorShift;
763  		new->hold_active = UNTIL_IOCTL;
764  	} else {
765  		error = -ENODEV;
766  		new->unit = mddev_alloc_unit();
767  		if (!new->unit)
768  			goto out_destroy_new;
769  		new->md_minor = MINOR(new->unit);
770  		new->hold_active = UNTIL_STOP;
771  	}
772  
773  	list_add(&new->all_mddevs, &all_mddevs);
774  	spin_unlock(&all_mddevs_lock);
775  	return new;
776  
777  out_destroy_new:
778  	spin_unlock(&all_mddevs_lock);
779  	mddev_destroy(new);
780  out_free_new:
781  	kfree(new);
782  	return ERR_PTR(error);
783  }
784  
mddev_free(struct mddev * mddev)785  static void mddev_free(struct mddev *mddev)
786  {
787  	spin_lock(&all_mddevs_lock);
788  	list_del(&mddev->all_mddevs);
789  	spin_unlock(&all_mddevs_lock);
790  
791  	mddev_destroy(mddev);
792  	kfree(mddev);
793  }
794  
795  static const struct attribute_group md_redundancy_group;
796  
mddev_unlock(struct mddev * mddev)797  void mddev_unlock(struct mddev *mddev)
798  {
799  	struct md_rdev *rdev;
800  	struct md_rdev *tmp;
801  	LIST_HEAD(delete);
802  
803  	if (!list_empty(&mddev->deleting))
804  		list_splice_init(&mddev->deleting, &delete);
805  
806  	if (mddev->to_remove) {
807  		/* These cannot be removed under reconfig_mutex as
808  		 * an access to the files will try to take reconfig_mutex
809  		 * while holding the file unremovable, which leads to
810  		 * a deadlock.
811  		 * So hold set sysfs_active while the remove in happeing,
812  		 * and anything else which might set ->to_remove or my
813  		 * otherwise change the sysfs namespace will fail with
814  		 * -EBUSY if sysfs_active is still set.
815  		 * We set sysfs_active under reconfig_mutex and elsewhere
816  		 * test it under the same mutex to ensure its correct value
817  		 * is seen.
818  		 */
819  		const struct attribute_group *to_remove = mddev->to_remove;
820  		mddev->to_remove = NULL;
821  		mddev->sysfs_active = 1;
822  		mutex_unlock(&mddev->reconfig_mutex);
823  
824  		if (mddev->kobj.sd) {
825  			if (to_remove != &md_redundancy_group)
826  				sysfs_remove_group(&mddev->kobj, to_remove);
827  			if (mddev->pers == NULL ||
828  			    mddev->pers->sync_request == NULL) {
829  				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
830  				if (mddev->sysfs_action)
831  					sysfs_put(mddev->sysfs_action);
832  				if (mddev->sysfs_completed)
833  					sysfs_put(mddev->sysfs_completed);
834  				if (mddev->sysfs_degraded)
835  					sysfs_put(mddev->sysfs_degraded);
836  				mddev->sysfs_action = NULL;
837  				mddev->sysfs_completed = NULL;
838  				mddev->sysfs_degraded = NULL;
839  			}
840  		}
841  		mddev->sysfs_active = 0;
842  	} else
843  		mutex_unlock(&mddev->reconfig_mutex);
844  
845  	md_wakeup_thread(mddev->thread);
846  	wake_up(&mddev->sb_wait);
847  
848  	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
849  		list_del_init(&rdev->same_set);
850  		kobject_del(&rdev->kobj);
851  		export_rdev(rdev, mddev);
852  	}
853  }
854  EXPORT_SYMBOL_GPL(mddev_unlock);
855  
md_find_rdev_nr_rcu(struct mddev * mddev,int nr)856  struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
857  {
858  	struct md_rdev *rdev;
859  
860  	rdev_for_each_rcu(rdev, mddev)
861  		if (rdev->desc_nr == nr)
862  			return rdev;
863  
864  	return NULL;
865  }
866  EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
867  
find_rdev(struct mddev * mddev,dev_t dev)868  static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
869  {
870  	struct md_rdev *rdev;
871  
872  	rdev_for_each(rdev, mddev)
873  		if (rdev->bdev->bd_dev == dev)
874  			return rdev;
875  
876  	return NULL;
877  }
878  
md_find_rdev_rcu(struct mddev * mddev,dev_t dev)879  struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
880  {
881  	struct md_rdev *rdev;
882  
883  	rdev_for_each_rcu(rdev, mddev)
884  		if (rdev->bdev->bd_dev == dev)
885  			return rdev;
886  
887  	return NULL;
888  }
889  EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
890  
get_pers(int level,char * clevel)891  static struct md_personality *get_pers(int level, char *clevel)
892  {
893  	struct md_personality *ret = NULL;
894  	struct md_submodule_head *head;
895  	unsigned long i;
896  
897  	xa_lock(&md_submodule);
898  	xa_for_each(&md_submodule, i, head) {
899  		if (head->type != MD_PERSONALITY)
900  			continue;
901  		if ((level != LEVEL_NONE && head->id == level) ||
902  		    !strcmp(head->name, clevel)) {
903  			if (try_module_get(head->owner))
904  				ret = (void *)head;
905  			break;
906  		}
907  	}
908  	xa_unlock(&md_submodule);
909  
910  	if (!ret) {
911  		if (level != LEVEL_NONE)
912  			pr_warn("md: personality for level %d is not loaded!\n",
913  				level);
914  		else
915  			pr_warn("md: personality for level %s is not loaded!\n",
916  				clevel);
917  	}
918  
919  	return ret;
920  }
921  
put_pers(struct md_personality * pers)922  static void put_pers(struct md_personality *pers)
923  {
924  	module_put(pers->head.owner);
925  }
926  
927  /* return the offset of the super block in 512byte sectors */
calc_dev_sboffset(struct md_rdev * rdev)928  static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
929  {
930  	return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
931  }
932  
alloc_disk_sb(struct md_rdev * rdev)933  static int alloc_disk_sb(struct md_rdev *rdev)
934  {
935  	rdev->sb_page = alloc_page(GFP_KERNEL);
936  	if (!rdev->sb_page)
937  		return -ENOMEM;
938  	return 0;
939  }
940  
md_rdev_clear(struct md_rdev * rdev)941  void md_rdev_clear(struct md_rdev *rdev)
942  {
943  	if (rdev->sb_page) {
944  		put_page(rdev->sb_page);
945  		rdev->sb_loaded = 0;
946  		rdev->sb_page = NULL;
947  		rdev->sb_start = 0;
948  		rdev->sectors = 0;
949  	}
950  	if (rdev->bb_page) {
951  		put_page(rdev->bb_page);
952  		rdev->bb_page = NULL;
953  	}
954  	badblocks_exit(&rdev->badblocks);
955  }
956  EXPORT_SYMBOL_GPL(md_rdev_clear);
957  
super_written(struct bio * bio)958  static void super_written(struct bio *bio)
959  {
960  	struct md_rdev *rdev = bio->bi_private;
961  	struct mddev *mddev = rdev->mddev;
962  
963  	if (bio->bi_status) {
964  		pr_err("md: %s gets error=%d\n", __func__,
965  		       blk_status_to_errno(bio->bi_status));
966  		md_error(mddev, rdev);
967  		if (!test_bit(Faulty, &rdev->flags)
968  		    && (bio->bi_opf & MD_FAILFAST)) {
969  			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
970  			set_bit(LastDev, &rdev->flags);
971  		}
972  	} else
973  		clear_bit(LastDev, &rdev->flags);
974  
975  	bio_put(bio);
976  
977  	rdev_dec_pending(rdev, mddev);
978  
979  	if (atomic_dec_and_test(&mddev->pending_writes))
980  		wake_up(&mddev->sb_wait);
981  }
982  
md_super_write(struct mddev * mddev,struct md_rdev * rdev,sector_t sector,int size,struct page * page)983  void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
984  		   sector_t sector, int size, struct page *page)
985  {
986  	/* write first size bytes of page to sector of rdev
987  	 * Increment mddev->pending_writes before returning
988  	 * and decrement it on completion, waking up sb_wait
989  	 * if zero is reached.
990  	 * If an error occurred, call md_error
991  	 */
992  	struct bio *bio;
993  
994  	if (!page)
995  		return;
996  
997  	if (test_bit(Faulty, &rdev->flags))
998  		return;
999  
1000  	bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1001  			      1,
1002  			      REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1003  				  | REQ_PREFLUSH | REQ_FUA,
1004  			      GFP_NOIO, &mddev->sync_set);
1005  
1006  	atomic_inc(&rdev->nr_pending);
1007  
1008  	bio->bi_iter.bi_sector = sector;
1009  	__bio_add_page(bio, page, size, 0);
1010  	bio->bi_private = rdev;
1011  	bio->bi_end_io = super_written;
1012  
1013  	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1014  	    test_bit(FailFast, &rdev->flags) &&
1015  	    !test_bit(LastDev, &rdev->flags))
1016  		bio->bi_opf |= MD_FAILFAST;
1017  
1018  	atomic_inc(&mddev->pending_writes);
1019  	submit_bio(bio);
1020  }
1021  
md_super_wait(struct mddev * mddev)1022  int md_super_wait(struct mddev *mddev)
1023  {
1024  	/* wait for all superblock writes that were scheduled to complete */
1025  	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1026  	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1027  		return -EAGAIN;
1028  	return 0;
1029  }
1030  
sync_page_io(struct md_rdev * rdev,sector_t sector,int size,struct page * page,blk_opf_t opf,bool metadata_op)1031  int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1032  		 struct page *page, blk_opf_t opf, bool metadata_op)
1033  {
1034  	struct bio bio;
1035  	struct bio_vec bvec;
1036  
1037  	if (metadata_op && rdev->meta_bdev)
1038  		bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1039  	else
1040  		bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1041  
1042  	if (metadata_op)
1043  		bio.bi_iter.bi_sector = sector + rdev->sb_start;
1044  	else if (rdev->mddev->reshape_position != MaxSector &&
1045  		 (rdev->mddev->reshape_backwards ==
1046  		  (sector >= rdev->mddev->reshape_position)))
1047  		bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1048  	else
1049  		bio.bi_iter.bi_sector = sector + rdev->data_offset;
1050  	__bio_add_page(&bio, page, size, 0);
1051  
1052  	submit_bio_wait(&bio);
1053  
1054  	return !bio.bi_status;
1055  }
1056  EXPORT_SYMBOL_GPL(sync_page_io);
1057  
read_disk_sb(struct md_rdev * rdev,int size)1058  static int read_disk_sb(struct md_rdev *rdev, int size)
1059  {
1060  	if (rdev->sb_loaded)
1061  		return 0;
1062  
1063  	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1064  		goto fail;
1065  	rdev->sb_loaded = 1;
1066  	return 0;
1067  
1068  fail:
1069  	pr_err("md: disabled device %pg, could not read superblock.\n",
1070  	       rdev->bdev);
1071  	return -EINVAL;
1072  }
1073  
md_uuid_equal(mdp_super_t * sb1,mdp_super_t * sb2)1074  static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1075  {
1076  	return	sb1->set_uuid0 == sb2->set_uuid0 &&
1077  		sb1->set_uuid1 == sb2->set_uuid1 &&
1078  		sb1->set_uuid2 == sb2->set_uuid2 &&
1079  		sb1->set_uuid3 == sb2->set_uuid3;
1080  }
1081  
md_sb_equal(mdp_super_t * sb1,mdp_super_t * sb2)1082  static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1083  {
1084  	int ret;
1085  	mdp_super_t *tmp1, *tmp2;
1086  
1087  	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1088  	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1089  
1090  	if (!tmp1 || !tmp2) {
1091  		ret = 0;
1092  		goto abort;
1093  	}
1094  
1095  	*tmp1 = *sb1;
1096  	*tmp2 = *sb2;
1097  
1098  	/*
1099  	 * nr_disks is not constant
1100  	 */
1101  	tmp1->nr_disks = 0;
1102  	tmp2->nr_disks = 0;
1103  
1104  	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1105  abort:
1106  	kfree(tmp1);
1107  	kfree(tmp2);
1108  	return ret;
1109  }
1110  
md_csum_fold(u32 csum)1111  static u32 md_csum_fold(u32 csum)
1112  {
1113  	csum = (csum & 0xffff) + (csum >> 16);
1114  	return (csum & 0xffff) + (csum >> 16);
1115  }
1116  
calc_sb_csum(mdp_super_t * sb)1117  static unsigned int calc_sb_csum(mdp_super_t *sb)
1118  {
1119  	u64 newcsum = 0;
1120  	u32 *sb32 = (u32*)sb;
1121  	int i;
1122  	unsigned int disk_csum, csum;
1123  
1124  	disk_csum = sb->sb_csum;
1125  	sb->sb_csum = 0;
1126  
1127  	for (i = 0; i < MD_SB_BYTES/4 ; i++)
1128  		newcsum += sb32[i];
1129  	csum = (newcsum & 0xffffffff) + (newcsum>>32);
1130  
1131  #ifdef CONFIG_ALPHA
1132  	/* This used to use csum_partial, which was wrong for several
1133  	 * reasons including that different results are returned on
1134  	 * different architectures.  It isn't critical that we get exactly
1135  	 * the same return value as before (we always csum_fold before
1136  	 * testing, and that removes any differences).  However as we
1137  	 * know that csum_partial always returned a 16bit value on
1138  	 * alphas, do a fold to maximise conformity to previous behaviour.
1139  	 */
1140  	sb->sb_csum = md_csum_fold(disk_csum);
1141  #else
1142  	sb->sb_csum = disk_csum;
1143  #endif
1144  	return csum;
1145  }
1146  
1147  /*
1148   * Handle superblock details.
1149   * We want to be able to handle multiple superblock formats
1150   * so we have a common interface to them all, and an array of
1151   * different handlers.
1152   * We rely on user-space to write the initial superblock, and support
1153   * reading and updating of superblocks.
1154   * Interface methods are:
1155   *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1156   *      loads and validates a superblock on dev.
1157   *      if refdev != NULL, compare superblocks on both devices
1158   *    Return:
1159   *      0 - dev has a superblock that is compatible with refdev
1160   *      1 - dev has a superblock that is compatible and newer than refdev
1161   *          so dev should be used as the refdev in future
1162   *     -EINVAL superblock incompatible or invalid
1163   *     -othererror e.g. -EIO
1164   *
1165   *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1166   *      Verify that dev is acceptable into mddev.
1167   *       The first time, mddev->raid_disks will be 0, and data from
1168   *       dev should be merged in.  Subsequent calls check that dev
1169   *       is new enough.  Return 0 or -EINVAL
1170   *
1171   *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1172   *     Update the superblock for rdev with data in mddev
1173   *     This does not write to disc.
1174   *
1175   */
1176  
1177  struct super_type  {
1178  	char		    *name;
1179  	struct module	    *owner;
1180  	int		    (*load_super)(struct md_rdev *rdev,
1181  					  struct md_rdev *refdev,
1182  					  int minor_version);
1183  	int		    (*validate_super)(struct mddev *mddev,
1184  					      struct md_rdev *freshest,
1185  					      struct md_rdev *rdev);
1186  	void		    (*sync_super)(struct mddev *mddev,
1187  					  struct md_rdev *rdev);
1188  	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1189  						sector_t num_sectors);
1190  	int		    (*allow_new_offset)(struct md_rdev *rdev,
1191  						unsigned long long new_offset);
1192  };
1193  
1194  /*
1195   * Check that the given mddev has no bitmap.
1196   *
1197   * This function is called from the run method of all personalities that do not
1198   * support bitmaps. It prints an error message and returns non-zero if mddev
1199   * has a bitmap. Otherwise, it returns 0.
1200   *
1201   */
md_check_no_bitmap(struct mddev * mddev)1202  int md_check_no_bitmap(struct mddev *mddev)
1203  {
1204  	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1205  		return 0;
1206  	pr_warn("%s: bitmaps are not supported for %s\n",
1207  		mdname(mddev), mddev->pers->head.name);
1208  	return 1;
1209  }
1210  EXPORT_SYMBOL(md_check_no_bitmap);
1211  
1212  /*
1213   * load_super for 0.90.0
1214   */
super_90_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1215  static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1216  {
1217  	mdp_super_t *sb;
1218  	int ret;
1219  	bool spare_disk = true;
1220  
1221  	/*
1222  	 * Calculate the position of the superblock (512byte sectors),
1223  	 * it's at the end of the disk.
1224  	 *
1225  	 * It also happens to be a multiple of 4Kb.
1226  	 */
1227  	rdev->sb_start = calc_dev_sboffset(rdev);
1228  
1229  	ret = read_disk_sb(rdev, MD_SB_BYTES);
1230  	if (ret)
1231  		return ret;
1232  
1233  	ret = -EINVAL;
1234  
1235  	sb = page_address(rdev->sb_page);
1236  
1237  	if (sb->md_magic != MD_SB_MAGIC) {
1238  		pr_warn("md: invalid raid superblock magic on %pg\n",
1239  			rdev->bdev);
1240  		goto abort;
1241  	}
1242  
1243  	if (sb->major_version != 0 ||
1244  	    sb->minor_version < 90 ||
1245  	    sb->minor_version > 91) {
1246  		pr_warn("Bad version number %d.%d on %pg\n",
1247  			sb->major_version, sb->minor_version, rdev->bdev);
1248  		goto abort;
1249  	}
1250  
1251  	if (sb->raid_disks <= 0)
1252  		goto abort;
1253  
1254  	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1255  		pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1256  		goto abort;
1257  	}
1258  
1259  	rdev->preferred_minor = sb->md_minor;
1260  	rdev->data_offset = 0;
1261  	rdev->new_data_offset = 0;
1262  	rdev->sb_size = MD_SB_BYTES;
1263  	rdev->badblocks.shift = -1;
1264  
1265  	rdev->desc_nr = sb->this_disk.number;
1266  
1267  	/* not spare disk */
1268  	if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1269  	    sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1270  		spare_disk = false;
1271  
1272  	if (!refdev) {
1273  		if (!spare_disk)
1274  			ret = 1;
1275  		else
1276  			ret = 0;
1277  	} else {
1278  		__u64 ev1, ev2;
1279  		mdp_super_t *refsb = page_address(refdev->sb_page);
1280  		if (!md_uuid_equal(refsb, sb)) {
1281  			pr_warn("md: %pg has different UUID to %pg\n",
1282  				rdev->bdev, refdev->bdev);
1283  			goto abort;
1284  		}
1285  		if (!md_sb_equal(refsb, sb)) {
1286  			pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1287  				rdev->bdev, refdev->bdev);
1288  			goto abort;
1289  		}
1290  		ev1 = md_event(sb);
1291  		ev2 = md_event(refsb);
1292  
1293  		if (!spare_disk && ev1 > ev2)
1294  			ret = 1;
1295  		else
1296  			ret = 0;
1297  	}
1298  	rdev->sectors = rdev->sb_start;
1299  	/* Limit to 4TB as metadata cannot record more than that.
1300  	 * (not needed for Linear and RAID0 as metadata doesn't
1301  	 * record this size)
1302  	 */
1303  	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1304  		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1305  
1306  	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1307  		/* "this cannot possibly happen" ... */
1308  		ret = -EINVAL;
1309  
1310   abort:
1311  	return ret;
1312  }
1313  
md_bitmap_events_cleared(struct mddev * mddev)1314  static u64 md_bitmap_events_cleared(struct mddev *mddev)
1315  {
1316  	struct md_bitmap_stats stats;
1317  	int err;
1318  
1319  	err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
1320  	if (err)
1321  		return 0;
1322  
1323  	return stats.events_cleared;
1324  }
1325  
1326  /*
1327   * validate_super for 0.90.0
1328   * note: we are not using "freshest" for 0.9 superblock
1329   */
super_90_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1330  static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1331  {
1332  	mdp_disk_t *desc;
1333  	mdp_super_t *sb = page_address(rdev->sb_page);
1334  	__u64 ev1 = md_event(sb);
1335  
1336  	rdev->raid_disk = -1;
1337  	clear_bit(Faulty, &rdev->flags);
1338  	clear_bit(In_sync, &rdev->flags);
1339  	clear_bit(Bitmap_sync, &rdev->flags);
1340  	clear_bit(WriteMostly, &rdev->flags);
1341  
1342  	if (mddev->raid_disks == 0) {
1343  		mddev->major_version = 0;
1344  		mddev->minor_version = sb->minor_version;
1345  		mddev->patch_version = sb->patch_version;
1346  		mddev->external = 0;
1347  		mddev->chunk_sectors = sb->chunk_size >> 9;
1348  		mddev->ctime = sb->ctime;
1349  		mddev->utime = sb->utime;
1350  		mddev->level = sb->level;
1351  		mddev->clevel[0] = 0;
1352  		mddev->layout = sb->layout;
1353  		mddev->raid_disks = sb->raid_disks;
1354  		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1355  		mddev->events = ev1;
1356  		mddev->bitmap_info.offset = 0;
1357  		mddev->bitmap_info.space = 0;
1358  		/* bitmap can use 60 K after the 4K superblocks */
1359  		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1360  		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1361  		mddev->reshape_backwards = 0;
1362  
1363  		if (mddev->minor_version >= 91) {
1364  			mddev->reshape_position = sb->reshape_position;
1365  			mddev->delta_disks = sb->delta_disks;
1366  			mddev->new_level = sb->new_level;
1367  			mddev->new_layout = sb->new_layout;
1368  			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1369  			if (mddev->delta_disks < 0)
1370  				mddev->reshape_backwards = 1;
1371  		} else {
1372  			mddev->reshape_position = MaxSector;
1373  			mddev->delta_disks = 0;
1374  			mddev->new_level = mddev->level;
1375  			mddev->new_layout = mddev->layout;
1376  			mddev->new_chunk_sectors = mddev->chunk_sectors;
1377  		}
1378  		if (mddev->level == 0)
1379  			mddev->layout = -1;
1380  
1381  		if (sb->state & (1<<MD_SB_CLEAN))
1382  			mddev->recovery_cp = MaxSector;
1383  		else {
1384  			if (sb->events_hi == sb->cp_events_hi &&
1385  				sb->events_lo == sb->cp_events_lo) {
1386  				mddev->recovery_cp = sb->recovery_cp;
1387  			} else
1388  				mddev->recovery_cp = 0;
1389  		}
1390  
1391  		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1392  		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1393  		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1394  		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1395  
1396  		mddev->max_disks = MD_SB_DISKS;
1397  
1398  		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1399  		    mddev->bitmap_info.file == NULL) {
1400  			mddev->bitmap_info.offset =
1401  				mddev->bitmap_info.default_offset;
1402  			mddev->bitmap_info.space =
1403  				mddev->bitmap_info.default_space;
1404  		}
1405  
1406  	} else if (mddev->pers == NULL) {
1407  		/* Insist on good event counter while assembling, except
1408  		 * for spares (which don't need an event count) */
1409  		++ev1;
1410  		if (sb->disks[rdev->desc_nr].state & (
1411  			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1412  			if (ev1 < mddev->events)
1413  				return -EINVAL;
1414  	} else if (mddev->bitmap) {
1415  		/* if adding to array with a bitmap, then we can accept an
1416  		 * older device ... but not too old.
1417  		 */
1418  		if (ev1 < md_bitmap_events_cleared(mddev))
1419  			return 0;
1420  		if (ev1 < mddev->events)
1421  			set_bit(Bitmap_sync, &rdev->flags);
1422  	} else {
1423  		if (ev1 < mddev->events)
1424  			/* just a hot-add of a new device, leave raid_disk at -1 */
1425  			return 0;
1426  	}
1427  
1428  	desc = sb->disks + rdev->desc_nr;
1429  
1430  	if (desc->state & (1<<MD_DISK_FAULTY))
1431  		set_bit(Faulty, &rdev->flags);
1432  	else if (desc->state & (1<<MD_DISK_SYNC)) {
1433  		set_bit(In_sync, &rdev->flags);
1434  		rdev->raid_disk = desc->raid_disk;
1435  		rdev->saved_raid_disk = desc->raid_disk;
1436  	} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1437  		/* active but not in sync implies recovery up to
1438  		 * reshape position.  We don't know exactly where
1439  		 * that is, so set to zero for now
1440  		 */
1441  		if (mddev->minor_version >= 91) {
1442  			rdev->recovery_offset = 0;
1443  			rdev->raid_disk = desc->raid_disk;
1444  		}
1445  	}
1446  	if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1447  		set_bit(WriteMostly, &rdev->flags);
1448  	if (desc->state & (1<<MD_DISK_FAILFAST))
1449  		set_bit(FailFast, &rdev->flags);
1450  	return 0;
1451  }
1452  
1453  /*
1454   * sync_super for 0.90.0
1455   */
super_90_sync(struct mddev * mddev,struct md_rdev * rdev)1456  static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1457  {
1458  	mdp_super_t *sb;
1459  	struct md_rdev *rdev2;
1460  	int next_spare = mddev->raid_disks;
1461  
1462  	/* make rdev->sb match mddev data..
1463  	 *
1464  	 * 1/ zero out disks
1465  	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1466  	 * 3/ any empty disks < next_spare become removed
1467  	 *
1468  	 * disks[0] gets initialised to REMOVED because
1469  	 * we cannot be sure from other fields if it has
1470  	 * been initialised or not.
1471  	 */
1472  	int i;
1473  	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1474  
1475  	rdev->sb_size = MD_SB_BYTES;
1476  
1477  	sb = page_address(rdev->sb_page);
1478  
1479  	memset(sb, 0, sizeof(*sb));
1480  
1481  	sb->md_magic = MD_SB_MAGIC;
1482  	sb->major_version = mddev->major_version;
1483  	sb->patch_version = mddev->patch_version;
1484  	sb->gvalid_words  = 0; /* ignored */
1485  	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1486  	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1487  	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1488  	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1489  
1490  	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1491  	sb->level = mddev->level;
1492  	sb->size = mddev->dev_sectors / 2;
1493  	sb->raid_disks = mddev->raid_disks;
1494  	sb->md_minor = mddev->md_minor;
1495  	sb->not_persistent = 0;
1496  	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1497  	sb->state = 0;
1498  	sb->events_hi = (mddev->events>>32);
1499  	sb->events_lo = (u32)mddev->events;
1500  
1501  	if (mddev->reshape_position == MaxSector)
1502  		sb->minor_version = 90;
1503  	else {
1504  		sb->minor_version = 91;
1505  		sb->reshape_position = mddev->reshape_position;
1506  		sb->new_level = mddev->new_level;
1507  		sb->delta_disks = mddev->delta_disks;
1508  		sb->new_layout = mddev->new_layout;
1509  		sb->new_chunk = mddev->new_chunk_sectors << 9;
1510  	}
1511  	mddev->minor_version = sb->minor_version;
1512  	if (mddev->in_sync)
1513  	{
1514  		sb->recovery_cp = mddev->recovery_cp;
1515  		sb->cp_events_hi = (mddev->events>>32);
1516  		sb->cp_events_lo = (u32)mddev->events;
1517  		if (mddev->recovery_cp == MaxSector)
1518  			sb->state = (1<< MD_SB_CLEAN);
1519  	} else
1520  		sb->recovery_cp = 0;
1521  
1522  	sb->layout = mddev->layout;
1523  	sb->chunk_size = mddev->chunk_sectors << 9;
1524  
1525  	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1526  		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1527  
1528  	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1529  	rdev_for_each(rdev2, mddev) {
1530  		mdp_disk_t *d;
1531  		int desc_nr;
1532  		int is_active = test_bit(In_sync, &rdev2->flags);
1533  
1534  		if (rdev2->raid_disk >= 0 &&
1535  		    sb->minor_version >= 91)
1536  			/* we have nowhere to store the recovery_offset,
1537  			 * but if it is not below the reshape_position,
1538  			 * we can piggy-back on that.
1539  			 */
1540  			is_active = 1;
1541  		if (rdev2->raid_disk < 0 ||
1542  		    test_bit(Faulty, &rdev2->flags))
1543  			is_active = 0;
1544  		if (is_active)
1545  			desc_nr = rdev2->raid_disk;
1546  		else
1547  			desc_nr = next_spare++;
1548  		rdev2->desc_nr = desc_nr;
1549  		d = &sb->disks[rdev2->desc_nr];
1550  		nr_disks++;
1551  		d->number = rdev2->desc_nr;
1552  		d->major = MAJOR(rdev2->bdev->bd_dev);
1553  		d->minor = MINOR(rdev2->bdev->bd_dev);
1554  		if (is_active)
1555  			d->raid_disk = rdev2->raid_disk;
1556  		else
1557  			d->raid_disk = rdev2->desc_nr; /* compatibility */
1558  		if (test_bit(Faulty, &rdev2->flags))
1559  			d->state = (1<<MD_DISK_FAULTY);
1560  		else if (is_active) {
1561  			d->state = (1<<MD_DISK_ACTIVE);
1562  			if (test_bit(In_sync, &rdev2->flags))
1563  				d->state |= (1<<MD_DISK_SYNC);
1564  			active++;
1565  			working++;
1566  		} else {
1567  			d->state = 0;
1568  			spare++;
1569  			working++;
1570  		}
1571  		if (test_bit(WriteMostly, &rdev2->flags))
1572  			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1573  		if (test_bit(FailFast, &rdev2->flags))
1574  			d->state |= (1<<MD_DISK_FAILFAST);
1575  	}
1576  	/* now set the "removed" and "faulty" bits on any missing devices */
1577  	for (i=0 ; i < mddev->raid_disks ; i++) {
1578  		mdp_disk_t *d = &sb->disks[i];
1579  		if (d->state == 0 && d->number == 0) {
1580  			d->number = i;
1581  			d->raid_disk = i;
1582  			d->state = (1<<MD_DISK_REMOVED);
1583  			d->state |= (1<<MD_DISK_FAULTY);
1584  			failed++;
1585  		}
1586  	}
1587  	sb->nr_disks = nr_disks;
1588  	sb->active_disks = active;
1589  	sb->working_disks = working;
1590  	sb->failed_disks = failed;
1591  	sb->spare_disks = spare;
1592  
1593  	sb->this_disk = sb->disks[rdev->desc_nr];
1594  	sb->sb_csum = calc_sb_csum(sb);
1595  }
1596  
1597  /*
1598   * rdev_size_change for 0.90.0
1599   */
1600  static unsigned long long
super_90_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)1601  super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1602  {
1603  	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1604  		return 0; /* component must fit device */
1605  	if (rdev->mddev->bitmap_info.offset)
1606  		return 0; /* can't move bitmap */
1607  	rdev->sb_start = calc_dev_sboffset(rdev);
1608  	if (!num_sectors || num_sectors > rdev->sb_start)
1609  		num_sectors = rdev->sb_start;
1610  	/* Limit to 4TB as metadata cannot record more than that.
1611  	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1612  	 */
1613  	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1614  		num_sectors = (sector_t)(2ULL << 32) - 2;
1615  	do {
1616  		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1617  		       rdev->sb_page);
1618  	} while (md_super_wait(rdev->mddev) < 0);
1619  	return num_sectors;
1620  }
1621  
1622  static int
super_90_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)1623  super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1624  {
1625  	/* non-zero offset changes not possible with v0.90 */
1626  	return new_offset == 0;
1627  }
1628  
1629  /*
1630   * version 1 superblock
1631   */
1632  
calc_sb_1_csum(struct mdp_superblock_1 * sb)1633  static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1634  {
1635  	__le32 disk_csum;
1636  	u32 csum;
1637  	unsigned long long newcsum;
1638  	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1639  	__le32 *isuper = (__le32*)sb;
1640  
1641  	disk_csum = sb->sb_csum;
1642  	sb->sb_csum = 0;
1643  	newcsum = 0;
1644  	for (; size >= 4; size -= 4)
1645  		newcsum += le32_to_cpu(*isuper++);
1646  
1647  	if (size == 2)
1648  		newcsum += le16_to_cpu(*(__le16*) isuper);
1649  
1650  	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1651  	sb->sb_csum = disk_csum;
1652  	return cpu_to_le32(csum);
1653  }
1654  
super_1_load(struct md_rdev * rdev,struct md_rdev * refdev,int minor_version)1655  static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1656  {
1657  	struct mdp_superblock_1 *sb;
1658  	int ret;
1659  	sector_t sb_start;
1660  	sector_t sectors;
1661  	int bmask;
1662  	bool spare_disk = true;
1663  
1664  	/*
1665  	 * Calculate the position of the superblock in 512byte sectors.
1666  	 * It is always aligned to a 4K boundary and
1667  	 * depeding on minor_version, it can be:
1668  	 * 0: At least 8K, but less than 12K, from end of device
1669  	 * 1: At start of device
1670  	 * 2: 4K from start of device.
1671  	 */
1672  	switch(minor_version) {
1673  	case 0:
1674  		sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1675  		sb_start &= ~(sector_t)(4*2-1);
1676  		break;
1677  	case 1:
1678  		sb_start = 0;
1679  		break;
1680  	case 2:
1681  		sb_start = 8;
1682  		break;
1683  	default:
1684  		return -EINVAL;
1685  	}
1686  	rdev->sb_start = sb_start;
1687  
1688  	/* superblock is rarely larger than 1K, but it can be larger,
1689  	 * and it is safe to read 4k, so we do that
1690  	 */
1691  	ret = read_disk_sb(rdev, 4096);
1692  	if (ret) return ret;
1693  
1694  	sb = page_address(rdev->sb_page);
1695  
1696  	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1697  	    sb->major_version != cpu_to_le32(1) ||
1698  	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1699  	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1700  	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1701  		return -EINVAL;
1702  
1703  	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1704  		pr_warn("md: invalid superblock checksum on %pg\n",
1705  			rdev->bdev);
1706  		return -EINVAL;
1707  	}
1708  	if (le64_to_cpu(sb->data_size) < 10) {
1709  		pr_warn("md: data_size too small on %pg\n",
1710  			rdev->bdev);
1711  		return -EINVAL;
1712  	}
1713  	if (sb->pad0 ||
1714  	    sb->pad3[0] ||
1715  	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1716  		/* Some padding is non-zero, might be a new feature */
1717  		return -EINVAL;
1718  
1719  	rdev->preferred_minor = 0xffff;
1720  	rdev->data_offset = le64_to_cpu(sb->data_offset);
1721  	rdev->new_data_offset = rdev->data_offset;
1722  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1723  	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1724  		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1725  	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1726  
1727  	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1728  	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1729  	if (rdev->sb_size & bmask)
1730  		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1731  
1732  	if (minor_version
1733  	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1734  		return -EINVAL;
1735  	if (minor_version
1736  	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1737  		return -EINVAL;
1738  
1739  	rdev->desc_nr = le32_to_cpu(sb->dev_number);
1740  
1741  	if (!rdev->bb_page) {
1742  		rdev->bb_page = alloc_page(GFP_KERNEL);
1743  		if (!rdev->bb_page)
1744  			return -ENOMEM;
1745  	}
1746  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1747  	    rdev->badblocks.count == 0) {
1748  		/* need to load the bad block list.
1749  		 * Currently we limit it to one page.
1750  		 */
1751  		s32 offset;
1752  		sector_t bb_sector;
1753  		__le64 *bbp;
1754  		int i;
1755  		int sectors = le16_to_cpu(sb->bblog_size);
1756  		if (sectors > (PAGE_SIZE / 512))
1757  			return -EINVAL;
1758  		offset = le32_to_cpu(sb->bblog_offset);
1759  		if (offset == 0)
1760  			return -EINVAL;
1761  		bb_sector = (long long)offset;
1762  		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1763  				  rdev->bb_page, REQ_OP_READ, true))
1764  			return -EIO;
1765  		bbp = (__le64 *)page_address(rdev->bb_page);
1766  		rdev->badblocks.shift = sb->bblog_shift;
1767  		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1768  			u64 bb = le64_to_cpu(*bbp);
1769  			int count = bb & (0x3ff);
1770  			u64 sector = bb >> 10;
1771  			sector <<= sb->bblog_shift;
1772  			count <<= sb->bblog_shift;
1773  			if (bb + 1 == 0)
1774  				break;
1775  			if (!badblocks_set(&rdev->badblocks, sector, count, 1))
1776  				return -EINVAL;
1777  		}
1778  	} else if (sb->bblog_offset != 0)
1779  		rdev->badblocks.shift = 0;
1780  
1781  	if ((le32_to_cpu(sb->feature_map) &
1782  	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1783  		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1784  		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1785  		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1786  	}
1787  
1788  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1789  	    sb->level != 0)
1790  		return -EINVAL;
1791  
1792  	/* not spare disk */
1793  	if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1794  	    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1795  	     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1796  		spare_disk = false;
1797  
1798  	if (!refdev) {
1799  		if (!spare_disk)
1800  			ret = 1;
1801  		else
1802  			ret = 0;
1803  	} else {
1804  		__u64 ev1, ev2;
1805  		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1806  
1807  		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1808  		    sb->level != refsb->level ||
1809  		    sb->layout != refsb->layout ||
1810  		    sb->chunksize != refsb->chunksize) {
1811  			pr_warn("md: %pg has strangely different superblock to %pg\n",
1812  				rdev->bdev,
1813  				refdev->bdev);
1814  			return -EINVAL;
1815  		}
1816  		ev1 = le64_to_cpu(sb->events);
1817  		ev2 = le64_to_cpu(refsb->events);
1818  
1819  		if (!spare_disk && ev1 > ev2)
1820  			ret = 1;
1821  		else
1822  			ret = 0;
1823  	}
1824  	if (minor_version)
1825  		sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1826  	else
1827  		sectors = rdev->sb_start;
1828  	if (sectors < le64_to_cpu(sb->data_size))
1829  		return -EINVAL;
1830  	rdev->sectors = le64_to_cpu(sb->data_size);
1831  	return ret;
1832  }
1833  
super_1_validate(struct mddev * mddev,struct md_rdev * freshest,struct md_rdev * rdev)1834  static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1835  {
1836  	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1837  	__u64 ev1 = le64_to_cpu(sb->events);
1838  	int role;
1839  
1840  	rdev->raid_disk = -1;
1841  	clear_bit(Faulty, &rdev->flags);
1842  	clear_bit(In_sync, &rdev->flags);
1843  	clear_bit(Bitmap_sync, &rdev->flags);
1844  	clear_bit(WriteMostly, &rdev->flags);
1845  
1846  	if (mddev->raid_disks == 0) {
1847  		mddev->major_version = 1;
1848  		mddev->patch_version = 0;
1849  		mddev->external = 0;
1850  		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1851  		mddev->ctime = le64_to_cpu(sb->ctime);
1852  		mddev->utime = le64_to_cpu(sb->utime);
1853  		mddev->level = le32_to_cpu(sb->level);
1854  		mddev->clevel[0] = 0;
1855  		mddev->layout = le32_to_cpu(sb->layout);
1856  		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1857  		mddev->dev_sectors = le64_to_cpu(sb->size);
1858  		mddev->events = ev1;
1859  		mddev->bitmap_info.offset = 0;
1860  		mddev->bitmap_info.space = 0;
1861  		/* Default location for bitmap is 1K after superblock
1862  		 * using 3K - total of 4K
1863  		 */
1864  		mddev->bitmap_info.default_offset = 1024 >> 9;
1865  		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1866  		mddev->reshape_backwards = 0;
1867  
1868  		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1869  		memcpy(mddev->uuid, sb->set_uuid, 16);
1870  
1871  		mddev->max_disks =  (4096-256)/2;
1872  
1873  		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1874  		    mddev->bitmap_info.file == NULL) {
1875  			mddev->bitmap_info.offset =
1876  				(__s32)le32_to_cpu(sb->bitmap_offset);
1877  			/* Metadata doesn't record how much space is available.
1878  			 * For 1.0, we assume we can use up to the superblock
1879  			 * if before, else to 4K beyond superblock.
1880  			 * For others, assume no change is possible.
1881  			 */
1882  			if (mddev->minor_version > 0)
1883  				mddev->bitmap_info.space = 0;
1884  			else if (mddev->bitmap_info.offset > 0)
1885  				mddev->bitmap_info.space =
1886  					8 - mddev->bitmap_info.offset;
1887  			else
1888  				mddev->bitmap_info.space =
1889  					-mddev->bitmap_info.offset;
1890  		}
1891  
1892  		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1893  			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1894  			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1895  			mddev->new_level = le32_to_cpu(sb->new_level);
1896  			mddev->new_layout = le32_to_cpu(sb->new_layout);
1897  			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1898  			if (mddev->delta_disks < 0 ||
1899  			    (mddev->delta_disks == 0 &&
1900  			     (le32_to_cpu(sb->feature_map)
1901  			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1902  				mddev->reshape_backwards = 1;
1903  		} else {
1904  			mddev->reshape_position = MaxSector;
1905  			mddev->delta_disks = 0;
1906  			mddev->new_level = mddev->level;
1907  			mddev->new_layout = mddev->layout;
1908  			mddev->new_chunk_sectors = mddev->chunk_sectors;
1909  		}
1910  
1911  		if (mddev->level == 0 &&
1912  		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1913  			mddev->layout = -1;
1914  
1915  		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1916  			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1917  
1918  		if (le32_to_cpu(sb->feature_map) &
1919  		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1920  			if (le32_to_cpu(sb->feature_map) &
1921  			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1922  				return -EINVAL;
1923  			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1924  			    (le32_to_cpu(sb->feature_map) &
1925  					    MD_FEATURE_MULTIPLE_PPLS))
1926  				return -EINVAL;
1927  			set_bit(MD_HAS_PPL, &mddev->flags);
1928  		}
1929  	} else if (mddev->pers == NULL) {
1930  		/* Insist of good event counter while assembling, except for
1931  		 * spares (which don't need an event count).
1932  		 * Similar to mdadm, we allow event counter difference of 1
1933  		 * from the freshest device.
1934  		 */
1935  		if (rdev->desc_nr >= 0 &&
1936  		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1937  		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1938  		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1939  			if (ev1 + 1 < mddev->events)
1940  				return -EINVAL;
1941  	} else if (mddev->bitmap) {
1942  		/* If adding to array with a bitmap, then we can accept an
1943  		 * older device, but not too old.
1944  		 */
1945  		if (ev1 < md_bitmap_events_cleared(mddev))
1946  			return 0;
1947  		if (ev1 < mddev->events)
1948  			set_bit(Bitmap_sync, &rdev->flags);
1949  	} else {
1950  		if (ev1 < mddev->events)
1951  			/* just a hot-add of a new device, leave raid_disk at -1 */
1952  			return 0;
1953  	}
1954  
1955  	if (rdev->desc_nr < 0 ||
1956  	    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1957  		role = MD_DISK_ROLE_SPARE;
1958  		rdev->desc_nr = -1;
1959  	} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
1960  		/*
1961  		 * If we are assembling, and our event counter is smaller than the
1962  		 * highest event counter, we cannot trust our superblock about the role.
1963  		 * It could happen that our rdev was marked as Faulty, and all other
1964  		 * superblocks were updated with +1 event counter.
1965  		 * Then, before the next superblock update, which typically happens when
1966  		 * remove_and_add_spares() removes the device from the array, there was
1967  		 * a crash or reboot.
1968  		 * If we allow current rdev without consulting the freshest superblock,
1969  		 * we could cause data corruption.
1970  		 * Note that in this case our event counter is smaller by 1 than the
1971  		 * highest, otherwise, this rdev would not be allowed into array;
1972  		 * both kernel and mdadm allow event counter difference of 1.
1973  		 */
1974  		struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
1975  		u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
1976  
1977  		if (rdev->desc_nr >= freshest_max_dev) {
1978  			/* this is unexpected, better not proceed */
1979  			pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
1980  				mdname(mddev), rdev->bdev, rdev->desc_nr,
1981  				freshest->bdev, freshest_max_dev);
1982  			return -EUCLEAN;
1983  		}
1984  
1985  		role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
1986  		pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
1987  			 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
1988  	} else {
1989  		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1990  	}
1991  	switch (role) {
1992  	case MD_DISK_ROLE_SPARE: /* spare */
1993  		break;
1994  	case MD_DISK_ROLE_FAULTY: /* faulty */
1995  		set_bit(Faulty, &rdev->flags);
1996  		break;
1997  	case MD_DISK_ROLE_JOURNAL: /* journal device */
1998  		if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1999  			/* journal device without journal feature */
2000  			pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2001  			return -EINVAL;
2002  		}
2003  		set_bit(Journal, &rdev->flags);
2004  		rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2005  		rdev->raid_disk = 0;
2006  		break;
2007  	default:
2008  		rdev->saved_raid_disk = role;
2009  		if ((le32_to_cpu(sb->feature_map) &
2010  		     MD_FEATURE_RECOVERY_OFFSET)) {
2011  			rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2012  			if (!(le32_to_cpu(sb->feature_map) &
2013  			      MD_FEATURE_RECOVERY_BITMAP))
2014  				rdev->saved_raid_disk = -1;
2015  		} else {
2016  			/*
2017  			 * If the array is FROZEN, then the device can't
2018  			 * be in_sync with rest of array.
2019  			 */
2020  			if (!test_bit(MD_RECOVERY_FROZEN,
2021  				      &mddev->recovery))
2022  				set_bit(In_sync, &rdev->flags);
2023  		}
2024  		rdev->raid_disk = role;
2025  		break;
2026  	}
2027  	if (sb->devflags & WriteMostly1)
2028  		set_bit(WriteMostly, &rdev->flags);
2029  	if (sb->devflags & FailFast1)
2030  		set_bit(FailFast, &rdev->flags);
2031  	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2032  		set_bit(Replacement, &rdev->flags);
2033  
2034  	return 0;
2035  }
2036  
super_1_sync(struct mddev * mddev,struct md_rdev * rdev)2037  static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2038  {
2039  	struct mdp_superblock_1 *sb;
2040  	struct md_rdev *rdev2;
2041  	int max_dev, i;
2042  	/* make rdev->sb match mddev and rdev data. */
2043  
2044  	sb = page_address(rdev->sb_page);
2045  
2046  	sb->feature_map = 0;
2047  	sb->pad0 = 0;
2048  	sb->recovery_offset = cpu_to_le64(0);
2049  	memset(sb->pad3, 0, sizeof(sb->pad3));
2050  
2051  	sb->utime = cpu_to_le64((__u64)mddev->utime);
2052  	sb->events = cpu_to_le64(mddev->events);
2053  	if (mddev->in_sync)
2054  		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2055  	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2056  		sb->resync_offset = cpu_to_le64(MaxSector);
2057  	else
2058  		sb->resync_offset = cpu_to_le64(0);
2059  
2060  	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2061  
2062  	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2063  	sb->size = cpu_to_le64(mddev->dev_sectors);
2064  	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2065  	sb->level = cpu_to_le32(mddev->level);
2066  	sb->layout = cpu_to_le32(mddev->layout);
2067  	if (test_bit(FailFast, &rdev->flags))
2068  		sb->devflags |= FailFast1;
2069  	else
2070  		sb->devflags &= ~FailFast1;
2071  
2072  	if (test_bit(WriteMostly, &rdev->flags))
2073  		sb->devflags |= WriteMostly1;
2074  	else
2075  		sb->devflags &= ~WriteMostly1;
2076  	sb->data_offset = cpu_to_le64(rdev->data_offset);
2077  	sb->data_size = cpu_to_le64(rdev->sectors);
2078  
2079  	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2080  		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2081  		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2082  	}
2083  
2084  	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2085  	    !test_bit(In_sync, &rdev->flags)) {
2086  		sb->feature_map |=
2087  			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2088  		sb->recovery_offset =
2089  			cpu_to_le64(rdev->recovery_offset);
2090  		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2091  			sb->feature_map |=
2092  				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2093  	}
2094  	/* Note: recovery_offset and journal_tail share space  */
2095  	if (test_bit(Journal, &rdev->flags))
2096  		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2097  	if (test_bit(Replacement, &rdev->flags))
2098  		sb->feature_map |=
2099  			cpu_to_le32(MD_FEATURE_REPLACEMENT);
2100  
2101  	if (mddev->reshape_position != MaxSector) {
2102  		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2103  		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2104  		sb->new_layout = cpu_to_le32(mddev->new_layout);
2105  		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2106  		sb->new_level = cpu_to_le32(mddev->new_level);
2107  		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2108  		if (mddev->delta_disks == 0 &&
2109  		    mddev->reshape_backwards)
2110  			sb->feature_map
2111  				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2112  		if (rdev->new_data_offset != rdev->data_offset) {
2113  			sb->feature_map
2114  				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2115  			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2116  							     - rdev->data_offset));
2117  		}
2118  	}
2119  
2120  	if (mddev_is_clustered(mddev))
2121  		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2122  
2123  	if (rdev->badblocks.count == 0)
2124  		/* Nothing to do for bad blocks*/ ;
2125  	else if (sb->bblog_offset == 0)
2126  		/* Cannot record bad blocks on this device */
2127  		md_error(mddev, rdev);
2128  	else {
2129  		struct badblocks *bb = &rdev->badblocks;
2130  		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2131  		u64 *p = bb->page;
2132  		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2133  		if (bb->changed) {
2134  			unsigned seq;
2135  
2136  retry:
2137  			seq = read_seqbegin(&bb->lock);
2138  
2139  			memset(bbp, 0xff, PAGE_SIZE);
2140  
2141  			for (i = 0 ; i < bb->count ; i++) {
2142  				u64 internal_bb = p[i];
2143  				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2144  						| BB_LEN(internal_bb));
2145  				bbp[i] = cpu_to_le64(store_bb);
2146  			}
2147  			bb->changed = 0;
2148  			if (read_seqretry(&bb->lock, seq))
2149  				goto retry;
2150  
2151  			bb->sector = (rdev->sb_start +
2152  				      (int)le32_to_cpu(sb->bblog_offset));
2153  			bb->size = le16_to_cpu(sb->bblog_size);
2154  		}
2155  	}
2156  
2157  	max_dev = 0;
2158  	rdev_for_each(rdev2, mddev)
2159  		if (rdev2->desc_nr+1 > max_dev)
2160  			max_dev = rdev2->desc_nr+1;
2161  
2162  	if (max_dev > le32_to_cpu(sb->max_dev)) {
2163  		int bmask;
2164  		sb->max_dev = cpu_to_le32(max_dev);
2165  		rdev->sb_size = max_dev * 2 + 256;
2166  		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2167  		if (rdev->sb_size & bmask)
2168  			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2169  	} else
2170  		max_dev = le32_to_cpu(sb->max_dev);
2171  
2172  	for (i=0; i<max_dev;i++)
2173  		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2174  
2175  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2176  		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2177  
2178  	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2179  		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2180  			sb->feature_map |=
2181  			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2182  		else
2183  			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2184  		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2185  		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2186  	}
2187  
2188  	rdev_for_each(rdev2, mddev) {
2189  		i = rdev2->desc_nr;
2190  		if (test_bit(Faulty, &rdev2->flags))
2191  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2192  		else if (test_bit(In_sync, &rdev2->flags))
2193  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2194  		else if (test_bit(Journal, &rdev2->flags))
2195  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2196  		else if (rdev2->raid_disk >= 0)
2197  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2198  		else
2199  			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2200  	}
2201  
2202  	sb->sb_csum = calc_sb_1_csum(sb);
2203  }
2204  
super_1_choose_bm_space(sector_t dev_size)2205  static sector_t super_1_choose_bm_space(sector_t dev_size)
2206  {
2207  	sector_t bm_space;
2208  
2209  	/* if the device is bigger than 8Gig, save 64k for bitmap
2210  	 * usage, if bigger than 200Gig, save 128k
2211  	 */
2212  	if (dev_size < 64*2)
2213  		bm_space = 0;
2214  	else if (dev_size - 64*2 >= 200*1024*1024*2)
2215  		bm_space = 128*2;
2216  	else if (dev_size - 4*2 > 8*1024*1024*2)
2217  		bm_space = 64*2;
2218  	else
2219  		bm_space = 4*2;
2220  	return bm_space;
2221  }
2222  
2223  static unsigned long long
super_1_rdev_size_change(struct md_rdev * rdev,sector_t num_sectors)2224  super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2225  {
2226  	struct mdp_superblock_1 *sb;
2227  	sector_t max_sectors;
2228  	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2229  		return 0; /* component must fit device */
2230  	if (rdev->data_offset != rdev->new_data_offset)
2231  		return 0; /* too confusing */
2232  	if (rdev->sb_start < rdev->data_offset) {
2233  		/* minor versions 1 and 2; superblock before data */
2234  		max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2235  		if (!num_sectors || num_sectors > max_sectors)
2236  			num_sectors = max_sectors;
2237  	} else if (rdev->mddev->bitmap_info.offset) {
2238  		/* minor version 0 with bitmap we can't move */
2239  		return 0;
2240  	} else {
2241  		/* minor version 0; superblock after data */
2242  		sector_t sb_start, bm_space;
2243  		sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2244  
2245  		/* 8K is for superblock */
2246  		sb_start = dev_size - 8*2;
2247  		sb_start &= ~(sector_t)(4*2 - 1);
2248  
2249  		bm_space = super_1_choose_bm_space(dev_size);
2250  
2251  		/* Space that can be used to store date needs to decrease
2252  		 * superblock bitmap space and bad block space(4K)
2253  		 */
2254  		max_sectors = sb_start - bm_space - 4*2;
2255  
2256  		if (!num_sectors || num_sectors > max_sectors)
2257  			num_sectors = max_sectors;
2258  		rdev->sb_start = sb_start;
2259  	}
2260  	sb = page_address(rdev->sb_page);
2261  	sb->data_size = cpu_to_le64(num_sectors);
2262  	sb->super_offset = cpu_to_le64(rdev->sb_start);
2263  	sb->sb_csum = calc_sb_1_csum(sb);
2264  	do {
2265  		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2266  			       rdev->sb_page);
2267  	} while (md_super_wait(rdev->mddev) < 0);
2268  	return num_sectors;
2269  
2270  }
2271  
2272  static int
super_1_allow_new_offset(struct md_rdev * rdev,unsigned long long new_offset)2273  super_1_allow_new_offset(struct md_rdev *rdev,
2274  			 unsigned long long new_offset)
2275  {
2276  	/* All necessary checks on new >= old have been done */
2277  	if (new_offset >= rdev->data_offset)
2278  		return 1;
2279  
2280  	/* with 1.0 metadata, there is no metadata to tread on
2281  	 * so we can always move back */
2282  	if (rdev->mddev->minor_version == 0)
2283  		return 1;
2284  
2285  	/* otherwise we must be sure not to step on
2286  	 * any metadata, so stay:
2287  	 * 36K beyond start of superblock
2288  	 * beyond end of badblocks
2289  	 * beyond write-intent bitmap
2290  	 */
2291  	if (rdev->sb_start + (32+4)*2 > new_offset)
2292  		return 0;
2293  
2294  	if (!rdev->mddev->bitmap_info.file) {
2295  		struct mddev *mddev = rdev->mddev;
2296  		struct md_bitmap_stats stats;
2297  		int err;
2298  
2299  		err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
2300  		if (!err && rdev->sb_start + mddev->bitmap_info.offset +
2301  		    stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
2302  			return 0;
2303  	}
2304  
2305  	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2306  		return 0;
2307  
2308  	return 1;
2309  }
2310  
2311  static struct super_type super_types[] = {
2312  	[0] = {
2313  		.name	= "0.90.0",
2314  		.owner	= THIS_MODULE,
2315  		.load_super	    = super_90_load,
2316  		.validate_super	    = super_90_validate,
2317  		.sync_super	    = super_90_sync,
2318  		.rdev_size_change   = super_90_rdev_size_change,
2319  		.allow_new_offset   = super_90_allow_new_offset,
2320  	},
2321  	[1] = {
2322  		.name	= "md-1",
2323  		.owner	= THIS_MODULE,
2324  		.load_super	    = super_1_load,
2325  		.validate_super	    = super_1_validate,
2326  		.sync_super	    = super_1_sync,
2327  		.rdev_size_change   = super_1_rdev_size_change,
2328  		.allow_new_offset   = super_1_allow_new_offset,
2329  	},
2330  };
2331  
sync_super(struct mddev * mddev,struct md_rdev * rdev)2332  static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2333  {
2334  	if (mddev->sync_super) {
2335  		mddev->sync_super(mddev, rdev);
2336  		return;
2337  	}
2338  
2339  	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2340  
2341  	super_types[mddev->major_version].sync_super(mddev, rdev);
2342  }
2343  
match_mddev_units(struct mddev * mddev1,struct mddev * mddev2)2344  static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2345  {
2346  	struct md_rdev *rdev, *rdev2;
2347  
2348  	rcu_read_lock();
2349  	rdev_for_each_rcu(rdev, mddev1) {
2350  		if (test_bit(Faulty, &rdev->flags) ||
2351  		    test_bit(Journal, &rdev->flags) ||
2352  		    rdev->raid_disk == -1)
2353  			continue;
2354  		rdev_for_each_rcu(rdev2, mddev2) {
2355  			if (test_bit(Faulty, &rdev2->flags) ||
2356  			    test_bit(Journal, &rdev2->flags) ||
2357  			    rdev2->raid_disk == -1)
2358  				continue;
2359  			if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2360  				rcu_read_unlock();
2361  				return 1;
2362  			}
2363  		}
2364  	}
2365  	rcu_read_unlock();
2366  	return 0;
2367  }
2368  
2369  static LIST_HEAD(pending_raid_disks);
2370  
2371  /*
2372   * Try to register data integrity profile for an mddev
2373   *
2374   * This is called when an array is started and after a disk has been kicked
2375   * from the array. It only succeeds if all working and active component devices
2376   * are integrity capable with matching profiles.
2377   */
md_integrity_register(struct mddev * mddev)2378  int md_integrity_register(struct mddev *mddev)
2379  {
2380  	if (list_empty(&mddev->disks))
2381  		return 0; /* nothing to do */
2382  	if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk))
2383  		return 0; /* shouldn't register */
2384  
2385  	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2386  	return 0;
2387  }
2388  EXPORT_SYMBOL(md_integrity_register);
2389  
rdev_read_only(struct md_rdev * rdev)2390  static bool rdev_read_only(struct md_rdev *rdev)
2391  {
2392  	return bdev_read_only(rdev->bdev) ||
2393  		(rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2394  }
2395  
bind_rdev_to_array(struct md_rdev * rdev,struct mddev * mddev)2396  static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2397  {
2398  	char b[BDEVNAME_SIZE];
2399  	int err;
2400  
2401  	/* prevent duplicates */
2402  	if (find_rdev(mddev, rdev->bdev->bd_dev))
2403  		return -EEXIST;
2404  
2405  	if (rdev_read_only(rdev) && mddev->pers)
2406  		return -EROFS;
2407  
2408  	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2409  	if (!test_bit(Journal, &rdev->flags) &&
2410  	    rdev->sectors &&
2411  	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2412  		if (mddev->pers) {
2413  			/* Cannot change size, so fail
2414  			 * If mddev->level <= 0, then we don't care
2415  			 * about aligning sizes (e.g. linear)
2416  			 */
2417  			if (mddev->level > 0)
2418  				return -ENOSPC;
2419  		} else
2420  			mddev->dev_sectors = rdev->sectors;
2421  	}
2422  
2423  	/* Verify rdev->desc_nr is unique.
2424  	 * If it is -1, assign a free number, else
2425  	 * check number is not in use
2426  	 */
2427  	rcu_read_lock();
2428  	if (rdev->desc_nr < 0) {
2429  		int choice = 0;
2430  		if (mddev->pers)
2431  			choice = mddev->raid_disks;
2432  		while (md_find_rdev_nr_rcu(mddev, choice))
2433  			choice++;
2434  		rdev->desc_nr = choice;
2435  	} else {
2436  		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2437  			rcu_read_unlock();
2438  			return -EBUSY;
2439  		}
2440  	}
2441  	rcu_read_unlock();
2442  	if (!test_bit(Journal, &rdev->flags) &&
2443  	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2444  		pr_warn("md: %s: array is limited to %d devices\n",
2445  			mdname(mddev), mddev->max_disks);
2446  		return -EBUSY;
2447  	}
2448  	snprintf(b, sizeof(b), "%pg", rdev->bdev);
2449  	strreplace(b, '/', '!');
2450  
2451  	rdev->mddev = mddev;
2452  	pr_debug("md: bind<%s>\n", b);
2453  
2454  	if (mddev->raid_disks)
2455  		mddev_create_serial_pool(mddev, rdev);
2456  
2457  	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2458  		goto fail;
2459  
2460  	/* failure here is OK */
2461  	err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2462  	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2463  	rdev->sysfs_unack_badblocks =
2464  		sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2465  	rdev->sysfs_badblocks =
2466  		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2467  
2468  	list_add_rcu(&rdev->same_set, &mddev->disks);
2469  	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2470  
2471  	/* May as well allow recovery to be retried once */
2472  	mddev->recovery_disabled++;
2473  
2474  	return 0;
2475  
2476   fail:
2477  	pr_warn("md: failed to register dev-%s for %s\n",
2478  		b, mdname(mddev));
2479  	mddev_destroy_serial_pool(mddev, rdev);
2480  	return err;
2481  }
2482  
2483  void md_autodetect_dev(dev_t dev);
2484  
2485  /* just for claiming the bdev */
2486  static struct md_rdev claim_rdev;
2487  
export_rdev(struct md_rdev * rdev,struct mddev * mddev)2488  static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2489  {
2490  	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2491  	md_rdev_clear(rdev);
2492  #ifndef MODULE
2493  	if (test_bit(AutoDetected, &rdev->flags))
2494  		md_autodetect_dev(rdev->bdev->bd_dev);
2495  #endif
2496  	fput(rdev->bdev_file);
2497  	rdev->bdev = NULL;
2498  	kobject_put(&rdev->kobj);
2499  }
2500  
md_kick_rdev_from_array(struct md_rdev * rdev)2501  static void md_kick_rdev_from_array(struct md_rdev *rdev)
2502  {
2503  	struct mddev *mddev = rdev->mddev;
2504  
2505  	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2506  	list_del_rcu(&rdev->same_set);
2507  	pr_debug("md: unbind<%pg>\n", rdev->bdev);
2508  	mddev_destroy_serial_pool(rdev->mddev, rdev);
2509  	WRITE_ONCE(rdev->mddev, NULL);
2510  	sysfs_remove_link(&rdev->kobj, "block");
2511  	sysfs_put(rdev->sysfs_state);
2512  	sysfs_put(rdev->sysfs_unack_badblocks);
2513  	sysfs_put(rdev->sysfs_badblocks);
2514  	rdev->sysfs_state = NULL;
2515  	rdev->sysfs_unack_badblocks = NULL;
2516  	rdev->sysfs_badblocks = NULL;
2517  	rdev->badblocks.count = 0;
2518  
2519  	synchronize_rcu();
2520  
2521  	/*
2522  	 * kobject_del() will wait for all in progress writers to be done, where
2523  	 * reconfig_mutex is held, hence it can't be called under
2524  	 * reconfig_mutex and it's delayed to mddev_unlock().
2525  	 */
2526  	list_add(&rdev->same_set, &mddev->deleting);
2527  }
2528  
export_array(struct mddev * mddev)2529  static void export_array(struct mddev *mddev)
2530  {
2531  	struct md_rdev *rdev;
2532  
2533  	while (!list_empty(&mddev->disks)) {
2534  		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2535  					same_set);
2536  		md_kick_rdev_from_array(rdev);
2537  	}
2538  	mddev->raid_disks = 0;
2539  	mddev->major_version = 0;
2540  }
2541  
set_in_sync(struct mddev * mddev)2542  static bool set_in_sync(struct mddev *mddev)
2543  {
2544  	lockdep_assert_held(&mddev->lock);
2545  	if (!mddev->in_sync) {
2546  		mddev->sync_checkers++;
2547  		spin_unlock(&mddev->lock);
2548  		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2549  		spin_lock(&mddev->lock);
2550  		if (!mddev->in_sync &&
2551  		    percpu_ref_is_zero(&mddev->writes_pending)) {
2552  			mddev->in_sync = 1;
2553  			/*
2554  			 * Ensure ->in_sync is visible before we clear
2555  			 * ->sync_checkers.
2556  			 */
2557  			smp_mb();
2558  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2559  			sysfs_notify_dirent_safe(mddev->sysfs_state);
2560  		}
2561  		if (--mddev->sync_checkers == 0)
2562  			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2563  	}
2564  	if (mddev->safemode == 1)
2565  		mddev->safemode = 0;
2566  	return mddev->in_sync;
2567  }
2568  
sync_sbs(struct mddev * mddev,int nospares)2569  static void sync_sbs(struct mddev *mddev, int nospares)
2570  {
2571  	/* Update each superblock (in-memory image), but
2572  	 * if we are allowed to, skip spares which already
2573  	 * have the right event counter, or have one earlier
2574  	 * (which would mean they aren't being marked as dirty
2575  	 * with the rest of the array)
2576  	 */
2577  	struct md_rdev *rdev;
2578  	rdev_for_each(rdev, mddev) {
2579  		if (rdev->sb_events == mddev->events ||
2580  		    (nospares &&
2581  		     rdev->raid_disk < 0 &&
2582  		     rdev->sb_events+1 == mddev->events)) {
2583  			/* Don't update this superblock */
2584  			rdev->sb_loaded = 2;
2585  		} else {
2586  			sync_super(mddev, rdev);
2587  			rdev->sb_loaded = 1;
2588  		}
2589  	}
2590  }
2591  
does_sb_need_changing(struct mddev * mddev)2592  static bool does_sb_need_changing(struct mddev *mddev)
2593  {
2594  	struct md_rdev *rdev = NULL, *iter;
2595  	struct mdp_superblock_1 *sb;
2596  	int role;
2597  
2598  	/* Find a good rdev */
2599  	rdev_for_each(iter, mddev)
2600  		if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2601  			rdev = iter;
2602  			break;
2603  		}
2604  
2605  	/* No good device found. */
2606  	if (!rdev)
2607  		return false;
2608  
2609  	sb = page_address(rdev->sb_page);
2610  	/* Check if a device has become faulty or a spare become active */
2611  	rdev_for_each(rdev, mddev) {
2612  		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2613  		/* Device activated? */
2614  		if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2615  		    !test_bit(Faulty, &rdev->flags))
2616  			return true;
2617  		/* Device turned faulty? */
2618  		if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2619  			return true;
2620  	}
2621  
2622  	/* Check if any mddev parameters have changed */
2623  	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2624  	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2625  	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2626  	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2627  	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2628  		return true;
2629  
2630  	return false;
2631  }
2632  
md_update_sb(struct mddev * mddev,int force_change)2633  void md_update_sb(struct mddev *mddev, int force_change)
2634  {
2635  	struct md_rdev *rdev;
2636  	int sync_req;
2637  	int nospares = 0;
2638  	int any_badblocks_changed = 0;
2639  	int ret = -1;
2640  
2641  	if (!md_is_rdwr(mddev)) {
2642  		if (force_change)
2643  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2644  		return;
2645  	}
2646  
2647  repeat:
2648  	if (mddev_is_clustered(mddev)) {
2649  		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2650  			force_change = 1;
2651  		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2652  			nospares = 1;
2653  		ret = mddev->cluster_ops->metadata_update_start(mddev);
2654  		/* Has someone else has updated the sb */
2655  		if (!does_sb_need_changing(mddev)) {
2656  			if (ret == 0)
2657  				mddev->cluster_ops->metadata_update_cancel(mddev);
2658  			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2659  							 BIT(MD_SB_CHANGE_DEVS) |
2660  							 BIT(MD_SB_CHANGE_CLEAN));
2661  			return;
2662  		}
2663  	}
2664  
2665  	/*
2666  	 * First make sure individual recovery_offsets are correct
2667  	 * curr_resync_completed can only be used during recovery.
2668  	 * During reshape/resync it might use array-addresses rather
2669  	 * that device addresses.
2670  	 */
2671  	rdev_for_each(rdev, mddev) {
2672  		if (rdev->raid_disk >= 0 &&
2673  		    mddev->delta_disks >= 0 &&
2674  		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2675  		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2676  		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2677  		    !test_bit(Journal, &rdev->flags) &&
2678  		    !test_bit(In_sync, &rdev->flags) &&
2679  		    mddev->curr_resync_completed > rdev->recovery_offset)
2680  				rdev->recovery_offset = mddev->curr_resync_completed;
2681  
2682  	}
2683  	if (!mddev->persistent) {
2684  		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2685  		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2686  		if (!mddev->external) {
2687  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2688  			rdev_for_each(rdev, mddev) {
2689  				if (rdev->badblocks.changed) {
2690  					rdev->badblocks.changed = 0;
2691  					ack_all_badblocks(&rdev->badblocks);
2692  					md_error(mddev, rdev);
2693  				}
2694  				clear_bit(Blocked, &rdev->flags);
2695  				clear_bit(BlockedBadBlocks, &rdev->flags);
2696  				wake_up(&rdev->blocked_wait);
2697  			}
2698  		}
2699  		wake_up(&mddev->sb_wait);
2700  		return;
2701  	}
2702  
2703  	spin_lock(&mddev->lock);
2704  
2705  	mddev->utime = ktime_get_real_seconds();
2706  
2707  	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2708  		force_change = 1;
2709  	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2710  		/* just a clean<-> dirty transition, possibly leave spares alone,
2711  		 * though if events isn't the right even/odd, we will have to do
2712  		 * spares after all
2713  		 */
2714  		nospares = 1;
2715  	if (force_change)
2716  		nospares = 0;
2717  	if (mddev->degraded)
2718  		/* If the array is degraded, then skipping spares is both
2719  		 * dangerous and fairly pointless.
2720  		 * Dangerous because a device that was removed from the array
2721  		 * might have a event_count that still looks up-to-date,
2722  		 * so it can be re-added without a resync.
2723  		 * Pointless because if there are any spares to skip,
2724  		 * then a recovery will happen and soon that array won't
2725  		 * be degraded any more and the spare can go back to sleep then.
2726  		 */
2727  		nospares = 0;
2728  
2729  	sync_req = mddev->in_sync;
2730  
2731  	/* If this is just a dirty<->clean transition, and the array is clean
2732  	 * and 'events' is odd, we can roll back to the previous clean state */
2733  	if (nospares
2734  	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2735  	    && mddev->can_decrease_events
2736  	    && mddev->events != 1) {
2737  		mddev->events--;
2738  		mddev->can_decrease_events = 0;
2739  	} else {
2740  		/* otherwise we have to go forward and ... */
2741  		mddev->events ++;
2742  		mddev->can_decrease_events = nospares;
2743  	}
2744  
2745  	/*
2746  	 * This 64-bit counter should never wrap.
2747  	 * Either we are in around ~1 trillion A.C., assuming
2748  	 * 1 reboot per second, or we have a bug...
2749  	 */
2750  	WARN_ON(mddev->events == 0);
2751  
2752  	rdev_for_each(rdev, mddev) {
2753  		if (rdev->badblocks.changed)
2754  			any_badblocks_changed++;
2755  		if (test_bit(Faulty, &rdev->flags))
2756  			set_bit(FaultRecorded, &rdev->flags);
2757  	}
2758  
2759  	sync_sbs(mddev, nospares);
2760  	spin_unlock(&mddev->lock);
2761  
2762  	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2763  		 mdname(mddev), mddev->in_sync);
2764  
2765  	mddev_add_trace_msg(mddev, "md md_update_sb");
2766  rewrite:
2767  	mddev->bitmap_ops->update_sb(mddev->bitmap);
2768  	rdev_for_each(rdev, mddev) {
2769  		if (rdev->sb_loaded != 1)
2770  			continue; /* no noise on spare devices */
2771  
2772  		if (!test_bit(Faulty, &rdev->flags)) {
2773  			md_super_write(mddev,rdev,
2774  				       rdev->sb_start, rdev->sb_size,
2775  				       rdev->sb_page);
2776  			pr_debug("md: (write) %pg's sb offset: %llu\n",
2777  				 rdev->bdev,
2778  				 (unsigned long long)rdev->sb_start);
2779  			rdev->sb_events = mddev->events;
2780  			if (rdev->badblocks.size) {
2781  				md_super_write(mddev, rdev,
2782  					       rdev->badblocks.sector,
2783  					       rdev->badblocks.size << 9,
2784  					       rdev->bb_page);
2785  				rdev->badblocks.size = 0;
2786  			}
2787  
2788  		} else
2789  			pr_debug("md: %pg (skipping faulty)\n",
2790  				 rdev->bdev);
2791  	}
2792  	if (md_super_wait(mddev) < 0)
2793  		goto rewrite;
2794  	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2795  
2796  	if (mddev_is_clustered(mddev) && ret == 0)
2797  		mddev->cluster_ops->metadata_update_finish(mddev);
2798  
2799  	if (mddev->in_sync != sync_req ||
2800  	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2801  			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2802  		/* have to write it out again */
2803  		goto repeat;
2804  	wake_up(&mddev->sb_wait);
2805  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2806  		sysfs_notify_dirent_safe(mddev->sysfs_completed);
2807  
2808  	rdev_for_each(rdev, mddev) {
2809  		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2810  			clear_bit(Blocked, &rdev->flags);
2811  
2812  		if (any_badblocks_changed)
2813  			ack_all_badblocks(&rdev->badblocks);
2814  		clear_bit(BlockedBadBlocks, &rdev->flags);
2815  		wake_up(&rdev->blocked_wait);
2816  	}
2817  }
2818  EXPORT_SYMBOL(md_update_sb);
2819  
add_bound_rdev(struct md_rdev * rdev)2820  static int add_bound_rdev(struct md_rdev *rdev)
2821  {
2822  	struct mddev *mddev = rdev->mddev;
2823  	int err = 0;
2824  	bool add_journal = test_bit(Journal, &rdev->flags);
2825  
2826  	if (!mddev->pers->hot_remove_disk || add_journal) {
2827  		/* If there is hot_add_disk but no hot_remove_disk
2828  		 * then added disks for geometry changes,
2829  		 * and should be added immediately.
2830  		 */
2831  		super_types[mddev->major_version].
2832  			validate_super(mddev, NULL/*freshest*/, rdev);
2833  		err = mddev->pers->hot_add_disk(mddev, rdev);
2834  		if (err) {
2835  			md_kick_rdev_from_array(rdev);
2836  			return err;
2837  		}
2838  	}
2839  	sysfs_notify_dirent_safe(rdev->sysfs_state);
2840  
2841  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2842  	if (mddev->degraded)
2843  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2844  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2845  	md_new_event();
2846  	return 0;
2847  }
2848  
2849  /* words written to sysfs files may, or may not, be \n terminated.
2850   * We want to accept with case. For this we use cmd_match.
2851   */
cmd_match(const char * cmd,const char * str)2852  static int cmd_match(const char *cmd, const char *str)
2853  {
2854  	/* See if cmd, written into a sysfs file, matches
2855  	 * str.  They must either be the same, or cmd can
2856  	 * have a trailing newline
2857  	 */
2858  	while (*cmd && *str && *cmd == *str) {
2859  		cmd++;
2860  		str++;
2861  	}
2862  	if (*cmd == '\n')
2863  		cmd++;
2864  	if (*str || *cmd)
2865  		return 0;
2866  	return 1;
2867  }
2868  
2869  struct rdev_sysfs_entry {
2870  	struct attribute attr;
2871  	ssize_t (*show)(struct md_rdev *, char *);
2872  	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2873  };
2874  
2875  static ssize_t
state_show(struct md_rdev * rdev,char * page)2876  state_show(struct md_rdev *rdev, char *page)
2877  {
2878  	char *sep = ",";
2879  	size_t len = 0;
2880  	unsigned long flags = READ_ONCE(rdev->flags);
2881  
2882  	if (test_bit(Faulty, &flags) ||
2883  	    (!test_bit(ExternalBbl, &flags) &&
2884  	    rdev->badblocks.unacked_exist))
2885  		len += sprintf(page+len, "faulty%s", sep);
2886  	if (test_bit(In_sync, &flags))
2887  		len += sprintf(page+len, "in_sync%s", sep);
2888  	if (test_bit(Journal, &flags))
2889  		len += sprintf(page+len, "journal%s", sep);
2890  	if (test_bit(WriteMostly, &flags))
2891  		len += sprintf(page+len, "write_mostly%s", sep);
2892  	if (test_bit(Blocked, &flags) ||
2893  	    (rdev->badblocks.unacked_exist
2894  	     && !test_bit(Faulty, &flags)))
2895  		len += sprintf(page+len, "blocked%s", sep);
2896  	if (!test_bit(Faulty, &flags) &&
2897  	    !test_bit(Journal, &flags) &&
2898  	    !test_bit(In_sync, &flags))
2899  		len += sprintf(page+len, "spare%s", sep);
2900  	if (test_bit(WriteErrorSeen, &flags))
2901  		len += sprintf(page+len, "write_error%s", sep);
2902  	if (test_bit(WantReplacement, &flags))
2903  		len += sprintf(page+len, "want_replacement%s", sep);
2904  	if (test_bit(Replacement, &flags))
2905  		len += sprintf(page+len, "replacement%s", sep);
2906  	if (test_bit(ExternalBbl, &flags))
2907  		len += sprintf(page+len, "external_bbl%s", sep);
2908  	if (test_bit(FailFast, &flags))
2909  		len += sprintf(page+len, "failfast%s", sep);
2910  
2911  	if (len)
2912  		len -= strlen(sep);
2913  
2914  	return len+sprintf(page+len, "\n");
2915  }
2916  
2917  static ssize_t
state_store(struct md_rdev * rdev,const char * buf,size_t len)2918  state_store(struct md_rdev *rdev, const char *buf, size_t len)
2919  {
2920  	/* can write
2921  	 *  faulty  - simulates an error
2922  	 *  remove  - disconnects the device
2923  	 *  writemostly - sets write_mostly
2924  	 *  -writemostly - clears write_mostly
2925  	 *  blocked - sets the Blocked flags
2926  	 *  -blocked - clears the Blocked and possibly simulates an error
2927  	 *  insync - sets Insync providing device isn't active
2928  	 *  -insync - clear Insync for a device with a slot assigned,
2929  	 *            so that it gets rebuilt based on bitmap
2930  	 *  write_error - sets WriteErrorSeen
2931  	 *  -write_error - clears WriteErrorSeen
2932  	 *  {,-}failfast - set/clear FailFast
2933  	 */
2934  
2935  	struct mddev *mddev = rdev->mddev;
2936  	int err = -EINVAL;
2937  	bool need_update_sb = false;
2938  
2939  	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2940  		md_error(rdev->mddev, rdev);
2941  
2942  		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
2943  			err = -EBUSY;
2944  		else
2945  			err = 0;
2946  	} else if (cmd_match(buf, "remove")) {
2947  		if (rdev->mddev->pers) {
2948  			clear_bit(Blocked, &rdev->flags);
2949  			remove_and_add_spares(rdev->mddev, rdev);
2950  		}
2951  		if (rdev->raid_disk >= 0)
2952  			err = -EBUSY;
2953  		else {
2954  			err = 0;
2955  			if (mddev_is_clustered(mddev))
2956  				err = mddev->cluster_ops->remove_disk(mddev, rdev);
2957  
2958  			if (err == 0) {
2959  				md_kick_rdev_from_array(rdev);
2960  				if (mddev->pers)
2961  					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2962  				md_new_event();
2963  			}
2964  		}
2965  	} else if (cmd_match(buf, "writemostly")) {
2966  		set_bit(WriteMostly, &rdev->flags);
2967  		mddev_create_serial_pool(rdev->mddev, rdev);
2968  		need_update_sb = true;
2969  		err = 0;
2970  	} else if (cmd_match(buf, "-writemostly")) {
2971  		mddev_destroy_serial_pool(rdev->mddev, rdev);
2972  		clear_bit(WriteMostly, &rdev->flags);
2973  		need_update_sb = true;
2974  		err = 0;
2975  	} else if (cmd_match(buf, "blocked")) {
2976  		set_bit(Blocked, &rdev->flags);
2977  		err = 0;
2978  	} else if (cmd_match(buf, "-blocked")) {
2979  		if (!test_bit(Faulty, &rdev->flags) &&
2980  		    !test_bit(ExternalBbl, &rdev->flags) &&
2981  		    rdev->badblocks.unacked_exist) {
2982  			/* metadata handler doesn't understand badblocks,
2983  			 * so we need to fail the device
2984  			 */
2985  			md_error(rdev->mddev, rdev);
2986  		}
2987  		clear_bit(Blocked, &rdev->flags);
2988  		clear_bit(BlockedBadBlocks, &rdev->flags);
2989  		wake_up(&rdev->blocked_wait);
2990  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2991  
2992  		err = 0;
2993  	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2994  		set_bit(In_sync, &rdev->flags);
2995  		err = 0;
2996  	} else if (cmd_match(buf, "failfast")) {
2997  		set_bit(FailFast, &rdev->flags);
2998  		need_update_sb = true;
2999  		err = 0;
3000  	} else if (cmd_match(buf, "-failfast")) {
3001  		clear_bit(FailFast, &rdev->flags);
3002  		need_update_sb = true;
3003  		err = 0;
3004  	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3005  		   !test_bit(Journal, &rdev->flags)) {
3006  		if (rdev->mddev->pers == NULL) {
3007  			clear_bit(In_sync, &rdev->flags);
3008  			rdev->saved_raid_disk = rdev->raid_disk;
3009  			rdev->raid_disk = -1;
3010  			err = 0;
3011  		}
3012  	} else if (cmd_match(buf, "write_error")) {
3013  		set_bit(WriteErrorSeen, &rdev->flags);
3014  		err = 0;
3015  	} else if (cmd_match(buf, "-write_error")) {
3016  		clear_bit(WriteErrorSeen, &rdev->flags);
3017  		err = 0;
3018  	} else if (cmd_match(buf, "want_replacement")) {
3019  		/* Any non-spare device that is not a replacement can
3020  		 * become want_replacement at any time, but we then need to
3021  		 * check if recovery is needed.
3022  		 */
3023  		if (rdev->raid_disk >= 0 &&
3024  		    !test_bit(Journal, &rdev->flags) &&
3025  		    !test_bit(Replacement, &rdev->flags))
3026  			set_bit(WantReplacement, &rdev->flags);
3027  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3028  		err = 0;
3029  	} else if (cmd_match(buf, "-want_replacement")) {
3030  		/* Clearing 'want_replacement' is always allowed.
3031  		 * Once replacements starts it is too late though.
3032  		 */
3033  		err = 0;
3034  		clear_bit(WantReplacement, &rdev->flags);
3035  	} else if (cmd_match(buf, "replacement")) {
3036  		/* Can only set a device as a replacement when array has not
3037  		 * yet been started.  Once running, replacement is automatic
3038  		 * from spares, or by assigning 'slot'.
3039  		 */
3040  		if (rdev->mddev->pers)
3041  			err = -EBUSY;
3042  		else {
3043  			set_bit(Replacement, &rdev->flags);
3044  			err = 0;
3045  		}
3046  	} else if (cmd_match(buf, "-replacement")) {
3047  		/* Similarly, can only clear Replacement before start */
3048  		if (rdev->mddev->pers)
3049  			err = -EBUSY;
3050  		else {
3051  			clear_bit(Replacement, &rdev->flags);
3052  			err = 0;
3053  		}
3054  	} else if (cmd_match(buf, "re-add")) {
3055  		if (!rdev->mddev->pers)
3056  			err = -EINVAL;
3057  		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3058  				rdev->saved_raid_disk >= 0) {
3059  			/* clear_bit is performed _after_ all the devices
3060  			 * have their local Faulty bit cleared. If any writes
3061  			 * happen in the meantime in the local node, they
3062  			 * will land in the local bitmap, which will be synced
3063  			 * by this node eventually
3064  			 */
3065  			if (!mddev_is_clustered(rdev->mddev) ||
3066  			    (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
3067  				clear_bit(Faulty, &rdev->flags);
3068  				err = add_bound_rdev(rdev);
3069  			}
3070  		} else
3071  			err = -EBUSY;
3072  	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3073  		set_bit(ExternalBbl, &rdev->flags);
3074  		rdev->badblocks.shift = 0;
3075  		err = 0;
3076  	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3077  		clear_bit(ExternalBbl, &rdev->flags);
3078  		err = 0;
3079  	}
3080  	if (need_update_sb)
3081  		md_update_sb(mddev, 1);
3082  	if (!err)
3083  		sysfs_notify_dirent_safe(rdev->sysfs_state);
3084  	return err ? err : len;
3085  }
3086  static struct rdev_sysfs_entry rdev_state =
3087  __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3088  
3089  static ssize_t
errors_show(struct md_rdev * rdev,char * page)3090  errors_show(struct md_rdev *rdev, char *page)
3091  {
3092  	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3093  }
3094  
3095  static ssize_t
errors_store(struct md_rdev * rdev,const char * buf,size_t len)3096  errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3097  {
3098  	unsigned int n;
3099  	int rv;
3100  
3101  	rv = kstrtouint(buf, 10, &n);
3102  	if (rv < 0)
3103  		return rv;
3104  	atomic_set(&rdev->corrected_errors, n);
3105  	return len;
3106  }
3107  static struct rdev_sysfs_entry rdev_errors =
3108  __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3109  
3110  static ssize_t
slot_show(struct md_rdev * rdev,char * page)3111  slot_show(struct md_rdev *rdev, char *page)
3112  {
3113  	if (test_bit(Journal, &rdev->flags))
3114  		return sprintf(page, "journal\n");
3115  	else if (rdev->raid_disk < 0)
3116  		return sprintf(page, "none\n");
3117  	else
3118  		return sprintf(page, "%d\n", rdev->raid_disk);
3119  }
3120  
3121  static ssize_t
slot_store(struct md_rdev * rdev,const char * buf,size_t len)3122  slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3123  {
3124  	int slot;
3125  	int err;
3126  
3127  	if (test_bit(Journal, &rdev->flags))
3128  		return -EBUSY;
3129  	if (strncmp(buf, "none", 4)==0)
3130  		slot = -1;
3131  	else {
3132  		err = kstrtouint(buf, 10, (unsigned int *)&slot);
3133  		if (err < 0)
3134  			return err;
3135  		if (slot < 0)
3136  			/* overflow */
3137  			return -ENOSPC;
3138  	}
3139  	if (rdev->mddev->pers && slot == -1) {
3140  		/* Setting 'slot' on an active array requires also
3141  		 * updating the 'rd%d' link, and communicating
3142  		 * with the personality with ->hot_*_disk.
3143  		 * For now we only support removing
3144  		 * failed/spare devices.  This normally happens automatically,
3145  		 * but not when the metadata is externally managed.
3146  		 */
3147  		if (rdev->raid_disk == -1)
3148  			return -EEXIST;
3149  		/* personality does all needed checks */
3150  		if (rdev->mddev->pers->hot_remove_disk == NULL)
3151  			return -EINVAL;
3152  		clear_bit(Blocked, &rdev->flags);
3153  		remove_and_add_spares(rdev->mddev, rdev);
3154  		if (rdev->raid_disk >= 0)
3155  			return -EBUSY;
3156  		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3157  	} else if (rdev->mddev->pers) {
3158  		/* Activating a spare .. or possibly reactivating
3159  		 * if we ever get bitmaps working here.
3160  		 */
3161  		int err;
3162  
3163  		if (rdev->raid_disk != -1)
3164  			return -EBUSY;
3165  
3166  		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3167  			return -EBUSY;
3168  
3169  		if (rdev->mddev->pers->hot_add_disk == NULL)
3170  			return -EINVAL;
3171  
3172  		if (slot >= rdev->mddev->raid_disks &&
3173  		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3174  			return -ENOSPC;
3175  
3176  		rdev->raid_disk = slot;
3177  		if (test_bit(In_sync, &rdev->flags))
3178  			rdev->saved_raid_disk = slot;
3179  		else
3180  			rdev->saved_raid_disk = -1;
3181  		clear_bit(In_sync, &rdev->flags);
3182  		clear_bit(Bitmap_sync, &rdev->flags);
3183  		err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3184  		if (err) {
3185  			rdev->raid_disk = -1;
3186  			return err;
3187  		} else
3188  			sysfs_notify_dirent_safe(rdev->sysfs_state);
3189  		/* failure here is OK */;
3190  		sysfs_link_rdev(rdev->mddev, rdev);
3191  		/* don't wakeup anyone, leave that to userspace. */
3192  	} else {
3193  		if (slot >= rdev->mddev->raid_disks &&
3194  		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3195  			return -ENOSPC;
3196  		rdev->raid_disk = slot;
3197  		/* assume it is working */
3198  		clear_bit(Faulty, &rdev->flags);
3199  		clear_bit(WriteMostly, &rdev->flags);
3200  		set_bit(In_sync, &rdev->flags);
3201  		sysfs_notify_dirent_safe(rdev->sysfs_state);
3202  	}
3203  	return len;
3204  }
3205  
3206  static struct rdev_sysfs_entry rdev_slot =
3207  __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3208  
3209  static ssize_t
offset_show(struct md_rdev * rdev,char * page)3210  offset_show(struct md_rdev *rdev, char *page)
3211  {
3212  	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3213  }
3214  
3215  static ssize_t
offset_store(struct md_rdev * rdev,const char * buf,size_t len)3216  offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3217  {
3218  	unsigned long long offset;
3219  	if (kstrtoull(buf, 10, &offset) < 0)
3220  		return -EINVAL;
3221  	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3222  		return -EBUSY;
3223  	if (rdev->sectors && rdev->mddev->external)
3224  		/* Must set offset before size, so overlap checks
3225  		 * can be sane */
3226  		return -EBUSY;
3227  	rdev->data_offset = offset;
3228  	rdev->new_data_offset = offset;
3229  	return len;
3230  }
3231  
3232  static struct rdev_sysfs_entry rdev_offset =
3233  __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3234  
new_offset_show(struct md_rdev * rdev,char * page)3235  static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3236  {
3237  	return sprintf(page, "%llu\n",
3238  		       (unsigned long long)rdev->new_data_offset);
3239  }
3240  
new_offset_store(struct md_rdev * rdev,const char * buf,size_t len)3241  static ssize_t new_offset_store(struct md_rdev *rdev,
3242  				const char *buf, size_t len)
3243  {
3244  	unsigned long long new_offset;
3245  	struct mddev *mddev = rdev->mddev;
3246  
3247  	if (kstrtoull(buf, 10, &new_offset) < 0)
3248  		return -EINVAL;
3249  
3250  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3251  		return -EBUSY;
3252  	if (new_offset == rdev->data_offset)
3253  		/* reset is always permitted */
3254  		;
3255  	else if (new_offset > rdev->data_offset) {
3256  		/* must not push array size beyond rdev_sectors */
3257  		if (new_offset - rdev->data_offset
3258  		    + mddev->dev_sectors > rdev->sectors)
3259  				return -E2BIG;
3260  	}
3261  	/* Metadata worries about other space details. */
3262  
3263  	/* decreasing the offset is inconsistent with a backwards
3264  	 * reshape.
3265  	 */
3266  	if (new_offset < rdev->data_offset &&
3267  	    mddev->reshape_backwards)
3268  		return -EINVAL;
3269  	/* Increasing offset is inconsistent with forwards
3270  	 * reshape.  reshape_direction should be set to
3271  	 * 'backwards' first.
3272  	 */
3273  	if (new_offset > rdev->data_offset &&
3274  	    !mddev->reshape_backwards)
3275  		return -EINVAL;
3276  
3277  	if (mddev->pers && mddev->persistent &&
3278  	    !super_types[mddev->major_version]
3279  	    .allow_new_offset(rdev, new_offset))
3280  		return -E2BIG;
3281  	rdev->new_data_offset = new_offset;
3282  	if (new_offset > rdev->data_offset)
3283  		mddev->reshape_backwards = 1;
3284  	else if (new_offset < rdev->data_offset)
3285  		mddev->reshape_backwards = 0;
3286  
3287  	return len;
3288  }
3289  static struct rdev_sysfs_entry rdev_new_offset =
3290  __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3291  
3292  static ssize_t
rdev_size_show(struct md_rdev * rdev,char * page)3293  rdev_size_show(struct md_rdev *rdev, char *page)
3294  {
3295  	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3296  }
3297  
md_rdevs_overlap(struct md_rdev * a,struct md_rdev * b)3298  static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3299  {
3300  	/* check if two start/length pairs overlap */
3301  	if (a->data_offset + a->sectors <= b->data_offset)
3302  		return false;
3303  	if (b->data_offset + b->sectors <= a->data_offset)
3304  		return false;
3305  	return true;
3306  }
3307  
md_rdev_overlaps(struct md_rdev * rdev)3308  static bool md_rdev_overlaps(struct md_rdev *rdev)
3309  {
3310  	struct mddev *mddev;
3311  	struct md_rdev *rdev2;
3312  
3313  	spin_lock(&all_mddevs_lock);
3314  	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3315  		if (test_bit(MD_DELETED, &mddev->flags))
3316  			continue;
3317  		rdev_for_each(rdev2, mddev) {
3318  			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3319  			    md_rdevs_overlap(rdev, rdev2)) {
3320  				spin_unlock(&all_mddevs_lock);
3321  				return true;
3322  			}
3323  		}
3324  	}
3325  	spin_unlock(&all_mddevs_lock);
3326  	return false;
3327  }
3328  
strict_blocks_to_sectors(const char * buf,sector_t * sectors)3329  static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3330  {
3331  	unsigned long long blocks;
3332  	sector_t new;
3333  
3334  	if (kstrtoull(buf, 10, &blocks) < 0)
3335  		return -EINVAL;
3336  
3337  	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3338  		return -EINVAL; /* sector conversion overflow */
3339  
3340  	new = blocks * 2;
3341  	if (new != blocks * 2)
3342  		return -EINVAL; /* unsigned long long to sector_t overflow */
3343  
3344  	*sectors = new;
3345  	return 0;
3346  }
3347  
3348  static ssize_t
rdev_size_store(struct md_rdev * rdev,const char * buf,size_t len)3349  rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3350  {
3351  	struct mddev *my_mddev = rdev->mddev;
3352  	sector_t oldsectors = rdev->sectors;
3353  	sector_t sectors;
3354  
3355  	if (test_bit(Journal, &rdev->flags))
3356  		return -EBUSY;
3357  	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3358  		return -EINVAL;
3359  	if (rdev->data_offset != rdev->new_data_offset)
3360  		return -EINVAL; /* too confusing */
3361  	if (my_mddev->pers && rdev->raid_disk >= 0) {
3362  		if (my_mddev->persistent) {
3363  			sectors = super_types[my_mddev->major_version].
3364  				rdev_size_change(rdev, sectors);
3365  			if (!sectors)
3366  				return -EBUSY;
3367  		} else if (!sectors)
3368  			sectors = bdev_nr_sectors(rdev->bdev) -
3369  				rdev->data_offset;
3370  		if (!my_mddev->pers->resize)
3371  			/* Cannot change size for RAID0 or Linear etc */
3372  			return -EINVAL;
3373  	}
3374  	if (sectors < my_mddev->dev_sectors)
3375  		return -EINVAL; /* component must fit device */
3376  
3377  	rdev->sectors = sectors;
3378  
3379  	/*
3380  	 * Check that all other rdevs with the same bdev do not overlap.  This
3381  	 * check does not provide a hard guarantee, it just helps avoid
3382  	 * dangerous mistakes.
3383  	 */
3384  	if (sectors > oldsectors && my_mddev->external &&
3385  	    md_rdev_overlaps(rdev)) {
3386  		/*
3387  		 * Someone else could have slipped in a size change here, but
3388  		 * doing so is just silly.  We put oldsectors back because we
3389  		 * know it is safe, and trust userspace not to race with itself.
3390  		 */
3391  		rdev->sectors = oldsectors;
3392  		return -EBUSY;
3393  	}
3394  	return len;
3395  }
3396  
3397  static struct rdev_sysfs_entry rdev_size =
3398  __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3399  
recovery_start_show(struct md_rdev * rdev,char * page)3400  static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3401  {
3402  	unsigned long long recovery_start = rdev->recovery_offset;
3403  
3404  	if (test_bit(In_sync, &rdev->flags) ||
3405  	    recovery_start == MaxSector)
3406  		return sprintf(page, "none\n");
3407  
3408  	return sprintf(page, "%llu\n", recovery_start);
3409  }
3410  
recovery_start_store(struct md_rdev * rdev,const char * buf,size_t len)3411  static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3412  {
3413  	unsigned long long recovery_start;
3414  
3415  	if (cmd_match(buf, "none"))
3416  		recovery_start = MaxSector;
3417  	else if (kstrtoull(buf, 10, &recovery_start))
3418  		return -EINVAL;
3419  
3420  	if (rdev->mddev->pers &&
3421  	    rdev->raid_disk >= 0)
3422  		return -EBUSY;
3423  
3424  	rdev->recovery_offset = recovery_start;
3425  	if (recovery_start == MaxSector)
3426  		set_bit(In_sync, &rdev->flags);
3427  	else
3428  		clear_bit(In_sync, &rdev->flags);
3429  	return len;
3430  }
3431  
3432  static struct rdev_sysfs_entry rdev_recovery_start =
3433  __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3434  
3435  /* sysfs access to bad-blocks list.
3436   * We present two files.
3437   * 'bad-blocks' lists sector numbers and lengths of ranges that
3438   *    are recorded as bad.  The list is truncated to fit within
3439   *    the one-page limit of sysfs.
3440   *    Writing "sector length" to this file adds an acknowledged
3441   *    bad block list.
3442   * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3443   *    been acknowledged.  Writing to this file adds bad blocks
3444   *    without acknowledging them.  This is largely for testing.
3445   */
bb_show(struct md_rdev * rdev,char * page)3446  static ssize_t bb_show(struct md_rdev *rdev, char *page)
3447  {
3448  	return badblocks_show(&rdev->badblocks, page, 0);
3449  }
bb_store(struct md_rdev * rdev,const char * page,size_t len)3450  static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3451  {
3452  	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3453  	/* Maybe that ack was all we needed */
3454  	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3455  		wake_up(&rdev->blocked_wait);
3456  	return rv;
3457  }
3458  static struct rdev_sysfs_entry rdev_bad_blocks =
3459  __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3460  
ubb_show(struct md_rdev * rdev,char * page)3461  static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3462  {
3463  	return badblocks_show(&rdev->badblocks, page, 1);
3464  }
ubb_store(struct md_rdev * rdev,const char * page,size_t len)3465  static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3466  {
3467  	return badblocks_store(&rdev->badblocks, page, len, 1);
3468  }
3469  static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3470  __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3471  
3472  static ssize_t
ppl_sector_show(struct md_rdev * rdev,char * page)3473  ppl_sector_show(struct md_rdev *rdev, char *page)
3474  {
3475  	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3476  }
3477  
3478  static ssize_t
ppl_sector_store(struct md_rdev * rdev,const char * buf,size_t len)3479  ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3480  {
3481  	unsigned long long sector;
3482  
3483  	if (kstrtoull(buf, 10, &sector) < 0)
3484  		return -EINVAL;
3485  	if (sector != (sector_t)sector)
3486  		return -EINVAL;
3487  
3488  	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3489  	    rdev->raid_disk >= 0)
3490  		return -EBUSY;
3491  
3492  	if (rdev->mddev->persistent) {
3493  		if (rdev->mddev->major_version == 0)
3494  			return -EINVAL;
3495  		if ((sector > rdev->sb_start &&
3496  		     sector - rdev->sb_start > S16_MAX) ||
3497  		    (sector < rdev->sb_start &&
3498  		     rdev->sb_start - sector > -S16_MIN))
3499  			return -EINVAL;
3500  		rdev->ppl.offset = sector - rdev->sb_start;
3501  	} else if (!rdev->mddev->external) {
3502  		return -EBUSY;
3503  	}
3504  	rdev->ppl.sector = sector;
3505  	return len;
3506  }
3507  
3508  static struct rdev_sysfs_entry rdev_ppl_sector =
3509  __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3510  
3511  static ssize_t
ppl_size_show(struct md_rdev * rdev,char * page)3512  ppl_size_show(struct md_rdev *rdev, char *page)
3513  {
3514  	return sprintf(page, "%u\n", rdev->ppl.size);
3515  }
3516  
3517  static ssize_t
ppl_size_store(struct md_rdev * rdev,const char * buf,size_t len)3518  ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3519  {
3520  	unsigned int size;
3521  
3522  	if (kstrtouint(buf, 10, &size) < 0)
3523  		return -EINVAL;
3524  
3525  	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3526  	    rdev->raid_disk >= 0)
3527  		return -EBUSY;
3528  
3529  	if (rdev->mddev->persistent) {
3530  		if (rdev->mddev->major_version == 0)
3531  			return -EINVAL;
3532  		if (size > U16_MAX)
3533  			return -EINVAL;
3534  	} else if (!rdev->mddev->external) {
3535  		return -EBUSY;
3536  	}
3537  	rdev->ppl.size = size;
3538  	return len;
3539  }
3540  
3541  static struct rdev_sysfs_entry rdev_ppl_size =
3542  __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3543  
3544  static struct attribute *rdev_default_attrs[] = {
3545  	&rdev_state.attr,
3546  	&rdev_errors.attr,
3547  	&rdev_slot.attr,
3548  	&rdev_offset.attr,
3549  	&rdev_new_offset.attr,
3550  	&rdev_size.attr,
3551  	&rdev_recovery_start.attr,
3552  	&rdev_bad_blocks.attr,
3553  	&rdev_unack_bad_blocks.attr,
3554  	&rdev_ppl_sector.attr,
3555  	&rdev_ppl_size.attr,
3556  	NULL,
3557  };
3558  ATTRIBUTE_GROUPS(rdev_default);
3559  static ssize_t
rdev_attr_show(struct kobject * kobj,struct attribute * attr,char * page)3560  rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3561  {
3562  	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3563  	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3564  
3565  	if (!entry->show)
3566  		return -EIO;
3567  	if (!rdev->mddev)
3568  		return -ENODEV;
3569  	return entry->show(rdev, page);
3570  }
3571  
3572  static ssize_t
rdev_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)3573  rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3574  	      const char *page, size_t length)
3575  {
3576  	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3577  	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3578  	struct kernfs_node *kn = NULL;
3579  	bool suspend = false;
3580  	ssize_t rv;
3581  	struct mddev *mddev = READ_ONCE(rdev->mddev);
3582  
3583  	if (!entry->store)
3584  		return -EIO;
3585  	if (!capable(CAP_SYS_ADMIN))
3586  		return -EACCES;
3587  	if (!mddev)
3588  		return -ENODEV;
3589  
3590  	if (entry->store == state_store) {
3591  		if (cmd_match(page, "remove"))
3592  			kn = sysfs_break_active_protection(kobj, attr);
3593  		if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3594  		    cmd_match(page, "writemostly") ||
3595  		    cmd_match(page, "-writemostly"))
3596  			suspend = true;
3597  	}
3598  
3599  	rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3600  	if (!rv) {
3601  		if (rdev->mddev == NULL)
3602  			rv = -ENODEV;
3603  		else
3604  			rv = entry->store(rdev, page, length);
3605  		suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3606  	}
3607  
3608  	if (kn)
3609  		sysfs_unbreak_active_protection(kn);
3610  
3611  	return rv;
3612  }
3613  
rdev_free(struct kobject * ko)3614  static void rdev_free(struct kobject *ko)
3615  {
3616  	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3617  	kfree(rdev);
3618  }
3619  static const struct sysfs_ops rdev_sysfs_ops = {
3620  	.show		= rdev_attr_show,
3621  	.store		= rdev_attr_store,
3622  };
3623  static const struct kobj_type rdev_ktype = {
3624  	.release	= rdev_free,
3625  	.sysfs_ops	= &rdev_sysfs_ops,
3626  	.default_groups	= rdev_default_groups,
3627  };
3628  
md_rdev_init(struct md_rdev * rdev)3629  int md_rdev_init(struct md_rdev *rdev)
3630  {
3631  	rdev->desc_nr = -1;
3632  	rdev->saved_raid_disk = -1;
3633  	rdev->raid_disk = -1;
3634  	rdev->flags = 0;
3635  	rdev->data_offset = 0;
3636  	rdev->new_data_offset = 0;
3637  	rdev->sb_events = 0;
3638  	rdev->last_read_error = 0;
3639  	rdev->sb_loaded = 0;
3640  	rdev->bb_page = NULL;
3641  	atomic_set(&rdev->nr_pending, 0);
3642  	atomic_set(&rdev->read_errors, 0);
3643  	atomic_set(&rdev->corrected_errors, 0);
3644  
3645  	INIT_LIST_HEAD(&rdev->same_set);
3646  	init_waitqueue_head(&rdev->blocked_wait);
3647  
3648  	/* Add space to store bad block list.
3649  	 * This reserves the space even on arrays where it cannot
3650  	 * be used - I wonder if that matters
3651  	 */
3652  	return badblocks_init(&rdev->badblocks, 0);
3653  }
3654  EXPORT_SYMBOL_GPL(md_rdev_init);
3655  
3656  /*
3657   * Import a device. If 'super_format' >= 0, then sanity check the superblock
3658   *
3659   * mark the device faulty if:
3660   *
3661   *   - the device is nonexistent (zero size)
3662   *   - the device has no valid superblock
3663   *
3664   * a faulty rdev _never_ has rdev->sb set.
3665   */
md_import_device(dev_t newdev,int super_format,int super_minor)3666  static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3667  {
3668  	struct md_rdev *rdev;
3669  	sector_t size;
3670  	int err;
3671  
3672  	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3673  	if (!rdev)
3674  		return ERR_PTR(-ENOMEM);
3675  
3676  	err = md_rdev_init(rdev);
3677  	if (err)
3678  		goto out_free_rdev;
3679  	err = alloc_disk_sb(rdev);
3680  	if (err)
3681  		goto out_clear_rdev;
3682  
3683  	rdev->bdev_file = bdev_file_open_by_dev(newdev,
3684  			BLK_OPEN_READ | BLK_OPEN_WRITE,
3685  			super_format == -2 ? &claim_rdev : rdev, NULL);
3686  	if (IS_ERR(rdev->bdev_file)) {
3687  		pr_warn("md: could not open device unknown-block(%u,%u).\n",
3688  			MAJOR(newdev), MINOR(newdev));
3689  		err = PTR_ERR(rdev->bdev_file);
3690  		goto out_clear_rdev;
3691  	}
3692  	rdev->bdev = file_bdev(rdev->bdev_file);
3693  
3694  	kobject_init(&rdev->kobj, &rdev_ktype);
3695  
3696  	size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3697  	if (!size) {
3698  		pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3699  			rdev->bdev);
3700  		err = -EINVAL;
3701  		goto out_blkdev_put;
3702  	}
3703  
3704  	if (super_format >= 0) {
3705  		err = super_types[super_format].
3706  			load_super(rdev, NULL, super_minor);
3707  		if (err == -EINVAL) {
3708  			pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3709  				rdev->bdev,
3710  				super_format, super_minor);
3711  			goto out_blkdev_put;
3712  		}
3713  		if (err < 0) {
3714  			pr_warn("md: could not read %pg's sb, not importing!\n",
3715  				rdev->bdev);
3716  			goto out_blkdev_put;
3717  		}
3718  	}
3719  
3720  	return rdev;
3721  
3722  out_blkdev_put:
3723  	fput(rdev->bdev_file);
3724  out_clear_rdev:
3725  	md_rdev_clear(rdev);
3726  out_free_rdev:
3727  	kfree(rdev);
3728  	return ERR_PTR(err);
3729  }
3730  
3731  /*
3732   * Check a full RAID array for plausibility
3733   */
3734  
analyze_sbs(struct mddev * mddev)3735  static int analyze_sbs(struct mddev *mddev)
3736  {
3737  	int i;
3738  	struct md_rdev *rdev, *freshest, *tmp;
3739  
3740  	freshest = NULL;
3741  	rdev_for_each_safe(rdev, tmp, mddev)
3742  		switch (super_types[mddev->major_version].
3743  			load_super(rdev, freshest, mddev->minor_version)) {
3744  		case 1:
3745  			freshest = rdev;
3746  			break;
3747  		case 0:
3748  			break;
3749  		default:
3750  			pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3751  				rdev->bdev);
3752  			md_kick_rdev_from_array(rdev);
3753  		}
3754  
3755  	/* Cannot find a valid fresh disk */
3756  	if (!freshest) {
3757  		pr_warn("md: cannot find a valid disk\n");
3758  		return -EINVAL;
3759  	}
3760  
3761  	super_types[mddev->major_version].
3762  		validate_super(mddev, NULL/*freshest*/, freshest);
3763  
3764  	i = 0;
3765  	rdev_for_each_safe(rdev, tmp, mddev) {
3766  		if (mddev->max_disks &&
3767  		    (rdev->desc_nr >= mddev->max_disks ||
3768  		     i > mddev->max_disks)) {
3769  			pr_warn("md: %s: %pg: only %d devices permitted\n",
3770  				mdname(mddev), rdev->bdev,
3771  				mddev->max_disks);
3772  			md_kick_rdev_from_array(rdev);
3773  			continue;
3774  		}
3775  		if (rdev != freshest) {
3776  			if (super_types[mddev->major_version].
3777  			    validate_super(mddev, freshest, rdev)) {
3778  				pr_warn("md: kicking non-fresh %pg from array!\n",
3779  					rdev->bdev);
3780  				md_kick_rdev_from_array(rdev);
3781  				continue;
3782  			}
3783  		}
3784  		if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3785  		    !test_bit(Journal, &rdev->flags)) {
3786  			rdev->raid_disk = -1;
3787  			clear_bit(In_sync, &rdev->flags);
3788  		}
3789  	}
3790  
3791  	return 0;
3792  }
3793  
3794  /* Read a fixed-point number.
3795   * Numbers in sysfs attributes should be in "standard" units where
3796   * possible, so time should be in seconds.
3797   * However we internally use a a much smaller unit such as
3798   * milliseconds or jiffies.
3799   * This function takes a decimal number with a possible fractional
3800   * component, and produces an integer which is the result of
3801   * multiplying that number by 10^'scale'.
3802   * all without any floating-point arithmetic.
3803   */
strict_strtoul_scaled(const char * cp,unsigned long * res,int scale)3804  int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3805  {
3806  	unsigned long result = 0;
3807  	long decimals = -1;
3808  	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3809  		if (*cp == '.')
3810  			decimals = 0;
3811  		else if (decimals < scale) {
3812  			unsigned int value;
3813  			value = *cp - '0';
3814  			result = result * 10 + value;
3815  			if (decimals >= 0)
3816  				decimals++;
3817  		}
3818  		cp++;
3819  	}
3820  	if (*cp == '\n')
3821  		cp++;
3822  	if (*cp)
3823  		return -EINVAL;
3824  	if (decimals < 0)
3825  		decimals = 0;
3826  	*res = result * int_pow(10, scale - decimals);
3827  	return 0;
3828  }
3829  
3830  static ssize_t
safe_delay_show(struct mddev * mddev,char * page)3831  safe_delay_show(struct mddev *mddev, char *page)
3832  {
3833  	unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3834  
3835  	return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3836  }
3837  static ssize_t
safe_delay_store(struct mddev * mddev,const char * cbuf,size_t len)3838  safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3839  {
3840  	unsigned long msec;
3841  
3842  	if (mddev_is_clustered(mddev)) {
3843  		pr_warn("md: Safemode is disabled for clustered mode\n");
3844  		return -EINVAL;
3845  	}
3846  
3847  	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3848  		return -EINVAL;
3849  	if (msec == 0)
3850  		mddev->safemode_delay = 0;
3851  	else {
3852  		unsigned long old_delay = mddev->safemode_delay;
3853  		unsigned long new_delay = (msec*HZ)/1000;
3854  
3855  		if (new_delay == 0)
3856  			new_delay = 1;
3857  		mddev->safemode_delay = new_delay;
3858  		if (new_delay < old_delay || old_delay == 0)
3859  			mod_timer(&mddev->safemode_timer, jiffies+1);
3860  	}
3861  	return len;
3862  }
3863  static struct md_sysfs_entry md_safe_delay =
3864  __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3865  
3866  static ssize_t
level_show(struct mddev * mddev,char * page)3867  level_show(struct mddev *mddev, char *page)
3868  {
3869  	struct md_personality *p;
3870  	int ret;
3871  	spin_lock(&mddev->lock);
3872  	p = mddev->pers;
3873  	if (p)
3874  		ret = sprintf(page, "%s\n", p->head.name);
3875  	else if (mddev->clevel[0])
3876  		ret = sprintf(page, "%s\n", mddev->clevel);
3877  	else if (mddev->level != LEVEL_NONE)
3878  		ret = sprintf(page, "%d\n", mddev->level);
3879  	else
3880  		ret = 0;
3881  	spin_unlock(&mddev->lock);
3882  	return ret;
3883  }
3884  
3885  static ssize_t
level_store(struct mddev * mddev,const char * buf,size_t len)3886  level_store(struct mddev *mddev, const char *buf, size_t len)
3887  {
3888  	char clevel[16];
3889  	ssize_t rv;
3890  	size_t slen = len;
3891  	struct md_personality *pers, *oldpers;
3892  	long level;
3893  	void *priv, *oldpriv;
3894  	struct md_rdev *rdev;
3895  
3896  	if (slen == 0 || slen >= sizeof(clevel))
3897  		return -EINVAL;
3898  
3899  	rv = mddev_suspend_and_lock(mddev);
3900  	if (rv)
3901  		return rv;
3902  
3903  	if (mddev->pers == NULL) {
3904  		memcpy(mddev->clevel, buf, slen);
3905  		if (mddev->clevel[slen-1] == '\n')
3906  			slen--;
3907  		mddev->clevel[slen] = 0;
3908  		mddev->level = LEVEL_NONE;
3909  		rv = len;
3910  		goto out_unlock;
3911  	}
3912  	rv = -EROFS;
3913  	if (!md_is_rdwr(mddev))
3914  		goto out_unlock;
3915  
3916  	/* request to change the personality.  Need to ensure:
3917  	 *  - array is not engaged in resync/recovery/reshape
3918  	 *  - old personality can be suspended
3919  	 *  - new personality will access other array.
3920  	 */
3921  
3922  	rv = -EBUSY;
3923  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3924  	    mddev->reshape_position != MaxSector ||
3925  	    mddev->sysfs_active)
3926  		goto out_unlock;
3927  
3928  	rv = -EINVAL;
3929  	if (!mddev->pers->quiesce) {
3930  		pr_warn("md: %s: %s does not support online personality change\n",
3931  			mdname(mddev), mddev->pers->head.name);
3932  		goto out_unlock;
3933  	}
3934  
3935  	/* Now find the new personality */
3936  	memcpy(clevel, buf, slen);
3937  	if (clevel[slen-1] == '\n')
3938  		slen--;
3939  	clevel[slen] = 0;
3940  	if (kstrtol(clevel, 10, &level))
3941  		level = LEVEL_NONE;
3942  
3943  	if (request_module("md-%s", clevel) != 0)
3944  		request_module("md-level-%s", clevel);
3945  	pers = get_pers(level, clevel);
3946  	if (!pers) {
3947  		rv = -EINVAL;
3948  		goto out_unlock;
3949  	}
3950  
3951  	if (pers == mddev->pers) {
3952  		/* Nothing to do! */
3953  		put_pers(pers);
3954  		rv = len;
3955  		goto out_unlock;
3956  	}
3957  	if (!pers->takeover) {
3958  		put_pers(pers);
3959  		pr_warn("md: %s: %s does not support personality takeover\n",
3960  			mdname(mddev), clevel);
3961  		rv = -EINVAL;
3962  		goto out_unlock;
3963  	}
3964  
3965  	rdev_for_each(rdev, mddev)
3966  		rdev->new_raid_disk = rdev->raid_disk;
3967  
3968  	/* ->takeover must set new_* and/or delta_disks
3969  	 * if it succeeds, and may set them when it fails.
3970  	 */
3971  	priv = pers->takeover(mddev);
3972  	if (IS_ERR(priv)) {
3973  		mddev->new_level = mddev->level;
3974  		mddev->new_layout = mddev->layout;
3975  		mddev->new_chunk_sectors = mddev->chunk_sectors;
3976  		mddev->raid_disks -= mddev->delta_disks;
3977  		mddev->delta_disks = 0;
3978  		mddev->reshape_backwards = 0;
3979  		put_pers(pers);
3980  		pr_warn("md: %s: %s would not accept array\n",
3981  			mdname(mddev), clevel);
3982  		rv = PTR_ERR(priv);
3983  		goto out_unlock;
3984  	}
3985  
3986  	/* Looks like we have a winner */
3987  	mddev_detach(mddev);
3988  
3989  	spin_lock(&mddev->lock);
3990  	oldpers = mddev->pers;
3991  	oldpriv = mddev->private;
3992  	mddev->pers = pers;
3993  	mddev->private = priv;
3994  	strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
3995  	mddev->level = mddev->new_level;
3996  	mddev->layout = mddev->new_layout;
3997  	mddev->chunk_sectors = mddev->new_chunk_sectors;
3998  	mddev->delta_disks = 0;
3999  	mddev->reshape_backwards = 0;
4000  	mddev->degraded = 0;
4001  	spin_unlock(&mddev->lock);
4002  
4003  	if (oldpers->sync_request == NULL &&
4004  	    mddev->external) {
4005  		/* We are converting from a no-redundancy array
4006  		 * to a redundancy array and metadata is managed
4007  		 * externally so we need to be sure that writes
4008  		 * won't block due to a need to transition
4009  		 *      clean->dirty
4010  		 * until external management is started.
4011  		 */
4012  		mddev->in_sync = 0;
4013  		mddev->safemode_delay = 0;
4014  		mddev->safemode = 0;
4015  	}
4016  
4017  	oldpers->free(mddev, oldpriv);
4018  
4019  	if (oldpers->sync_request == NULL &&
4020  	    pers->sync_request != NULL) {
4021  		/* need to add the md_redundancy_group */
4022  		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4023  			pr_warn("md: cannot register extra attributes for %s\n",
4024  				mdname(mddev));
4025  		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4026  		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4027  		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4028  	}
4029  	if (oldpers->sync_request != NULL &&
4030  	    pers->sync_request == NULL) {
4031  		/* need to remove the md_redundancy_group */
4032  		if (mddev->to_remove == NULL)
4033  			mddev->to_remove = &md_redundancy_group;
4034  	}
4035  
4036  	put_pers(oldpers);
4037  
4038  	rdev_for_each(rdev, mddev) {
4039  		if (rdev->raid_disk < 0)
4040  			continue;
4041  		if (rdev->new_raid_disk >= mddev->raid_disks)
4042  			rdev->new_raid_disk = -1;
4043  		if (rdev->new_raid_disk == rdev->raid_disk)
4044  			continue;
4045  		sysfs_unlink_rdev(mddev, rdev);
4046  	}
4047  	rdev_for_each(rdev, mddev) {
4048  		if (rdev->raid_disk < 0)
4049  			continue;
4050  		if (rdev->new_raid_disk == rdev->raid_disk)
4051  			continue;
4052  		rdev->raid_disk = rdev->new_raid_disk;
4053  		if (rdev->raid_disk < 0)
4054  			clear_bit(In_sync, &rdev->flags);
4055  		else {
4056  			if (sysfs_link_rdev(mddev, rdev))
4057  				pr_warn("md: cannot register rd%d for %s after level change\n",
4058  					rdev->raid_disk, mdname(mddev));
4059  		}
4060  	}
4061  
4062  	if (pers->sync_request == NULL) {
4063  		/* this is now an array without redundancy, so
4064  		 * it must always be in_sync
4065  		 */
4066  		mddev->in_sync = 1;
4067  		del_timer_sync(&mddev->safemode_timer);
4068  	}
4069  	pers->run(mddev);
4070  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4071  	if (!mddev->thread)
4072  		md_update_sb(mddev, 1);
4073  	sysfs_notify_dirent_safe(mddev->sysfs_level);
4074  	md_new_event();
4075  	rv = len;
4076  out_unlock:
4077  	mddev_unlock_and_resume(mddev);
4078  	return rv;
4079  }
4080  
4081  static struct md_sysfs_entry md_level =
4082  __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4083  
4084  static ssize_t
new_level_show(struct mddev * mddev,char * page)4085  new_level_show(struct mddev *mddev, char *page)
4086  {
4087  	return sprintf(page, "%d\n", mddev->new_level);
4088  }
4089  
4090  static ssize_t
new_level_store(struct mddev * mddev,const char * buf,size_t len)4091  new_level_store(struct mddev *mddev, const char *buf, size_t len)
4092  {
4093  	unsigned int n;
4094  	int err;
4095  
4096  	err = kstrtouint(buf, 10, &n);
4097  	if (err < 0)
4098  		return err;
4099  	err = mddev_lock(mddev);
4100  	if (err)
4101  		return err;
4102  
4103  	mddev->new_level = n;
4104  	md_update_sb(mddev, 1);
4105  
4106  	mddev_unlock(mddev);
4107  	return len;
4108  }
4109  static struct md_sysfs_entry md_new_level =
4110  __ATTR(new_level, 0664, new_level_show, new_level_store);
4111  
4112  static ssize_t
layout_show(struct mddev * mddev,char * page)4113  layout_show(struct mddev *mddev, char *page)
4114  {
4115  	/* just a number, not meaningful for all levels */
4116  	if (mddev->reshape_position != MaxSector &&
4117  	    mddev->layout != mddev->new_layout)
4118  		return sprintf(page, "%d (%d)\n",
4119  			       mddev->new_layout, mddev->layout);
4120  	return sprintf(page, "%d\n", mddev->layout);
4121  }
4122  
4123  static ssize_t
layout_store(struct mddev * mddev,const char * buf,size_t len)4124  layout_store(struct mddev *mddev, const char *buf, size_t len)
4125  {
4126  	unsigned int n;
4127  	int err;
4128  
4129  	err = kstrtouint(buf, 10, &n);
4130  	if (err < 0)
4131  		return err;
4132  	err = mddev_lock(mddev);
4133  	if (err)
4134  		return err;
4135  
4136  	if (mddev->pers) {
4137  		if (mddev->pers->check_reshape == NULL)
4138  			err = -EBUSY;
4139  		else if (!md_is_rdwr(mddev))
4140  			err = -EROFS;
4141  		else {
4142  			mddev->new_layout = n;
4143  			err = mddev->pers->check_reshape(mddev);
4144  			if (err)
4145  				mddev->new_layout = mddev->layout;
4146  		}
4147  	} else {
4148  		mddev->new_layout = n;
4149  		if (mddev->reshape_position == MaxSector)
4150  			mddev->layout = n;
4151  	}
4152  	mddev_unlock(mddev);
4153  	return err ?: len;
4154  }
4155  static struct md_sysfs_entry md_layout =
4156  __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4157  
4158  static ssize_t
raid_disks_show(struct mddev * mddev,char * page)4159  raid_disks_show(struct mddev *mddev, char *page)
4160  {
4161  	if (mddev->raid_disks == 0)
4162  		return 0;
4163  	if (mddev->reshape_position != MaxSector &&
4164  	    mddev->delta_disks != 0)
4165  		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4166  			       mddev->raid_disks - mddev->delta_disks);
4167  	return sprintf(page, "%d\n", mddev->raid_disks);
4168  }
4169  
4170  static int update_raid_disks(struct mddev *mddev, int raid_disks);
4171  
4172  static ssize_t
raid_disks_store(struct mddev * mddev,const char * buf,size_t len)4173  raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4174  {
4175  	unsigned int n;
4176  	int err;
4177  
4178  	err = kstrtouint(buf, 10, &n);
4179  	if (err < 0)
4180  		return err;
4181  
4182  	err = mddev_lock(mddev);
4183  	if (err)
4184  		return err;
4185  	if (mddev->pers)
4186  		err = update_raid_disks(mddev, n);
4187  	else if (mddev->reshape_position != MaxSector) {
4188  		struct md_rdev *rdev;
4189  		int olddisks = mddev->raid_disks - mddev->delta_disks;
4190  
4191  		err = -EINVAL;
4192  		rdev_for_each(rdev, mddev) {
4193  			if (olddisks < n &&
4194  			    rdev->data_offset < rdev->new_data_offset)
4195  				goto out_unlock;
4196  			if (olddisks > n &&
4197  			    rdev->data_offset > rdev->new_data_offset)
4198  				goto out_unlock;
4199  		}
4200  		err = 0;
4201  		mddev->delta_disks = n - olddisks;
4202  		mddev->raid_disks = n;
4203  		mddev->reshape_backwards = (mddev->delta_disks < 0);
4204  	} else
4205  		mddev->raid_disks = n;
4206  out_unlock:
4207  	mddev_unlock(mddev);
4208  	return err ? err : len;
4209  }
4210  static struct md_sysfs_entry md_raid_disks =
4211  __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4212  
4213  static ssize_t
uuid_show(struct mddev * mddev,char * page)4214  uuid_show(struct mddev *mddev, char *page)
4215  {
4216  	return sprintf(page, "%pU\n", mddev->uuid);
4217  }
4218  static struct md_sysfs_entry md_uuid =
4219  __ATTR(uuid, S_IRUGO, uuid_show, NULL);
4220  
4221  static ssize_t
chunk_size_show(struct mddev * mddev,char * page)4222  chunk_size_show(struct mddev *mddev, char *page)
4223  {
4224  	if (mddev->reshape_position != MaxSector &&
4225  	    mddev->chunk_sectors != mddev->new_chunk_sectors)
4226  		return sprintf(page, "%d (%d)\n",
4227  			       mddev->new_chunk_sectors << 9,
4228  			       mddev->chunk_sectors << 9);
4229  	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4230  }
4231  
4232  static ssize_t
chunk_size_store(struct mddev * mddev,const char * buf,size_t len)4233  chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4234  {
4235  	unsigned long n;
4236  	int err;
4237  
4238  	err = kstrtoul(buf, 10, &n);
4239  	if (err < 0)
4240  		return err;
4241  
4242  	err = mddev_lock(mddev);
4243  	if (err)
4244  		return err;
4245  	if (mddev->pers) {
4246  		if (mddev->pers->check_reshape == NULL)
4247  			err = -EBUSY;
4248  		else if (!md_is_rdwr(mddev))
4249  			err = -EROFS;
4250  		else {
4251  			mddev->new_chunk_sectors = n >> 9;
4252  			err = mddev->pers->check_reshape(mddev);
4253  			if (err)
4254  				mddev->new_chunk_sectors = mddev->chunk_sectors;
4255  		}
4256  	} else {
4257  		mddev->new_chunk_sectors = n >> 9;
4258  		if (mddev->reshape_position == MaxSector)
4259  			mddev->chunk_sectors = n >> 9;
4260  	}
4261  	mddev_unlock(mddev);
4262  	return err ?: len;
4263  }
4264  static struct md_sysfs_entry md_chunk_size =
4265  __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4266  
4267  static ssize_t
resync_start_show(struct mddev * mddev,char * page)4268  resync_start_show(struct mddev *mddev, char *page)
4269  {
4270  	if (mddev->recovery_cp == MaxSector)
4271  		return sprintf(page, "none\n");
4272  	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4273  }
4274  
4275  static ssize_t
resync_start_store(struct mddev * mddev,const char * buf,size_t len)4276  resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4277  {
4278  	unsigned long long n;
4279  	int err;
4280  
4281  	if (cmd_match(buf, "none"))
4282  		n = MaxSector;
4283  	else {
4284  		err = kstrtoull(buf, 10, &n);
4285  		if (err < 0)
4286  			return err;
4287  		if (n != (sector_t)n)
4288  			return -EINVAL;
4289  	}
4290  
4291  	err = mddev_lock(mddev);
4292  	if (err)
4293  		return err;
4294  	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4295  		err = -EBUSY;
4296  
4297  	if (!err) {
4298  		mddev->recovery_cp = n;
4299  		if (mddev->pers)
4300  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4301  	}
4302  	mddev_unlock(mddev);
4303  	return err ?: len;
4304  }
4305  static struct md_sysfs_entry md_resync_start =
4306  __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4307  		resync_start_show, resync_start_store);
4308  
4309  /*
4310   * The array state can be:
4311   *
4312   * clear
4313   *     No devices, no size, no level
4314   *     Equivalent to STOP_ARRAY ioctl
4315   * inactive
4316   *     May have some settings, but array is not active
4317   *        all IO results in error
4318   *     When written, doesn't tear down array, but just stops it
4319   * suspended (not supported yet)
4320   *     All IO requests will block. The array can be reconfigured.
4321   *     Writing this, if accepted, will block until array is quiescent
4322   * readonly
4323   *     no resync can happen.  no superblocks get written.
4324   *     write requests fail
4325   * read-auto
4326   *     like readonly, but behaves like 'clean' on a write request.
4327   *
4328   * clean - no pending writes, but otherwise active.
4329   *     When written to inactive array, starts without resync
4330   *     If a write request arrives then
4331   *       if metadata is known, mark 'dirty' and switch to 'active'.
4332   *       if not known, block and switch to write-pending
4333   *     If written to an active array that has pending writes, then fails.
4334   * active
4335   *     fully active: IO and resync can be happening.
4336   *     When written to inactive array, starts with resync
4337   *
4338   * write-pending
4339   *     clean, but writes are blocked waiting for 'active' to be written.
4340   *
4341   * active-idle
4342   *     like active, but no writes have been seen for a while (100msec).
4343   *
4344   * broken
4345  *     Array is failed. It's useful because mounted-arrays aren't stopped
4346  *     when array is failed, so this state will at least alert the user that
4347  *     something is wrong.
4348   */
4349  enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4350  		   write_pending, active_idle, broken, bad_word};
4351  static char *array_states[] = {
4352  	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4353  	"write-pending", "active-idle", "broken", NULL };
4354  
match_word(const char * word,char ** list)4355  static int match_word(const char *word, char **list)
4356  {
4357  	int n;
4358  	for (n=0; list[n]; n++)
4359  		if (cmd_match(word, list[n]))
4360  			break;
4361  	return n;
4362  }
4363  
4364  static ssize_t
array_state_show(struct mddev * mddev,char * page)4365  array_state_show(struct mddev *mddev, char *page)
4366  {
4367  	enum array_state st = inactive;
4368  
4369  	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4370  		switch(mddev->ro) {
4371  		case MD_RDONLY:
4372  			st = readonly;
4373  			break;
4374  		case MD_AUTO_READ:
4375  			st = read_auto;
4376  			break;
4377  		case MD_RDWR:
4378  			spin_lock(&mddev->lock);
4379  			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4380  				st = write_pending;
4381  			else if (mddev->in_sync)
4382  				st = clean;
4383  			else if (mddev->safemode)
4384  				st = active_idle;
4385  			else
4386  				st = active;
4387  			spin_unlock(&mddev->lock);
4388  		}
4389  
4390  		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4391  			st = broken;
4392  	} else {
4393  		if (list_empty(&mddev->disks) &&
4394  		    mddev->raid_disks == 0 &&
4395  		    mddev->dev_sectors == 0)
4396  			st = clear;
4397  		else
4398  			st = inactive;
4399  	}
4400  	return sprintf(page, "%s\n", array_states[st]);
4401  }
4402  
4403  static int do_md_stop(struct mddev *mddev, int ro);
4404  static int md_set_readonly(struct mddev *mddev);
4405  static int restart_array(struct mddev *mddev);
4406  
4407  static ssize_t
array_state_store(struct mddev * mddev,const char * buf,size_t len)4408  array_state_store(struct mddev *mddev, const char *buf, size_t len)
4409  {
4410  	int err = 0;
4411  	enum array_state st = match_word(buf, array_states);
4412  
4413  	/* No lock dependent actions */
4414  	switch (st) {
4415  	case suspended:		/* not supported yet */
4416  	case write_pending:	/* cannot be set */
4417  	case active_idle:	/* cannot be set */
4418  	case broken:		/* cannot be set */
4419  	case bad_word:
4420  		return -EINVAL;
4421  	case clear:
4422  	case readonly:
4423  	case inactive:
4424  	case read_auto:
4425  		if (!mddev->pers || !md_is_rdwr(mddev))
4426  			break;
4427  		/* write sysfs will not open mddev and opener should be 0 */
4428  		err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4429  		if (err)
4430  			return err;
4431  		break;
4432  	default:
4433  		break;
4434  	}
4435  
4436  	if (mddev->pers && (st == active || st == clean) &&
4437  	    mddev->ro != MD_RDONLY) {
4438  		/* don't take reconfig_mutex when toggling between
4439  		 * clean and active
4440  		 */
4441  		spin_lock(&mddev->lock);
4442  		if (st == active) {
4443  			restart_array(mddev);
4444  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4445  			md_wakeup_thread(mddev->thread);
4446  			wake_up(&mddev->sb_wait);
4447  		} else /* st == clean */ {
4448  			restart_array(mddev);
4449  			if (!set_in_sync(mddev))
4450  				err = -EBUSY;
4451  		}
4452  		if (!err)
4453  			sysfs_notify_dirent_safe(mddev->sysfs_state);
4454  		spin_unlock(&mddev->lock);
4455  		return err ?: len;
4456  	}
4457  	err = mddev_lock(mddev);
4458  	if (err)
4459  		return err;
4460  
4461  	switch (st) {
4462  	case inactive:
4463  		/* stop an active array, return 0 otherwise */
4464  		if (mddev->pers)
4465  			err = do_md_stop(mddev, 2);
4466  		break;
4467  	case clear:
4468  		err = do_md_stop(mddev, 0);
4469  		break;
4470  	case readonly:
4471  		if (mddev->pers)
4472  			err = md_set_readonly(mddev);
4473  		else {
4474  			mddev->ro = MD_RDONLY;
4475  			set_disk_ro(mddev->gendisk, 1);
4476  			err = do_md_run(mddev);
4477  		}
4478  		break;
4479  	case read_auto:
4480  		if (mddev->pers) {
4481  			if (md_is_rdwr(mddev))
4482  				err = md_set_readonly(mddev);
4483  			else if (mddev->ro == MD_RDONLY)
4484  				err = restart_array(mddev);
4485  			if (err == 0) {
4486  				mddev->ro = MD_AUTO_READ;
4487  				set_disk_ro(mddev->gendisk, 0);
4488  			}
4489  		} else {
4490  			mddev->ro = MD_AUTO_READ;
4491  			err = do_md_run(mddev);
4492  		}
4493  		break;
4494  	case clean:
4495  		if (mddev->pers) {
4496  			err = restart_array(mddev);
4497  			if (err)
4498  				break;
4499  			spin_lock(&mddev->lock);
4500  			if (!set_in_sync(mddev))
4501  				err = -EBUSY;
4502  			spin_unlock(&mddev->lock);
4503  		} else
4504  			err = -EINVAL;
4505  		break;
4506  	case active:
4507  		if (mddev->pers) {
4508  			err = restart_array(mddev);
4509  			if (err)
4510  				break;
4511  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4512  			wake_up(&mddev->sb_wait);
4513  			err = 0;
4514  		} else {
4515  			mddev->ro = MD_RDWR;
4516  			set_disk_ro(mddev->gendisk, 0);
4517  			err = do_md_run(mddev);
4518  		}
4519  		break;
4520  	default:
4521  		err = -EINVAL;
4522  		break;
4523  	}
4524  
4525  	if (!err) {
4526  		if (mddev->hold_active == UNTIL_IOCTL)
4527  			mddev->hold_active = 0;
4528  		sysfs_notify_dirent_safe(mddev->sysfs_state);
4529  	}
4530  	mddev_unlock(mddev);
4531  
4532  	if (st == readonly || st == read_auto || st == inactive ||
4533  	    (err && st == clear))
4534  		clear_bit(MD_CLOSING, &mddev->flags);
4535  
4536  	return err ?: len;
4537  }
4538  static struct md_sysfs_entry md_array_state =
4539  __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4540  
4541  static ssize_t
max_corrected_read_errors_show(struct mddev * mddev,char * page)4542  max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4543  	return sprintf(page, "%d\n",
4544  		       atomic_read(&mddev->max_corr_read_errors));
4545  }
4546  
4547  static ssize_t
max_corrected_read_errors_store(struct mddev * mddev,const char * buf,size_t len)4548  max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4549  {
4550  	unsigned int n;
4551  	int rv;
4552  
4553  	rv = kstrtouint(buf, 10, &n);
4554  	if (rv < 0)
4555  		return rv;
4556  	if (n > INT_MAX)
4557  		return -EINVAL;
4558  	atomic_set(&mddev->max_corr_read_errors, n);
4559  	return len;
4560  }
4561  
4562  static struct md_sysfs_entry max_corr_read_errors =
4563  __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4564  	max_corrected_read_errors_store);
4565  
4566  static ssize_t
null_show(struct mddev * mddev,char * page)4567  null_show(struct mddev *mddev, char *page)
4568  {
4569  	return -EINVAL;
4570  }
4571  
4572  static ssize_t
new_dev_store(struct mddev * mddev,const char * buf,size_t len)4573  new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4574  {
4575  	/* buf must be %d:%d\n? giving major and minor numbers */
4576  	/* The new device is added to the array.
4577  	 * If the array has a persistent superblock, we read the
4578  	 * superblock to initialise info and check validity.
4579  	 * Otherwise, only checking done is that in bind_rdev_to_array,
4580  	 * which mainly checks size.
4581  	 */
4582  	char *e;
4583  	int major = simple_strtoul(buf, &e, 10);
4584  	int minor;
4585  	dev_t dev;
4586  	struct md_rdev *rdev;
4587  	int err;
4588  
4589  	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4590  		return -EINVAL;
4591  	minor = simple_strtoul(e+1, &e, 10);
4592  	if (*e && *e != '\n')
4593  		return -EINVAL;
4594  	dev = MKDEV(major, minor);
4595  	if (major != MAJOR(dev) ||
4596  	    minor != MINOR(dev))
4597  		return -EOVERFLOW;
4598  
4599  	err = mddev_suspend_and_lock(mddev);
4600  	if (err)
4601  		return err;
4602  	if (mddev->persistent) {
4603  		rdev = md_import_device(dev, mddev->major_version,
4604  					mddev->minor_version);
4605  		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4606  			struct md_rdev *rdev0
4607  				= list_entry(mddev->disks.next,
4608  					     struct md_rdev, same_set);
4609  			err = super_types[mddev->major_version]
4610  				.load_super(rdev, rdev0, mddev->minor_version);
4611  			if (err < 0)
4612  				goto out;
4613  		}
4614  	} else if (mddev->external)
4615  		rdev = md_import_device(dev, -2, -1);
4616  	else
4617  		rdev = md_import_device(dev, -1, -1);
4618  
4619  	if (IS_ERR(rdev)) {
4620  		mddev_unlock_and_resume(mddev);
4621  		return PTR_ERR(rdev);
4622  	}
4623  	err = bind_rdev_to_array(rdev, mddev);
4624   out:
4625  	if (err)
4626  		export_rdev(rdev, mddev);
4627  	mddev_unlock_and_resume(mddev);
4628  	if (!err)
4629  		md_new_event();
4630  	return err ? err : len;
4631  }
4632  
4633  static struct md_sysfs_entry md_new_device =
4634  __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4635  
4636  static ssize_t
bitmap_store(struct mddev * mddev,const char * buf,size_t len)4637  bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4638  {
4639  	char *end;
4640  	unsigned long chunk, end_chunk;
4641  	int err;
4642  
4643  	err = mddev_lock(mddev);
4644  	if (err)
4645  		return err;
4646  	if (!mddev->bitmap)
4647  		goto out;
4648  	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4649  	while (*buf) {
4650  		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4651  		if (buf == end)
4652  			break;
4653  
4654  		if (*end == '-') { /* range */
4655  			buf = end + 1;
4656  			end_chunk = simple_strtoul(buf, &end, 0);
4657  			if (buf == end)
4658  				break;
4659  		}
4660  
4661  		if (*end && !isspace(*end))
4662  			break;
4663  
4664  		mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
4665  		buf = skip_spaces(end);
4666  	}
4667  	mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
4668  out:
4669  	mddev_unlock(mddev);
4670  	return len;
4671  }
4672  
4673  static struct md_sysfs_entry md_bitmap =
4674  __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4675  
4676  static ssize_t
size_show(struct mddev * mddev,char * page)4677  size_show(struct mddev *mddev, char *page)
4678  {
4679  	return sprintf(page, "%llu\n",
4680  		(unsigned long long)mddev->dev_sectors / 2);
4681  }
4682  
4683  static int update_size(struct mddev *mddev, sector_t num_sectors);
4684  
4685  static ssize_t
size_store(struct mddev * mddev,const char * buf,size_t len)4686  size_store(struct mddev *mddev, const char *buf, size_t len)
4687  {
4688  	/* If array is inactive, we can reduce the component size, but
4689  	 * not increase it (except from 0).
4690  	 * If array is active, we can try an on-line resize
4691  	 */
4692  	sector_t sectors;
4693  	int err = strict_blocks_to_sectors(buf, &sectors);
4694  
4695  	if (err < 0)
4696  		return err;
4697  	err = mddev_lock(mddev);
4698  	if (err)
4699  		return err;
4700  	if (mddev->pers) {
4701  		err = update_size(mddev, sectors);
4702  		if (err == 0)
4703  			md_update_sb(mddev, 1);
4704  	} else {
4705  		if (mddev->dev_sectors == 0 ||
4706  		    mddev->dev_sectors > sectors)
4707  			mddev->dev_sectors = sectors;
4708  		else
4709  			err = -ENOSPC;
4710  	}
4711  	mddev_unlock(mddev);
4712  	return err ? err : len;
4713  }
4714  
4715  static struct md_sysfs_entry md_size =
4716  __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4717  
4718  /* Metadata version.
4719   * This is one of
4720   *   'none' for arrays with no metadata (good luck...)
4721   *   'external' for arrays with externally managed metadata,
4722   * or N.M for internally known formats
4723   */
4724  static ssize_t
metadata_show(struct mddev * mddev,char * page)4725  metadata_show(struct mddev *mddev, char *page)
4726  {
4727  	if (mddev->persistent)
4728  		return sprintf(page, "%d.%d\n",
4729  			       mddev->major_version, mddev->minor_version);
4730  	else if (mddev->external)
4731  		return sprintf(page, "external:%s\n", mddev->metadata_type);
4732  	else
4733  		return sprintf(page, "none\n");
4734  }
4735  
4736  static ssize_t
metadata_store(struct mddev * mddev,const char * buf,size_t len)4737  metadata_store(struct mddev *mddev, const char *buf, size_t len)
4738  {
4739  	int major, minor;
4740  	char *e;
4741  	int err;
4742  	/* Changing the details of 'external' metadata is
4743  	 * always permitted.  Otherwise there must be
4744  	 * no devices attached to the array.
4745  	 */
4746  
4747  	err = mddev_lock(mddev);
4748  	if (err)
4749  		return err;
4750  	err = -EBUSY;
4751  	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4752  		;
4753  	else if (!list_empty(&mddev->disks))
4754  		goto out_unlock;
4755  
4756  	err = 0;
4757  	if (cmd_match(buf, "none")) {
4758  		mddev->persistent = 0;
4759  		mddev->external = 0;
4760  		mddev->major_version = 0;
4761  		mddev->minor_version = 90;
4762  		goto out_unlock;
4763  	}
4764  	if (strncmp(buf, "external:", 9) == 0) {
4765  		size_t namelen = len-9;
4766  		if (namelen >= sizeof(mddev->metadata_type))
4767  			namelen = sizeof(mddev->metadata_type)-1;
4768  		memcpy(mddev->metadata_type, buf+9, namelen);
4769  		mddev->metadata_type[namelen] = 0;
4770  		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4771  			mddev->metadata_type[--namelen] = 0;
4772  		mddev->persistent = 0;
4773  		mddev->external = 1;
4774  		mddev->major_version = 0;
4775  		mddev->minor_version = 90;
4776  		goto out_unlock;
4777  	}
4778  	major = simple_strtoul(buf, &e, 10);
4779  	err = -EINVAL;
4780  	if (e==buf || *e != '.')
4781  		goto out_unlock;
4782  	buf = e+1;
4783  	minor = simple_strtoul(buf, &e, 10);
4784  	if (e==buf || (*e && *e != '\n') )
4785  		goto out_unlock;
4786  	err = -ENOENT;
4787  	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4788  		goto out_unlock;
4789  	mddev->major_version = major;
4790  	mddev->minor_version = minor;
4791  	mddev->persistent = 1;
4792  	mddev->external = 0;
4793  	err = 0;
4794  out_unlock:
4795  	mddev_unlock(mddev);
4796  	return err ?: len;
4797  }
4798  
4799  static struct md_sysfs_entry md_metadata =
4800  __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4801  
md_sync_action(struct mddev * mddev)4802  enum sync_action md_sync_action(struct mddev *mddev)
4803  {
4804  	unsigned long recovery = mddev->recovery;
4805  
4806  	/*
4807  	 * frozen has the highest priority, means running sync_thread will be
4808  	 * stopped immediately, and no new sync_thread can start.
4809  	 */
4810  	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4811  		return ACTION_FROZEN;
4812  
4813  	/*
4814  	 * read-only array can't register sync_thread, and it can only
4815  	 * add/remove spares.
4816  	 */
4817  	if (!md_is_rdwr(mddev))
4818  		return ACTION_IDLE;
4819  
4820  	/*
4821  	 * idle means no sync_thread is running, and no new sync_thread is
4822  	 * requested.
4823  	 */
4824  	if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
4825  	    !test_bit(MD_RECOVERY_NEEDED, &recovery))
4826  		return ACTION_IDLE;
4827  
4828  	if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
4829  	    mddev->reshape_position != MaxSector)
4830  		return ACTION_RESHAPE;
4831  
4832  	if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4833  		return ACTION_RECOVER;
4834  
4835  	if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4836  		/*
4837  		 * MD_RECOVERY_CHECK must be paired with
4838  		 * MD_RECOVERY_REQUESTED.
4839  		 */
4840  		if (test_bit(MD_RECOVERY_CHECK, &recovery))
4841  			return ACTION_CHECK;
4842  		if (test_bit(MD_RECOVERY_REQUESTED, &recovery))
4843  			return ACTION_REPAIR;
4844  		return ACTION_RESYNC;
4845  	}
4846  
4847  	/*
4848  	 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no
4849  	 * sync_action is specified.
4850  	 */
4851  	return ACTION_IDLE;
4852  }
4853  
md_sync_action_by_name(const char * page)4854  enum sync_action md_sync_action_by_name(const char *page)
4855  {
4856  	enum sync_action action;
4857  
4858  	for (action = 0; action < NR_SYNC_ACTIONS; ++action) {
4859  		if (cmd_match(page, action_name[action]))
4860  			return action;
4861  	}
4862  
4863  	return NR_SYNC_ACTIONS;
4864  }
4865  
md_sync_action_name(enum sync_action action)4866  const char *md_sync_action_name(enum sync_action action)
4867  {
4868  	return action_name[action];
4869  }
4870  
4871  static ssize_t
action_show(struct mddev * mddev,char * page)4872  action_show(struct mddev *mddev, char *page)
4873  {
4874  	enum sync_action action = md_sync_action(mddev);
4875  
4876  	return sprintf(page, "%s\n", md_sync_action_name(action));
4877  }
4878  
4879  /**
4880   * stop_sync_thread() - wait for sync_thread to stop if it's running.
4881   * @mddev:	the array.
4882   * @locked:	if set, reconfig_mutex will still be held after this function
4883   *		return; if not set, reconfig_mutex will be released after this
4884   *		function return.
4885   */
stop_sync_thread(struct mddev * mddev,bool locked)4886  static void stop_sync_thread(struct mddev *mddev, bool locked)
4887  {
4888  	int sync_seq = atomic_read(&mddev->sync_seq);
4889  
4890  	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4891  		if (!locked)
4892  			mddev_unlock(mddev);
4893  		return;
4894  	}
4895  
4896  	mddev_unlock(mddev);
4897  
4898  	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4899  	/*
4900  	 * Thread might be blocked waiting for metadata update which will now
4901  	 * never happen
4902  	 */
4903  	md_wakeup_thread_directly(mddev->sync_thread);
4904  	if (work_pending(&mddev->sync_work))
4905  		flush_work(&mddev->sync_work);
4906  
4907  	wait_event(resync_wait,
4908  		   !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4909  		   (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) &&
4910  		    sync_seq != atomic_read(&mddev->sync_seq)));
4911  
4912  	if (locked)
4913  		mddev_lock_nointr(mddev);
4914  }
4915  
md_idle_sync_thread(struct mddev * mddev)4916  void md_idle_sync_thread(struct mddev *mddev)
4917  {
4918  	lockdep_assert_held(&mddev->reconfig_mutex);
4919  
4920  	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4921  	stop_sync_thread(mddev, true);
4922  }
4923  EXPORT_SYMBOL_GPL(md_idle_sync_thread);
4924  
md_frozen_sync_thread(struct mddev * mddev)4925  void md_frozen_sync_thread(struct mddev *mddev)
4926  {
4927  	lockdep_assert_held(&mddev->reconfig_mutex);
4928  
4929  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4930  	stop_sync_thread(mddev, true);
4931  }
4932  EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
4933  
md_unfrozen_sync_thread(struct mddev * mddev)4934  void md_unfrozen_sync_thread(struct mddev *mddev)
4935  {
4936  	lockdep_assert_held(&mddev->reconfig_mutex);
4937  
4938  	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4939  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4940  	md_wakeup_thread(mddev->thread);
4941  	sysfs_notify_dirent_safe(mddev->sysfs_action);
4942  }
4943  EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
4944  
mddev_start_reshape(struct mddev * mddev)4945  static int mddev_start_reshape(struct mddev *mddev)
4946  {
4947  	int ret;
4948  
4949  	if (mddev->pers->start_reshape == NULL)
4950  		return -EINVAL;
4951  
4952  	if (mddev->reshape_position == MaxSector ||
4953  	    mddev->pers->check_reshape == NULL ||
4954  	    mddev->pers->check_reshape(mddev)) {
4955  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4956  		ret = mddev->pers->start_reshape(mddev);
4957  		if (ret)
4958  			return ret;
4959  	} else {
4960  		/*
4961  		 * If reshape is still in progress, and md_check_recovery() can
4962  		 * continue to reshape, don't restart reshape because data can
4963  		 * be corrupted for raid456.
4964  		 */
4965  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4966  	}
4967  
4968  	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
4969  	return 0;
4970  }
4971  
4972  static ssize_t
action_store(struct mddev * mddev,const char * page,size_t len)4973  action_store(struct mddev *mddev, const char *page, size_t len)
4974  {
4975  	int ret;
4976  	enum sync_action action;
4977  
4978  	if (!mddev->pers || !mddev->pers->sync_request)
4979  		return -EINVAL;
4980  
4981  retry:
4982  	if (work_busy(&mddev->sync_work))
4983  		flush_work(&mddev->sync_work);
4984  
4985  	ret = mddev_lock(mddev);
4986  	if (ret)
4987  		return ret;
4988  
4989  	if (work_busy(&mddev->sync_work)) {
4990  		mddev_unlock(mddev);
4991  		goto retry;
4992  	}
4993  
4994  	action = md_sync_action_by_name(page);
4995  
4996  	/* TODO: mdadm rely on "idle" to start sync_thread. */
4997  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4998  		switch (action) {
4999  		case ACTION_FROZEN:
5000  			md_frozen_sync_thread(mddev);
5001  			ret = len;
5002  			goto out;
5003  		case ACTION_IDLE:
5004  			md_idle_sync_thread(mddev);
5005  			break;
5006  		case ACTION_RESHAPE:
5007  		case ACTION_RECOVER:
5008  		case ACTION_CHECK:
5009  		case ACTION_REPAIR:
5010  		case ACTION_RESYNC:
5011  			ret = -EBUSY;
5012  			goto out;
5013  		default:
5014  			ret = -EINVAL;
5015  			goto out;
5016  		}
5017  	} else {
5018  		switch (action) {
5019  		case ACTION_FROZEN:
5020  			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5021  			ret = len;
5022  			goto out;
5023  		case ACTION_RESHAPE:
5024  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5025  			ret = mddev_start_reshape(mddev);
5026  			if (ret)
5027  				goto out;
5028  			break;
5029  		case ACTION_RECOVER:
5030  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5031  			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5032  			break;
5033  		case ACTION_CHECK:
5034  			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5035  			fallthrough;
5036  		case ACTION_REPAIR:
5037  			set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5038  			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5039  			fallthrough;
5040  		case ACTION_RESYNC:
5041  		case ACTION_IDLE:
5042  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5043  			break;
5044  		default:
5045  			ret = -EINVAL;
5046  			goto out;
5047  		}
5048  	}
5049  
5050  	if (mddev->ro == MD_AUTO_READ) {
5051  		/* A write to sync_action is enough to justify
5052  		 * canceling read-auto mode
5053  		 */
5054  		mddev->ro = MD_RDWR;
5055  		md_wakeup_thread(mddev->sync_thread);
5056  	}
5057  
5058  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5059  	md_wakeup_thread(mddev->thread);
5060  	sysfs_notify_dirent_safe(mddev->sysfs_action);
5061  	ret = len;
5062  
5063  out:
5064  	mddev_unlock(mddev);
5065  	return ret;
5066  }
5067  
5068  static struct md_sysfs_entry md_scan_mode =
5069  __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5070  
5071  static ssize_t
last_sync_action_show(struct mddev * mddev,char * page)5072  last_sync_action_show(struct mddev *mddev, char *page)
5073  {
5074  	return sprintf(page, "%s\n",
5075  		       md_sync_action_name(mddev->last_sync_action));
5076  }
5077  
5078  static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5079  
5080  static ssize_t
mismatch_cnt_show(struct mddev * mddev,char * page)5081  mismatch_cnt_show(struct mddev *mddev, char *page)
5082  {
5083  	return sprintf(page, "%llu\n",
5084  		       (unsigned long long)
5085  		       atomic64_read(&mddev->resync_mismatches));
5086  }
5087  
5088  static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5089  
5090  static ssize_t
sync_min_show(struct mddev * mddev,char * page)5091  sync_min_show(struct mddev *mddev, char *page)
5092  {
5093  	return sprintf(page, "%d (%s)\n", speed_min(mddev),
5094  		       mddev->sync_speed_min ? "local": "system");
5095  }
5096  
5097  static ssize_t
sync_min_store(struct mddev * mddev,const char * buf,size_t len)5098  sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5099  {
5100  	unsigned int min;
5101  	int rv;
5102  
5103  	if (strncmp(buf, "system", 6)==0) {
5104  		min = 0;
5105  	} else {
5106  		rv = kstrtouint(buf, 10, &min);
5107  		if (rv < 0)
5108  			return rv;
5109  		if (min == 0)
5110  			return -EINVAL;
5111  	}
5112  	mddev->sync_speed_min = min;
5113  	return len;
5114  }
5115  
5116  static struct md_sysfs_entry md_sync_min =
5117  __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5118  
5119  static ssize_t
sync_max_show(struct mddev * mddev,char * page)5120  sync_max_show(struct mddev *mddev, char *page)
5121  {
5122  	return sprintf(page, "%d (%s)\n", speed_max(mddev),
5123  		       mddev->sync_speed_max ? "local": "system");
5124  }
5125  
5126  static ssize_t
sync_max_store(struct mddev * mddev,const char * buf,size_t len)5127  sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5128  {
5129  	unsigned int max;
5130  	int rv;
5131  
5132  	if (strncmp(buf, "system", 6)==0) {
5133  		max = 0;
5134  	} else {
5135  		rv = kstrtouint(buf, 10, &max);
5136  		if (rv < 0)
5137  			return rv;
5138  		if (max == 0)
5139  			return -EINVAL;
5140  	}
5141  	mddev->sync_speed_max = max;
5142  	return len;
5143  }
5144  
5145  static struct md_sysfs_entry md_sync_max =
5146  __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5147  
5148  static ssize_t
degraded_show(struct mddev * mddev,char * page)5149  degraded_show(struct mddev *mddev, char *page)
5150  {
5151  	return sprintf(page, "%d\n", mddev->degraded);
5152  }
5153  static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5154  
5155  static ssize_t
sync_force_parallel_show(struct mddev * mddev,char * page)5156  sync_force_parallel_show(struct mddev *mddev, char *page)
5157  {
5158  	return sprintf(page, "%d\n", mddev->parallel_resync);
5159  }
5160  
5161  static ssize_t
sync_force_parallel_store(struct mddev * mddev,const char * buf,size_t len)5162  sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5163  {
5164  	long n;
5165  
5166  	if (kstrtol(buf, 10, &n))
5167  		return -EINVAL;
5168  
5169  	if (n != 0 && n != 1)
5170  		return -EINVAL;
5171  
5172  	mddev->parallel_resync = n;
5173  
5174  	if (mddev->sync_thread)
5175  		wake_up(&resync_wait);
5176  
5177  	return len;
5178  }
5179  
5180  /* force parallel resync, even with shared block devices */
5181  static struct md_sysfs_entry md_sync_force_parallel =
5182  __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5183         sync_force_parallel_show, sync_force_parallel_store);
5184  
5185  static ssize_t
sync_speed_show(struct mddev * mddev,char * page)5186  sync_speed_show(struct mddev *mddev, char *page)
5187  {
5188  	unsigned long resync, dt, db;
5189  	if (mddev->curr_resync == MD_RESYNC_NONE)
5190  		return sprintf(page, "none\n");
5191  	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5192  	dt = (jiffies - mddev->resync_mark) / HZ;
5193  	if (!dt) dt++;
5194  	db = resync - mddev->resync_mark_cnt;
5195  	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5196  }
5197  
5198  static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5199  
5200  static ssize_t
sync_completed_show(struct mddev * mddev,char * page)5201  sync_completed_show(struct mddev *mddev, char *page)
5202  {
5203  	unsigned long long max_sectors, resync;
5204  
5205  	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5206  		return sprintf(page, "none\n");
5207  
5208  	if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5209  	    mddev->curr_resync == MD_RESYNC_DELAYED)
5210  		return sprintf(page, "delayed\n");
5211  
5212  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5213  	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5214  		max_sectors = mddev->resync_max_sectors;
5215  	else
5216  		max_sectors = mddev->dev_sectors;
5217  
5218  	resync = mddev->curr_resync_completed;
5219  	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5220  }
5221  
5222  static struct md_sysfs_entry md_sync_completed =
5223  	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5224  
5225  static ssize_t
min_sync_show(struct mddev * mddev,char * page)5226  min_sync_show(struct mddev *mddev, char *page)
5227  {
5228  	return sprintf(page, "%llu\n",
5229  		       (unsigned long long)mddev->resync_min);
5230  }
5231  static ssize_t
min_sync_store(struct mddev * mddev,const char * buf,size_t len)5232  min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5233  {
5234  	unsigned long long min;
5235  	int err;
5236  
5237  	if (kstrtoull(buf, 10, &min))
5238  		return -EINVAL;
5239  
5240  	spin_lock(&mddev->lock);
5241  	err = -EINVAL;
5242  	if (min > mddev->resync_max)
5243  		goto out_unlock;
5244  
5245  	err = -EBUSY;
5246  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5247  		goto out_unlock;
5248  
5249  	/* Round down to multiple of 4K for safety */
5250  	mddev->resync_min = round_down(min, 8);
5251  	err = 0;
5252  
5253  out_unlock:
5254  	spin_unlock(&mddev->lock);
5255  	return err ?: len;
5256  }
5257  
5258  static struct md_sysfs_entry md_min_sync =
5259  __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5260  
5261  static ssize_t
max_sync_show(struct mddev * mddev,char * page)5262  max_sync_show(struct mddev *mddev, char *page)
5263  {
5264  	if (mddev->resync_max == MaxSector)
5265  		return sprintf(page, "max\n");
5266  	else
5267  		return sprintf(page, "%llu\n",
5268  			       (unsigned long long)mddev->resync_max);
5269  }
5270  static ssize_t
max_sync_store(struct mddev * mddev,const char * buf,size_t len)5271  max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5272  {
5273  	int err;
5274  	spin_lock(&mddev->lock);
5275  	if (strncmp(buf, "max", 3) == 0)
5276  		mddev->resync_max = MaxSector;
5277  	else {
5278  		unsigned long long max;
5279  		int chunk;
5280  
5281  		err = -EINVAL;
5282  		if (kstrtoull(buf, 10, &max))
5283  			goto out_unlock;
5284  		if (max < mddev->resync_min)
5285  			goto out_unlock;
5286  
5287  		err = -EBUSY;
5288  		if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5289  		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5290  			goto out_unlock;
5291  
5292  		/* Must be a multiple of chunk_size */
5293  		chunk = mddev->chunk_sectors;
5294  		if (chunk) {
5295  			sector_t temp = max;
5296  
5297  			err = -EINVAL;
5298  			if (sector_div(temp, chunk))
5299  				goto out_unlock;
5300  		}
5301  		mddev->resync_max = max;
5302  	}
5303  	wake_up(&mddev->recovery_wait);
5304  	err = 0;
5305  out_unlock:
5306  	spin_unlock(&mddev->lock);
5307  	return err ?: len;
5308  }
5309  
5310  static struct md_sysfs_entry md_max_sync =
5311  __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5312  
5313  static ssize_t
suspend_lo_show(struct mddev * mddev,char * page)5314  suspend_lo_show(struct mddev *mddev, char *page)
5315  {
5316  	return sprintf(page, "%llu\n",
5317  		       (unsigned long long)READ_ONCE(mddev->suspend_lo));
5318  }
5319  
5320  static ssize_t
suspend_lo_store(struct mddev * mddev,const char * buf,size_t len)5321  suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5322  {
5323  	unsigned long long new;
5324  	int err;
5325  
5326  	err = kstrtoull(buf, 10, &new);
5327  	if (err < 0)
5328  		return err;
5329  	if (new != (sector_t)new)
5330  		return -EINVAL;
5331  
5332  	err = mddev_suspend(mddev, true);
5333  	if (err)
5334  		return err;
5335  
5336  	WRITE_ONCE(mddev->suspend_lo, new);
5337  	mddev_resume(mddev);
5338  
5339  	return len;
5340  }
5341  static struct md_sysfs_entry md_suspend_lo =
5342  __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5343  
5344  static ssize_t
suspend_hi_show(struct mddev * mddev,char * page)5345  suspend_hi_show(struct mddev *mddev, char *page)
5346  {
5347  	return sprintf(page, "%llu\n",
5348  		       (unsigned long long)READ_ONCE(mddev->suspend_hi));
5349  }
5350  
5351  static ssize_t
suspend_hi_store(struct mddev * mddev,const char * buf,size_t len)5352  suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5353  {
5354  	unsigned long long new;
5355  	int err;
5356  
5357  	err = kstrtoull(buf, 10, &new);
5358  	if (err < 0)
5359  		return err;
5360  	if (new != (sector_t)new)
5361  		return -EINVAL;
5362  
5363  	err = mddev_suspend(mddev, true);
5364  	if (err)
5365  		return err;
5366  
5367  	WRITE_ONCE(mddev->suspend_hi, new);
5368  	mddev_resume(mddev);
5369  
5370  	return len;
5371  }
5372  static struct md_sysfs_entry md_suspend_hi =
5373  __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5374  
5375  static ssize_t
reshape_position_show(struct mddev * mddev,char * page)5376  reshape_position_show(struct mddev *mddev, char *page)
5377  {
5378  	if (mddev->reshape_position != MaxSector)
5379  		return sprintf(page, "%llu\n",
5380  			       (unsigned long long)mddev->reshape_position);
5381  	strcpy(page, "none\n");
5382  	return 5;
5383  }
5384  
5385  static ssize_t
reshape_position_store(struct mddev * mddev,const char * buf,size_t len)5386  reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5387  {
5388  	struct md_rdev *rdev;
5389  	unsigned long long new;
5390  	int err;
5391  
5392  	err = kstrtoull(buf, 10, &new);
5393  	if (err < 0)
5394  		return err;
5395  	if (new != (sector_t)new)
5396  		return -EINVAL;
5397  	err = mddev_lock(mddev);
5398  	if (err)
5399  		return err;
5400  	err = -EBUSY;
5401  	if (mddev->pers)
5402  		goto unlock;
5403  	mddev->reshape_position = new;
5404  	mddev->delta_disks = 0;
5405  	mddev->reshape_backwards = 0;
5406  	mddev->new_level = mddev->level;
5407  	mddev->new_layout = mddev->layout;
5408  	mddev->new_chunk_sectors = mddev->chunk_sectors;
5409  	rdev_for_each(rdev, mddev)
5410  		rdev->new_data_offset = rdev->data_offset;
5411  	err = 0;
5412  unlock:
5413  	mddev_unlock(mddev);
5414  	return err ?: len;
5415  }
5416  
5417  static struct md_sysfs_entry md_reshape_position =
5418  __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5419         reshape_position_store);
5420  
5421  static ssize_t
reshape_direction_show(struct mddev * mddev,char * page)5422  reshape_direction_show(struct mddev *mddev, char *page)
5423  {
5424  	return sprintf(page, "%s\n",
5425  		       mddev->reshape_backwards ? "backwards" : "forwards");
5426  }
5427  
5428  static ssize_t
reshape_direction_store(struct mddev * mddev,const char * buf,size_t len)5429  reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5430  {
5431  	int backwards = 0;
5432  	int err;
5433  
5434  	if (cmd_match(buf, "forwards"))
5435  		backwards = 0;
5436  	else if (cmd_match(buf, "backwards"))
5437  		backwards = 1;
5438  	else
5439  		return -EINVAL;
5440  	if (mddev->reshape_backwards == backwards)
5441  		return len;
5442  
5443  	err = mddev_lock(mddev);
5444  	if (err)
5445  		return err;
5446  	/* check if we are allowed to change */
5447  	if (mddev->delta_disks)
5448  		err = -EBUSY;
5449  	else if (mddev->persistent &&
5450  	    mddev->major_version == 0)
5451  		err =  -EINVAL;
5452  	else
5453  		mddev->reshape_backwards = backwards;
5454  	mddev_unlock(mddev);
5455  	return err ?: len;
5456  }
5457  
5458  static struct md_sysfs_entry md_reshape_direction =
5459  __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5460         reshape_direction_store);
5461  
5462  static ssize_t
array_size_show(struct mddev * mddev,char * page)5463  array_size_show(struct mddev *mddev, char *page)
5464  {
5465  	if (mddev->external_size)
5466  		return sprintf(page, "%llu\n",
5467  			       (unsigned long long)mddev->array_sectors/2);
5468  	else
5469  		return sprintf(page, "default\n");
5470  }
5471  
5472  static ssize_t
array_size_store(struct mddev * mddev,const char * buf,size_t len)5473  array_size_store(struct mddev *mddev, const char *buf, size_t len)
5474  {
5475  	sector_t sectors;
5476  	int err;
5477  
5478  	err = mddev_lock(mddev);
5479  	if (err)
5480  		return err;
5481  
5482  	/* cluster raid doesn't support change array_sectors */
5483  	if (mddev_is_clustered(mddev)) {
5484  		mddev_unlock(mddev);
5485  		return -EINVAL;
5486  	}
5487  
5488  	if (strncmp(buf, "default", 7) == 0) {
5489  		if (mddev->pers)
5490  			sectors = mddev->pers->size(mddev, 0, 0);
5491  		else
5492  			sectors = mddev->array_sectors;
5493  
5494  		mddev->external_size = 0;
5495  	} else {
5496  		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5497  			err = -EINVAL;
5498  		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5499  			err = -E2BIG;
5500  		else
5501  			mddev->external_size = 1;
5502  	}
5503  
5504  	if (!err) {
5505  		mddev->array_sectors = sectors;
5506  		if (mddev->pers)
5507  			set_capacity_and_notify(mddev->gendisk,
5508  						mddev->array_sectors);
5509  	}
5510  	mddev_unlock(mddev);
5511  	return err ?: len;
5512  }
5513  
5514  static struct md_sysfs_entry md_array_size =
5515  __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5516         array_size_store);
5517  
5518  static ssize_t
consistency_policy_show(struct mddev * mddev,char * page)5519  consistency_policy_show(struct mddev *mddev, char *page)
5520  {
5521  	int ret;
5522  
5523  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5524  		ret = sprintf(page, "journal\n");
5525  	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5526  		ret = sprintf(page, "ppl\n");
5527  	} else if (mddev->bitmap) {
5528  		ret = sprintf(page, "bitmap\n");
5529  	} else if (mddev->pers) {
5530  		if (mddev->pers->sync_request)
5531  			ret = sprintf(page, "resync\n");
5532  		else
5533  			ret = sprintf(page, "none\n");
5534  	} else {
5535  		ret = sprintf(page, "unknown\n");
5536  	}
5537  
5538  	return ret;
5539  }
5540  
5541  static ssize_t
consistency_policy_store(struct mddev * mddev,const char * buf,size_t len)5542  consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5543  {
5544  	int err = 0;
5545  
5546  	if (mddev->pers) {
5547  		if (mddev->pers->change_consistency_policy)
5548  			err = mddev->pers->change_consistency_policy(mddev, buf);
5549  		else
5550  			err = -EBUSY;
5551  	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5552  		set_bit(MD_HAS_PPL, &mddev->flags);
5553  	} else {
5554  		err = -EINVAL;
5555  	}
5556  
5557  	return err ? err : len;
5558  }
5559  
5560  static struct md_sysfs_entry md_consistency_policy =
5561  __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5562         consistency_policy_store);
5563  
fail_last_dev_show(struct mddev * mddev,char * page)5564  static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5565  {
5566  	return sprintf(page, "%d\n", mddev->fail_last_dev);
5567  }
5568  
5569  /*
5570   * Setting fail_last_dev to true to allow last device to be forcibly removed
5571   * from RAID1/RAID10.
5572   */
5573  static ssize_t
fail_last_dev_store(struct mddev * mddev,const char * buf,size_t len)5574  fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5575  {
5576  	int ret;
5577  	bool value;
5578  
5579  	ret = kstrtobool(buf, &value);
5580  	if (ret)
5581  		return ret;
5582  
5583  	if (value != mddev->fail_last_dev)
5584  		mddev->fail_last_dev = value;
5585  
5586  	return len;
5587  }
5588  static struct md_sysfs_entry md_fail_last_dev =
5589  __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5590         fail_last_dev_store);
5591  
serialize_policy_show(struct mddev * mddev,char * page)5592  static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5593  {
5594  	if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
5595  		return sprintf(page, "n/a\n");
5596  	else
5597  		return sprintf(page, "%d\n", mddev->serialize_policy);
5598  }
5599  
5600  /*
5601   * Setting serialize_policy to true to enforce write IO is not reordered
5602   * for raid1.
5603   */
5604  static ssize_t
serialize_policy_store(struct mddev * mddev,const char * buf,size_t len)5605  serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5606  {
5607  	int err;
5608  	bool value;
5609  
5610  	err = kstrtobool(buf, &value);
5611  	if (err)
5612  		return err;
5613  
5614  	if (value == mddev->serialize_policy)
5615  		return len;
5616  
5617  	err = mddev_suspend_and_lock(mddev);
5618  	if (err)
5619  		return err;
5620  	if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
5621  		pr_err("md: serialize_policy is only effective for raid1\n");
5622  		err = -EINVAL;
5623  		goto unlock;
5624  	}
5625  
5626  	if (value)
5627  		mddev_create_serial_pool(mddev, NULL);
5628  	else
5629  		mddev_destroy_serial_pool(mddev, NULL);
5630  	mddev->serialize_policy = value;
5631  unlock:
5632  	mddev_unlock_and_resume(mddev);
5633  	return err ?: len;
5634  }
5635  
5636  static struct md_sysfs_entry md_serialize_policy =
5637  __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5638         serialize_policy_store);
5639  
5640  
5641  static struct attribute *md_default_attrs[] = {
5642  	&md_level.attr,
5643  	&md_new_level.attr,
5644  	&md_layout.attr,
5645  	&md_raid_disks.attr,
5646  	&md_uuid.attr,
5647  	&md_chunk_size.attr,
5648  	&md_size.attr,
5649  	&md_resync_start.attr,
5650  	&md_metadata.attr,
5651  	&md_new_device.attr,
5652  	&md_safe_delay.attr,
5653  	&md_array_state.attr,
5654  	&md_reshape_position.attr,
5655  	&md_reshape_direction.attr,
5656  	&md_array_size.attr,
5657  	&max_corr_read_errors.attr,
5658  	&md_consistency_policy.attr,
5659  	&md_fail_last_dev.attr,
5660  	&md_serialize_policy.attr,
5661  	NULL,
5662  };
5663  
5664  static const struct attribute_group md_default_group = {
5665  	.attrs = md_default_attrs,
5666  };
5667  
5668  static struct attribute *md_redundancy_attrs[] = {
5669  	&md_scan_mode.attr,
5670  	&md_last_scan_mode.attr,
5671  	&md_mismatches.attr,
5672  	&md_sync_min.attr,
5673  	&md_sync_max.attr,
5674  	&md_sync_speed.attr,
5675  	&md_sync_force_parallel.attr,
5676  	&md_sync_completed.attr,
5677  	&md_min_sync.attr,
5678  	&md_max_sync.attr,
5679  	&md_suspend_lo.attr,
5680  	&md_suspend_hi.attr,
5681  	&md_bitmap.attr,
5682  	&md_degraded.attr,
5683  	NULL,
5684  };
5685  static const struct attribute_group md_redundancy_group = {
5686  	.name = NULL,
5687  	.attrs = md_redundancy_attrs,
5688  };
5689  
5690  static const struct attribute_group *md_attr_groups[] = {
5691  	&md_default_group,
5692  	&md_bitmap_group,
5693  	NULL,
5694  };
5695  
5696  static ssize_t
md_attr_show(struct kobject * kobj,struct attribute * attr,char * page)5697  md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5698  {
5699  	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5700  	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5701  	ssize_t rv;
5702  
5703  	if (!entry->show)
5704  		return -EIO;
5705  	spin_lock(&all_mddevs_lock);
5706  	if (!mddev_get(mddev)) {
5707  		spin_unlock(&all_mddevs_lock);
5708  		return -EBUSY;
5709  	}
5710  	spin_unlock(&all_mddevs_lock);
5711  
5712  	rv = entry->show(mddev, page);
5713  	mddev_put(mddev);
5714  	return rv;
5715  }
5716  
5717  static ssize_t
md_attr_store(struct kobject * kobj,struct attribute * attr,const char * page,size_t length)5718  md_attr_store(struct kobject *kobj, struct attribute *attr,
5719  	      const char *page, size_t length)
5720  {
5721  	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5722  	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5723  	ssize_t rv;
5724  
5725  	if (!entry->store)
5726  		return -EIO;
5727  	if (!capable(CAP_SYS_ADMIN))
5728  		return -EACCES;
5729  	spin_lock(&all_mddevs_lock);
5730  	if (!mddev_get(mddev)) {
5731  		spin_unlock(&all_mddevs_lock);
5732  		return -EBUSY;
5733  	}
5734  	spin_unlock(&all_mddevs_lock);
5735  	rv = entry->store(mddev, page, length);
5736  	mddev_put(mddev);
5737  	return rv;
5738  }
5739  
md_kobj_release(struct kobject * ko)5740  static void md_kobj_release(struct kobject *ko)
5741  {
5742  	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5743  
5744  	if (mddev->sysfs_state)
5745  		sysfs_put(mddev->sysfs_state);
5746  	if (mddev->sysfs_level)
5747  		sysfs_put(mddev->sysfs_level);
5748  
5749  	del_gendisk(mddev->gendisk);
5750  	put_disk(mddev->gendisk);
5751  }
5752  
5753  static const struct sysfs_ops md_sysfs_ops = {
5754  	.show	= md_attr_show,
5755  	.store	= md_attr_store,
5756  };
5757  static const struct kobj_type md_ktype = {
5758  	.release	= md_kobj_release,
5759  	.sysfs_ops	= &md_sysfs_ops,
5760  	.default_groups	= md_attr_groups,
5761  };
5762  
5763  int mdp_major = 0;
5764  
5765  /* stack the limit for all rdevs into lim */
mddev_stack_rdev_limits(struct mddev * mddev,struct queue_limits * lim,unsigned int flags)5766  int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
5767  		unsigned int flags)
5768  {
5769  	struct md_rdev *rdev;
5770  
5771  	rdev_for_each(rdev, mddev) {
5772  		queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
5773  					mddev->gendisk->disk_name);
5774  		if ((flags & MDDEV_STACK_INTEGRITY) &&
5775  		    !queue_limits_stack_integrity_bdev(lim, rdev->bdev))
5776  			return -EINVAL;
5777  	}
5778  
5779  	return 0;
5780  }
5781  EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
5782  
5783  /* apply the extra stacking limits from a new rdev into mddev */
mddev_stack_new_rdev(struct mddev * mddev,struct md_rdev * rdev)5784  int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
5785  {
5786  	struct queue_limits lim;
5787  
5788  	if (mddev_is_dm(mddev))
5789  		return 0;
5790  
5791  	lim = queue_limits_start_update(mddev->gendisk->queue);
5792  	queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
5793  				mddev->gendisk->disk_name);
5794  
5795  	if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) {
5796  		pr_err("%s: incompatible integrity profile for %pg\n",
5797  		       mdname(mddev), rdev->bdev);
5798  		queue_limits_cancel_update(mddev->gendisk->queue);
5799  		return -ENXIO;
5800  	}
5801  
5802  	return queue_limits_commit_update(mddev->gendisk->queue, &lim);
5803  }
5804  EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
5805  
5806  /* update the optimal I/O size after a reshape */
mddev_update_io_opt(struct mddev * mddev,unsigned int nr_stripes)5807  void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
5808  {
5809  	struct queue_limits lim;
5810  
5811  	if (mddev_is_dm(mddev))
5812  		return;
5813  
5814  	/* don't bother updating io_opt if we can't suspend the array */
5815  	if (mddev_suspend(mddev, false) < 0)
5816  		return;
5817  	lim = queue_limits_start_update(mddev->gendisk->queue);
5818  	lim.io_opt = lim.io_min * nr_stripes;
5819  	queue_limits_commit_update(mddev->gendisk->queue, &lim);
5820  	mddev_resume(mddev);
5821  }
5822  EXPORT_SYMBOL_GPL(mddev_update_io_opt);
5823  
mddev_delayed_delete(struct work_struct * ws)5824  static void mddev_delayed_delete(struct work_struct *ws)
5825  {
5826  	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5827  
5828  	kobject_put(&mddev->kobj);
5829  }
5830  
md_init_stacking_limits(struct queue_limits * lim)5831  void md_init_stacking_limits(struct queue_limits *lim)
5832  {
5833  	blk_set_stacking_limits(lim);
5834  	lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
5835  			BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
5836  }
5837  EXPORT_SYMBOL_GPL(md_init_stacking_limits);
5838  
md_alloc(dev_t dev,char * name)5839  struct mddev *md_alloc(dev_t dev, char *name)
5840  {
5841  	/*
5842  	 * If dev is zero, name is the name of a device to allocate with
5843  	 * an arbitrary minor number.  It will be "md_???"
5844  	 * If dev is non-zero it must be a device number with a MAJOR of
5845  	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5846  	 * the device is being created by opening a node in /dev.
5847  	 * If "name" is not NULL, the device is being created by
5848  	 * writing to /sys/module/md_mod/parameters/new_array.
5849  	 */
5850  	static DEFINE_MUTEX(disks_mutex);
5851  	struct mddev *mddev;
5852  	struct gendisk *disk;
5853  	int partitioned;
5854  	int shift;
5855  	int unit;
5856  	int error;
5857  
5858  	/*
5859  	 * Wait for any previous instance of this device to be completely
5860  	 * removed (mddev_delayed_delete).
5861  	 */
5862  	flush_workqueue(md_misc_wq);
5863  
5864  	mutex_lock(&disks_mutex);
5865  	mddev = mddev_alloc(dev);
5866  	if (IS_ERR(mddev)) {
5867  		error = PTR_ERR(mddev);
5868  		goto out_unlock;
5869  	}
5870  
5871  	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5872  	shift = partitioned ? MdpMinorShift : 0;
5873  	unit = MINOR(mddev->unit) >> shift;
5874  
5875  	if (name && !dev) {
5876  		/* Need to ensure that 'name' is not a duplicate.
5877  		 */
5878  		struct mddev *mddev2;
5879  		spin_lock(&all_mddevs_lock);
5880  
5881  		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5882  			if (mddev2->gendisk &&
5883  			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5884  				spin_unlock(&all_mddevs_lock);
5885  				error = -EEXIST;
5886  				goto out_free_mddev;
5887  			}
5888  		spin_unlock(&all_mddevs_lock);
5889  	}
5890  	if (name && dev)
5891  		/*
5892  		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5893  		 */
5894  		mddev->hold_active = UNTIL_STOP;
5895  
5896  	disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
5897  	if (IS_ERR(disk)) {
5898  		error = PTR_ERR(disk);
5899  		goto out_free_mddev;
5900  	}
5901  
5902  	disk->major = MAJOR(mddev->unit);
5903  	disk->first_minor = unit << shift;
5904  	disk->minors = 1 << shift;
5905  	if (name)
5906  		strcpy(disk->disk_name, name);
5907  	else if (partitioned)
5908  		sprintf(disk->disk_name, "md_d%d", unit);
5909  	else
5910  		sprintf(disk->disk_name, "md%d", unit);
5911  	disk->fops = &md_fops;
5912  	disk->private_data = mddev;
5913  
5914  	disk->events |= DISK_EVENT_MEDIA_CHANGE;
5915  	mddev->gendisk = disk;
5916  	error = add_disk(disk);
5917  	if (error)
5918  		goto out_put_disk;
5919  
5920  	kobject_init(&mddev->kobj, &md_ktype);
5921  	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5922  	if (error) {
5923  		/*
5924  		 * The disk is already live at this point.  Clear the hold flag
5925  		 * and let mddev_put take care of the deletion, as it isn't any
5926  		 * different from a normal close on last release now.
5927  		 */
5928  		mddev->hold_active = 0;
5929  		mutex_unlock(&disks_mutex);
5930  		mddev_put(mddev);
5931  		return ERR_PTR(error);
5932  	}
5933  
5934  	kobject_uevent(&mddev->kobj, KOBJ_ADD);
5935  	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5936  	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5937  	mutex_unlock(&disks_mutex);
5938  	return mddev;
5939  
5940  out_put_disk:
5941  	put_disk(disk);
5942  out_free_mddev:
5943  	mddev_free(mddev);
5944  out_unlock:
5945  	mutex_unlock(&disks_mutex);
5946  	return ERR_PTR(error);
5947  }
5948  
md_alloc_and_put(dev_t dev,char * name)5949  static int md_alloc_and_put(dev_t dev, char *name)
5950  {
5951  	struct mddev *mddev = md_alloc(dev, name);
5952  
5953  	if (IS_ERR(mddev))
5954  		return PTR_ERR(mddev);
5955  	mddev_put(mddev);
5956  	return 0;
5957  }
5958  
md_probe(dev_t dev)5959  static void md_probe(dev_t dev)
5960  {
5961  	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
5962  		return;
5963  	if (create_on_open)
5964  		md_alloc_and_put(dev, NULL);
5965  }
5966  
add_named_array(const char * val,const struct kernel_param * kp)5967  static int add_named_array(const char *val, const struct kernel_param *kp)
5968  {
5969  	/*
5970  	 * val must be "md_*" or "mdNNN".
5971  	 * For "md_*" we allocate an array with a large free minor number, and
5972  	 * set the name to val.  val must not already be an active name.
5973  	 * For "mdNNN" we allocate an array with the minor number NNN
5974  	 * which must not already be in use.
5975  	 */
5976  	int len = strlen(val);
5977  	char buf[DISK_NAME_LEN];
5978  	unsigned long devnum;
5979  
5980  	while (len && val[len-1] == '\n')
5981  		len--;
5982  	if (len >= DISK_NAME_LEN)
5983  		return -E2BIG;
5984  	strscpy(buf, val, len+1);
5985  	if (strncmp(buf, "md_", 3) == 0)
5986  		return md_alloc_and_put(0, buf);
5987  	if (strncmp(buf, "md", 2) == 0 &&
5988  	    isdigit(buf[2]) &&
5989  	    kstrtoul(buf+2, 10, &devnum) == 0 &&
5990  	    devnum <= MINORMASK)
5991  		return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
5992  
5993  	return -EINVAL;
5994  }
5995  
md_safemode_timeout(struct timer_list * t)5996  static void md_safemode_timeout(struct timer_list *t)
5997  {
5998  	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5999  
6000  	mddev->safemode = 1;
6001  	if (mddev->external)
6002  		sysfs_notify_dirent_safe(mddev->sysfs_state);
6003  
6004  	md_wakeup_thread(mddev->thread);
6005  }
6006  
6007  static int start_dirty_degraded;
6008  
md_run(struct mddev * mddev)6009  int md_run(struct mddev *mddev)
6010  {
6011  	int err;
6012  	struct md_rdev *rdev;
6013  	struct md_personality *pers;
6014  	bool nowait = true;
6015  
6016  	if (list_empty(&mddev->disks))
6017  		/* cannot run an array with no devices.. */
6018  		return -EINVAL;
6019  
6020  	if (mddev->pers)
6021  		return -EBUSY;
6022  	/* Cannot run until previous stop completes properly */
6023  	if (mddev->sysfs_active)
6024  		return -EBUSY;
6025  
6026  	/*
6027  	 * Analyze all RAID superblock(s)
6028  	 */
6029  	if (!mddev->raid_disks) {
6030  		if (!mddev->persistent)
6031  			return -EINVAL;
6032  		err = analyze_sbs(mddev);
6033  		if (err)
6034  			return -EINVAL;
6035  	}
6036  
6037  	if (mddev->level != LEVEL_NONE)
6038  		request_module("md-level-%d", mddev->level);
6039  	else if (mddev->clevel[0])
6040  		request_module("md-%s", mddev->clevel);
6041  
6042  	/*
6043  	 * Drop all container device buffers, from now on
6044  	 * the only valid external interface is through the md
6045  	 * device.
6046  	 */
6047  	mddev->has_superblocks = false;
6048  	rdev_for_each(rdev, mddev) {
6049  		if (test_bit(Faulty, &rdev->flags))
6050  			continue;
6051  		sync_blockdev(rdev->bdev);
6052  		invalidate_bdev(rdev->bdev);
6053  		if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6054  			mddev->ro = MD_RDONLY;
6055  			if (!mddev_is_dm(mddev))
6056  				set_disk_ro(mddev->gendisk, 1);
6057  		}
6058  
6059  		if (rdev->sb_page)
6060  			mddev->has_superblocks = true;
6061  
6062  		/* perform some consistency tests on the device.
6063  		 * We don't want the data to overlap the metadata,
6064  		 * Internal Bitmap issues have been handled elsewhere.
6065  		 */
6066  		if (rdev->meta_bdev) {
6067  			/* Nothing to check */;
6068  		} else if (rdev->data_offset < rdev->sb_start) {
6069  			if (mddev->dev_sectors &&
6070  			    rdev->data_offset + mddev->dev_sectors
6071  			    > rdev->sb_start) {
6072  				pr_warn("md: %s: data overlaps metadata\n",
6073  					mdname(mddev));
6074  				return -EINVAL;
6075  			}
6076  		} else {
6077  			if (rdev->sb_start + rdev->sb_size/512
6078  			    > rdev->data_offset) {
6079  				pr_warn("md: %s: metadata overlaps data\n",
6080  					mdname(mddev));
6081  				return -EINVAL;
6082  			}
6083  		}
6084  		sysfs_notify_dirent_safe(rdev->sysfs_state);
6085  		nowait = nowait && bdev_nowait(rdev->bdev);
6086  	}
6087  
6088  	if (!bioset_initialized(&mddev->bio_set)) {
6089  		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6090  		if (err)
6091  			return err;
6092  	}
6093  	if (!bioset_initialized(&mddev->sync_set)) {
6094  		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6095  		if (err)
6096  			goto exit_bio_set;
6097  	}
6098  
6099  	if (!bioset_initialized(&mddev->io_clone_set)) {
6100  		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
6101  				  offsetof(struct md_io_clone, bio_clone), 0);
6102  		if (err)
6103  			goto exit_sync_set;
6104  	}
6105  
6106  	pers = get_pers(mddev->level, mddev->clevel);
6107  	if (!pers) {
6108  		err = -EINVAL;
6109  		goto abort;
6110  	}
6111  	if (mddev->level != pers->head.id) {
6112  		mddev->level = pers->head.id;
6113  		mddev->new_level = pers->head.id;
6114  	}
6115  	strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel));
6116  
6117  	if (mddev->reshape_position != MaxSector &&
6118  	    pers->start_reshape == NULL) {
6119  		/* This personality cannot handle reshaping... */
6120  		put_pers(pers);
6121  		err = -EINVAL;
6122  		goto abort;
6123  	}
6124  
6125  	if (pers->sync_request) {
6126  		/* Warn if this is a potentially silly
6127  		 * configuration.
6128  		 */
6129  		struct md_rdev *rdev2;
6130  		int warned = 0;
6131  
6132  		rdev_for_each(rdev, mddev)
6133  			rdev_for_each(rdev2, mddev) {
6134  				if (rdev < rdev2 &&
6135  				    rdev->bdev->bd_disk ==
6136  				    rdev2->bdev->bd_disk) {
6137  					pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6138  						mdname(mddev),
6139  						rdev->bdev,
6140  						rdev2->bdev);
6141  					warned = 1;
6142  				}
6143  			}
6144  
6145  		if (warned)
6146  			pr_warn("True protection against single-disk failure might be compromised.\n");
6147  	}
6148  
6149  	/* dm-raid expect sync_thread to be frozen until resume */
6150  	if (mddev->gendisk)
6151  		mddev->recovery = 0;
6152  
6153  	/* may be over-ridden by personality */
6154  	mddev->resync_max_sectors = mddev->dev_sectors;
6155  
6156  	mddev->ok_start_degraded = start_dirty_degraded;
6157  
6158  	if (start_readonly && md_is_rdwr(mddev))
6159  		mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6160  
6161  	err = pers->run(mddev);
6162  	if (err)
6163  		pr_warn("md: pers->run() failed ...\n");
6164  	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6165  		WARN_ONCE(!mddev->external_size,
6166  			  "%s: default size too small, but 'external_size' not in effect?\n",
6167  			  __func__);
6168  		pr_warn("md: invalid array_size %llu > default size %llu\n",
6169  			(unsigned long long)mddev->array_sectors / 2,
6170  			(unsigned long long)pers->size(mddev, 0, 0) / 2);
6171  		err = -EINVAL;
6172  	}
6173  	if (err == 0 && pers->sync_request &&
6174  	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6175  		err = mddev->bitmap_ops->create(mddev, -1);
6176  		if (err)
6177  			pr_warn("%s: failed to create bitmap (%d)\n",
6178  				mdname(mddev), err);
6179  	}
6180  	if (err)
6181  		goto bitmap_abort;
6182  
6183  	if (mddev->bitmap_info.max_write_behind > 0) {
6184  		bool create_pool = false;
6185  
6186  		rdev_for_each(rdev, mddev) {
6187  			if (test_bit(WriteMostly, &rdev->flags) &&
6188  			    rdev_init_serial(rdev))
6189  				create_pool = true;
6190  		}
6191  		if (create_pool && mddev->serial_info_pool == NULL) {
6192  			mddev->serial_info_pool =
6193  				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6194  						    sizeof(struct serial_info));
6195  			if (!mddev->serial_info_pool) {
6196  				err = -ENOMEM;
6197  				goto bitmap_abort;
6198  			}
6199  		}
6200  	}
6201  
6202  	if (pers->sync_request) {
6203  		if (mddev->kobj.sd &&
6204  		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6205  			pr_warn("md: cannot register extra attributes for %s\n",
6206  				mdname(mddev));
6207  		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6208  		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6209  		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6210  	} else if (mddev->ro == MD_AUTO_READ)
6211  		mddev->ro = MD_RDWR;
6212  
6213  	atomic_set(&mddev->max_corr_read_errors,
6214  		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6215  	mddev->safemode = 0;
6216  	if (mddev_is_clustered(mddev))
6217  		mddev->safemode_delay = 0;
6218  	else
6219  		mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6220  	mddev->in_sync = 1;
6221  	smp_wmb();
6222  	spin_lock(&mddev->lock);
6223  	mddev->pers = pers;
6224  	spin_unlock(&mddev->lock);
6225  	rdev_for_each(rdev, mddev)
6226  		if (rdev->raid_disk >= 0)
6227  			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6228  
6229  	if (mddev->degraded && md_is_rdwr(mddev))
6230  		/* This ensures that recovering status is reported immediately
6231  		 * via sysfs - until a lack of spares is confirmed.
6232  		 */
6233  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6234  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6235  
6236  	if (mddev->sb_flags)
6237  		md_update_sb(mddev, 0);
6238  
6239  	md_new_event();
6240  	return 0;
6241  
6242  bitmap_abort:
6243  	mddev_detach(mddev);
6244  	if (mddev->private)
6245  		pers->free(mddev, mddev->private);
6246  	mddev->private = NULL;
6247  	put_pers(pers);
6248  	mddev->bitmap_ops->destroy(mddev);
6249  abort:
6250  	bioset_exit(&mddev->io_clone_set);
6251  exit_sync_set:
6252  	bioset_exit(&mddev->sync_set);
6253  exit_bio_set:
6254  	bioset_exit(&mddev->bio_set);
6255  	return err;
6256  }
6257  EXPORT_SYMBOL_GPL(md_run);
6258  
do_md_run(struct mddev * mddev)6259  int do_md_run(struct mddev *mddev)
6260  {
6261  	int err;
6262  
6263  	set_bit(MD_NOT_READY, &mddev->flags);
6264  	err = md_run(mddev);
6265  	if (err)
6266  		goto out;
6267  
6268  	err = mddev->bitmap_ops->load(mddev);
6269  	if (err) {
6270  		mddev->bitmap_ops->destroy(mddev);
6271  		goto out;
6272  	}
6273  
6274  	if (mddev_is_clustered(mddev))
6275  		md_allow_write(mddev);
6276  
6277  	/* run start up tasks that require md_thread */
6278  	md_start(mddev);
6279  
6280  	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6281  
6282  	set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6283  	clear_bit(MD_NOT_READY, &mddev->flags);
6284  	mddev->changed = 1;
6285  	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6286  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6287  	sysfs_notify_dirent_safe(mddev->sysfs_action);
6288  	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6289  out:
6290  	clear_bit(MD_NOT_READY, &mddev->flags);
6291  	return err;
6292  }
6293  
md_start(struct mddev * mddev)6294  int md_start(struct mddev *mddev)
6295  {
6296  	int ret = 0;
6297  
6298  	if (mddev->pers->start) {
6299  		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6300  		ret = mddev->pers->start(mddev);
6301  		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6302  		md_wakeup_thread(mddev->sync_thread);
6303  	}
6304  	return ret;
6305  }
6306  EXPORT_SYMBOL_GPL(md_start);
6307  
restart_array(struct mddev * mddev)6308  static int restart_array(struct mddev *mddev)
6309  {
6310  	struct gendisk *disk = mddev->gendisk;
6311  	struct md_rdev *rdev;
6312  	bool has_journal = false;
6313  	bool has_readonly = false;
6314  
6315  	/* Complain if it has no devices */
6316  	if (list_empty(&mddev->disks))
6317  		return -ENXIO;
6318  	if (!mddev->pers)
6319  		return -EINVAL;
6320  	if (md_is_rdwr(mddev))
6321  		return -EBUSY;
6322  
6323  	rcu_read_lock();
6324  	rdev_for_each_rcu(rdev, mddev) {
6325  		if (test_bit(Journal, &rdev->flags) &&
6326  		    !test_bit(Faulty, &rdev->flags))
6327  			has_journal = true;
6328  		if (rdev_read_only(rdev))
6329  			has_readonly = true;
6330  	}
6331  	rcu_read_unlock();
6332  	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6333  		/* Don't restart rw with journal missing/faulty */
6334  			return -EINVAL;
6335  	if (has_readonly)
6336  		return -EROFS;
6337  
6338  	mddev->safemode = 0;
6339  	mddev->ro = MD_RDWR;
6340  	set_disk_ro(disk, 0);
6341  	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6342  	/* Kick recovery or resync if necessary */
6343  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6344  	md_wakeup_thread(mddev->sync_thread);
6345  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6346  	return 0;
6347  }
6348  
md_clean(struct mddev * mddev)6349  static void md_clean(struct mddev *mddev)
6350  {
6351  	mddev->array_sectors = 0;
6352  	mddev->external_size = 0;
6353  	mddev->dev_sectors = 0;
6354  	mddev->raid_disks = 0;
6355  	mddev->recovery_cp = 0;
6356  	mddev->resync_min = 0;
6357  	mddev->resync_max = MaxSector;
6358  	mddev->reshape_position = MaxSector;
6359  	/* we still need mddev->external in export_rdev, do not clear it yet */
6360  	mddev->persistent = 0;
6361  	mddev->level = LEVEL_NONE;
6362  	mddev->clevel[0] = 0;
6363  	/*
6364  	 * Don't clear MD_CLOSING, or mddev can be opened again.
6365  	 * 'hold_active != 0' means mddev is still in the creation
6366  	 * process and will be used later.
6367  	 */
6368  	if (mddev->hold_active)
6369  		mddev->flags = 0;
6370  	else
6371  		mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6372  	mddev->sb_flags = 0;
6373  	mddev->ro = MD_RDWR;
6374  	mddev->metadata_type[0] = 0;
6375  	mddev->chunk_sectors = 0;
6376  	mddev->ctime = mddev->utime = 0;
6377  	mddev->layout = 0;
6378  	mddev->max_disks = 0;
6379  	mddev->events = 0;
6380  	mddev->can_decrease_events = 0;
6381  	mddev->delta_disks = 0;
6382  	mddev->reshape_backwards = 0;
6383  	mddev->new_level = LEVEL_NONE;
6384  	mddev->new_layout = 0;
6385  	mddev->new_chunk_sectors = 0;
6386  	mddev->curr_resync = MD_RESYNC_NONE;
6387  	atomic64_set(&mddev->resync_mismatches, 0);
6388  	mddev->suspend_lo = mddev->suspend_hi = 0;
6389  	mddev->sync_speed_min = mddev->sync_speed_max = 0;
6390  	mddev->recovery = 0;
6391  	mddev->in_sync = 0;
6392  	mddev->changed = 0;
6393  	mddev->degraded = 0;
6394  	mddev->safemode = 0;
6395  	mddev->private = NULL;
6396  	mddev->cluster_info = NULL;
6397  	mddev->bitmap_info.offset = 0;
6398  	mddev->bitmap_info.default_offset = 0;
6399  	mddev->bitmap_info.default_space = 0;
6400  	mddev->bitmap_info.chunksize = 0;
6401  	mddev->bitmap_info.daemon_sleep = 0;
6402  	mddev->bitmap_info.max_write_behind = 0;
6403  	mddev->bitmap_info.nodes = 0;
6404  }
6405  
__md_stop_writes(struct mddev * mddev)6406  static void __md_stop_writes(struct mddev *mddev)
6407  {
6408  	del_timer_sync(&mddev->safemode_timer);
6409  
6410  	if (mddev->pers && mddev->pers->quiesce) {
6411  		mddev->pers->quiesce(mddev, 1);
6412  		mddev->pers->quiesce(mddev, 0);
6413  	}
6414  
6415  	mddev->bitmap_ops->flush(mddev);
6416  
6417  	if (md_is_rdwr(mddev) &&
6418  	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6419  	     mddev->sb_flags)) {
6420  		/* mark array as shutdown cleanly */
6421  		if (!mddev_is_clustered(mddev))
6422  			mddev->in_sync = 1;
6423  		md_update_sb(mddev, 1);
6424  	}
6425  	/* disable policy to guarantee rdevs free resources for serialization */
6426  	mddev->serialize_policy = 0;
6427  	mddev_destroy_serial_pool(mddev, NULL);
6428  }
6429  
md_stop_writes(struct mddev * mddev)6430  void md_stop_writes(struct mddev *mddev)
6431  {
6432  	mddev_lock_nointr(mddev);
6433  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6434  	stop_sync_thread(mddev, true);
6435  	__md_stop_writes(mddev);
6436  	mddev_unlock(mddev);
6437  }
6438  EXPORT_SYMBOL_GPL(md_stop_writes);
6439  
mddev_detach(struct mddev * mddev)6440  static void mddev_detach(struct mddev *mddev)
6441  {
6442  	mddev->bitmap_ops->wait_behind_writes(mddev);
6443  	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6444  		mddev->pers->quiesce(mddev, 1);
6445  		mddev->pers->quiesce(mddev, 0);
6446  	}
6447  	md_unregister_thread(mddev, &mddev->thread);
6448  
6449  	/* the unplug fn references 'conf' */
6450  	if (!mddev_is_dm(mddev))
6451  		blk_sync_queue(mddev->gendisk->queue);
6452  }
6453  
__md_stop(struct mddev * mddev)6454  static void __md_stop(struct mddev *mddev)
6455  {
6456  	struct md_personality *pers = mddev->pers;
6457  
6458  	mddev->bitmap_ops->destroy(mddev);
6459  	mddev_detach(mddev);
6460  	spin_lock(&mddev->lock);
6461  	mddev->pers = NULL;
6462  	spin_unlock(&mddev->lock);
6463  	if (mddev->private)
6464  		pers->free(mddev, mddev->private);
6465  	mddev->private = NULL;
6466  	if (pers->sync_request && mddev->to_remove == NULL)
6467  		mddev->to_remove = &md_redundancy_group;
6468  	put_pers(pers);
6469  	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6470  
6471  	bioset_exit(&mddev->bio_set);
6472  	bioset_exit(&mddev->sync_set);
6473  	bioset_exit(&mddev->io_clone_set);
6474  }
6475  
md_stop(struct mddev * mddev)6476  void md_stop(struct mddev *mddev)
6477  {
6478  	lockdep_assert_held(&mddev->reconfig_mutex);
6479  
6480  	/* stop the array and free an attached data structures.
6481  	 * This is called from dm-raid
6482  	 */
6483  	__md_stop_writes(mddev);
6484  	__md_stop(mddev);
6485  }
6486  
6487  EXPORT_SYMBOL_GPL(md_stop);
6488  
6489  /* ensure 'mddev->pers' exist before calling md_set_readonly() */
md_set_readonly(struct mddev * mddev)6490  static int md_set_readonly(struct mddev *mddev)
6491  {
6492  	int err = 0;
6493  	int did_freeze = 0;
6494  
6495  	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6496  		return -EBUSY;
6497  
6498  	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6499  		did_freeze = 1;
6500  		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6501  	}
6502  
6503  	stop_sync_thread(mddev, false);
6504  	wait_event(mddev->sb_wait,
6505  		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6506  	mddev_lock_nointr(mddev);
6507  
6508  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6509  		pr_warn("md: %s still in use.\n",mdname(mddev));
6510  		err = -EBUSY;
6511  		goto out;
6512  	}
6513  
6514  	__md_stop_writes(mddev);
6515  
6516  	if (mddev->ro == MD_RDONLY) {
6517  		err  = -ENXIO;
6518  		goto out;
6519  	}
6520  
6521  	mddev->ro = MD_RDONLY;
6522  	set_disk_ro(mddev->gendisk, 1);
6523  
6524  out:
6525  	if (!err || did_freeze) {
6526  		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6527  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6528  		sysfs_notify_dirent_safe(mddev->sysfs_state);
6529  	}
6530  
6531  	return err;
6532  }
6533  
6534  /* mode:
6535   *   0 - completely stop and dis-assemble array
6536   *   2 - stop but do not disassemble array
6537   */
do_md_stop(struct mddev * mddev,int mode)6538  static int do_md_stop(struct mddev *mddev, int mode)
6539  {
6540  	struct gendisk *disk = mddev->gendisk;
6541  	struct md_rdev *rdev;
6542  	int did_freeze = 0;
6543  
6544  	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6545  		did_freeze = 1;
6546  		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6547  	}
6548  
6549  	stop_sync_thread(mddev, true);
6550  
6551  	if (mddev->sysfs_active ||
6552  	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6553  		pr_warn("md: %s still in use.\n",mdname(mddev));
6554  		if (did_freeze) {
6555  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6556  			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6557  		}
6558  		return -EBUSY;
6559  	}
6560  	if (mddev->pers) {
6561  		if (!md_is_rdwr(mddev))
6562  			set_disk_ro(disk, 0);
6563  
6564  		__md_stop_writes(mddev);
6565  		__md_stop(mddev);
6566  
6567  		/* tell userspace to handle 'inactive' */
6568  		sysfs_notify_dirent_safe(mddev->sysfs_state);
6569  
6570  		rdev_for_each(rdev, mddev)
6571  			if (rdev->raid_disk >= 0)
6572  				sysfs_unlink_rdev(mddev, rdev);
6573  
6574  		set_capacity_and_notify(disk, 0);
6575  		mddev->changed = 1;
6576  
6577  		if (!md_is_rdwr(mddev))
6578  			mddev->ro = MD_RDWR;
6579  	}
6580  	/*
6581  	 * Free resources if final stop
6582  	 */
6583  	if (mode == 0) {
6584  		pr_info("md: %s stopped.\n", mdname(mddev));
6585  
6586  		if (mddev->bitmap_info.file) {
6587  			struct file *f = mddev->bitmap_info.file;
6588  			spin_lock(&mddev->lock);
6589  			mddev->bitmap_info.file = NULL;
6590  			spin_unlock(&mddev->lock);
6591  			fput(f);
6592  		}
6593  		mddev->bitmap_info.offset = 0;
6594  
6595  		export_array(mddev);
6596  
6597  		md_clean(mddev);
6598  		if (mddev->hold_active == UNTIL_STOP)
6599  			mddev->hold_active = 0;
6600  	}
6601  	md_new_event();
6602  	sysfs_notify_dirent_safe(mddev->sysfs_state);
6603  	return 0;
6604  }
6605  
6606  #ifndef MODULE
autorun_array(struct mddev * mddev)6607  static void autorun_array(struct mddev *mddev)
6608  {
6609  	struct md_rdev *rdev;
6610  	int err;
6611  
6612  	if (list_empty(&mddev->disks))
6613  		return;
6614  
6615  	pr_info("md: running: ");
6616  
6617  	rdev_for_each(rdev, mddev) {
6618  		pr_cont("<%pg>", rdev->bdev);
6619  	}
6620  	pr_cont("\n");
6621  
6622  	err = do_md_run(mddev);
6623  	if (err) {
6624  		pr_warn("md: do_md_run() returned %d\n", err);
6625  		do_md_stop(mddev, 0);
6626  	}
6627  }
6628  
6629  /*
6630   * lets try to run arrays based on all disks that have arrived
6631   * until now. (those are in pending_raid_disks)
6632   *
6633   * the method: pick the first pending disk, collect all disks with
6634   * the same UUID, remove all from the pending list and put them into
6635   * the 'same_array' list. Then order this list based on superblock
6636   * update time (freshest comes first), kick out 'old' disks and
6637   * compare superblocks. If everything's fine then run it.
6638   *
6639   * If "unit" is allocated, then bump its reference count
6640   */
autorun_devices(int part)6641  static void autorun_devices(int part)
6642  {
6643  	struct md_rdev *rdev0, *rdev, *tmp;
6644  	struct mddev *mddev;
6645  
6646  	pr_info("md: autorun ...\n");
6647  	while (!list_empty(&pending_raid_disks)) {
6648  		int unit;
6649  		dev_t dev;
6650  		LIST_HEAD(candidates);
6651  		rdev0 = list_entry(pending_raid_disks.next,
6652  					 struct md_rdev, same_set);
6653  
6654  		pr_debug("md: considering %pg ...\n", rdev0->bdev);
6655  		INIT_LIST_HEAD(&candidates);
6656  		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6657  			if (super_90_load(rdev, rdev0, 0) >= 0) {
6658  				pr_debug("md:  adding %pg ...\n",
6659  					 rdev->bdev);
6660  				list_move(&rdev->same_set, &candidates);
6661  			}
6662  		/*
6663  		 * now we have a set of devices, with all of them having
6664  		 * mostly sane superblocks. It's time to allocate the
6665  		 * mddev.
6666  		 */
6667  		if (part) {
6668  			dev = MKDEV(mdp_major,
6669  				    rdev0->preferred_minor << MdpMinorShift);
6670  			unit = MINOR(dev) >> MdpMinorShift;
6671  		} else {
6672  			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6673  			unit = MINOR(dev);
6674  		}
6675  		if (rdev0->preferred_minor != unit) {
6676  			pr_warn("md: unit number in %pg is bad: %d\n",
6677  				rdev0->bdev, rdev0->preferred_minor);
6678  			break;
6679  		}
6680  
6681  		mddev = md_alloc(dev, NULL);
6682  		if (IS_ERR(mddev))
6683  			break;
6684  
6685  		if (mddev_suspend_and_lock(mddev))
6686  			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6687  		else if (mddev->raid_disks || mddev->major_version
6688  			 || !list_empty(&mddev->disks)) {
6689  			pr_warn("md: %s already running, cannot run %pg\n",
6690  				mdname(mddev), rdev0->bdev);
6691  			mddev_unlock_and_resume(mddev);
6692  		} else {
6693  			pr_debug("md: created %s\n", mdname(mddev));
6694  			mddev->persistent = 1;
6695  			rdev_for_each_list(rdev, tmp, &candidates) {
6696  				list_del_init(&rdev->same_set);
6697  				if (bind_rdev_to_array(rdev, mddev))
6698  					export_rdev(rdev, mddev);
6699  			}
6700  			autorun_array(mddev);
6701  			mddev_unlock_and_resume(mddev);
6702  		}
6703  		/* on success, candidates will be empty, on error
6704  		 * it won't...
6705  		 */
6706  		rdev_for_each_list(rdev, tmp, &candidates) {
6707  			list_del_init(&rdev->same_set);
6708  			export_rdev(rdev, mddev);
6709  		}
6710  		mddev_put(mddev);
6711  	}
6712  	pr_info("md: ... autorun DONE.\n");
6713  }
6714  #endif /* !MODULE */
6715  
get_version(void __user * arg)6716  static int get_version(void __user *arg)
6717  {
6718  	mdu_version_t ver;
6719  
6720  	ver.major = MD_MAJOR_VERSION;
6721  	ver.minor = MD_MINOR_VERSION;
6722  	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6723  
6724  	if (copy_to_user(arg, &ver, sizeof(ver)))
6725  		return -EFAULT;
6726  
6727  	return 0;
6728  }
6729  
get_array_info(struct mddev * mddev,void __user * arg)6730  static int get_array_info(struct mddev *mddev, void __user *arg)
6731  {
6732  	mdu_array_info_t info;
6733  	int nr,working,insync,failed,spare;
6734  	struct md_rdev *rdev;
6735  
6736  	nr = working = insync = failed = spare = 0;
6737  	rcu_read_lock();
6738  	rdev_for_each_rcu(rdev, mddev) {
6739  		nr++;
6740  		if (test_bit(Faulty, &rdev->flags))
6741  			failed++;
6742  		else {
6743  			working++;
6744  			if (test_bit(In_sync, &rdev->flags))
6745  				insync++;
6746  			else if (test_bit(Journal, &rdev->flags))
6747  				/* TODO: add journal count to md_u.h */
6748  				;
6749  			else
6750  				spare++;
6751  		}
6752  	}
6753  	rcu_read_unlock();
6754  
6755  	info.major_version = mddev->major_version;
6756  	info.minor_version = mddev->minor_version;
6757  	info.patch_version = MD_PATCHLEVEL_VERSION;
6758  	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6759  	info.level         = mddev->level;
6760  	info.size          = mddev->dev_sectors / 2;
6761  	if (info.size != mddev->dev_sectors / 2) /* overflow */
6762  		info.size = -1;
6763  	info.nr_disks      = nr;
6764  	info.raid_disks    = mddev->raid_disks;
6765  	info.md_minor      = mddev->md_minor;
6766  	info.not_persistent= !mddev->persistent;
6767  
6768  	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6769  	info.state         = 0;
6770  	if (mddev->in_sync)
6771  		info.state = (1<<MD_SB_CLEAN);
6772  	if (mddev->bitmap && mddev->bitmap_info.offset)
6773  		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6774  	if (mddev_is_clustered(mddev))
6775  		info.state |= (1<<MD_SB_CLUSTERED);
6776  	info.active_disks  = insync;
6777  	info.working_disks = working;
6778  	info.failed_disks  = failed;
6779  	info.spare_disks   = spare;
6780  
6781  	info.layout        = mddev->layout;
6782  	info.chunk_size    = mddev->chunk_sectors << 9;
6783  
6784  	if (copy_to_user(arg, &info, sizeof(info)))
6785  		return -EFAULT;
6786  
6787  	return 0;
6788  }
6789  
get_bitmap_file(struct mddev * mddev,void __user * arg)6790  static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6791  {
6792  	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6793  	char *ptr;
6794  	int err;
6795  
6796  	file = kzalloc(sizeof(*file), GFP_NOIO);
6797  	if (!file)
6798  		return -ENOMEM;
6799  
6800  	err = 0;
6801  	spin_lock(&mddev->lock);
6802  	/* bitmap enabled */
6803  	if (mddev->bitmap_info.file) {
6804  		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6805  				sizeof(file->pathname));
6806  		if (IS_ERR(ptr))
6807  			err = PTR_ERR(ptr);
6808  		else
6809  			memmove(file->pathname, ptr,
6810  				sizeof(file->pathname)-(ptr-file->pathname));
6811  	}
6812  	spin_unlock(&mddev->lock);
6813  
6814  	if (err == 0 &&
6815  	    copy_to_user(arg, file, sizeof(*file)))
6816  		err = -EFAULT;
6817  
6818  	kfree(file);
6819  	return err;
6820  }
6821  
get_disk_info(struct mddev * mddev,void __user * arg)6822  static int get_disk_info(struct mddev *mddev, void __user * arg)
6823  {
6824  	mdu_disk_info_t info;
6825  	struct md_rdev *rdev;
6826  
6827  	if (copy_from_user(&info, arg, sizeof(info)))
6828  		return -EFAULT;
6829  
6830  	rcu_read_lock();
6831  	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6832  	if (rdev) {
6833  		info.major = MAJOR(rdev->bdev->bd_dev);
6834  		info.minor = MINOR(rdev->bdev->bd_dev);
6835  		info.raid_disk = rdev->raid_disk;
6836  		info.state = 0;
6837  		if (test_bit(Faulty, &rdev->flags))
6838  			info.state |= (1<<MD_DISK_FAULTY);
6839  		else if (test_bit(In_sync, &rdev->flags)) {
6840  			info.state |= (1<<MD_DISK_ACTIVE);
6841  			info.state |= (1<<MD_DISK_SYNC);
6842  		}
6843  		if (test_bit(Journal, &rdev->flags))
6844  			info.state |= (1<<MD_DISK_JOURNAL);
6845  		if (test_bit(WriteMostly, &rdev->flags))
6846  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6847  		if (test_bit(FailFast, &rdev->flags))
6848  			info.state |= (1<<MD_DISK_FAILFAST);
6849  	} else {
6850  		info.major = info.minor = 0;
6851  		info.raid_disk = -1;
6852  		info.state = (1<<MD_DISK_REMOVED);
6853  	}
6854  	rcu_read_unlock();
6855  
6856  	if (copy_to_user(arg, &info, sizeof(info)))
6857  		return -EFAULT;
6858  
6859  	return 0;
6860  }
6861  
md_add_new_disk(struct mddev * mddev,struct mdu_disk_info_s * info)6862  int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6863  {
6864  	struct md_rdev *rdev;
6865  	dev_t dev = MKDEV(info->major,info->minor);
6866  
6867  	if (mddev_is_clustered(mddev) &&
6868  		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6869  		pr_warn("%s: Cannot add to clustered mddev.\n",
6870  			mdname(mddev));
6871  		return -EINVAL;
6872  	}
6873  
6874  	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6875  		return -EOVERFLOW;
6876  
6877  	if (!mddev->raid_disks) {
6878  		int err;
6879  		/* expecting a device which has a superblock */
6880  		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6881  		if (IS_ERR(rdev)) {
6882  			pr_warn("md: md_import_device returned %ld\n",
6883  				PTR_ERR(rdev));
6884  			return PTR_ERR(rdev);
6885  		}
6886  		if (!list_empty(&mddev->disks)) {
6887  			struct md_rdev *rdev0
6888  				= list_entry(mddev->disks.next,
6889  					     struct md_rdev, same_set);
6890  			err = super_types[mddev->major_version]
6891  				.load_super(rdev, rdev0, mddev->minor_version);
6892  			if (err < 0) {
6893  				pr_warn("md: %pg has different UUID to %pg\n",
6894  					rdev->bdev,
6895  					rdev0->bdev);
6896  				export_rdev(rdev, mddev);
6897  				return -EINVAL;
6898  			}
6899  		}
6900  		err = bind_rdev_to_array(rdev, mddev);
6901  		if (err)
6902  			export_rdev(rdev, mddev);
6903  		return err;
6904  	}
6905  
6906  	/*
6907  	 * md_add_new_disk can be used once the array is assembled
6908  	 * to add "hot spares".  They must already have a superblock
6909  	 * written
6910  	 */
6911  	if (mddev->pers) {
6912  		int err;
6913  		if (!mddev->pers->hot_add_disk) {
6914  			pr_warn("%s: personality does not support diskops!\n",
6915  				mdname(mddev));
6916  			return -EINVAL;
6917  		}
6918  		if (mddev->persistent)
6919  			rdev = md_import_device(dev, mddev->major_version,
6920  						mddev->minor_version);
6921  		else
6922  			rdev = md_import_device(dev, -1, -1);
6923  		if (IS_ERR(rdev)) {
6924  			pr_warn("md: md_import_device returned %ld\n",
6925  				PTR_ERR(rdev));
6926  			return PTR_ERR(rdev);
6927  		}
6928  		/* set saved_raid_disk if appropriate */
6929  		if (!mddev->persistent) {
6930  			if (info->state & (1<<MD_DISK_SYNC)  &&
6931  			    info->raid_disk < mddev->raid_disks) {
6932  				rdev->raid_disk = info->raid_disk;
6933  				clear_bit(Bitmap_sync, &rdev->flags);
6934  			} else
6935  				rdev->raid_disk = -1;
6936  			rdev->saved_raid_disk = rdev->raid_disk;
6937  		} else
6938  			super_types[mddev->major_version].
6939  				validate_super(mddev, NULL/*freshest*/, rdev);
6940  		if ((info->state & (1<<MD_DISK_SYNC)) &&
6941  		     rdev->raid_disk != info->raid_disk) {
6942  			/* This was a hot-add request, but events doesn't
6943  			 * match, so reject it.
6944  			 */
6945  			export_rdev(rdev, mddev);
6946  			return -EINVAL;
6947  		}
6948  
6949  		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6950  		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6951  			set_bit(WriteMostly, &rdev->flags);
6952  		else
6953  			clear_bit(WriteMostly, &rdev->flags);
6954  		if (info->state & (1<<MD_DISK_FAILFAST))
6955  			set_bit(FailFast, &rdev->flags);
6956  		else
6957  			clear_bit(FailFast, &rdev->flags);
6958  
6959  		if (info->state & (1<<MD_DISK_JOURNAL)) {
6960  			struct md_rdev *rdev2;
6961  			bool has_journal = false;
6962  
6963  			/* make sure no existing journal disk */
6964  			rdev_for_each(rdev2, mddev) {
6965  				if (test_bit(Journal, &rdev2->flags)) {
6966  					has_journal = true;
6967  					break;
6968  				}
6969  			}
6970  			if (has_journal || mddev->bitmap) {
6971  				export_rdev(rdev, mddev);
6972  				return -EBUSY;
6973  			}
6974  			set_bit(Journal, &rdev->flags);
6975  		}
6976  		/*
6977  		 * check whether the device shows up in other nodes
6978  		 */
6979  		if (mddev_is_clustered(mddev)) {
6980  			if (info->state & (1 << MD_DISK_CANDIDATE))
6981  				set_bit(Candidate, &rdev->flags);
6982  			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6983  				/* --add initiated by this node */
6984  				err = mddev->cluster_ops->add_new_disk(mddev, rdev);
6985  				if (err) {
6986  					export_rdev(rdev, mddev);
6987  					return err;
6988  				}
6989  			}
6990  		}
6991  
6992  		rdev->raid_disk = -1;
6993  		err = bind_rdev_to_array(rdev, mddev);
6994  
6995  		if (err)
6996  			export_rdev(rdev, mddev);
6997  
6998  		if (mddev_is_clustered(mddev)) {
6999  			if (info->state & (1 << MD_DISK_CANDIDATE)) {
7000  				if (!err) {
7001  					err = mddev->cluster_ops->new_disk_ack(
7002  							mddev, err == 0);
7003  					if (err)
7004  						md_kick_rdev_from_array(rdev);
7005  				}
7006  			} else {
7007  				if (err)
7008  					mddev->cluster_ops->add_new_disk_cancel(mddev);
7009  				else
7010  					err = add_bound_rdev(rdev);
7011  			}
7012  
7013  		} else if (!err)
7014  			err = add_bound_rdev(rdev);
7015  
7016  		return err;
7017  	}
7018  
7019  	/* otherwise, md_add_new_disk is only allowed
7020  	 * for major_version==0 superblocks
7021  	 */
7022  	if (mddev->major_version != 0) {
7023  		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7024  		return -EINVAL;
7025  	}
7026  
7027  	if (!(info->state & (1<<MD_DISK_FAULTY))) {
7028  		int err;
7029  		rdev = md_import_device(dev, -1, 0);
7030  		if (IS_ERR(rdev)) {
7031  			pr_warn("md: error, md_import_device() returned %ld\n",
7032  				PTR_ERR(rdev));
7033  			return PTR_ERR(rdev);
7034  		}
7035  		rdev->desc_nr = info->number;
7036  		if (info->raid_disk < mddev->raid_disks)
7037  			rdev->raid_disk = info->raid_disk;
7038  		else
7039  			rdev->raid_disk = -1;
7040  
7041  		if (rdev->raid_disk < mddev->raid_disks)
7042  			if (info->state & (1<<MD_DISK_SYNC))
7043  				set_bit(In_sync, &rdev->flags);
7044  
7045  		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7046  			set_bit(WriteMostly, &rdev->flags);
7047  		if (info->state & (1<<MD_DISK_FAILFAST))
7048  			set_bit(FailFast, &rdev->flags);
7049  
7050  		if (!mddev->persistent) {
7051  			pr_debug("md: nonpersistent superblock ...\n");
7052  			rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7053  		} else
7054  			rdev->sb_start = calc_dev_sboffset(rdev);
7055  		rdev->sectors = rdev->sb_start;
7056  
7057  		err = bind_rdev_to_array(rdev, mddev);
7058  		if (err) {
7059  			export_rdev(rdev, mddev);
7060  			return err;
7061  		}
7062  	}
7063  
7064  	return 0;
7065  }
7066  
hot_remove_disk(struct mddev * mddev,dev_t dev)7067  static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7068  {
7069  	struct md_rdev *rdev;
7070  
7071  	if (!mddev->pers)
7072  		return -ENODEV;
7073  
7074  	rdev = find_rdev(mddev, dev);
7075  	if (!rdev)
7076  		return -ENXIO;
7077  
7078  	if (rdev->raid_disk < 0)
7079  		goto kick_rdev;
7080  
7081  	clear_bit(Blocked, &rdev->flags);
7082  	remove_and_add_spares(mddev, rdev);
7083  
7084  	if (rdev->raid_disk >= 0)
7085  		goto busy;
7086  
7087  kick_rdev:
7088  	if (mddev_is_clustered(mddev) &&
7089  	    mddev->cluster_ops->remove_disk(mddev, rdev))
7090  		goto busy;
7091  
7092  	md_kick_rdev_from_array(rdev);
7093  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7094  	if (!mddev->thread)
7095  		md_update_sb(mddev, 1);
7096  	md_new_event();
7097  
7098  	return 0;
7099  busy:
7100  	pr_debug("md: cannot remove active disk %pg from %s ...\n",
7101  		 rdev->bdev, mdname(mddev));
7102  	return -EBUSY;
7103  }
7104  
hot_add_disk(struct mddev * mddev,dev_t dev)7105  static int hot_add_disk(struct mddev *mddev, dev_t dev)
7106  {
7107  	int err;
7108  	struct md_rdev *rdev;
7109  
7110  	if (!mddev->pers)
7111  		return -ENODEV;
7112  
7113  	if (mddev->major_version != 0) {
7114  		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7115  			mdname(mddev));
7116  		return -EINVAL;
7117  	}
7118  	if (!mddev->pers->hot_add_disk) {
7119  		pr_warn("%s: personality does not support diskops!\n",
7120  			mdname(mddev));
7121  		return -EINVAL;
7122  	}
7123  
7124  	rdev = md_import_device(dev, -1, 0);
7125  	if (IS_ERR(rdev)) {
7126  		pr_warn("md: error, md_import_device() returned %ld\n",
7127  			PTR_ERR(rdev));
7128  		return -EINVAL;
7129  	}
7130  
7131  	if (mddev->persistent)
7132  		rdev->sb_start = calc_dev_sboffset(rdev);
7133  	else
7134  		rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7135  
7136  	rdev->sectors = rdev->sb_start;
7137  
7138  	if (test_bit(Faulty, &rdev->flags)) {
7139  		pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7140  			rdev->bdev, mdname(mddev));
7141  		err = -EINVAL;
7142  		goto abort_export;
7143  	}
7144  
7145  	clear_bit(In_sync, &rdev->flags);
7146  	rdev->desc_nr = -1;
7147  	rdev->saved_raid_disk = -1;
7148  	err = bind_rdev_to_array(rdev, mddev);
7149  	if (err)
7150  		goto abort_export;
7151  
7152  	/*
7153  	 * The rest should better be atomic, we can have disk failures
7154  	 * noticed in interrupt contexts ...
7155  	 */
7156  
7157  	rdev->raid_disk = -1;
7158  
7159  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7160  	if (!mddev->thread)
7161  		md_update_sb(mddev, 1);
7162  	/*
7163  	 * Kick recovery, maybe this spare has to be added to the
7164  	 * array immediately.
7165  	 */
7166  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7167  	md_new_event();
7168  	return 0;
7169  
7170  abort_export:
7171  	export_rdev(rdev, mddev);
7172  	return err;
7173  }
7174  
set_bitmap_file(struct mddev * mddev,int fd)7175  static int set_bitmap_file(struct mddev *mddev, int fd)
7176  {
7177  	int err = 0;
7178  
7179  	if (mddev->pers) {
7180  		if (!mddev->pers->quiesce || !mddev->thread)
7181  			return -EBUSY;
7182  		if (mddev->recovery || mddev->sync_thread)
7183  			return -EBUSY;
7184  		/* we should be able to change the bitmap.. */
7185  	}
7186  
7187  	if (fd >= 0) {
7188  		struct inode *inode;
7189  		struct file *f;
7190  
7191  		if (mddev->bitmap || mddev->bitmap_info.file)
7192  			return -EEXIST; /* cannot add when bitmap is present */
7193  
7194  		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7195  			pr_warn("%s: bitmap files not supported by this kernel\n",
7196  				mdname(mddev));
7197  			return -EINVAL;
7198  		}
7199  		pr_warn("%s: using deprecated bitmap file support\n",
7200  			mdname(mddev));
7201  
7202  		f = fget(fd);
7203  
7204  		if (f == NULL) {
7205  			pr_warn("%s: error: failed to get bitmap file\n",
7206  				mdname(mddev));
7207  			return -EBADF;
7208  		}
7209  
7210  		inode = f->f_mapping->host;
7211  		if (!S_ISREG(inode->i_mode)) {
7212  			pr_warn("%s: error: bitmap file must be a regular file\n",
7213  				mdname(mddev));
7214  			err = -EBADF;
7215  		} else if (!(f->f_mode & FMODE_WRITE)) {
7216  			pr_warn("%s: error: bitmap file must open for write\n",
7217  				mdname(mddev));
7218  			err = -EBADF;
7219  		} else if (atomic_read(&inode->i_writecount) != 1) {
7220  			pr_warn("%s: error: bitmap file is already in use\n",
7221  				mdname(mddev));
7222  			err = -EBUSY;
7223  		}
7224  		if (err) {
7225  			fput(f);
7226  			return err;
7227  		}
7228  		mddev->bitmap_info.file = f;
7229  		mddev->bitmap_info.offset = 0; /* file overrides offset */
7230  	} else if (mddev->bitmap == NULL)
7231  		return -ENOENT; /* cannot remove what isn't there */
7232  	err = 0;
7233  	if (mddev->pers) {
7234  		if (fd >= 0) {
7235  			err = mddev->bitmap_ops->create(mddev, -1);
7236  			if (!err)
7237  				err = mddev->bitmap_ops->load(mddev);
7238  
7239  			if (err) {
7240  				mddev->bitmap_ops->destroy(mddev);
7241  				fd = -1;
7242  			}
7243  		} else if (fd < 0) {
7244  			mddev->bitmap_ops->destroy(mddev);
7245  		}
7246  	}
7247  
7248  	if (fd < 0) {
7249  		struct file *f = mddev->bitmap_info.file;
7250  		if (f) {
7251  			spin_lock(&mddev->lock);
7252  			mddev->bitmap_info.file = NULL;
7253  			spin_unlock(&mddev->lock);
7254  			fput(f);
7255  		}
7256  	}
7257  
7258  	return err;
7259  }
7260  
7261  /*
7262   * md_set_array_info is used two different ways
7263   * The original usage is when creating a new array.
7264   * In this usage, raid_disks is > 0 and it together with
7265   *  level, size, not_persistent,layout,chunksize determine the
7266   *  shape of the array.
7267   *  This will always create an array with a type-0.90.0 superblock.
7268   * The newer usage is when assembling an array.
7269   *  In this case raid_disks will be 0, and the major_version field is
7270   *  use to determine which style super-blocks are to be found on the devices.
7271   *  The minor and patch _version numbers are also kept incase the
7272   *  super_block handler wishes to interpret them.
7273   */
md_set_array_info(struct mddev * mddev,struct mdu_array_info_s * info)7274  int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7275  {
7276  	if (info->raid_disks == 0) {
7277  		/* just setting version number for superblock loading */
7278  		if (info->major_version < 0 ||
7279  		    info->major_version >= ARRAY_SIZE(super_types) ||
7280  		    super_types[info->major_version].name == NULL) {
7281  			/* maybe try to auto-load a module? */
7282  			pr_warn("md: superblock version %d not known\n",
7283  				info->major_version);
7284  			return -EINVAL;
7285  		}
7286  		mddev->major_version = info->major_version;
7287  		mddev->minor_version = info->minor_version;
7288  		mddev->patch_version = info->patch_version;
7289  		mddev->persistent = !info->not_persistent;
7290  		/* ensure mddev_put doesn't delete this now that there
7291  		 * is some minimal configuration.
7292  		 */
7293  		mddev->ctime         = ktime_get_real_seconds();
7294  		return 0;
7295  	}
7296  	mddev->major_version = MD_MAJOR_VERSION;
7297  	mddev->minor_version = MD_MINOR_VERSION;
7298  	mddev->patch_version = MD_PATCHLEVEL_VERSION;
7299  	mddev->ctime         = ktime_get_real_seconds();
7300  
7301  	mddev->level         = info->level;
7302  	mddev->clevel[0]     = 0;
7303  	mddev->dev_sectors   = 2 * (sector_t)info->size;
7304  	mddev->raid_disks    = info->raid_disks;
7305  	/* don't set md_minor, it is determined by which /dev/md* was
7306  	 * openned
7307  	 */
7308  	if (info->state & (1<<MD_SB_CLEAN))
7309  		mddev->recovery_cp = MaxSector;
7310  	else
7311  		mddev->recovery_cp = 0;
7312  	mddev->persistent    = ! info->not_persistent;
7313  	mddev->external	     = 0;
7314  
7315  	mddev->layout        = info->layout;
7316  	if (mddev->level == 0)
7317  		/* Cannot trust RAID0 layout info here */
7318  		mddev->layout = -1;
7319  	mddev->chunk_sectors = info->chunk_size >> 9;
7320  
7321  	if (mddev->persistent) {
7322  		mddev->max_disks = MD_SB_DISKS;
7323  		mddev->flags = 0;
7324  		mddev->sb_flags = 0;
7325  	}
7326  	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7327  
7328  	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7329  	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7330  	mddev->bitmap_info.offset = 0;
7331  
7332  	mddev->reshape_position = MaxSector;
7333  
7334  	/*
7335  	 * Generate a 128 bit UUID
7336  	 */
7337  	get_random_bytes(mddev->uuid, 16);
7338  
7339  	mddev->new_level = mddev->level;
7340  	mddev->new_chunk_sectors = mddev->chunk_sectors;
7341  	mddev->new_layout = mddev->layout;
7342  	mddev->delta_disks = 0;
7343  	mddev->reshape_backwards = 0;
7344  
7345  	return 0;
7346  }
7347  
md_set_array_sectors(struct mddev * mddev,sector_t array_sectors)7348  void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7349  {
7350  	lockdep_assert_held(&mddev->reconfig_mutex);
7351  
7352  	if (mddev->external_size)
7353  		return;
7354  
7355  	mddev->array_sectors = array_sectors;
7356  }
7357  EXPORT_SYMBOL(md_set_array_sectors);
7358  
update_size(struct mddev * mddev,sector_t num_sectors)7359  static int update_size(struct mddev *mddev, sector_t num_sectors)
7360  {
7361  	struct md_rdev *rdev;
7362  	int rv;
7363  	int fit = (num_sectors == 0);
7364  	sector_t old_dev_sectors = mddev->dev_sectors;
7365  
7366  	if (mddev->pers->resize == NULL)
7367  		return -EINVAL;
7368  	/* The "num_sectors" is the number of sectors of each device that
7369  	 * is used.  This can only make sense for arrays with redundancy.
7370  	 * linear and raid0 always use whatever space is available. We can only
7371  	 * consider changing this number if no resync or reconstruction is
7372  	 * happening, and if the new size is acceptable. It must fit before the
7373  	 * sb_start or, if that is <data_offset, it must fit before the size
7374  	 * of each device.  If num_sectors is zero, we find the largest size
7375  	 * that fits.
7376  	 */
7377  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7378  		return -EBUSY;
7379  	if (!md_is_rdwr(mddev))
7380  		return -EROFS;
7381  
7382  	rdev_for_each(rdev, mddev) {
7383  		sector_t avail = rdev->sectors;
7384  
7385  		if (fit && (num_sectors == 0 || num_sectors > avail))
7386  			num_sectors = avail;
7387  		if (avail < num_sectors)
7388  			return -ENOSPC;
7389  	}
7390  	rv = mddev->pers->resize(mddev, num_sectors);
7391  	if (!rv) {
7392  		if (mddev_is_clustered(mddev))
7393  			mddev->cluster_ops->update_size(mddev, old_dev_sectors);
7394  		else if (!mddev_is_dm(mddev))
7395  			set_capacity_and_notify(mddev->gendisk,
7396  						mddev->array_sectors);
7397  	}
7398  	return rv;
7399  }
7400  
update_raid_disks(struct mddev * mddev,int raid_disks)7401  static int update_raid_disks(struct mddev *mddev, int raid_disks)
7402  {
7403  	int rv;
7404  	struct md_rdev *rdev;
7405  	/* change the number of raid disks */
7406  	if (mddev->pers->check_reshape == NULL)
7407  		return -EINVAL;
7408  	if (!md_is_rdwr(mddev))
7409  		return -EROFS;
7410  	if (raid_disks <= 0 ||
7411  	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7412  		return -EINVAL;
7413  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7414  	    test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7415  	    mddev->reshape_position != MaxSector)
7416  		return -EBUSY;
7417  
7418  	rdev_for_each(rdev, mddev) {
7419  		if (mddev->raid_disks < raid_disks &&
7420  		    rdev->data_offset < rdev->new_data_offset)
7421  			return -EINVAL;
7422  		if (mddev->raid_disks > raid_disks &&
7423  		    rdev->data_offset > rdev->new_data_offset)
7424  			return -EINVAL;
7425  	}
7426  
7427  	mddev->delta_disks = raid_disks - mddev->raid_disks;
7428  	if (mddev->delta_disks < 0)
7429  		mddev->reshape_backwards = 1;
7430  	else if (mddev->delta_disks > 0)
7431  		mddev->reshape_backwards = 0;
7432  
7433  	rv = mddev->pers->check_reshape(mddev);
7434  	if (rv < 0) {
7435  		mddev->delta_disks = 0;
7436  		mddev->reshape_backwards = 0;
7437  	}
7438  	return rv;
7439  }
7440  
get_cluster_ops(struct mddev * mddev)7441  static int get_cluster_ops(struct mddev *mddev)
7442  {
7443  	xa_lock(&md_submodule);
7444  	mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER);
7445  	if (mddev->cluster_ops &&
7446  	    !try_module_get(mddev->cluster_ops->head.owner))
7447  		mddev->cluster_ops = NULL;
7448  	xa_unlock(&md_submodule);
7449  
7450  	return mddev->cluster_ops == NULL ? -ENOENT : 0;
7451  }
7452  
put_cluster_ops(struct mddev * mddev)7453  static void put_cluster_ops(struct mddev *mddev)
7454  {
7455  	if (!mddev->cluster_ops)
7456  		return;
7457  
7458  	mddev->cluster_ops->leave(mddev);
7459  	module_put(mddev->cluster_ops->head.owner);
7460  	mddev->cluster_ops = NULL;
7461  }
7462  
7463  /*
7464   * update_array_info is used to change the configuration of an
7465   * on-line array.
7466   * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7467   * fields in the info are checked against the array.
7468   * Any differences that cannot be handled will cause an error.
7469   * Normally, only one change can be managed at a time.
7470   */
update_array_info(struct mddev * mddev,mdu_array_info_t * info)7471  static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7472  {
7473  	int rv = 0;
7474  	int cnt = 0;
7475  	int state = 0;
7476  
7477  	/* calculate expected state,ignoring low bits */
7478  	if (mddev->bitmap && mddev->bitmap_info.offset)
7479  		state |= (1 << MD_SB_BITMAP_PRESENT);
7480  
7481  	if (mddev->major_version != info->major_version ||
7482  	    mddev->minor_version != info->minor_version ||
7483  /*	    mddev->patch_version != info->patch_version || */
7484  	    mddev->ctime         != info->ctime         ||
7485  	    mddev->level         != info->level         ||
7486  /*	    mddev->layout        != info->layout        || */
7487  	    mddev->persistent	 != !info->not_persistent ||
7488  	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7489  	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7490  	    ((state^info->state) & 0xfffffe00)
7491  		)
7492  		return -EINVAL;
7493  	/* Check there is only one change */
7494  	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7495  		cnt++;
7496  	if (mddev->raid_disks != info->raid_disks)
7497  		cnt++;
7498  	if (mddev->layout != info->layout)
7499  		cnt++;
7500  	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7501  		cnt++;
7502  	if (cnt == 0)
7503  		return 0;
7504  	if (cnt > 1)
7505  		return -EINVAL;
7506  
7507  	if (mddev->layout != info->layout) {
7508  		/* Change layout
7509  		 * we don't need to do anything at the md level, the
7510  		 * personality will take care of it all.
7511  		 */
7512  		if (mddev->pers->check_reshape == NULL)
7513  			return -EINVAL;
7514  		else {
7515  			mddev->new_layout = info->layout;
7516  			rv = mddev->pers->check_reshape(mddev);
7517  			if (rv)
7518  				mddev->new_layout = mddev->layout;
7519  			return rv;
7520  		}
7521  	}
7522  	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7523  		rv = update_size(mddev, (sector_t)info->size * 2);
7524  
7525  	if (mddev->raid_disks    != info->raid_disks)
7526  		rv = update_raid_disks(mddev, info->raid_disks);
7527  
7528  	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7529  		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7530  			rv = -EINVAL;
7531  			goto err;
7532  		}
7533  		if (mddev->recovery || mddev->sync_thread) {
7534  			rv = -EBUSY;
7535  			goto err;
7536  		}
7537  		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7538  			/* add the bitmap */
7539  			if (mddev->bitmap) {
7540  				rv = -EEXIST;
7541  				goto err;
7542  			}
7543  			if (mddev->bitmap_info.default_offset == 0) {
7544  				rv = -EINVAL;
7545  				goto err;
7546  			}
7547  			mddev->bitmap_info.offset =
7548  				mddev->bitmap_info.default_offset;
7549  			mddev->bitmap_info.space =
7550  				mddev->bitmap_info.default_space;
7551  			rv = mddev->bitmap_ops->create(mddev, -1);
7552  			if (!rv)
7553  				rv = mddev->bitmap_ops->load(mddev);
7554  
7555  			if (rv)
7556  				mddev->bitmap_ops->destroy(mddev);
7557  		} else {
7558  			struct md_bitmap_stats stats;
7559  
7560  			rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
7561  			if (rv)
7562  				goto err;
7563  
7564  			if (stats.file) {
7565  				rv = -EINVAL;
7566  				goto err;
7567  			}
7568  
7569  			if (mddev->bitmap_info.nodes) {
7570  				/* hold PW on all the bitmap lock */
7571  				if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7572  					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7573  					rv = -EPERM;
7574  					mddev->cluster_ops->unlock_all_bitmaps(mddev);
7575  					goto err;
7576  				}
7577  
7578  				mddev->bitmap_info.nodes = 0;
7579  				put_cluster_ops(mddev);
7580  				mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7581  			}
7582  			mddev->bitmap_ops->destroy(mddev);
7583  			mddev->bitmap_info.offset = 0;
7584  		}
7585  	}
7586  	md_update_sb(mddev, 1);
7587  	return rv;
7588  err:
7589  	return rv;
7590  }
7591  
set_disk_faulty(struct mddev * mddev,dev_t dev)7592  static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7593  {
7594  	struct md_rdev *rdev;
7595  	int err = 0;
7596  
7597  	if (mddev->pers == NULL)
7598  		return -ENODEV;
7599  
7600  	rcu_read_lock();
7601  	rdev = md_find_rdev_rcu(mddev, dev);
7602  	if (!rdev)
7603  		err =  -ENODEV;
7604  	else {
7605  		md_error(mddev, rdev);
7606  		if (test_bit(MD_BROKEN, &mddev->flags))
7607  			err = -EBUSY;
7608  	}
7609  	rcu_read_unlock();
7610  	return err;
7611  }
7612  
7613  /*
7614   * We have a problem here : there is no easy way to give a CHS
7615   * virtual geometry. We currently pretend that we have a 2 heads
7616   * 4 sectors (with a BIG number of cylinders...). This drives
7617   * dosfs just mad... ;-)
7618   */
md_getgeo(struct block_device * bdev,struct hd_geometry * geo)7619  static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7620  {
7621  	struct mddev *mddev = bdev->bd_disk->private_data;
7622  
7623  	geo->heads = 2;
7624  	geo->sectors = 4;
7625  	geo->cylinders = mddev->array_sectors / 8;
7626  	return 0;
7627  }
7628  
md_ioctl_valid(unsigned int cmd)7629  static inline int md_ioctl_valid(unsigned int cmd)
7630  {
7631  	switch (cmd) {
7632  	case GET_ARRAY_INFO:
7633  	case GET_DISK_INFO:
7634  	case RAID_VERSION:
7635  		return 0;
7636  	case ADD_NEW_DISK:
7637  	case GET_BITMAP_FILE:
7638  	case HOT_ADD_DISK:
7639  	case HOT_REMOVE_DISK:
7640  	case RESTART_ARRAY_RW:
7641  	case RUN_ARRAY:
7642  	case SET_ARRAY_INFO:
7643  	case SET_BITMAP_FILE:
7644  	case SET_DISK_FAULTY:
7645  	case STOP_ARRAY:
7646  	case STOP_ARRAY_RO:
7647  	case CLUSTERED_DISK_NACK:
7648  		if (!capable(CAP_SYS_ADMIN))
7649  			return -EACCES;
7650  		return 0;
7651  	default:
7652  		return -ENOTTY;
7653  	}
7654  }
7655  
md_ioctl_need_suspend(unsigned int cmd)7656  static bool md_ioctl_need_suspend(unsigned int cmd)
7657  {
7658  	switch (cmd) {
7659  	case ADD_NEW_DISK:
7660  	case HOT_ADD_DISK:
7661  	case HOT_REMOVE_DISK:
7662  	case SET_BITMAP_FILE:
7663  	case SET_ARRAY_INFO:
7664  		return true;
7665  	default:
7666  		return false;
7667  	}
7668  }
7669  
__md_set_array_info(struct mddev * mddev,void __user * argp)7670  static int __md_set_array_info(struct mddev *mddev, void __user *argp)
7671  {
7672  	mdu_array_info_t info;
7673  	int err;
7674  
7675  	if (!argp)
7676  		memset(&info, 0, sizeof(info));
7677  	else if (copy_from_user(&info, argp, sizeof(info)))
7678  		return -EFAULT;
7679  
7680  	if (mddev->pers) {
7681  		err = update_array_info(mddev, &info);
7682  		if (err)
7683  			pr_warn("md: couldn't update array info. %d\n", err);
7684  		return err;
7685  	}
7686  
7687  	if (!list_empty(&mddev->disks)) {
7688  		pr_warn("md: array %s already has disks!\n", mdname(mddev));
7689  		return -EBUSY;
7690  	}
7691  
7692  	if (mddev->raid_disks) {
7693  		pr_warn("md: array %s already initialised!\n", mdname(mddev));
7694  		return -EBUSY;
7695  	}
7696  
7697  	err = md_set_array_info(mddev, &info);
7698  	if (err)
7699  		pr_warn("md: couldn't set array info. %d\n", err);
7700  
7701  	return err;
7702  }
7703  
md_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7704  static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
7705  			unsigned int cmd, unsigned long arg)
7706  {
7707  	int err = 0;
7708  	void __user *argp = (void __user *)arg;
7709  	struct mddev *mddev = NULL;
7710  
7711  	err = md_ioctl_valid(cmd);
7712  	if (err)
7713  		return err;
7714  
7715  	/*
7716  	 * Commands dealing with the RAID driver but not any
7717  	 * particular array:
7718  	 */
7719  	if (cmd == RAID_VERSION)
7720  		return get_version(argp);
7721  
7722  	/*
7723  	 * Commands creating/starting a new array:
7724  	 */
7725  
7726  	mddev = bdev->bd_disk->private_data;
7727  
7728  	/* Some actions do not requires the mutex */
7729  	switch (cmd) {
7730  	case GET_ARRAY_INFO:
7731  		if (!mddev->raid_disks && !mddev->external)
7732  			return -ENODEV;
7733  		return get_array_info(mddev, argp);
7734  
7735  	case GET_DISK_INFO:
7736  		if (!mddev->raid_disks && !mddev->external)
7737  			return -ENODEV;
7738  		return get_disk_info(mddev, argp);
7739  
7740  	case SET_DISK_FAULTY:
7741  		return set_disk_faulty(mddev, new_decode_dev(arg));
7742  
7743  	case GET_BITMAP_FILE:
7744  		return get_bitmap_file(mddev, argp);
7745  	}
7746  
7747  	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7748  		/* Need to flush page cache, and ensure no-one else opens
7749  		 * and writes
7750  		 */
7751  		err = mddev_set_closing_and_sync_blockdev(mddev, 1);
7752  		if (err)
7753  			return err;
7754  	}
7755  
7756  	if (!md_is_rdwr(mddev))
7757  		flush_work(&mddev->sync_work);
7758  
7759  	err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
7760  					   mddev_lock(mddev);
7761  	if (err) {
7762  		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7763  			 err, cmd);
7764  		goto out;
7765  	}
7766  
7767  	if (cmd == SET_ARRAY_INFO) {
7768  		err = __md_set_array_info(mddev, argp);
7769  		goto unlock;
7770  	}
7771  
7772  	/*
7773  	 * Commands querying/configuring an existing array:
7774  	 */
7775  	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7776  	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7777  	if ((!mddev->raid_disks && !mddev->external)
7778  	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7779  	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7780  	    && cmd != GET_BITMAP_FILE) {
7781  		err = -ENODEV;
7782  		goto unlock;
7783  	}
7784  
7785  	/*
7786  	 * Commands even a read-only array can execute:
7787  	 */
7788  	switch (cmd) {
7789  	case RESTART_ARRAY_RW:
7790  		err = restart_array(mddev);
7791  		goto unlock;
7792  
7793  	case STOP_ARRAY:
7794  		err = do_md_stop(mddev, 0);
7795  		goto unlock;
7796  
7797  	case STOP_ARRAY_RO:
7798  		if (mddev->pers)
7799  			err = md_set_readonly(mddev);
7800  		goto unlock;
7801  
7802  	case HOT_REMOVE_DISK:
7803  		err = hot_remove_disk(mddev, new_decode_dev(arg));
7804  		goto unlock;
7805  
7806  	case ADD_NEW_DISK:
7807  		/* We can support ADD_NEW_DISK on read-only arrays
7808  		 * only if we are re-adding a preexisting device.
7809  		 * So require mddev->pers and MD_DISK_SYNC.
7810  		 */
7811  		if (mddev->pers) {
7812  			mdu_disk_info_t info;
7813  			if (copy_from_user(&info, argp, sizeof(info)))
7814  				err = -EFAULT;
7815  			else if (!(info.state & (1<<MD_DISK_SYNC)))
7816  				/* Need to clear read-only for this */
7817  				break;
7818  			else
7819  				err = md_add_new_disk(mddev, &info);
7820  			goto unlock;
7821  		}
7822  		break;
7823  	}
7824  
7825  	/*
7826  	 * The remaining ioctls are changing the state of the
7827  	 * superblock, so we do not allow them on read-only arrays.
7828  	 */
7829  	if (!md_is_rdwr(mddev) && mddev->pers) {
7830  		if (mddev->ro != MD_AUTO_READ) {
7831  			err = -EROFS;
7832  			goto unlock;
7833  		}
7834  		mddev->ro = MD_RDWR;
7835  		sysfs_notify_dirent_safe(mddev->sysfs_state);
7836  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7837  		/* mddev_unlock will wake thread */
7838  		/* If a device failed while we were read-only, we
7839  		 * need to make sure the metadata is updated now.
7840  		 */
7841  		if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7842  			mddev_unlock(mddev);
7843  			wait_event(mddev->sb_wait,
7844  				   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7845  				   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7846  			mddev_lock_nointr(mddev);
7847  		}
7848  	}
7849  
7850  	switch (cmd) {
7851  	case ADD_NEW_DISK:
7852  	{
7853  		mdu_disk_info_t info;
7854  		if (copy_from_user(&info, argp, sizeof(info)))
7855  			err = -EFAULT;
7856  		else
7857  			err = md_add_new_disk(mddev, &info);
7858  		goto unlock;
7859  	}
7860  
7861  	case CLUSTERED_DISK_NACK:
7862  		if (mddev_is_clustered(mddev))
7863  			mddev->cluster_ops->new_disk_ack(mddev, false);
7864  		else
7865  			err = -EINVAL;
7866  		goto unlock;
7867  
7868  	case HOT_ADD_DISK:
7869  		err = hot_add_disk(mddev, new_decode_dev(arg));
7870  		goto unlock;
7871  
7872  	case RUN_ARRAY:
7873  		err = do_md_run(mddev);
7874  		goto unlock;
7875  
7876  	case SET_BITMAP_FILE:
7877  		err = set_bitmap_file(mddev, (int)arg);
7878  		goto unlock;
7879  
7880  	default:
7881  		err = -EINVAL;
7882  		goto unlock;
7883  	}
7884  
7885  unlock:
7886  	if (mddev->hold_active == UNTIL_IOCTL &&
7887  	    err != -EINVAL)
7888  		mddev->hold_active = 0;
7889  
7890  	md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
7891  				     mddev_unlock(mddev);
7892  
7893  out:
7894  	if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
7895  		clear_bit(MD_CLOSING, &mddev->flags);
7896  	return err;
7897  }
7898  #ifdef CONFIG_COMPAT
md_compat_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)7899  static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
7900  		    unsigned int cmd, unsigned long arg)
7901  {
7902  	switch (cmd) {
7903  	case HOT_REMOVE_DISK:
7904  	case HOT_ADD_DISK:
7905  	case SET_DISK_FAULTY:
7906  	case SET_BITMAP_FILE:
7907  		/* These take in integer arg, do not convert */
7908  		break;
7909  	default:
7910  		arg = (unsigned long)compat_ptr(arg);
7911  		break;
7912  	}
7913  
7914  	return md_ioctl(bdev, mode, cmd, arg);
7915  }
7916  #endif /* CONFIG_COMPAT */
7917  
md_set_read_only(struct block_device * bdev,bool ro)7918  static int md_set_read_only(struct block_device *bdev, bool ro)
7919  {
7920  	struct mddev *mddev = bdev->bd_disk->private_data;
7921  	int err;
7922  
7923  	err = mddev_lock(mddev);
7924  	if (err)
7925  		return err;
7926  
7927  	if (!mddev->raid_disks && !mddev->external) {
7928  		err = -ENODEV;
7929  		goto out_unlock;
7930  	}
7931  
7932  	/*
7933  	 * Transitioning to read-auto need only happen for arrays that call
7934  	 * md_write_start and which are not ready for writes yet.
7935  	 */
7936  	if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
7937  		err = restart_array(mddev);
7938  		if (err)
7939  			goto out_unlock;
7940  		mddev->ro = MD_AUTO_READ;
7941  	}
7942  
7943  out_unlock:
7944  	mddev_unlock(mddev);
7945  	return err;
7946  }
7947  
md_open(struct gendisk * disk,blk_mode_t mode)7948  static int md_open(struct gendisk *disk, blk_mode_t mode)
7949  {
7950  	struct mddev *mddev;
7951  	int err;
7952  
7953  	spin_lock(&all_mddevs_lock);
7954  	mddev = mddev_get(disk->private_data);
7955  	spin_unlock(&all_mddevs_lock);
7956  	if (!mddev)
7957  		return -ENODEV;
7958  
7959  	err = mutex_lock_interruptible(&mddev->open_mutex);
7960  	if (err)
7961  		goto out;
7962  
7963  	err = -ENODEV;
7964  	if (test_bit(MD_CLOSING, &mddev->flags))
7965  		goto out_unlock;
7966  
7967  	atomic_inc(&mddev->openers);
7968  	mutex_unlock(&mddev->open_mutex);
7969  
7970  	disk_check_media_change(disk);
7971  	return 0;
7972  
7973  out_unlock:
7974  	mutex_unlock(&mddev->open_mutex);
7975  out:
7976  	mddev_put(mddev);
7977  	return err;
7978  }
7979  
md_release(struct gendisk * disk)7980  static void md_release(struct gendisk *disk)
7981  {
7982  	struct mddev *mddev = disk->private_data;
7983  
7984  	BUG_ON(!mddev);
7985  	atomic_dec(&mddev->openers);
7986  	mddev_put(mddev);
7987  }
7988  
md_check_events(struct gendisk * disk,unsigned int clearing)7989  static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
7990  {
7991  	struct mddev *mddev = disk->private_data;
7992  	unsigned int ret = 0;
7993  
7994  	if (mddev->changed)
7995  		ret = DISK_EVENT_MEDIA_CHANGE;
7996  	mddev->changed = 0;
7997  	return ret;
7998  }
7999  
md_free_disk(struct gendisk * disk)8000  static void md_free_disk(struct gendisk *disk)
8001  {
8002  	struct mddev *mddev = disk->private_data;
8003  
8004  	mddev_free(mddev);
8005  }
8006  
8007  const struct block_device_operations md_fops =
8008  {
8009  	.owner		= THIS_MODULE,
8010  	.submit_bio	= md_submit_bio,
8011  	.open		= md_open,
8012  	.release	= md_release,
8013  	.ioctl		= md_ioctl,
8014  #ifdef CONFIG_COMPAT
8015  	.compat_ioctl	= md_compat_ioctl,
8016  #endif
8017  	.getgeo		= md_getgeo,
8018  	.check_events	= md_check_events,
8019  	.set_read_only	= md_set_read_only,
8020  	.free_disk	= md_free_disk,
8021  };
8022  
md_thread(void * arg)8023  static int md_thread(void *arg)
8024  {
8025  	struct md_thread *thread = arg;
8026  
8027  	/*
8028  	 * md_thread is a 'system-thread', it's priority should be very
8029  	 * high. We avoid resource deadlocks individually in each
8030  	 * raid personality. (RAID5 does preallocation) We also use RR and
8031  	 * the very same RT priority as kswapd, thus we will never get
8032  	 * into a priority inversion deadlock.
8033  	 *
8034  	 * we definitely have to have equal or higher priority than
8035  	 * bdflush, otherwise bdflush will deadlock if there are too
8036  	 * many dirty RAID5 blocks.
8037  	 */
8038  
8039  	allow_signal(SIGKILL);
8040  	while (!kthread_should_stop()) {
8041  
8042  		/* We need to wait INTERRUPTIBLE so that
8043  		 * we don't add to the load-average.
8044  		 * That means we need to be sure no signals are
8045  		 * pending
8046  		 */
8047  		if (signal_pending(current))
8048  			flush_signals(current);
8049  
8050  		wait_event_interruptible_timeout
8051  			(thread->wqueue,
8052  			 test_bit(THREAD_WAKEUP, &thread->flags)
8053  			 || kthread_should_stop() || kthread_should_park(),
8054  			 thread->timeout);
8055  
8056  		clear_bit(THREAD_WAKEUP, &thread->flags);
8057  		if (kthread_should_park())
8058  			kthread_parkme();
8059  		if (!kthread_should_stop())
8060  			thread->run(thread);
8061  	}
8062  
8063  	return 0;
8064  }
8065  
md_wakeup_thread_directly(struct md_thread __rcu * thread)8066  static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
8067  {
8068  	struct md_thread *t;
8069  
8070  	rcu_read_lock();
8071  	t = rcu_dereference(thread);
8072  	if (t)
8073  		wake_up_process(t->tsk);
8074  	rcu_read_unlock();
8075  }
8076  
md_wakeup_thread(struct md_thread __rcu * thread)8077  void md_wakeup_thread(struct md_thread __rcu *thread)
8078  {
8079  	struct md_thread *t;
8080  
8081  	rcu_read_lock();
8082  	t = rcu_dereference(thread);
8083  	if (t) {
8084  		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8085  		set_bit(THREAD_WAKEUP, &t->flags);
8086  		if (wq_has_sleeper(&t->wqueue))
8087  			wake_up(&t->wqueue);
8088  	}
8089  	rcu_read_unlock();
8090  }
8091  EXPORT_SYMBOL(md_wakeup_thread);
8092  
md_register_thread(void (* run)(struct md_thread *),struct mddev * mddev,const char * name)8093  struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8094  		struct mddev *mddev, const char *name)
8095  {
8096  	struct md_thread *thread;
8097  
8098  	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8099  	if (!thread)
8100  		return NULL;
8101  
8102  	init_waitqueue_head(&thread->wqueue);
8103  
8104  	thread->run = run;
8105  	thread->mddev = mddev;
8106  	thread->timeout = MAX_SCHEDULE_TIMEOUT;
8107  	thread->tsk = kthread_run(md_thread, thread,
8108  				  "%s_%s",
8109  				  mdname(thread->mddev),
8110  				  name);
8111  	if (IS_ERR(thread->tsk)) {
8112  		kfree(thread);
8113  		return NULL;
8114  	}
8115  	return thread;
8116  }
8117  EXPORT_SYMBOL(md_register_thread);
8118  
md_unregister_thread(struct mddev * mddev,struct md_thread __rcu ** threadp)8119  void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8120  {
8121  	struct md_thread *thread = rcu_dereference_protected(*threadp,
8122  					lockdep_is_held(&mddev->reconfig_mutex));
8123  
8124  	if (!thread)
8125  		return;
8126  
8127  	rcu_assign_pointer(*threadp, NULL);
8128  	synchronize_rcu();
8129  
8130  	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8131  	kthread_stop(thread->tsk);
8132  	kfree(thread);
8133  }
8134  EXPORT_SYMBOL(md_unregister_thread);
8135  
md_error(struct mddev * mddev,struct md_rdev * rdev)8136  void md_error(struct mddev *mddev, struct md_rdev *rdev)
8137  {
8138  	if (!rdev || test_bit(Faulty, &rdev->flags))
8139  		return;
8140  
8141  	if (!mddev->pers || !mddev->pers->error_handler)
8142  		return;
8143  	mddev->pers->error_handler(mddev, rdev);
8144  
8145  	if (mddev->pers->head.id == ID_RAID0 ||
8146  	    mddev->pers->head.id == ID_LINEAR)
8147  		return;
8148  
8149  	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8150  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8151  	sysfs_notify_dirent_safe(rdev->sysfs_state);
8152  	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8153  	if (!test_bit(MD_BROKEN, &mddev->flags)) {
8154  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8155  		md_wakeup_thread(mddev->thread);
8156  	}
8157  	if (mddev->event_work.func)
8158  		queue_work(md_misc_wq, &mddev->event_work);
8159  	md_new_event();
8160  }
8161  EXPORT_SYMBOL(md_error);
8162  
8163  /* seq_file implementation /proc/mdstat */
8164  
status_unused(struct seq_file * seq)8165  static void status_unused(struct seq_file *seq)
8166  {
8167  	int i = 0;
8168  	struct md_rdev *rdev;
8169  
8170  	seq_printf(seq, "unused devices: ");
8171  
8172  	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8173  		i++;
8174  		seq_printf(seq, "%pg ", rdev->bdev);
8175  	}
8176  	if (!i)
8177  		seq_printf(seq, "<none>");
8178  
8179  	seq_printf(seq, "\n");
8180  }
8181  
status_personalities(struct seq_file * seq)8182  static void status_personalities(struct seq_file *seq)
8183  {
8184  	struct md_submodule_head *head;
8185  	unsigned long i;
8186  
8187  	seq_puts(seq, "Personalities : ");
8188  
8189  	xa_lock(&md_submodule);
8190  	xa_for_each(&md_submodule, i, head)
8191  		if (head->type == MD_PERSONALITY)
8192  			seq_printf(seq, "[%s] ", head->name);
8193  	xa_unlock(&md_submodule);
8194  
8195  	seq_puts(seq, "\n");
8196  }
8197  
status_resync(struct seq_file * seq,struct mddev * mddev)8198  static int status_resync(struct seq_file *seq, struct mddev *mddev)
8199  {
8200  	sector_t max_sectors, resync, res;
8201  	unsigned long dt, db = 0;
8202  	sector_t rt, curr_mark_cnt, resync_mark_cnt;
8203  	int scale, recovery_active;
8204  	unsigned int per_milli;
8205  
8206  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8207  	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8208  		max_sectors = mddev->resync_max_sectors;
8209  	else
8210  		max_sectors = mddev->dev_sectors;
8211  
8212  	resync = mddev->curr_resync;
8213  	if (resync < MD_RESYNC_ACTIVE) {
8214  		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8215  			/* Still cleaning up */
8216  			resync = max_sectors;
8217  	} else if (resync > max_sectors) {
8218  		resync = max_sectors;
8219  	} else {
8220  		res = atomic_read(&mddev->recovery_active);
8221  		/*
8222  		 * Resync has started, but the subtraction has overflowed or
8223  		 * yielded one of the special values. Force it to active to
8224  		 * ensure the status reports an active resync.
8225  		 */
8226  		if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8227  			resync = MD_RESYNC_ACTIVE;
8228  		else
8229  			resync -= res;
8230  	}
8231  
8232  	if (resync == MD_RESYNC_NONE) {
8233  		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8234  			struct md_rdev *rdev;
8235  
8236  			rdev_for_each(rdev, mddev)
8237  				if (rdev->raid_disk >= 0 &&
8238  				    !test_bit(Faulty, &rdev->flags) &&
8239  				    rdev->recovery_offset != MaxSector &&
8240  				    rdev->recovery_offset) {
8241  					seq_printf(seq, "\trecover=REMOTE");
8242  					return 1;
8243  				}
8244  			if (mddev->reshape_position != MaxSector)
8245  				seq_printf(seq, "\treshape=REMOTE");
8246  			else
8247  				seq_printf(seq, "\tresync=REMOTE");
8248  			return 1;
8249  		}
8250  		if (mddev->recovery_cp < MaxSector) {
8251  			seq_printf(seq, "\tresync=PENDING");
8252  			return 1;
8253  		}
8254  		return 0;
8255  	}
8256  	if (resync < MD_RESYNC_ACTIVE) {
8257  		seq_printf(seq, "\tresync=DELAYED");
8258  		return 1;
8259  	}
8260  
8261  	WARN_ON(max_sectors == 0);
8262  	/* Pick 'scale' such that (resync>>scale)*1000 will fit
8263  	 * in a sector_t, and (max_sectors>>scale) will fit in a
8264  	 * u32, as those are the requirements for sector_div.
8265  	 * Thus 'scale' must be at least 10
8266  	 */
8267  	scale = 10;
8268  	if (sizeof(sector_t) > sizeof(unsigned long)) {
8269  		while ( max_sectors/2 > (1ULL<<(scale+32)))
8270  			scale++;
8271  	}
8272  	res = (resync>>scale)*1000;
8273  	sector_div(res, (u32)((max_sectors>>scale)+1));
8274  
8275  	per_milli = res;
8276  	{
8277  		int i, x = per_milli/50, y = 20-x;
8278  		seq_printf(seq, "[");
8279  		for (i = 0; i < x; i++)
8280  			seq_printf(seq, "=");
8281  		seq_printf(seq, ">");
8282  		for (i = 0; i < y; i++)
8283  			seq_printf(seq, ".");
8284  		seq_printf(seq, "] ");
8285  	}
8286  	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8287  		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8288  		    "reshape" :
8289  		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8290  		     "check" :
8291  		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8292  		      "resync" : "recovery"))),
8293  		   per_milli/10, per_milli % 10,
8294  		   (unsigned long long) resync/2,
8295  		   (unsigned long long) max_sectors/2);
8296  
8297  	/*
8298  	 * dt: time from mark until now
8299  	 * db: blocks written from mark until now
8300  	 * rt: remaining time
8301  	 *
8302  	 * rt is a sector_t, which is always 64bit now. We are keeping
8303  	 * the original algorithm, but it is not really necessary.
8304  	 *
8305  	 * Original algorithm:
8306  	 *   So we divide before multiply in case it is 32bit and close
8307  	 *   to the limit.
8308  	 *   We scale the divisor (db) by 32 to avoid losing precision
8309  	 *   near the end of resync when the number of remaining sectors
8310  	 *   is close to 'db'.
8311  	 *   We then divide rt by 32 after multiplying by db to compensate.
8312  	 *   The '+1' avoids division by zero if db is very small.
8313  	 */
8314  	dt = ((jiffies - mddev->resync_mark) / HZ);
8315  	if (!dt) dt++;
8316  
8317  	curr_mark_cnt = mddev->curr_mark_cnt;
8318  	recovery_active = atomic_read(&mddev->recovery_active);
8319  	resync_mark_cnt = mddev->resync_mark_cnt;
8320  
8321  	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8322  		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8323  
8324  	rt = max_sectors - resync;    /* number of remaining sectors */
8325  	rt = div64_u64(rt, db/32+1);
8326  	rt *= dt;
8327  	rt >>= 5;
8328  
8329  	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8330  		   ((unsigned long)rt % 60)/6);
8331  
8332  	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8333  	return 1;
8334  }
8335  
md_seq_start(struct seq_file * seq,loff_t * pos)8336  static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8337  	__acquires(&all_mddevs_lock)
8338  {
8339  	seq->poll_event = atomic_read(&md_event_count);
8340  	spin_lock(&all_mddevs_lock);
8341  
8342  	return seq_list_start_head(&all_mddevs, *pos);
8343  }
8344  
md_seq_next(struct seq_file * seq,void * v,loff_t * pos)8345  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8346  {
8347  	return seq_list_next(v, &all_mddevs, pos);
8348  }
8349  
md_seq_stop(struct seq_file * seq,void * v)8350  static void md_seq_stop(struct seq_file *seq, void *v)
8351  	__releases(&all_mddevs_lock)
8352  {
8353  	spin_unlock(&all_mddevs_lock);
8354  }
8355  
md_bitmap_status(struct seq_file * seq,struct mddev * mddev)8356  static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
8357  {
8358  	struct md_bitmap_stats stats;
8359  	unsigned long used_pages;
8360  	unsigned long chunk_kb;
8361  	int err;
8362  
8363  	err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
8364  	if (err)
8365  		return;
8366  
8367  	chunk_kb = mddev->bitmap_info.chunksize >> 10;
8368  	used_pages = stats.pages - stats.missing_pages;
8369  
8370  	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
8371  		   used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
8372  		   chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
8373  		   chunk_kb ? "KB" : "B");
8374  
8375  	if (stats.file) {
8376  		seq_puts(seq, ", file: ");
8377  		seq_file_path(seq, stats.file, " \t\n");
8378  	}
8379  
8380  	seq_putc(seq, '\n');
8381  }
8382  
md_seq_show(struct seq_file * seq,void * v)8383  static int md_seq_show(struct seq_file *seq, void *v)
8384  {
8385  	struct mddev *mddev;
8386  	sector_t sectors;
8387  	struct md_rdev *rdev;
8388  
8389  	if (v == &all_mddevs) {
8390  		status_personalities(seq);
8391  		if (list_empty(&all_mddevs))
8392  			status_unused(seq);
8393  		return 0;
8394  	}
8395  
8396  	mddev = list_entry(v, struct mddev, all_mddevs);
8397  	if (!mddev_get(mddev))
8398  		return 0;
8399  
8400  	spin_unlock(&all_mddevs_lock);
8401  
8402  	/* prevent bitmap to be freed after checking */
8403  	mutex_lock(&mddev->bitmap_info.mutex);
8404  
8405  	spin_lock(&mddev->lock);
8406  	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8407  		seq_printf(seq, "%s : ", mdname(mddev));
8408  		if (mddev->pers) {
8409  			if (test_bit(MD_BROKEN, &mddev->flags))
8410  				seq_printf(seq, "broken");
8411  			else
8412  				seq_printf(seq, "active");
8413  			if (mddev->ro == MD_RDONLY)
8414  				seq_printf(seq, " (read-only)");
8415  			if (mddev->ro == MD_AUTO_READ)
8416  				seq_printf(seq, " (auto-read-only)");
8417  			seq_printf(seq, " %s", mddev->pers->head.name);
8418  		} else {
8419  			seq_printf(seq, "inactive");
8420  		}
8421  
8422  		sectors = 0;
8423  		rcu_read_lock();
8424  		rdev_for_each_rcu(rdev, mddev) {
8425  			seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8426  
8427  			if (test_bit(WriteMostly, &rdev->flags))
8428  				seq_printf(seq, "(W)");
8429  			if (test_bit(Journal, &rdev->flags))
8430  				seq_printf(seq, "(J)");
8431  			if (test_bit(Faulty, &rdev->flags)) {
8432  				seq_printf(seq, "(F)");
8433  				continue;
8434  			}
8435  			if (rdev->raid_disk < 0)
8436  				seq_printf(seq, "(S)"); /* spare */
8437  			if (test_bit(Replacement, &rdev->flags))
8438  				seq_printf(seq, "(R)");
8439  			sectors += rdev->sectors;
8440  		}
8441  		rcu_read_unlock();
8442  
8443  		if (!list_empty(&mddev->disks)) {
8444  			if (mddev->pers)
8445  				seq_printf(seq, "\n      %llu blocks",
8446  					   (unsigned long long)
8447  					   mddev->array_sectors / 2);
8448  			else
8449  				seq_printf(seq, "\n      %llu blocks",
8450  					   (unsigned long long)sectors / 2);
8451  		}
8452  		if (mddev->persistent) {
8453  			if (mddev->major_version != 0 ||
8454  			    mddev->minor_version != 90) {
8455  				seq_printf(seq," super %d.%d",
8456  					   mddev->major_version,
8457  					   mddev->minor_version);
8458  			}
8459  		} else if (mddev->external)
8460  			seq_printf(seq, " super external:%s",
8461  				   mddev->metadata_type);
8462  		else
8463  			seq_printf(seq, " super non-persistent");
8464  
8465  		if (mddev->pers) {
8466  			mddev->pers->status(seq, mddev);
8467  			seq_printf(seq, "\n      ");
8468  			if (mddev->pers->sync_request) {
8469  				if (status_resync(seq, mddev))
8470  					seq_printf(seq, "\n      ");
8471  			}
8472  		} else
8473  			seq_printf(seq, "\n       ");
8474  
8475  		md_bitmap_status(seq, mddev);
8476  
8477  		seq_printf(seq, "\n");
8478  	}
8479  	spin_unlock(&mddev->lock);
8480  	mutex_unlock(&mddev->bitmap_info.mutex);
8481  	spin_lock(&all_mddevs_lock);
8482  
8483  	if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8484  		status_unused(seq);
8485  
8486  	mddev_put_locked(mddev);
8487  	return 0;
8488  }
8489  
8490  static const struct seq_operations md_seq_ops = {
8491  	.start  = md_seq_start,
8492  	.next   = md_seq_next,
8493  	.stop   = md_seq_stop,
8494  	.show   = md_seq_show,
8495  };
8496  
md_seq_open(struct inode * inode,struct file * file)8497  static int md_seq_open(struct inode *inode, struct file *file)
8498  {
8499  	struct seq_file *seq;
8500  	int error;
8501  
8502  	error = seq_open(file, &md_seq_ops);
8503  	if (error)
8504  		return error;
8505  
8506  	seq = file->private_data;
8507  	seq->poll_event = atomic_read(&md_event_count);
8508  	return error;
8509  }
8510  
8511  static int md_unloading;
mdstat_poll(struct file * filp,poll_table * wait)8512  static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8513  {
8514  	struct seq_file *seq = filp->private_data;
8515  	__poll_t mask;
8516  
8517  	if (md_unloading)
8518  		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8519  	poll_wait(filp, &md_event_waiters, wait);
8520  
8521  	/* always allow read */
8522  	mask = EPOLLIN | EPOLLRDNORM;
8523  
8524  	if (seq->poll_event != atomic_read(&md_event_count))
8525  		mask |= EPOLLERR | EPOLLPRI;
8526  	return mask;
8527  }
8528  
8529  static const struct proc_ops mdstat_proc_ops = {
8530  	.proc_open	= md_seq_open,
8531  	.proc_read	= seq_read,
8532  	.proc_lseek	= seq_lseek,
8533  	.proc_release	= seq_release,
8534  	.proc_poll	= mdstat_poll,
8535  };
8536  
register_md_submodule(struct md_submodule_head * msh)8537  int register_md_submodule(struct md_submodule_head *msh)
8538  {
8539  	return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL);
8540  }
8541  EXPORT_SYMBOL_GPL(register_md_submodule);
8542  
unregister_md_submodule(struct md_submodule_head * msh)8543  void unregister_md_submodule(struct md_submodule_head *msh)
8544  {
8545  	xa_erase(&md_submodule, msh->id);
8546  }
8547  EXPORT_SYMBOL_GPL(unregister_md_submodule);
8548  
md_setup_cluster(struct mddev * mddev,int nodes)8549  int md_setup_cluster(struct mddev *mddev, int nodes)
8550  {
8551  	int ret = get_cluster_ops(mddev);
8552  
8553  	if (ret) {
8554  		request_module("md-cluster");
8555  		ret = get_cluster_ops(mddev);
8556  	}
8557  
8558  	/* ensure module won't be unloaded */
8559  	if (ret) {
8560  		pr_warn("can't find md-cluster module or get its reference.\n");
8561  		return ret;
8562  	}
8563  
8564  	ret = mddev->cluster_ops->join(mddev, nodes);
8565  	if (!ret)
8566  		mddev->safemode_delay = 0;
8567  	return ret;
8568  }
8569  
md_cluster_stop(struct mddev * mddev)8570  void md_cluster_stop(struct mddev *mddev)
8571  {
8572  	put_cluster_ops(mddev);
8573  }
8574  
is_mddev_idle(struct mddev * mddev,int init)8575  static int is_mddev_idle(struct mddev *mddev, int init)
8576  {
8577  	struct md_rdev *rdev;
8578  	int idle;
8579  	int curr_events;
8580  
8581  	idle = 1;
8582  	rcu_read_lock();
8583  	rdev_for_each_rcu(rdev, mddev) {
8584  		struct gendisk *disk = rdev->bdev->bd_disk;
8585  
8586  		if (!init && !blk_queue_io_stat(disk->queue))
8587  			continue;
8588  
8589  		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
8590  			      atomic_read(&disk->sync_io);
8591  		/* sync IO will cause sync_io to increase before the disk_stats
8592  		 * as sync_io is counted when a request starts, and
8593  		 * disk_stats is counted when it completes.
8594  		 * So resync activity will cause curr_events to be smaller than
8595  		 * when there was no such activity.
8596  		 * non-sync IO will cause disk_stat to increase without
8597  		 * increasing sync_io so curr_events will (eventually)
8598  		 * be larger than it was before.  Once it becomes
8599  		 * substantially larger, the test below will cause
8600  		 * the array to appear non-idle, and resync will slow
8601  		 * down.
8602  		 * If there is a lot of outstanding resync activity when
8603  		 * we set last_event to curr_events, then all that activity
8604  		 * completing might cause the array to appear non-idle
8605  		 * and resync will be slowed down even though there might
8606  		 * not have been non-resync activity.  This will only
8607  		 * happen once though.  'last_events' will soon reflect
8608  		 * the state where there is little or no outstanding
8609  		 * resync requests, and further resync activity will
8610  		 * always make curr_events less than last_events.
8611  		 *
8612  		 */
8613  		if (init || curr_events - rdev->last_events > 64) {
8614  			rdev->last_events = curr_events;
8615  			idle = 0;
8616  		}
8617  	}
8618  	rcu_read_unlock();
8619  	return idle;
8620  }
8621  
md_done_sync(struct mddev * mddev,int blocks,int ok)8622  void md_done_sync(struct mddev *mddev, int blocks, int ok)
8623  {
8624  	/* another "blocks" (512byte) blocks have been synced */
8625  	atomic_sub(blocks, &mddev->recovery_active);
8626  	wake_up(&mddev->recovery_wait);
8627  	if (!ok) {
8628  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8629  		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8630  		md_wakeup_thread(mddev->thread);
8631  		// stop recovery, signal do_sync ....
8632  	}
8633  }
8634  EXPORT_SYMBOL(md_done_sync);
8635  
8636  /* md_write_start(mddev, bi)
8637   * If we need to update some array metadata (e.g. 'active' flag
8638   * in superblock) before writing, schedule a superblock update
8639   * and wait for it to complete.
8640   * A return value of 'false' means that the write wasn't recorded
8641   * and cannot proceed as the array is being suspend.
8642   */
md_write_start(struct mddev * mddev,struct bio * bi)8643  void md_write_start(struct mddev *mddev, struct bio *bi)
8644  {
8645  	int did_change = 0;
8646  
8647  	if (bio_data_dir(bi) != WRITE)
8648  		return;
8649  
8650  	BUG_ON(mddev->ro == MD_RDONLY);
8651  	if (mddev->ro == MD_AUTO_READ) {
8652  		/* need to switch to read/write */
8653  		mddev->ro = MD_RDWR;
8654  		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8655  		md_wakeup_thread(mddev->thread);
8656  		md_wakeup_thread(mddev->sync_thread);
8657  		did_change = 1;
8658  	}
8659  	rcu_read_lock();
8660  	percpu_ref_get(&mddev->writes_pending);
8661  	smp_mb(); /* Match smp_mb in set_in_sync() */
8662  	if (mddev->safemode == 1)
8663  		mddev->safemode = 0;
8664  	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8665  	if (mddev->in_sync || mddev->sync_checkers) {
8666  		spin_lock(&mddev->lock);
8667  		if (mddev->in_sync) {
8668  			mddev->in_sync = 0;
8669  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8670  			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8671  			md_wakeup_thread(mddev->thread);
8672  			did_change = 1;
8673  		}
8674  		spin_unlock(&mddev->lock);
8675  	}
8676  	rcu_read_unlock();
8677  	if (did_change)
8678  		sysfs_notify_dirent_safe(mddev->sysfs_state);
8679  	if (!mddev->has_superblocks)
8680  		return;
8681  	wait_event(mddev->sb_wait,
8682  		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8683  }
8684  EXPORT_SYMBOL(md_write_start);
8685  
8686  /* md_write_inc can only be called when md_write_start() has
8687   * already been called at least once of the current request.
8688   * It increments the counter and is useful when a single request
8689   * is split into several parts.  Each part causes an increment and
8690   * so needs a matching md_write_end().
8691   * Unlike md_write_start(), it is safe to call md_write_inc() inside
8692   * a spinlocked region.
8693   */
md_write_inc(struct mddev * mddev,struct bio * bi)8694  void md_write_inc(struct mddev *mddev, struct bio *bi)
8695  {
8696  	if (bio_data_dir(bi) != WRITE)
8697  		return;
8698  	WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
8699  	percpu_ref_get(&mddev->writes_pending);
8700  }
8701  EXPORT_SYMBOL(md_write_inc);
8702  
md_write_end(struct mddev * mddev)8703  void md_write_end(struct mddev *mddev)
8704  {
8705  	percpu_ref_put(&mddev->writes_pending);
8706  
8707  	if (mddev->safemode == 2)
8708  		md_wakeup_thread(mddev->thread);
8709  	else if (mddev->safemode_delay)
8710  		/* The roundup() ensures this only performs locking once
8711  		 * every ->safemode_delay jiffies
8712  		 */
8713  		mod_timer(&mddev->safemode_timer,
8714  			  roundup(jiffies, mddev->safemode_delay) +
8715  			  mddev->safemode_delay);
8716  }
8717  
8718  EXPORT_SYMBOL(md_write_end);
8719  
8720  /* This is used by raid0 and raid10 */
md_submit_discard_bio(struct mddev * mddev,struct md_rdev * rdev,struct bio * bio,sector_t start,sector_t size)8721  void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8722  			struct bio *bio, sector_t start, sector_t size)
8723  {
8724  	struct bio *discard_bio = NULL;
8725  
8726  	if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8727  			&discard_bio) || !discard_bio)
8728  		return;
8729  
8730  	bio_chain(discard_bio, bio);
8731  	bio_clone_blkg_association(discard_bio, bio);
8732  	mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
8733  	submit_bio_noacct(discard_bio);
8734  }
8735  EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8736  
md_bitmap_start(struct mddev * mddev,struct md_io_clone * md_io_clone)8737  static void md_bitmap_start(struct mddev *mddev,
8738  			    struct md_io_clone *md_io_clone)
8739  {
8740  	if (mddev->pers->bitmap_sector)
8741  		mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
8742  					   &md_io_clone->sectors);
8743  
8744  	mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
8745  				      md_io_clone->sectors);
8746  }
8747  
md_bitmap_end(struct mddev * mddev,struct md_io_clone * md_io_clone)8748  static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
8749  {
8750  	mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
8751  				    md_io_clone->sectors);
8752  }
8753  
md_end_clone_io(struct bio * bio)8754  static void md_end_clone_io(struct bio *bio)
8755  {
8756  	struct md_io_clone *md_io_clone = bio->bi_private;
8757  	struct bio *orig_bio = md_io_clone->orig_bio;
8758  	struct mddev *mddev = md_io_clone->mddev;
8759  
8760  	if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8761  		md_bitmap_end(mddev, md_io_clone);
8762  
8763  	if (bio->bi_status && !orig_bio->bi_status)
8764  		orig_bio->bi_status = bio->bi_status;
8765  
8766  	if (md_io_clone->start_time)
8767  		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8768  
8769  	bio_put(bio);
8770  	bio_endio(orig_bio);
8771  	percpu_ref_put(&mddev->active_io);
8772  }
8773  
md_clone_bio(struct mddev * mddev,struct bio ** bio)8774  static void md_clone_bio(struct mddev *mddev, struct bio **bio)
8775  {
8776  	struct block_device *bdev = (*bio)->bi_bdev;
8777  	struct md_io_clone *md_io_clone;
8778  	struct bio *clone =
8779  		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
8780  
8781  	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
8782  	md_io_clone->orig_bio = *bio;
8783  	md_io_clone->mddev = mddev;
8784  	if (blk_queue_io_stat(bdev->bd_disk->queue))
8785  		md_io_clone->start_time = bio_start_io_acct(*bio);
8786  
8787  	if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
8788  		md_io_clone->offset = (*bio)->bi_iter.bi_sector;
8789  		md_io_clone->sectors = bio_sectors(*bio);
8790  		md_bitmap_start(mddev, md_io_clone);
8791  	}
8792  
8793  	clone->bi_end_io = md_end_clone_io;
8794  	clone->bi_private = md_io_clone;
8795  	*bio = clone;
8796  }
8797  
md_account_bio(struct mddev * mddev,struct bio ** bio)8798  void md_account_bio(struct mddev *mddev, struct bio **bio)
8799  {
8800  	percpu_ref_get(&mddev->active_io);
8801  	md_clone_bio(mddev, bio);
8802  }
8803  EXPORT_SYMBOL_GPL(md_account_bio);
8804  
md_free_cloned_bio(struct bio * bio)8805  void md_free_cloned_bio(struct bio *bio)
8806  {
8807  	struct md_io_clone *md_io_clone = bio->bi_private;
8808  	struct bio *orig_bio = md_io_clone->orig_bio;
8809  	struct mddev *mddev = md_io_clone->mddev;
8810  
8811  	if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8812  		md_bitmap_end(mddev, md_io_clone);
8813  
8814  	if (bio->bi_status && !orig_bio->bi_status)
8815  		orig_bio->bi_status = bio->bi_status;
8816  
8817  	if (md_io_clone->start_time)
8818  		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8819  
8820  	bio_put(bio);
8821  	percpu_ref_put(&mddev->active_io);
8822  }
8823  EXPORT_SYMBOL_GPL(md_free_cloned_bio);
8824  
8825  /* md_allow_write(mddev)
8826   * Calling this ensures that the array is marked 'active' so that writes
8827   * may proceed without blocking.  It is important to call this before
8828   * attempting a GFP_KERNEL allocation while holding the mddev lock.
8829   * Must be called with mddev_lock held.
8830   */
md_allow_write(struct mddev * mddev)8831  void md_allow_write(struct mddev *mddev)
8832  {
8833  	if (!mddev->pers)
8834  		return;
8835  	if (!md_is_rdwr(mddev))
8836  		return;
8837  	if (!mddev->pers->sync_request)
8838  		return;
8839  
8840  	spin_lock(&mddev->lock);
8841  	if (mddev->in_sync) {
8842  		mddev->in_sync = 0;
8843  		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8844  		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8845  		if (mddev->safemode_delay &&
8846  		    mddev->safemode == 0)
8847  			mddev->safemode = 1;
8848  		spin_unlock(&mddev->lock);
8849  		md_update_sb(mddev, 0);
8850  		sysfs_notify_dirent_safe(mddev->sysfs_state);
8851  		/* wait for the dirty state to be recorded in the metadata */
8852  		wait_event(mddev->sb_wait,
8853  			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8854  	} else
8855  		spin_unlock(&mddev->lock);
8856  }
8857  EXPORT_SYMBOL_GPL(md_allow_write);
8858  
md_sync_max_sectors(struct mddev * mddev,enum sync_action action)8859  static sector_t md_sync_max_sectors(struct mddev *mddev,
8860  				    enum sync_action action)
8861  {
8862  	switch (action) {
8863  	case ACTION_RESYNC:
8864  	case ACTION_CHECK:
8865  	case ACTION_REPAIR:
8866  		atomic64_set(&mddev->resync_mismatches, 0);
8867  		fallthrough;
8868  	case ACTION_RESHAPE:
8869  		return mddev->resync_max_sectors;
8870  	case ACTION_RECOVER:
8871  		return mddev->dev_sectors;
8872  	default:
8873  		return 0;
8874  	}
8875  }
8876  
md_sync_position(struct mddev * mddev,enum sync_action action)8877  static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
8878  {
8879  	sector_t start = 0;
8880  	struct md_rdev *rdev;
8881  
8882  	switch (action) {
8883  	case ACTION_CHECK:
8884  	case ACTION_REPAIR:
8885  		return mddev->resync_min;
8886  	case ACTION_RESYNC:
8887  		if (!mddev->bitmap)
8888  			return mddev->recovery_cp;
8889  		return 0;
8890  	case ACTION_RESHAPE:
8891  		/*
8892  		 * If the original node aborts reshaping then we continue the
8893  		 * reshaping, so set again to avoid restart reshape from the
8894  		 * first beginning
8895  		 */
8896  		if (mddev_is_clustered(mddev) &&
8897  		    mddev->reshape_position != MaxSector)
8898  			return mddev->reshape_position;
8899  		return 0;
8900  	case ACTION_RECOVER:
8901  		start = MaxSector;
8902  		rcu_read_lock();
8903  		rdev_for_each_rcu(rdev, mddev)
8904  			if (rdev->raid_disk >= 0 &&
8905  			    !test_bit(Journal, &rdev->flags) &&
8906  			    !test_bit(Faulty, &rdev->flags) &&
8907  			    !test_bit(In_sync, &rdev->flags) &&
8908  			    rdev->recovery_offset < start)
8909  				start = rdev->recovery_offset;
8910  		rcu_read_unlock();
8911  
8912  		/* If there is a bitmap, we need to make sure all
8913  		 * writes that started before we added a spare
8914  		 * complete before we start doing a recovery.
8915  		 * Otherwise the write might complete and (via
8916  		 * bitmap_endwrite) set a bit in the bitmap after the
8917  		 * recovery has checked that bit and skipped that
8918  		 * region.
8919  		 */
8920  		if (mddev->bitmap) {
8921  			mddev->pers->quiesce(mddev, 1);
8922  			mddev->pers->quiesce(mddev, 0);
8923  		}
8924  		return start;
8925  	default:
8926  		return MaxSector;
8927  	}
8928  }
8929  
8930  #define SYNC_MARKS	10
8931  #define	SYNC_MARK_STEP	(3*HZ)
8932  #define UPDATE_FREQUENCY (5*60*HZ)
md_do_sync(struct md_thread * thread)8933  void md_do_sync(struct md_thread *thread)
8934  {
8935  	struct mddev *mddev = thread->mddev;
8936  	struct mddev *mddev2;
8937  	unsigned int currspeed = 0, window;
8938  	sector_t max_sectors,j, io_sectors, recovery_done;
8939  	unsigned long mark[SYNC_MARKS];
8940  	unsigned long update_time;
8941  	sector_t mark_cnt[SYNC_MARKS];
8942  	int last_mark,m;
8943  	sector_t last_check;
8944  	int skipped = 0;
8945  	struct md_rdev *rdev;
8946  	enum sync_action action;
8947  	const char *desc;
8948  	struct blk_plug plug;
8949  	int ret;
8950  
8951  	/* just incase thread restarts... */
8952  	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8953  		return;
8954  
8955  	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8956  		goto skip;
8957  
8958  	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
8959  	    !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
8960  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8961  		goto skip;
8962  	}
8963  
8964  	if (mddev_is_clustered(mddev)) {
8965  		ret = mddev->cluster_ops->resync_start(mddev);
8966  		if (ret)
8967  			goto skip;
8968  
8969  		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8970  		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8971  			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8972  			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8973  		     && ((unsigned long long)mddev->curr_resync_completed
8974  			 < (unsigned long long)mddev->resync_max_sectors))
8975  			goto skip;
8976  	}
8977  
8978  	action = md_sync_action(mddev);
8979  	desc = md_sync_action_name(action);
8980  	mddev->last_sync_action = action;
8981  
8982  	/*
8983  	 * Before starting a resync we must have set curr_resync to
8984  	 * 2, and then checked that every "conflicting" array has curr_resync
8985  	 * less than ours.  When we find one that is the same or higher
8986  	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8987  	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8988  	 * This will mean we have to start checking from the beginning again.
8989  	 *
8990  	 */
8991  	if (mddev_is_clustered(mddev))
8992  		mddev->cluster_ops->resync_start_notify(mddev);
8993  	do {
8994  		int mddev2_minor = -1;
8995  		mddev->curr_resync = MD_RESYNC_DELAYED;
8996  
8997  	try_again:
8998  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8999  			goto skip;
9000  		spin_lock(&all_mddevs_lock);
9001  		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
9002  			if (test_bit(MD_DELETED, &mddev2->flags))
9003  				continue;
9004  			if (mddev2 == mddev)
9005  				continue;
9006  			if (!mddev->parallel_resync
9007  			&&  mddev2->curr_resync
9008  			&&  match_mddev_units(mddev, mddev2)) {
9009  				DEFINE_WAIT(wq);
9010  				if (mddev < mddev2 &&
9011  				    mddev->curr_resync == MD_RESYNC_DELAYED) {
9012  					/* arbitrarily yield */
9013  					mddev->curr_resync = MD_RESYNC_YIELDED;
9014  					wake_up(&resync_wait);
9015  				}
9016  				if (mddev > mddev2 &&
9017  				    mddev->curr_resync == MD_RESYNC_YIELDED)
9018  					/* no need to wait here, we can wait the next
9019  					 * time 'round when curr_resync == 2
9020  					 */
9021  					continue;
9022  				/* We need to wait 'interruptible' so as not to
9023  				 * contribute to the load average, and not to
9024  				 * be caught by 'softlockup'
9025  				 */
9026  				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
9027  				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9028  				    mddev2->curr_resync >= mddev->curr_resync) {
9029  					if (mddev2_minor != mddev2->md_minor) {
9030  						mddev2_minor = mddev2->md_minor;
9031  						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
9032  							desc, mdname(mddev),
9033  							mdname(mddev2));
9034  					}
9035  					spin_unlock(&all_mddevs_lock);
9036  
9037  					if (signal_pending(current))
9038  						flush_signals(current);
9039  					schedule();
9040  					finish_wait(&resync_wait, &wq);
9041  					goto try_again;
9042  				}
9043  				finish_wait(&resync_wait, &wq);
9044  			}
9045  		}
9046  		spin_unlock(&all_mddevs_lock);
9047  	} while (mddev->curr_resync < MD_RESYNC_DELAYED);
9048  
9049  	max_sectors = md_sync_max_sectors(mddev, action);
9050  	j = md_sync_position(mddev, action);
9051  
9052  	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9053  	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
9054  	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9055  		 speed_max(mddev), desc);
9056  
9057  	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9058  
9059  	io_sectors = 0;
9060  	for (m = 0; m < SYNC_MARKS; m++) {
9061  		mark[m] = jiffies;
9062  		mark_cnt[m] = io_sectors;
9063  	}
9064  	last_mark = 0;
9065  	mddev->resync_mark = mark[last_mark];
9066  	mddev->resync_mark_cnt = mark_cnt[last_mark];
9067  
9068  	/*
9069  	 * Tune reconstruction:
9070  	 */
9071  	window = 32 * (PAGE_SIZE / 512);
9072  	pr_debug("md: using %dk window, over a total of %lluk.\n",
9073  		 window/2, (unsigned long long)max_sectors/2);
9074  
9075  	atomic_set(&mddev->recovery_active, 0);
9076  	last_check = 0;
9077  
9078  	if (j >= MD_RESYNC_ACTIVE) {
9079  		pr_debug("md: resuming %s of %s from checkpoint.\n",
9080  			 desc, mdname(mddev));
9081  		mddev->curr_resync = j;
9082  	} else
9083  		mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9084  	mddev->curr_resync_completed = j;
9085  	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9086  	md_new_event();
9087  	update_time = jiffies;
9088  
9089  	blk_start_plug(&plug);
9090  	while (j < max_sectors) {
9091  		sector_t sectors;
9092  
9093  		skipped = 0;
9094  
9095  		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9096  		    ((mddev->curr_resync > mddev->curr_resync_completed &&
9097  		      (mddev->curr_resync - mddev->curr_resync_completed)
9098  		      > (max_sectors >> 4)) ||
9099  		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9100  		     (j - mddev->curr_resync_completed)*2
9101  		     >= mddev->resync_max - mddev->curr_resync_completed ||
9102  		     mddev->curr_resync_completed > mddev->resync_max
9103  			    )) {
9104  			/* time to update curr_resync_completed */
9105  			wait_event(mddev->recovery_wait,
9106  				   atomic_read(&mddev->recovery_active) == 0);
9107  			mddev->curr_resync_completed = j;
9108  			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9109  			    j > mddev->recovery_cp)
9110  				mddev->recovery_cp = j;
9111  			update_time = jiffies;
9112  			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9113  			sysfs_notify_dirent_safe(mddev->sysfs_completed);
9114  		}
9115  
9116  		while (j >= mddev->resync_max &&
9117  		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9118  			/* As this condition is controlled by user-space,
9119  			 * we can block indefinitely, so use '_interruptible'
9120  			 * to avoid triggering warnings.
9121  			 */
9122  			flush_signals(current); /* just in case */
9123  			wait_event_interruptible(mddev->recovery_wait,
9124  						 mddev->resync_max > j
9125  						 || test_bit(MD_RECOVERY_INTR,
9126  							     &mddev->recovery));
9127  		}
9128  
9129  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9130  			break;
9131  
9132  		sectors = mddev->pers->sync_request(mddev, j, max_sectors,
9133  						    &skipped);
9134  		if (sectors == 0) {
9135  			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9136  			break;
9137  		}
9138  
9139  		if (!skipped) { /* actual IO requested */
9140  			io_sectors += sectors;
9141  			atomic_add(sectors, &mddev->recovery_active);
9142  		}
9143  
9144  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9145  			break;
9146  
9147  		j += sectors;
9148  		if (j > max_sectors)
9149  			/* when skipping, extra large numbers can be returned. */
9150  			j = max_sectors;
9151  		if (j >= MD_RESYNC_ACTIVE)
9152  			mddev->curr_resync = j;
9153  		mddev->curr_mark_cnt = io_sectors;
9154  		if (last_check == 0)
9155  			/* this is the earliest that rebuild will be
9156  			 * visible in /proc/mdstat
9157  			 */
9158  			md_new_event();
9159  
9160  		if (last_check + window > io_sectors || j == max_sectors)
9161  			continue;
9162  
9163  		last_check = io_sectors;
9164  	repeat:
9165  		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9166  			/* step marks */
9167  			int next = (last_mark+1) % SYNC_MARKS;
9168  
9169  			mddev->resync_mark = mark[next];
9170  			mddev->resync_mark_cnt = mark_cnt[next];
9171  			mark[next] = jiffies;
9172  			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9173  			last_mark = next;
9174  		}
9175  
9176  		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9177  			break;
9178  
9179  		/*
9180  		 * this loop exits only if either when we are slower than
9181  		 * the 'hard' speed limit, or the system was IO-idle for
9182  		 * a jiffy.
9183  		 * the system might be non-idle CPU-wise, but we only care
9184  		 * about not overloading the IO subsystem. (things like an
9185  		 * e2fsck being done on the RAID array should execute fast)
9186  		 */
9187  		cond_resched();
9188  
9189  		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9190  		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9191  			/((jiffies-mddev->resync_mark)/HZ +1) +1;
9192  
9193  		if (currspeed > speed_min(mddev)) {
9194  			if (currspeed > speed_max(mddev)) {
9195  				msleep(500);
9196  				goto repeat;
9197  			}
9198  			if (!is_mddev_idle(mddev, 0)) {
9199  				/*
9200  				 * Give other IO more of a chance.
9201  				 * The faster the devices, the less we wait.
9202  				 */
9203  				wait_event(mddev->recovery_wait,
9204  					   !atomic_read(&mddev->recovery_active));
9205  			}
9206  		}
9207  	}
9208  	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9209  		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9210  		? "interrupted" : "done");
9211  	/*
9212  	 * this also signals 'finished resyncing' to md_stop
9213  	 */
9214  	blk_finish_plug(&plug);
9215  	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9216  
9217  	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9218  	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9219  	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9220  		mddev->curr_resync_completed = mddev->curr_resync;
9221  		sysfs_notify_dirent_safe(mddev->sysfs_completed);
9222  	}
9223  	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
9224  
9225  	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9226  	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
9227  		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9228  			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9229  				if (mddev->curr_resync >= mddev->recovery_cp) {
9230  					pr_debug("md: checkpointing %s of %s.\n",
9231  						 desc, mdname(mddev));
9232  					if (test_bit(MD_RECOVERY_ERROR,
9233  						&mddev->recovery))
9234  						mddev->recovery_cp =
9235  							mddev->curr_resync_completed;
9236  					else
9237  						mddev->recovery_cp =
9238  							mddev->curr_resync;
9239  				}
9240  			} else
9241  				mddev->recovery_cp = MaxSector;
9242  		} else {
9243  			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9244  				mddev->curr_resync = MaxSector;
9245  			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9246  			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9247  				rcu_read_lock();
9248  				rdev_for_each_rcu(rdev, mddev)
9249  					if (rdev->raid_disk >= 0 &&
9250  					    mddev->delta_disks >= 0 &&
9251  					    !test_bit(Journal, &rdev->flags) &&
9252  					    !test_bit(Faulty, &rdev->flags) &&
9253  					    !test_bit(In_sync, &rdev->flags) &&
9254  					    rdev->recovery_offset < mddev->curr_resync)
9255  						rdev->recovery_offset = mddev->curr_resync;
9256  				rcu_read_unlock();
9257  			}
9258  		}
9259  	}
9260   skip:
9261  	/* set CHANGE_PENDING here since maybe another update is needed,
9262  	 * so other nodes are informed. It should be harmless for normal
9263  	 * raid */
9264  	set_mask_bits(&mddev->sb_flags, 0,
9265  		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9266  
9267  	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9268  			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9269  			mddev->delta_disks > 0 &&
9270  			mddev->pers->finish_reshape &&
9271  			mddev->pers->size &&
9272  			!mddev_is_dm(mddev)) {
9273  		mddev_lock_nointr(mddev);
9274  		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9275  		mddev_unlock(mddev);
9276  		if (!mddev_is_clustered(mddev))
9277  			set_capacity_and_notify(mddev->gendisk,
9278  						mddev->array_sectors);
9279  	}
9280  
9281  	spin_lock(&mddev->lock);
9282  	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9283  		/* We completed so min/max setting can be forgotten if used. */
9284  		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9285  			mddev->resync_min = 0;
9286  		mddev->resync_max = MaxSector;
9287  	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9288  		mddev->resync_min = mddev->curr_resync_completed;
9289  	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9290  	mddev->curr_resync = MD_RESYNC_NONE;
9291  	spin_unlock(&mddev->lock);
9292  
9293  	wake_up(&resync_wait);
9294  	md_wakeup_thread(mddev->thread);
9295  	return;
9296  }
9297  EXPORT_SYMBOL_GPL(md_do_sync);
9298  
rdev_removeable(struct md_rdev * rdev)9299  static bool rdev_removeable(struct md_rdev *rdev)
9300  {
9301  	/* rdev is not used. */
9302  	if (rdev->raid_disk < 0)
9303  		return false;
9304  
9305  	/* There are still inflight io, don't remove this rdev. */
9306  	if (atomic_read(&rdev->nr_pending))
9307  		return false;
9308  
9309  	/*
9310  	 * An error occurred but has not yet been acknowledged by the metadata
9311  	 * handler, don't remove this rdev.
9312  	 */
9313  	if (test_bit(Blocked, &rdev->flags))
9314  		return false;
9315  
9316  	/* Fautly rdev is not used, it's safe to remove it. */
9317  	if (test_bit(Faulty, &rdev->flags))
9318  		return true;
9319  
9320  	/* Journal disk can only be removed if it's faulty. */
9321  	if (test_bit(Journal, &rdev->flags))
9322  		return false;
9323  
9324  	/*
9325  	 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9326  	 * replacement has just become active from pers->spare_active(), and
9327  	 * then pers->hot_remove_disk() will replace this rdev with replacement.
9328  	 */
9329  	if (!test_bit(In_sync, &rdev->flags))
9330  		return true;
9331  
9332  	return false;
9333  }
9334  
rdev_is_spare(struct md_rdev * rdev)9335  static bool rdev_is_spare(struct md_rdev *rdev)
9336  {
9337  	return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9338  	       !test_bit(In_sync, &rdev->flags) &&
9339  	       !test_bit(Journal, &rdev->flags) &&
9340  	       !test_bit(Faulty, &rdev->flags);
9341  }
9342  
rdev_addable(struct md_rdev * rdev)9343  static bool rdev_addable(struct md_rdev *rdev)
9344  {
9345  	/* rdev is already used, don't add it again. */
9346  	if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9347  	    test_bit(Faulty, &rdev->flags))
9348  		return false;
9349  
9350  	/* Allow to add journal disk. */
9351  	if (test_bit(Journal, &rdev->flags))
9352  		return true;
9353  
9354  	/* Allow to add if array is read-write. */
9355  	if (md_is_rdwr(rdev->mddev))
9356  		return true;
9357  
9358  	/*
9359  	 * For read-only array, only allow to readd a rdev. And if bitmap is
9360  	 * used, don't allow to readd a rdev that is too old.
9361  	 */
9362  	if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9363  		return true;
9364  
9365  	return false;
9366  }
9367  
md_spares_need_change(struct mddev * mddev)9368  static bool md_spares_need_change(struct mddev *mddev)
9369  {
9370  	struct md_rdev *rdev;
9371  
9372  	rcu_read_lock();
9373  	rdev_for_each_rcu(rdev, mddev) {
9374  		if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9375  			rcu_read_unlock();
9376  			return true;
9377  		}
9378  	}
9379  	rcu_read_unlock();
9380  	return false;
9381  }
9382  
remove_and_add_spares(struct mddev * mddev,struct md_rdev * this)9383  static int remove_and_add_spares(struct mddev *mddev,
9384  				 struct md_rdev *this)
9385  {
9386  	struct md_rdev *rdev;
9387  	int spares = 0;
9388  	int removed = 0;
9389  
9390  	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9391  		/* Mustn't remove devices when resync thread is running */
9392  		return 0;
9393  
9394  	rdev_for_each(rdev, mddev) {
9395  		if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9396  		    !mddev->pers->hot_remove_disk(mddev, rdev)) {
9397  			sysfs_unlink_rdev(mddev, rdev);
9398  			rdev->saved_raid_disk = rdev->raid_disk;
9399  			rdev->raid_disk = -1;
9400  			removed++;
9401  		}
9402  	}
9403  
9404  	if (removed && mddev->kobj.sd)
9405  		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9406  
9407  	if (this && removed)
9408  		goto no_add;
9409  
9410  	rdev_for_each(rdev, mddev) {
9411  		if (this && this != rdev)
9412  			continue;
9413  		if (rdev_is_spare(rdev))
9414  			spares++;
9415  		if (!rdev_addable(rdev))
9416  			continue;
9417  		if (!test_bit(Journal, &rdev->flags))
9418  			rdev->recovery_offset = 0;
9419  		if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9420  			/* failure here is OK */
9421  			sysfs_link_rdev(mddev, rdev);
9422  			if (!test_bit(Journal, &rdev->flags))
9423  				spares++;
9424  			md_new_event();
9425  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9426  		}
9427  	}
9428  no_add:
9429  	if (removed)
9430  		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9431  	return spares;
9432  }
9433  
md_choose_sync_action(struct mddev * mddev,int * spares)9434  static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9435  {
9436  	/* Check if reshape is in progress first. */
9437  	if (mddev->reshape_position != MaxSector) {
9438  		if (mddev->pers->check_reshape == NULL ||
9439  		    mddev->pers->check_reshape(mddev) != 0)
9440  			return false;
9441  
9442  		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9443  		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9444  		return true;
9445  	}
9446  
9447  	/* Check if resync is in progress. */
9448  	if (mddev->recovery_cp < MaxSector) {
9449  		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9450  		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9451  		return true;
9452  	}
9453  
9454  	/*
9455  	 * Remove any failed drives, then add spares if possible. Spares are
9456  	 * also removed and re-added, to allow the personality to fail the
9457  	 * re-add.
9458  	 */
9459  	*spares = remove_and_add_spares(mddev, NULL);
9460  	if (*spares) {
9461  		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9462  		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9463  		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9464  
9465  		/* Start new recovery. */
9466  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9467  		return true;
9468  	}
9469  
9470  	/* Delay to choose resync/check/repair in md_do_sync(). */
9471  	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9472  		return true;
9473  
9474  	/* Nothing to be done */
9475  	return false;
9476  }
9477  
md_start_sync(struct work_struct * ws)9478  static void md_start_sync(struct work_struct *ws)
9479  {
9480  	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
9481  	int spares = 0;
9482  	bool suspend = false;
9483  	char *name;
9484  
9485  	/*
9486  	 * If reshape is still in progress, spares won't be added or removed
9487  	 * from conf until reshape is done.
9488  	 */
9489  	if (mddev->reshape_position == MaxSector &&
9490  	    md_spares_need_change(mddev)) {
9491  		suspend = true;
9492  		mddev_suspend(mddev, false);
9493  	}
9494  
9495  	mddev_lock_nointr(mddev);
9496  	if (!md_is_rdwr(mddev)) {
9497  		/*
9498  		 * On a read-only array we can:
9499  		 * - remove failed devices
9500  		 * - add already-in_sync devices if the array itself is in-sync.
9501  		 * As we only add devices that are already in-sync, we can
9502  		 * activate the spares immediately.
9503  		 */
9504  		remove_and_add_spares(mddev, NULL);
9505  		goto not_running;
9506  	}
9507  
9508  	if (!md_choose_sync_action(mddev, &spares))
9509  		goto not_running;
9510  
9511  	if (!mddev->pers->sync_request)
9512  		goto not_running;
9513  
9514  	/*
9515  	 * We are adding a device or devices to an array which has the bitmap
9516  	 * stored on all devices. So make sure all bitmap pages get written.
9517  	 */
9518  	if (spares)
9519  		mddev->bitmap_ops->write_all(mddev);
9520  
9521  	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
9522  			"reshape" : "resync";
9523  	rcu_assign_pointer(mddev->sync_thread,
9524  			   md_register_thread(md_do_sync, mddev, name));
9525  	if (!mddev->sync_thread) {
9526  		pr_warn("%s: could not start resync thread...\n",
9527  			mdname(mddev));
9528  		/* leave the spares where they are, it shouldn't hurt */
9529  		goto not_running;
9530  	}
9531  
9532  	mddev_unlock(mddev);
9533  	/*
9534  	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9535  	 * not set it again. Otherwise, we may cause issue like this one:
9536  	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9537  	 * Therefore, use __mddev_resume(mddev, false).
9538  	 */
9539  	if (suspend)
9540  		__mddev_resume(mddev, false);
9541  	md_wakeup_thread(mddev->sync_thread);
9542  	sysfs_notify_dirent_safe(mddev->sysfs_action);
9543  	md_new_event();
9544  	return;
9545  
9546  not_running:
9547  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9548  	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9549  	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9550  	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9551  	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9552  	mddev_unlock(mddev);
9553  	/*
9554  	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9555  	 * not set it again. Otherwise, we may cause issue like this one:
9556  	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9557  	 * Therefore, use __mddev_resume(mddev, false).
9558  	 */
9559  	if (suspend)
9560  		__mddev_resume(mddev, false);
9561  
9562  	wake_up(&resync_wait);
9563  	if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
9564  	    mddev->sysfs_action)
9565  		sysfs_notify_dirent_safe(mddev->sysfs_action);
9566  }
9567  
unregister_sync_thread(struct mddev * mddev)9568  static void unregister_sync_thread(struct mddev *mddev)
9569  {
9570  	if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9571  		/* resync/recovery still happening */
9572  		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9573  		return;
9574  	}
9575  
9576  	if (WARN_ON_ONCE(!mddev->sync_thread))
9577  		return;
9578  
9579  	md_reap_sync_thread(mddev);
9580  }
9581  
9582  /*
9583   * This routine is regularly called by all per-raid-array threads to
9584   * deal with generic issues like resync and super-block update.
9585   * Raid personalities that don't have a thread (linear/raid0) do not
9586   * need this as they never do any recovery or update the superblock.
9587   *
9588   * It does not do any resync itself, but rather "forks" off other threads
9589   * to do that as needed.
9590   * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9591   * "->recovery" and create a thread at ->sync_thread.
9592   * When the thread finishes it sets MD_RECOVERY_DONE
9593   * and wakeups up this thread which will reap the thread and finish up.
9594   * This thread also removes any faulty devices (with nr_pending == 0).
9595   *
9596   * The overall approach is:
9597   *  1/ if the superblock needs updating, update it.
9598   *  2/ If a recovery thread is running, don't do anything else.
9599   *  3/ If recovery has finished, clean up, possibly marking spares active.
9600   *  4/ If there are any faulty devices, remove them.
9601   *  5/ If array is degraded, try to add spares devices
9602   *  6/ If array has spares or is not in-sync, start a resync thread.
9603   */
md_check_recovery(struct mddev * mddev)9604  void md_check_recovery(struct mddev *mddev)
9605  {
9606  	if (mddev->bitmap)
9607  		mddev->bitmap_ops->daemon_work(mddev);
9608  
9609  	if (signal_pending(current)) {
9610  		if (mddev->pers->sync_request && !mddev->external) {
9611  			pr_debug("md: %s in immediate safe mode\n",
9612  				 mdname(mddev));
9613  			mddev->safemode = 2;
9614  		}
9615  		flush_signals(current);
9616  	}
9617  
9618  	if (!md_is_rdwr(mddev) &&
9619  	    !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9620  	    !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9621  		return;
9622  	if ( ! (
9623  		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9624  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9625  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9626  		(mddev->external == 0 && mddev->safemode == 1) ||
9627  		(mddev->safemode == 2
9628  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9629  		))
9630  		return;
9631  
9632  	if (mddev_trylock(mddev)) {
9633  		bool try_set_sync = mddev->safemode != 0;
9634  
9635  		if (!mddev->external && mddev->safemode == 1)
9636  			mddev->safemode = 0;
9637  
9638  		if (!md_is_rdwr(mddev)) {
9639  			struct md_rdev *rdev;
9640  
9641  			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9642  				unregister_sync_thread(mddev);
9643  				goto unlock;
9644  			}
9645  
9646  			if (!mddev->external && mddev->in_sync)
9647  				/*
9648  				 * 'Blocked' flag not needed as failed devices
9649  				 * will be recorded if array switched to read/write.
9650  				 * Leaving it set will prevent the device
9651  				 * from being removed.
9652  				 */
9653  				rdev_for_each(rdev, mddev)
9654  					clear_bit(Blocked, &rdev->flags);
9655  
9656  			/*
9657  			 * There is no thread, but we need to call
9658  			 * ->spare_active and clear saved_raid_disk
9659  			 */
9660  			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9661  			md_reap_sync_thread(mddev);
9662  
9663  			/*
9664  			 * Let md_start_sync() to remove and add rdevs to the
9665  			 * array.
9666  			 */
9667  			if (md_spares_need_change(mddev)) {
9668  				set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9669  				queue_work(md_misc_wq, &mddev->sync_work);
9670  			}
9671  
9672  			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9673  			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9674  			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9675  
9676  			goto unlock;
9677  		}
9678  
9679  		if (mddev_is_clustered(mddev)) {
9680  			struct md_rdev *rdev, *tmp;
9681  			/* kick the device if another node issued a
9682  			 * remove disk.
9683  			 */
9684  			rdev_for_each_safe(rdev, tmp, mddev) {
9685  				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9686  						rdev->raid_disk < 0)
9687  					md_kick_rdev_from_array(rdev);
9688  			}
9689  		}
9690  
9691  		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9692  			spin_lock(&mddev->lock);
9693  			set_in_sync(mddev);
9694  			spin_unlock(&mddev->lock);
9695  		}
9696  
9697  		if (mddev->sb_flags)
9698  			md_update_sb(mddev, 0);
9699  
9700  		/*
9701  		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
9702  		 * still set.
9703  		 */
9704  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9705  			unregister_sync_thread(mddev);
9706  			goto unlock;
9707  		}
9708  
9709  		/* Set RUNNING before clearing NEEDED to avoid
9710  		 * any transients in the value of "sync_action".
9711  		 */
9712  		mddev->curr_resync_completed = 0;
9713  		spin_lock(&mddev->lock);
9714  		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9715  		spin_unlock(&mddev->lock);
9716  		/* Clear some bits that don't mean anything, but
9717  		 * might be left set
9718  		 */
9719  		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9720  		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9721  
9722  		if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9723  		    !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
9724  			queue_work(md_misc_wq, &mddev->sync_work);
9725  		} else {
9726  			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9727  			wake_up(&resync_wait);
9728  		}
9729  
9730  	unlock:
9731  		wake_up(&mddev->sb_wait);
9732  		mddev_unlock(mddev);
9733  	}
9734  }
9735  EXPORT_SYMBOL(md_check_recovery);
9736  
md_reap_sync_thread(struct mddev * mddev)9737  void md_reap_sync_thread(struct mddev *mddev)
9738  {
9739  	struct md_rdev *rdev;
9740  	sector_t old_dev_sectors = mddev->dev_sectors;
9741  	bool is_reshaped = false;
9742  
9743  	/* resync has finished, collect result */
9744  	md_unregister_thread(mddev, &mddev->sync_thread);
9745  	atomic_inc(&mddev->sync_seq);
9746  
9747  	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9748  	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9749  	    mddev->degraded != mddev->raid_disks) {
9750  		/* success...*/
9751  		/* activate any spares */
9752  		if (mddev->pers->spare_active(mddev)) {
9753  			sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9754  			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9755  		}
9756  	}
9757  	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9758  	    mddev->pers->finish_reshape) {
9759  		mddev->pers->finish_reshape(mddev);
9760  		if (mddev_is_clustered(mddev))
9761  			is_reshaped = true;
9762  	}
9763  
9764  	/* If array is no-longer degraded, then any saved_raid_disk
9765  	 * information must be scrapped.
9766  	 */
9767  	if (!mddev->degraded)
9768  		rdev_for_each(rdev, mddev)
9769  			rdev->saved_raid_disk = -1;
9770  
9771  	md_update_sb(mddev, 1);
9772  	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9773  	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9774  	 * clustered raid */
9775  	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9776  		mddev->cluster_ops->resync_finish(mddev);
9777  	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9778  	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9779  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9780  	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9781  	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9782  	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9783  	/*
9784  	 * We call mddev->cluster_ops->update_size here because sync_size could
9785  	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9786  	 * so it is time to update size across cluster.
9787  	 */
9788  	if (mddev_is_clustered(mddev) && is_reshaped
9789  				      && !test_bit(MD_CLOSING, &mddev->flags))
9790  		mddev->cluster_ops->update_size(mddev, old_dev_sectors);
9791  	/* flag recovery needed just to double check */
9792  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9793  	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9794  	sysfs_notify_dirent_safe(mddev->sysfs_action);
9795  	md_new_event();
9796  	if (mddev->event_work.func)
9797  		queue_work(md_misc_wq, &mddev->event_work);
9798  	wake_up(&resync_wait);
9799  }
9800  EXPORT_SYMBOL(md_reap_sync_thread);
9801  
md_wait_for_blocked_rdev(struct md_rdev * rdev,struct mddev * mddev)9802  void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9803  {
9804  	sysfs_notify_dirent_safe(rdev->sysfs_state);
9805  	wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
9806  			   msecs_to_jiffies(5000));
9807  	rdev_dec_pending(rdev, mddev);
9808  }
9809  EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9810  
md_finish_reshape(struct mddev * mddev)9811  void md_finish_reshape(struct mddev *mddev)
9812  {
9813  	/* called be personality module when reshape completes. */
9814  	struct md_rdev *rdev;
9815  
9816  	rdev_for_each(rdev, mddev) {
9817  		if (rdev->data_offset > rdev->new_data_offset)
9818  			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9819  		else
9820  			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9821  		rdev->data_offset = rdev->new_data_offset;
9822  	}
9823  }
9824  EXPORT_SYMBOL(md_finish_reshape);
9825  
9826  /* Bad block management */
9827  
9828  /* Returns true on success, false on failure */
rdev_set_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9829  bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9830  			int is_new)
9831  {
9832  	struct mddev *mddev = rdev->mddev;
9833  
9834  	/*
9835  	 * Recording new badblocks for faulty rdev will force unnecessary
9836  	 * super block updating. This is fragile for external management because
9837  	 * userspace daemon may trying to remove this device and deadlock may
9838  	 * occur. This will be probably solved in the mdadm, but it is safer to
9839  	 * avoid it.
9840  	 */
9841  	if (test_bit(Faulty, &rdev->flags))
9842  		return true;
9843  
9844  	if (is_new)
9845  		s += rdev->new_data_offset;
9846  	else
9847  		s += rdev->data_offset;
9848  
9849  	if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
9850  		return false;
9851  
9852  	/* Make sure they get written out promptly */
9853  	if (test_bit(ExternalBbl, &rdev->flags))
9854  		sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9855  	sysfs_notify_dirent_safe(rdev->sysfs_state);
9856  	set_mask_bits(&mddev->sb_flags, 0,
9857  		      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9858  	md_wakeup_thread(rdev->mddev->thread);
9859  	return true;
9860  }
9861  EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9862  
rdev_clear_badblocks(struct md_rdev * rdev,sector_t s,int sectors,int is_new)9863  void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9864  			  int is_new)
9865  {
9866  	if (is_new)
9867  		s += rdev->new_data_offset;
9868  	else
9869  		s += rdev->data_offset;
9870  
9871  	if (!badblocks_clear(&rdev->badblocks, s, sectors))
9872  		return;
9873  
9874  	if (test_bit(ExternalBbl, &rdev->flags))
9875  		sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9876  }
9877  EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9878  
md_notify_reboot(struct notifier_block * this,unsigned long code,void * x)9879  static int md_notify_reboot(struct notifier_block *this,
9880  			    unsigned long code, void *x)
9881  {
9882  	struct mddev *mddev;
9883  	int need_delay = 0;
9884  
9885  	spin_lock(&all_mddevs_lock);
9886  	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
9887  		if (!mddev_get(mddev))
9888  			continue;
9889  		spin_unlock(&all_mddevs_lock);
9890  		if (mddev_trylock(mddev)) {
9891  			if (mddev->pers)
9892  				__md_stop_writes(mddev);
9893  			if (mddev->persistent)
9894  				mddev->safemode = 2;
9895  			mddev_unlock(mddev);
9896  		}
9897  		need_delay = 1;
9898  		spin_lock(&all_mddevs_lock);
9899  		mddev_put_locked(mddev);
9900  	}
9901  	spin_unlock(&all_mddevs_lock);
9902  
9903  	/*
9904  	 * certain more exotic SCSI devices are known to be
9905  	 * volatile wrt too early system reboots. While the
9906  	 * right place to handle this issue is the given
9907  	 * driver, we do want to have a safe RAID driver ...
9908  	 */
9909  	if (need_delay)
9910  		msleep(1000);
9911  
9912  	return NOTIFY_DONE;
9913  }
9914  
9915  static struct notifier_block md_notifier = {
9916  	.notifier_call	= md_notify_reboot,
9917  	.next		= NULL,
9918  	.priority	= INT_MAX, /* before any real devices */
9919  };
9920  
md_geninit(void)9921  static void md_geninit(void)
9922  {
9923  	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9924  
9925  	proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
9926  }
9927  
md_init(void)9928  static int __init md_init(void)
9929  {
9930  	int ret = -ENOMEM;
9931  
9932  	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9933  	if (!md_wq)
9934  		goto err_wq;
9935  
9936  	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9937  	if (!md_misc_wq)
9938  		goto err_misc_wq;
9939  
9940  	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
9941  				       0);
9942  	if (!md_bitmap_wq)
9943  		goto err_bitmap_wq;
9944  
9945  	ret = __register_blkdev(MD_MAJOR, "md", md_probe);
9946  	if (ret < 0)
9947  		goto err_md;
9948  
9949  	ret = __register_blkdev(0, "mdp", md_probe);
9950  	if (ret < 0)
9951  		goto err_mdp;
9952  	mdp_major = ret;
9953  
9954  	register_reboot_notifier(&md_notifier);
9955  	raid_table_header = register_sysctl("dev/raid", raid_table);
9956  
9957  	md_geninit();
9958  	return 0;
9959  
9960  err_mdp:
9961  	unregister_blkdev(MD_MAJOR, "md");
9962  err_md:
9963  	destroy_workqueue(md_bitmap_wq);
9964  err_bitmap_wq:
9965  	destroy_workqueue(md_misc_wq);
9966  err_misc_wq:
9967  	destroy_workqueue(md_wq);
9968  err_wq:
9969  	return ret;
9970  }
9971  
check_sb_changes(struct mddev * mddev,struct md_rdev * rdev)9972  static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9973  {
9974  	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9975  	struct md_rdev *rdev2, *tmp;
9976  	int role, ret;
9977  
9978  	/*
9979  	 * If size is changed in another node then we need to
9980  	 * do resize as well.
9981  	 */
9982  	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9983  		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9984  		if (ret)
9985  			pr_info("md-cluster: resize failed\n");
9986  		else
9987  			mddev->bitmap_ops->update_sb(mddev->bitmap);
9988  	}
9989  
9990  	/* Check for change of roles in the active devices */
9991  	rdev_for_each_safe(rdev2, tmp, mddev) {
9992  		if (test_bit(Faulty, &rdev2->flags))
9993  			continue;
9994  
9995  		/* Check if the roles changed */
9996  		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9997  
9998  		if (test_bit(Candidate, &rdev2->flags)) {
9999  			if (role == MD_DISK_ROLE_FAULTY) {
10000  				pr_info("md: Removing Candidate device %pg because add failed\n",
10001  					rdev2->bdev);
10002  				md_kick_rdev_from_array(rdev2);
10003  				continue;
10004  			}
10005  			else
10006  				clear_bit(Candidate, &rdev2->flags);
10007  		}
10008  
10009  		if (role != rdev2->raid_disk) {
10010  			/*
10011  			 * got activated except reshape is happening.
10012  			 */
10013  			if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
10014  			    !(le32_to_cpu(sb->feature_map) &
10015  			      MD_FEATURE_RESHAPE_ACTIVE) &&
10016  			    !mddev->cluster_ops->resync_status_get(mddev)) {
10017  				/*
10018  				 * -1 to make raid1_add_disk() set conf->fullsync
10019  				 * to 1. This could avoid skipping sync when the
10020  				 * remote node is down during resyncing.
10021  				 */
10022  				if ((le32_to_cpu(sb->feature_map)
10023  				    & MD_FEATURE_RECOVERY_OFFSET))
10024  					rdev2->saved_raid_disk = -1;
10025  				else
10026  					rdev2->saved_raid_disk = role;
10027  				ret = remove_and_add_spares(mddev, rdev2);
10028  				pr_info("Activated spare: %pg\n",
10029  					rdev2->bdev);
10030  				/* wakeup mddev->thread here, so array could
10031  				 * perform resync with the new activated disk */
10032  				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
10033  				md_wakeup_thread(mddev->thread);
10034  			}
10035  			/* device faulty
10036  			 * We just want to do the minimum to mark the disk
10037  			 * as faulty. The recovery is performed by the
10038  			 * one who initiated the error.
10039  			 */
10040  			if (role == MD_DISK_ROLE_FAULTY ||
10041  			    role == MD_DISK_ROLE_JOURNAL) {
10042  				md_error(mddev, rdev2);
10043  				clear_bit(Blocked, &rdev2->flags);
10044  			}
10045  		}
10046  	}
10047  
10048  	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
10049  		ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
10050  		if (ret)
10051  			pr_warn("md: updating array disks failed. %d\n", ret);
10052  	}
10053  
10054  	/*
10055  	 * Since mddev->delta_disks has already updated in update_raid_disks,
10056  	 * so it is time to check reshape.
10057  	 */
10058  	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10059  	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10060  		/*
10061  		 * reshape is happening in the remote node, we need to
10062  		 * update reshape_position and call start_reshape.
10063  		 */
10064  		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10065  		if (mddev->pers->update_reshape_pos)
10066  			mddev->pers->update_reshape_pos(mddev);
10067  		if (mddev->pers->start_reshape)
10068  			mddev->pers->start_reshape(mddev);
10069  	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10070  		   mddev->reshape_position != MaxSector &&
10071  		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10072  		/* reshape is just done in another node. */
10073  		mddev->reshape_position = MaxSector;
10074  		if (mddev->pers->update_reshape_pos)
10075  			mddev->pers->update_reshape_pos(mddev);
10076  	}
10077  
10078  	/* Finally set the event to be up to date */
10079  	mddev->events = le64_to_cpu(sb->events);
10080  }
10081  
read_rdev(struct mddev * mddev,struct md_rdev * rdev)10082  static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10083  {
10084  	int err;
10085  	struct page *swapout = rdev->sb_page;
10086  	struct mdp_superblock_1 *sb;
10087  
10088  	/* Store the sb page of the rdev in the swapout temporary
10089  	 * variable in case we err in the future
10090  	 */
10091  	rdev->sb_page = NULL;
10092  	err = alloc_disk_sb(rdev);
10093  	if (err == 0) {
10094  		ClearPageUptodate(rdev->sb_page);
10095  		rdev->sb_loaded = 0;
10096  		err = super_types[mddev->major_version].
10097  			load_super(rdev, NULL, mddev->minor_version);
10098  	}
10099  	if (err < 0) {
10100  		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10101  				__func__, __LINE__, rdev->desc_nr, err);
10102  		if (rdev->sb_page)
10103  			put_page(rdev->sb_page);
10104  		rdev->sb_page = swapout;
10105  		rdev->sb_loaded = 1;
10106  		return err;
10107  	}
10108  
10109  	sb = page_address(rdev->sb_page);
10110  	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10111  	 * is not set
10112  	 */
10113  
10114  	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10115  		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10116  
10117  	/* The other node finished recovery, call spare_active to set
10118  	 * device In_sync and mddev->degraded
10119  	 */
10120  	if (rdev->recovery_offset == MaxSector &&
10121  	    !test_bit(In_sync, &rdev->flags) &&
10122  	    mddev->pers->spare_active(mddev))
10123  		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10124  
10125  	put_page(swapout);
10126  	return 0;
10127  }
10128  
md_reload_sb(struct mddev * mddev,int nr)10129  void md_reload_sb(struct mddev *mddev, int nr)
10130  {
10131  	struct md_rdev *rdev = NULL, *iter;
10132  	int err;
10133  
10134  	/* Find the rdev */
10135  	rdev_for_each_rcu(iter, mddev) {
10136  		if (iter->desc_nr == nr) {
10137  			rdev = iter;
10138  			break;
10139  		}
10140  	}
10141  
10142  	if (!rdev) {
10143  		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10144  		return;
10145  	}
10146  
10147  	err = read_rdev(mddev, rdev);
10148  	if (err < 0)
10149  		return;
10150  
10151  	check_sb_changes(mddev, rdev);
10152  
10153  	/* Read all rdev's to update recovery_offset */
10154  	rdev_for_each_rcu(rdev, mddev) {
10155  		if (!test_bit(Faulty, &rdev->flags))
10156  			read_rdev(mddev, rdev);
10157  	}
10158  }
10159  EXPORT_SYMBOL(md_reload_sb);
10160  
10161  #ifndef MODULE
10162  
10163  /*
10164   * Searches all registered partitions for autorun RAID arrays
10165   * at boot time.
10166   */
10167  
10168  static DEFINE_MUTEX(detected_devices_mutex);
10169  static LIST_HEAD(all_detected_devices);
10170  struct detected_devices_node {
10171  	struct list_head list;
10172  	dev_t dev;
10173  };
10174  
md_autodetect_dev(dev_t dev)10175  void md_autodetect_dev(dev_t dev)
10176  {
10177  	struct detected_devices_node *node_detected_dev;
10178  
10179  	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
10180  	if (node_detected_dev) {
10181  		node_detected_dev->dev = dev;
10182  		mutex_lock(&detected_devices_mutex);
10183  		list_add_tail(&node_detected_dev->list, &all_detected_devices);
10184  		mutex_unlock(&detected_devices_mutex);
10185  	}
10186  }
10187  
md_autostart_arrays(int part)10188  void md_autostart_arrays(int part)
10189  {
10190  	struct md_rdev *rdev;
10191  	struct detected_devices_node *node_detected_dev;
10192  	dev_t dev;
10193  	int i_scanned, i_passed;
10194  
10195  	i_scanned = 0;
10196  	i_passed = 0;
10197  
10198  	pr_info("md: Autodetecting RAID arrays.\n");
10199  
10200  	mutex_lock(&detected_devices_mutex);
10201  	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10202  		i_scanned++;
10203  		node_detected_dev = list_entry(all_detected_devices.next,
10204  					struct detected_devices_node, list);
10205  		list_del(&node_detected_dev->list);
10206  		dev = node_detected_dev->dev;
10207  		kfree(node_detected_dev);
10208  		mutex_unlock(&detected_devices_mutex);
10209  		rdev = md_import_device(dev,0, 90);
10210  		mutex_lock(&detected_devices_mutex);
10211  		if (IS_ERR(rdev))
10212  			continue;
10213  
10214  		if (test_bit(Faulty, &rdev->flags))
10215  			continue;
10216  
10217  		set_bit(AutoDetected, &rdev->flags);
10218  		list_add(&rdev->same_set, &pending_raid_disks);
10219  		i_passed++;
10220  	}
10221  	mutex_unlock(&detected_devices_mutex);
10222  
10223  	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10224  
10225  	autorun_devices(part);
10226  }
10227  
10228  #endif /* !MODULE */
10229  
md_exit(void)10230  static __exit void md_exit(void)
10231  {
10232  	struct mddev *mddev;
10233  	int delay = 1;
10234  
10235  	unregister_blkdev(MD_MAJOR,"md");
10236  	unregister_blkdev(mdp_major, "mdp");
10237  	unregister_reboot_notifier(&md_notifier);
10238  	unregister_sysctl_table(raid_table_header);
10239  
10240  	/* We cannot unload the modules while some process is
10241  	 * waiting for us in select() or poll() - wake them up
10242  	 */
10243  	md_unloading = 1;
10244  	while (waitqueue_active(&md_event_waiters)) {
10245  		/* not safe to leave yet */
10246  		wake_up(&md_event_waiters);
10247  		msleep(delay);
10248  		delay += delay;
10249  	}
10250  	remove_proc_entry("mdstat", NULL);
10251  
10252  	spin_lock(&all_mddevs_lock);
10253  	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
10254  		if (!mddev_get(mddev))
10255  			continue;
10256  		spin_unlock(&all_mddevs_lock);
10257  		export_array(mddev);
10258  		mddev->ctime = 0;
10259  		mddev->hold_active = 0;
10260  		/*
10261  		 * As the mddev is now fully clear, mddev_put will schedule
10262  		 * the mddev for destruction by a workqueue, and the
10263  		 * destroy_workqueue() below will wait for that to complete.
10264  		 */
10265  		spin_lock(&all_mddevs_lock);
10266  		mddev_put_locked(mddev);
10267  	}
10268  	spin_unlock(&all_mddevs_lock);
10269  
10270  	destroy_workqueue(md_misc_wq);
10271  	destroy_workqueue(md_bitmap_wq);
10272  	destroy_workqueue(md_wq);
10273  }
10274  
10275  subsys_initcall(md_init);
module_exit(md_exit)10276  module_exit(md_exit)
10277  
10278  static int get_ro(char *buffer, const struct kernel_param *kp)
10279  {
10280  	return sprintf(buffer, "%d\n", start_readonly);
10281  }
set_ro(const char * val,const struct kernel_param * kp)10282  static int set_ro(const char *val, const struct kernel_param *kp)
10283  {
10284  	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10285  }
10286  
10287  module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10288  module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10289  module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10290  module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10291  
10292  MODULE_LICENSE("GPL");
10293  MODULE_DESCRIPTION("MD RAID framework");
10294  MODULE_ALIAS("md");
10295  MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10296