xref: /linux/drivers/md/md.c (revision c39b9fd728d8173ecda993524089fbc38211a17f)
1 /*
2    md.c : Multiple Devices driver for Linux
3 	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 
5      completely rewritten, based on the MD driver code from Marc Zyngier
6 
7    Changes:
8 
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19 
20      Neil Brown <neilb@cse.unsw.edu.au>.
21 
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29 
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34 
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/fs.h>
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/string.h>
43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h>
45 #include <linux/random.h>
46 #include <linux/module.h>
47 #include <linux/reboot.h>
48 #include <linux/file.h>
49 #include <linux/compat.h>
50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h>
54 #include "md.h"
55 #include "bitmap.h"
56 
57 #ifndef MODULE
58 static void autostart_arrays(int part);
59 #endif
60 
61 /* pers_list is a list of registered personalities protected
62  * by pers_lock.
63  * pers_lock does extra service to protect accesses to
64  * mddev->thread when the mutex cannot be held.
65  */
66 static LIST_HEAD(pers_list);
67 static DEFINE_SPINLOCK(pers_lock);
68 
69 static void md_print_devices(void);
70 
71 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
72 static struct workqueue_struct *md_wq;
73 static struct workqueue_struct *md_misc_wq;
74 
75 static int remove_and_add_spares(struct mddev *mddev,
76 				 struct md_rdev *this);
77 
78 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
79 
80 /*
81  * Default number of read corrections we'll attempt on an rdev
82  * before ejecting it from the array. We divide the read error
83  * count by 2 for every hour elapsed between read errors.
84  */
85 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
86 /*
87  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
88  * is 1000 KB/sec, so the extra system load does not show up that much.
89  * Increase it if you want to have more _guaranteed_ speed. Note that
90  * the RAID driver will use the maximum available bandwidth if the IO
91  * subsystem is idle. There is also an 'absolute maximum' reconstruction
92  * speed limit - in case reconstruction slows down your system despite
93  * idle IO detection.
94  *
95  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
96  * or /sys/block/mdX/md/sync_speed_{min,max}
97  */
98 
99 static int sysctl_speed_limit_min = 1000;
100 static int sysctl_speed_limit_max = 200000;
101 static inline int speed_min(struct mddev *mddev)
102 {
103 	return mddev->sync_speed_min ?
104 		mddev->sync_speed_min : sysctl_speed_limit_min;
105 }
106 
107 static inline int speed_max(struct mddev *mddev)
108 {
109 	return mddev->sync_speed_max ?
110 		mddev->sync_speed_max : sysctl_speed_limit_max;
111 }
112 
113 static struct ctl_table_header *raid_table_header;
114 
115 static ctl_table raid_table[] = {
116 	{
117 		.procname	= "speed_limit_min",
118 		.data		= &sysctl_speed_limit_min,
119 		.maxlen		= sizeof(int),
120 		.mode		= S_IRUGO|S_IWUSR,
121 		.proc_handler	= proc_dointvec,
122 	},
123 	{
124 		.procname	= "speed_limit_max",
125 		.data		= &sysctl_speed_limit_max,
126 		.maxlen		= sizeof(int),
127 		.mode		= S_IRUGO|S_IWUSR,
128 		.proc_handler	= proc_dointvec,
129 	},
130 	{ }
131 };
132 
133 static ctl_table raid_dir_table[] = {
134 	{
135 		.procname	= "raid",
136 		.maxlen		= 0,
137 		.mode		= S_IRUGO|S_IXUGO,
138 		.child		= raid_table,
139 	},
140 	{ }
141 };
142 
143 static ctl_table raid_root_table[] = {
144 	{
145 		.procname	= "dev",
146 		.maxlen		= 0,
147 		.mode		= 0555,
148 		.child		= raid_dir_table,
149 	},
150 	{  }
151 };
152 
153 static const struct block_device_operations md_fops;
154 
155 static int start_readonly;
156 
157 /* bio_clone_mddev
158  * like bio_clone, but with a local bio set
159  */
160 
161 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
162 			    struct mddev *mddev)
163 {
164 	struct bio *b;
165 
166 	if (!mddev || !mddev->bio_set)
167 		return bio_alloc(gfp_mask, nr_iovecs);
168 
169 	b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
170 	if (!b)
171 		return NULL;
172 	return b;
173 }
174 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
175 
176 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
177 			    struct mddev *mddev)
178 {
179 	if (!mddev || !mddev->bio_set)
180 		return bio_clone(bio, gfp_mask);
181 
182 	return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
183 }
184 EXPORT_SYMBOL_GPL(bio_clone_mddev);
185 
186 void md_trim_bio(struct bio *bio, int offset, int size)
187 {
188 	/* 'bio' is a cloned bio which we need to trim to match
189 	 * the given offset and size.
190 	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
191 	 */
192 	int i;
193 	struct bio_vec *bvec;
194 	int sofar = 0;
195 
196 	size <<= 9;
197 	if (offset == 0 && size == bio->bi_size)
198 		return;
199 
200 	bio->bi_sector += offset;
201 	bio->bi_size = size;
202 	offset <<= 9;
203 	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
204 
205 	while (bio->bi_idx < bio->bi_vcnt &&
206 	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
207 		/* remove this whole bio_vec */
208 		offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
209 		bio->bi_idx++;
210 	}
211 	if (bio->bi_idx < bio->bi_vcnt) {
212 		bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
213 		bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
214 	}
215 	/* avoid any complications with bi_idx being non-zero*/
216 	if (bio->bi_idx) {
217 		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
218 			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
219 		bio->bi_vcnt -= bio->bi_idx;
220 		bio->bi_idx = 0;
221 	}
222 	/* Make sure vcnt and last bv are not too big */
223 	bio_for_each_segment(bvec, bio, i) {
224 		if (sofar + bvec->bv_len > size)
225 			bvec->bv_len = size - sofar;
226 		if (bvec->bv_len == 0) {
227 			bio->bi_vcnt = i;
228 			break;
229 		}
230 		sofar += bvec->bv_len;
231 	}
232 }
233 EXPORT_SYMBOL_GPL(md_trim_bio);
234 
235 /*
236  * We have a system wide 'event count' that is incremented
237  * on any 'interesting' event, and readers of /proc/mdstat
238  * can use 'poll' or 'select' to find out when the event
239  * count increases.
240  *
241  * Events are:
242  *  start array, stop array, error, add device, remove device,
243  *  start build, activate spare
244  */
245 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
246 static atomic_t md_event_count;
247 void md_new_event(struct mddev *mddev)
248 {
249 	atomic_inc(&md_event_count);
250 	wake_up(&md_event_waiters);
251 }
252 EXPORT_SYMBOL_GPL(md_new_event);
253 
254 /* Alternate version that can be called from interrupts
255  * when calling sysfs_notify isn't needed.
256  */
257 static void md_new_event_inintr(struct mddev *mddev)
258 {
259 	atomic_inc(&md_event_count);
260 	wake_up(&md_event_waiters);
261 }
262 
263 /*
264  * Enables to iterate over all existing md arrays
265  * all_mddevs_lock protects this list.
266  */
267 static LIST_HEAD(all_mddevs);
268 static DEFINE_SPINLOCK(all_mddevs_lock);
269 
270 
271 /*
272  * iterates through all used mddevs in the system.
273  * We take care to grab the all_mddevs_lock whenever navigating
274  * the list, and to always hold a refcount when unlocked.
275  * Any code which breaks out of this loop while own
276  * a reference to the current mddev and must mddev_put it.
277  */
278 #define for_each_mddev(_mddev,_tmp)					\
279 									\
280 	for (({ spin_lock(&all_mddevs_lock); 				\
281 		_tmp = all_mddevs.next;					\
282 		_mddev = NULL;});					\
283 	     ({ if (_tmp != &all_mddevs)				\
284 			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
285 		spin_unlock(&all_mddevs_lock);				\
286 		if (_mddev) mddev_put(_mddev);				\
287 		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
288 		_tmp != &all_mddevs;});					\
289 	     ({ spin_lock(&all_mddevs_lock);				\
290 		_tmp = _tmp->next;})					\
291 		)
292 
293 
294 /* Rather than calling directly into the personality make_request function,
295  * IO requests come here first so that we can check if the device is
296  * being suspended pending a reconfiguration.
297  * We hold a refcount over the call to ->make_request.  By the time that
298  * call has finished, the bio has been linked into some internal structure
299  * and so is visible to ->quiesce(), so we don't need the refcount any more.
300  */
301 static void md_make_request(struct request_queue *q, struct bio *bio)
302 {
303 	const int rw = bio_data_dir(bio);
304 	struct mddev *mddev = q->queuedata;
305 	int cpu;
306 	unsigned int sectors;
307 
308 	if (mddev == NULL || mddev->pers == NULL
309 	    || !mddev->ready) {
310 		bio_io_error(bio);
311 		return;
312 	}
313 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
314 		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
315 		return;
316 	}
317 	smp_rmb(); /* Ensure implications of  'active' are visible */
318 	rcu_read_lock();
319 	if (mddev->suspended) {
320 		DEFINE_WAIT(__wait);
321 		for (;;) {
322 			prepare_to_wait(&mddev->sb_wait, &__wait,
323 					TASK_UNINTERRUPTIBLE);
324 			if (!mddev->suspended)
325 				break;
326 			rcu_read_unlock();
327 			schedule();
328 			rcu_read_lock();
329 		}
330 		finish_wait(&mddev->sb_wait, &__wait);
331 	}
332 	atomic_inc(&mddev->active_io);
333 	rcu_read_unlock();
334 
335 	/*
336 	 * save the sectors now since our bio can
337 	 * go away inside make_request
338 	 */
339 	sectors = bio_sectors(bio);
340 	mddev->pers->make_request(mddev, bio);
341 
342 	cpu = part_stat_lock();
343 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
344 	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
345 	part_stat_unlock();
346 
347 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
348 		wake_up(&mddev->sb_wait);
349 }
350 
351 /* mddev_suspend makes sure no new requests are submitted
352  * to the device, and that any requests that have been submitted
353  * are completely handled.
354  * Once ->stop is called and completes, the module will be completely
355  * unused.
356  */
357 void mddev_suspend(struct mddev *mddev)
358 {
359 	BUG_ON(mddev->suspended);
360 	mddev->suspended = 1;
361 	synchronize_rcu();
362 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
363 	mddev->pers->quiesce(mddev, 1);
364 
365 	del_timer_sync(&mddev->safemode_timer);
366 }
367 EXPORT_SYMBOL_GPL(mddev_suspend);
368 
369 void mddev_resume(struct mddev *mddev)
370 {
371 	mddev->suspended = 0;
372 	wake_up(&mddev->sb_wait);
373 	mddev->pers->quiesce(mddev, 0);
374 
375 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
376 	md_wakeup_thread(mddev->thread);
377 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
378 }
379 EXPORT_SYMBOL_GPL(mddev_resume);
380 
381 int mddev_congested(struct mddev *mddev, int bits)
382 {
383 	return mddev->suspended;
384 }
385 EXPORT_SYMBOL(mddev_congested);
386 
387 /*
388  * Generic flush handling for md
389  */
390 
391 static void md_end_flush(struct bio *bio, int err)
392 {
393 	struct md_rdev *rdev = bio->bi_private;
394 	struct mddev *mddev = rdev->mddev;
395 
396 	rdev_dec_pending(rdev, mddev);
397 
398 	if (atomic_dec_and_test(&mddev->flush_pending)) {
399 		/* The pre-request flush has finished */
400 		queue_work(md_wq, &mddev->flush_work);
401 	}
402 	bio_put(bio);
403 }
404 
405 static void md_submit_flush_data(struct work_struct *ws);
406 
407 static void submit_flushes(struct work_struct *ws)
408 {
409 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
410 	struct md_rdev *rdev;
411 
412 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
413 	atomic_set(&mddev->flush_pending, 1);
414 	rcu_read_lock();
415 	rdev_for_each_rcu(rdev, mddev)
416 		if (rdev->raid_disk >= 0 &&
417 		    !test_bit(Faulty, &rdev->flags)) {
418 			/* Take two references, one is dropped
419 			 * when request finishes, one after
420 			 * we reclaim rcu_read_lock
421 			 */
422 			struct bio *bi;
423 			atomic_inc(&rdev->nr_pending);
424 			atomic_inc(&rdev->nr_pending);
425 			rcu_read_unlock();
426 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
427 			bi->bi_end_io = md_end_flush;
428 			bi->bi_private = rdev;
429 			bi->bi_bdev = rdev->bdev;
430 			atomic_inc(&mddev->flush_pending);
431 			submit_bio(WRITE_FLUSH, bi);
432 			rcu_read_lock();
433 			rdev_dec_pending(rdev, mddev);
434 		}
435 	rcu_read_unlock();
436 	if (atomic_dec_and_test(&mddev->flush_pending))
437 		queue_work(md_wq, &mddev->flush_work);
438 }
439 
440 static void md_submit_flush_data(struct work_struct *ws)
441 {
442 	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
443 	struct bio *bio = mddev->flush_bio;
444 
445 	if (bio->bi_size == 0)
446 		/* an empty barrier - all done */
447 		bio_endio(bio, 0);
448 	else {
449 		bio->bi_rw &= ~REQ_FLUSH;
450 		mddev->pers->make_request(mddev, bio);
451 	}
452 
453 	mddev->flush_bio = NULL;
454 	wake_up(&mddev->sb_wait);
455 }
456 
457 void md_flush_request(struct mddev *mddev, struct bio *bio)
458 {
459 	spin_lock_irq(&mddev->write_lock);
460 	wait_event_lock_irq(mddev->sb_wait,
461 			    !mddev->flush_bio,
462 			    mddev->write_lock);
463 	mddev->flush_bio = bio;
464 	spin_unlock_irq(&mddev->write_lock);
465 
466 	INIT_WORK(&mddev->flush_work, submit_flushes);
467 	queue_work(md_wq, &mddev->flush_work);
468 }
469 EXPORT_SYMBOL(md_flush_request);
470 
471 void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
472 {
473 	struct mddev *mddev = cb->data;
474 	md_wakeup_thread(mddev->thread);
475 	kfree(cb);
476 }
477 EXPORT_SYMBOL(md_unplug);
478 
479 static inline struct mddev *mddev_get(struct mddev *mddev)
480 {
481 	atomic_inc(&mddev->active);
482 	return mddev;
483 }
484 
485 static void mddev_delayed_delete(struct work_struct *ws);
486 
487 static void mddev_put(struct mddev *mddev)
488 {
489 	struct bio_set *bs = NULL;
490 
491 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
492 		return;
493 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
494 	    mddev->ctime == 0 && !mddev->hold_active) {
495 		/* Array is not configured at all, and not held active,
496 		 * so destroy it */
497 		list_del_init(&mddev->all_mddevs);
498 		bs = mddev->bio_set;
499 		mddev->bio_set = NULL;
500 		if (mddev->gendisk) {
501 			/* We did a probe so need to clean up.  Call
502 			 * queue_work inside the spinlock so that
503 			 * flush_workqueue() after mddev_find will
504 			 * succeed in waiting for the work to be done.
505 			 */
506 			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
507 			queue_work(md_misc_wq, &mddev->del_work);
508 		} else
509 			kfree(mddev);
510 	}
511 	spin_unlock(&all_mddevs_lock);
512 	if (bs)
513 		bioset_free(bs);
514 }
515 
516 void mddev_init(struct mddev *mddev)
517 {
518 	mutex_init(&mddev->open_mutex);
519 	mutex_init(&mddev->reconfig_mutex);
520 	mutex_init(&mddev->bitmap_info.mutex);
521 	INIT_LIST_HEAD(&mddev->disks);
522 	INIT_LIST_HEAD(&mddev->all_mddevs);
523 	init_timer(&mddev->safemode_timer);
524 	atomic_set(&mddev->active, 1);
525 	atomic_set(&mddev->openers, 0);
526 	atomic_set(&mddev->active_io, 0);
527 	spin_lock_init(&mddev->write_lock);
528 	atomic_set(&mddev->flush_pending, 0);
529 	init_waitqueue_head(&mddev->sb_wait);
530 	init_waitqueue_head(&mddev->recovery_wait);
531 	mddev->reshape_position = MaxSector;
532 	mddev->reshape_backwards = 0;
533 	mddev->resync_min = 0;
534 	mddev->resync_max = MaxSector;
535 	mddev->level = LEVEL_NONE;
536 }
537 EXPORT_SYMBOL_GPL(mddev_init);
538 
539 static struct mddev * mddev_find(dev_t unit)
540 {
541 	struct mddev *mddev, *new = NULL;
542 
543 	if (unit && MAJOR(unit) != MD_MAJOR)
544 		unit &= ~((1<<MdpMinorShift)-1);
545 
546  retry:
547 	spin_lock(&all_mddevs_lock);
548 
549 	if (unit) {
550 		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
551 			if (mddev->unit == unit) {
552 				mddev_get(mddev);
553 				spin_unlock(&all_mddevs_lock);
554 				kfree(new);
555 				return mddev;
556 			}
557 
558 		if (new) {
559 			list_add(&new->all_mddevs, &all_mddevs);
560 			spin_unlock(&all_mddevs_lock);
561 			new->hold_active = UNTIL_IOCTL;
562 			return new;
563 		}
564 	} else if (new) {
565 		/* find an unused unit number */
566 		static int next_minor = 512;
567 		int start = next_minor;
568 		int is_free = 0;
569 		int dev = 0;
570 		while (!is_free) {
571 			dev = MKDEV(MD_MAJOR, next_minor);
572 			next_minor++;
573 			if (next_minor > MINORMASK)
574 				next_minor = 0;
575 			if (next_minor == start) {
576 				/* Oh dear, all in use. */
577 				spin_unlock(&all_mddevs_lock);
578 				kfree(new);
579 				return NULL;
580 			}
581 
582 			is_free = 1;
583 			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
584 				if (mddev->unit == dev) {
585 					is_free = 0;
586 					break;
587 				}
588 		}
589 		new->unit = dev;
590 		new->md_minor = MINOR(dev);
591 		new->hold_active = UNTIL_STOP;
592 		list_add(&new->all_mddevs, &all_mddevs);
593 		spin_unlock(&all_mddevs_lock);
594 		return new;
595 	}
596 	spin_unlock(&all_mddevs_lock);
597 
598 	new = kzalloc(sizeof(*new), GFP_KERNEL);
599 	if (!new)
600 		return NULL;
601 
602 	new->unit = unit;
603 	if (MAJOR(unit) == MD_MAJOR)
604 		new->md_minor = MINOR(unit);
605 	else
606 		new->md_minor = MINOR(unit) >> MdpMinorShift;
607 
608 	mddev_init(new);
609 
610 	goto retry;
611 }
612 
613 static inline int mddev_lock(struct mddev * mddev)
614 {
615 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
616 }
617 
618 static inline int mddev_is_locked(struct mddev *mddev)
619 {
620 	return mutex_is_locked(&mddev->reconfig_mutex);
621 }
622 
623 static inline int mddev_trylock(struct mddev * mddev)
624 {
625 	return mutex_trylock(&mddev->reconfig_mutex);
626 }
627 
628 static struct attribute_group md_redundancy_group;
629 
630 static void mddev_unlock(struct mddev * mddev)
631 {
632 	if (mddev->to_remove) {
633 		/* These cannot be removed under reconfig_mutex as
634 		 * an access to the files will try to take reconfig_mutex
635 		 * while holding the file unremovable, which leads to
636 		 * a deadlock.
637 		 * So hold set sysfs_active while the remove in happeing,
638 		 * and anything else which might set ->to_remove or my
639 		 * otherwise change the sysfs namespace will fail with
640 		 * -EBUSY if sysfs_active is still set.
641 		 * We set sysfs_active under reconfig_mutex and elsewhere
642 		 * test it under the same mutex to ensure its correct value
643 		 * is seen.
644 		 */
645 		struct attribute_group *to_remove = mddev->to_remove;
646 		mddev->to_remove = NULL;
647 		mddev->sysfs_active = 1;
648 		mutex_unlock(&mddev->reconfig_mutex);
649 
650 		if (mddev->kobj.sd) {
651 			if (to_remove != &md_redundancy_group)
652 				sysfs_remove_group(&mddev->kobj, to_remove);
653 			if (mddev->pers == NULL ||
654 			    mddev->pers->sync_request == NULL) {
655 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
656 				if (mddev->sysfs_action)
657 					sysfs_put(mddev->sysfs_action);
658 				mddev->sysfs_action = NULL;
659 			}
660 		}
661 		mddev->sysfs_active = 0;
662 	} else
663 		mutex_unlock(&mddev->reconfig_mutex);
664 
665 	/* As we've dropped the mutex we need a spinlock to
666 	 * make sure the thread doesn't disappear
667 	 */
668 	spin_lock(&pers_lock);
669 	md_wakeup_thread(mddev->thread);
670 	spin_unlock(&pers_lock);
671 }
672 
673 static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
674 {
675 	struct md_rdev *rdev;
676 
677 	rdev_for_each(rdev, mddev)
678 		if (rdev->desc_nr == nr)
679 			return rdev;
680 
681 	return NULL;
682 }
683 
684 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
685 {
686 	struct md_rdev *rdev;
687 
688 	rdev_for_each_rcu(rdev, mddev)
689 		if (rdev->desc_nr == nr)
690 			return rdev;
691 
692 	return NULL;
693 }
694 
695 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
696 {
697 	struct md_rdev *rdev;
698 
699 	rdev_for_each(rdev, mddev)
700 		if (rdev->bdev->bd_dev == dev)
701 			return rdev;
702 
703 	return NULL;
704 }
705 
706 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
707 {
708 	struct md_rdev *rdev;
709 
710 	rdev_for_each_rcu(rdev, mddev)
711 		if (rdev->bdev->bd_dev == dev)
712 			return rdev;
713 
714 	return NULL;
715 }
716 
717 static struct md_personality *find_pers(int level, char *clevel)
718 {
719 	struct md_personality *pers;
720 	list_for_each_entry(pers, &pers_list, list) {
721 		if (level != LEVEL_NONE && pers->level == level)
722 			return pers;
723 		if (strcmp(pers->name, clevel)==0)
724 			return pers;
725 	}
726 	return NULL;
727 }
728 
729 /* return the offset of the super block in 512byte sectors */
730 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
731 {
732 	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
733 	return MD_NEW_SIZE_SECTORS(num_sectors);
734 }
735 
736 static int alloc_disk_sb(struct md_rdev * rdev)
737 {
738 	if (rdev->sb_page)
739 		MD_BUG();
740 
741 	rdev->sb_page = alloc_page(GFP_KERNEL);
742 	if (!rdev->sb_page) {
743 		printk(KERN_ALERT "md: out of memory.\n");
744 		return -ENOMEM;
745 	}
746 
747 	return 0;
748 }
749 
750 void md_rdev_clear(struct md_rdev *rdev)
751 {
752 	if (rdev->sb_page) {
753 		put_page(rdev->sb_page);
754 		rdev->sb_loaded = 0;
755 		rdev->sb_page = NULL;
756 		rdev->sb_start = 0;
757 		rdev->sectors = 0;
758 	}
759 	if (rdev->bb_page) {
760 		put_page(rdev->bb_page);
761 		rdev->bb_page = NULL;
762 	}
763 	kfree(rdev->badblocks.page);
764 	rdev->badblocks.page = NULL;
765 }
766 EXPORT_SYMBOL_GPL(md_rdev_clear);
767 
768 static void super_written(struct bio *bio, int error)
769 {
770 	struct md_rdev *rdev = bio->bi_private;
771 	struct mddev *mddev = rdev->mddev;
772 
773 	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
774 		printk("md: super_written gets error=%d, uptodate=%d\n",
775 		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
776 		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
777 		md_error(mddev, rdev);
778 	}
779 
780 	if (atomic_dec_and_test(&mddev->pending_writes))
781 		wake_up(&mddev->sb_wait);
782 	bio_put(bio);
783 }
784 
785 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
786 		   sector_t sector, int size, struct page *page)
787 {
788 	/* write first size bytes of page to sector of rdev
789 	 * Increment mddev->pending_writes before returning
790 	 * and decrement it on completion, waking up sb_wait
791 	 * if zero is reached.
792 	 * If an error occurred, call md_error
793 	 */
794 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
795 
796 	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
797 	bio->bi_sector = sector;
798 	bio_add_page(bio, page, size, 0);
799 	bio->bi_private = rdev;
800 	bio->bi_end_io = super_written;
801 
802 	atomic_inc(&mddev->pending_writes);
803 	submit_bio(WRITE_FLUSH_FUA, bio);
804 }
805 
806 void md_super_wait(struct mddev *mddev)
807 {
808 	/* wait for all superblock writes that were scheduled to complete */
809 	DEFINE_WAIT(wq);
810 	for(;;) {
811 		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
812 		if (atomic_read(&mddev->pending_writes)==0)
813 			break;
814 		schedule();
815 	}
816 	finish_wait(&mddev->sb_wait, &wq);
817 }
818 
819 static void bi_complete(struct bio *bio, int error)
820 {
821 	complete((struct completion*)bio->bi_private);
822 }
823 
824 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
825 		 struct page *page, int rw, bool metadata_op)
826 {
827 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
828 	struct completion event;
829 	int ret;
830 
831 	rw |= REQ_SYNC;
832 
833 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
834 		rdev->meta_bdev : rdev->bdev;
835 	if (metadata_op)
836 		bio->bi_sector = sector + rdev->sb_start;
837 	else if (rdev->mddev->reshape_position != MaxSector &&
838 		 (rdev->mddev->reshape_backwards ==
839 		  (sector >= rdev->mddev->reshape_position)))
840 		bio->bi_sector = sector + rdev->new_data_offset;
841 	else
842 		bio->bi_sector = sector + rdev->data_offset;
843 	bio_add_page(bio, page, size, 0);
844 	init_completion(&event);
845 	bio->bi_private = &event;
846 	bio->bi_end_io = bi_complete;
847 	submit_bio(rw, bio);
848 	wait_for_completion(&event);
849 
850 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
851 	bio_put(bio);
852 	return ret;
853 }
854 EXPORT_SYMBOL_GPL(sync_page_io);
855 
856 static int read_disk_sb(struct md_rdev * rdev, int size)
857 {
858 	char b[BDEVNAME_SIZE];
859 	if (!rdev->sb_page) {
860 		MD_BUG();
861 		return -EINVAL;
862 	}
863 	if (rdev->sb_loaded)
864 		return 0;
865 
866 
867 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
868 		goto fail;
869 	rdev->sb_loaded = 1;
870 	return 0;
871 
872 fail:
873 	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
874 		bdevname(rdev->bdev,b));
875 	return -EINVAL;
876 }
877 
878 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
879 {
880 	return 	sb1->set_uuid0 == sb2->set_uuid0 &&
881 		sb1->set_uuid1 == sb2->set_uuid1 &&
882 		sb1->set_uuid2 == sb2->set_uuid2 &&
883 		sb1->set_uuid3 == sb2->set_uuid3;
884 }
885 
886 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
887 {
888 	int ret;
889 	mdp_super_t *tmp1, *tmp2;
890 
891 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
892 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
893 
894 	if (!tmp1 || !tmp2) {
895 		ret = 0;
896 		printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
897 		goto abort;
898 	}
899 
900 	*tmp1 = *sb1;
901 	*tmp2 = *sb2;
902 
903 	/*
904 	 * nr_disks is not constant
905 	 */
906 	tmp1->nr_disks = 0;
907 	tmp2->nr_disks = 0;
908 
909 	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
910 abort:
911 	kfree(tmp1);
912 	kfree(tmp2);
913 	return ret;
914 }
915 
916 
917 static u32 md_csum_fold(u32 csum)
918 {
919 	csum = (csum & 0xffff) + (csum >> 16);
920 	return (csum & 0xffff) + (csum >> 16);
921 }
922 
923 static unsigned int calc_sb_csum(mdp_super_t * sb)
924 {
925 	u64 newcsum = 0;
926 	u32 *sb32 = (u32*)sb;
927 	int i;
928 	unsigned int disk_csum, csum;
929 
930 	disk_csum = sb->sb_csum;
931 	sb->sb_csum = 0;
932 
933 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
934 		newcsum += sb32[i];
935 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
936 
937 
938 #ifdef CONFIG_ALPHA
939 	/* This used to use csum_partial, which was wrong for several
940 	 * reasons including that different results are returned on
941 	 * different architectures.  It isn't critical that we get exactly
942 	 * the same return value as before (we always csum_fold before
943 	 * testing, and that removes any differences).  However as we
944 	 * know that csum_partial always returned a 16bit value on
945 	 * alphas, do a fold to maximise conformity to previous behaviour.
946 	 */
947 	sb->sb_csum = md_csum_fold(disk_csum);
948 #else
949 	sb->sb_csum = disk_csum;
950 #endif
951 	return csum;
952 }
953 
954 
955 /*
956  * Handle superblock details.
957  * We want to be able to handle multiple superblock formats
958  * so we have a common interface to them all, and an array of
959  * different handlers.
960  * We rely on user-space to write the initial superblock, and support
961  * reading and updating of superblocks.
962  * Interface methods are:
963  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
964  *      loads and validates a superblock on dev.
965  *      if refdev != NULL, compare superblocks on both devices
966  *    Return:
967  *      0 - dev has a superblock that is compatible with refdev
968  *      1 - dev has a superblock that is compatible and newer than refdev
969  *          so dev should be used as the refdev in future
970  *     -EINVAL superblock incompatible or invalid
971  *     -othererror e.g. -EIO
972  *
973  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
974  *      Verify that dev is acceptable into mddev.
975  *       The first time, mddev->raid_disks will be 0, and data from
976  *       dev should be merged in.  Subsequent calls check that dev
977  *       is new enough.  Return 0 or -EINVAL
978  *
979  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
980  *     Update the superblock for rdev with data in mddev
981  *     This does not write to disc.
982  *
983  */
984 
985 struct super_type  {
986 	char		    *name;
987 	struct module	    *owner;
988 	int		    (*load_super)(struct md_rdev *rdev,
989 					  struct md_rdev *refdev,
990 					  int minor_version);
991 	int		    (*validate_super)(struct mddev *mddev,
992 					      struct md_rdev *rdev);
993 	void		    (*sync_super)(struct mddev *mddev,
994 					  struct md_rdev *rdev);
995 	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
996 						sector_t num_sectors);
997 	int		    (*allow_new_offset)(struct md_rdev *rdev,
998 						unsigned long long new_offset);
999 };
1000 
1001 /*
1002  * Check that the given mddev has no bitmap.
1003  *
1004  * This function is called from the run method of all personalities that do not
1005  * support bitmaps. It prints an error message and returns non-zero if mddev
1006  * has a bitmap. Otherwise, it returns 0.
1007  *
1008  */
1009 int md_check_no_bitmap(struct mddev *mddev)
1010 {
1011 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1012 		return 0;
1013 	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
1014 		mdname(mddev), mddev->pers->name);
1015 	return 1;
1016 }
1017 EXPORT_SYMBOL(md_check_no_bitmap);
1018 
1019 /*
1020  * load_super for 0.90.0
1021  */
1022 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1023 {
1024 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1025 	mdp_super_t *sb;
1026 	int ret;
1027 
1028 	/*
1029 	 * Calculate the position of the superblock (512byte sectors),
1030 	 * it's at the end of the disk.
1031 	 *
1032 	 * It also happens to be a multiple of 4Kb.
1033 	 */
1034 	rdev->sb_start = calc_dev_sboffset(rdev);
1035 
1036 	ret = read_disk_sb(rdev, MD_SB_BYTES);
1037 	if (ret) return ret;
1038 
1039 	ret = -EINVAL;
1040 
1041 	bdevname(rdev->bdev, b);
1042 	sb = page_address(rdev->sb_page);
1043 
1044 	if (sb->md_magic != MD_SB_MAGIC) {
1045 		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
1046 		       b);
1047 		goto abort;
1048 	}
1049 
1050 	if (sb->major_version != 0 ||
1051 	    sb->minor_version < 90 ||
1052 	    sb->minor_version > 91) {
1053 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
1054 			sb->major_version, sb->minor_version,
1055 			b);
1056 		goto abort;
1057 	}
1058 
1059 	if (sb->raid_disks <= 0)
1060 		goto abort;
1061 
1062 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1063 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1064 			b);
1065 		goto abort;
1066 	}
1067 
1068 	rdev->preferred_minor = sb->md_minor;
1069 	rdev->data_offset = 0;
1070 	rdev->new_data_offset = 0;
1071 	rdev->sb_size = MD_SB_BYTES;
1072 	rdev->badblocks.shift = -1;
1073 
1074 	if (sb->level == LEVEL_MULTIPATH)
1075 		rdev->desc_nr = -1;
1076 	else
1077 		rdev->desc_nr = sb->this_disk.number;
1078 
1079 	if (!refdev) {
1080 		ret = 1;
1081 	} else {
1082 		__u64 ev1, ev2;
1083 		mdp_super_t *refsb = page_address(refdev->sb_page);
1084 		if (!uuid_equal(refsb, sb)) {
1085 			printk(KERN_WARNING "md: %s has different UUID to %s\n",
1086 				b, bdevname(refdev->bdev,b2));
1087 			goto abort;
1088 		}
1089 		if (!sb_equal(refsb, sb)) {
1090 			printk(KERN_WARNING "md: %s has same UUID"
1091 			       " but different superblock to %s\n",
1092 			       b, bdevname(refdev->bdev, b2));
1093 			goto abort;
1094 		}
1095 		ev1 = md_event(sb);
1096 		ev2 = md_event(refsb);
1097 		if (ev1 > ev2)
1098 			ret = 1;
1099 		else
1100 			ret = 0;
1101 	}
1102 	rdev->sectors = rdev->sb_start;
1103 	/* Limit to 4TB as metadata cannot record more than that.
1104 	 * (not needed for Linear and RAID0 as metadata doesn't
1105 	 * record this size)
1106 	 */
1107 	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1108 		rdev->sectors = (2ULL << 32) - 2;
1109 
1110 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1111 		/* "this cannot possibly happen" ... */
1112 		ret = -EINVAL;
1113 
1114  abort:
1115 	return ret;
1116 }
1117 
1118 /*
1119  * validate_super for 0.90.0
1120  */
1121 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1122 {
1123 	mdp_disk_t *desc;
1124 	mdp_super_t *sb = page_address(rdev->sb_page);
1125 	__u64 ev1 = md_event(sb);
1126 
1127 	rdev->raid_disk = -1;
1128 	clear_bit(Faulty, &rdev->flags);
1129 	clear_bit(In_sync, &rdev->flags);
1130 	clear_bit(WriteMostly, &rdev->flags);
1131 
1132 	if (mddev->raid_disks == 0) {
1133 		mddev->major_version = 0;
1134 		mddev->minor_version = sb->minor_version;
1135 		mddev->patch_version = sb->patch_version;
1136 		mddev->external = 0;
1137 		mddev->chunk_sectors = sb->chunk_size >> 9;
1138 		mddev->ctime = sb->ctime;
1139 		mddev->utime = sb->utime;
1140 		mddev->level = sb->level;
1141 		mddev->clevel[0] = 0;
1142 		mddev->layout = sb->layout;
1143 		mddev->raid_disks = sb->raid_disks;
1144 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1145 		mddev->events = ev1;
1146 		mddev->bitmap_info.offset = 0;
1147 		mddev->bitmap_info.space = 0;
1148 		/* bitmap can use 60 K after the 4K superblocks */
1149 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1150 		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1151 		mddev->reshape_backwards = 0;
1152 
1153 		if (mddev->minor_version >= 91) {
1154 			mddev->reshape_position = sb->reshape_position;
1155 			mddev->delta_disks = sb->delta_disks;
1156 			mddev->new_level = sb->new_level;
1157 			mddev->new_layout = sb->new_layout;
1158 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1159 			if (mddev->delta_disks < 0)
1160 				mddev->reshape_backwards = 1;
1161 		} else {
1162 			mddev->reshape_position = MaxSector;
1163 			mddev->delta_disks = 0;
1164 			mddev->new_level = mddev->level;
1165 			mddev->new_layout = mddev->layout;
1166 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1167 		}
1168 
1169 		if (sb->state & (1<<MD_SB_CLEAN))
1170 			mddev->recovery_cp = MaxSector;
1171 		else {
1172 			if (sb->events_hi == sb->cp_events_hi &&
1173 				sb->events_lo == sb->cp_events_lo) {
1174 				mddev->recovery_cp = sb->recovery_cp;
1175 			} else
1176 				mddev->recovery_cp = 0;
1177 		}
1178 
1179 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1180 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1181 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1182 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1183 
1184 		mddev->max_disks = MD_SB_DISKS;
1185 
1186 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1187 		    mddev->bitmap_info.file == NULL) {
1188 			mddev->bitmap_info.offset =
1189 				mddev->bitmap_info.default_offset;
1190 			mddev->bitmap_info.space =
1191 				mddev->bitmap_info.space;
1192 		}
1193 
1194 	} else if (mddev->pers == NULL) {
1195 		/* Insist on good event counter while assembling, except
1196 		 * for spares (which don't need an event count) */
1197 		++ev1;
1198 		if (sb->disks[rdev->desc_nr].state & (
1199 			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1200 			if (ev1 < mddev->events)
1201 				return -EINVAL;
1202 	} else if (mddev->bitmap) {
1203 		/* if adding to array with a bitmap, then we can accept an
1204 		 * older device ... but not too old.
1205 		 */
1206 		if (ev1 < mddev->bitmap->events_cleared)
1207 			return 0;
1208 	} else {
1209 		if (ev1 < mddev->events)
1210 			/* just a hot-add of a new device, leave raid_disk at -1 */
1211 			return 0;
1212 	}
1213 
1214 	if (mddev->level != LEVEL_MULTIPATH) {
1215 		desc = sb->disks + rdev->desc_nr;
1216 
1217 		if (desc->state & (1<<MD_DISK_FAULTY))
1218 			set_bit(Faulty, &rdev->flags);
1219 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1220 			    desc->raid_disk < mddev->raid_disks */) {
1221 			set_bit(In_sync, &rdev->flags);
1222 			rdev->raid_disk = desc->raid_disk;
1223 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1224 			/* active but not in sync implies recovery up to
1225 			 * reshape position.  We don't know exactly where
1226 			 * that is, so set to zero for now */
1227 			if (mddev->minor_version >= 91) {
1228 				rdev->recovery_offset = 0;
1229 				rdev->raid_disk = desc->raid_disk;
1230 			}
1231 		}
1232 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1233 			set_bit(WriteMostly, &rdev->flags);
1234 	} else /* MULTIPATH are always insync */
1235 		set_bit(In_sync, &rdev->flags);
1236 	return 0;
1237 }
1238 
1239 /*
1240  * sync_super for 0.90.0
1241  */
1242 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1243 {
1244 	mdp_super_t *sb;
1245 	struct md_rdev *rdev2;
1246 	int next_spare = mddev->raid_disks;
1247 
1248 
1249 	/* make rdev->sb match mddev data..
1250 	 *
1251 	 * 1/ zero out disks
1252 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1253 	 * 3/ any empty disks < next_spare become removed
1254 	 *
1255 	 * disks[0] gets initialised to REMOVED because
1256 	 * we cannot be sure from other fields if it has
1257 	 * been initialised or not.
1258 	 */
1259 	int i;
1260 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1261 
1262 	rdev->sb_size = MD_SB_BYTES;
1263 
1264 	sb = page_address(rdev->sb_page);
1265 
1266 	memset(sb, 0, sizeof(*sb));
1267 
1268 	sb->md_magic = MD_SB_MAGIC;
1269 	sb->major_version = mddev->major_version;
1270 	sb->patch_version = mddev->patch_version;
1271 	sb->gvalid_words  = 0; /* ignored */
1272 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1273 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1274 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1275 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1276 
1277 	sb->ctime = mddev->ctime;
1278 	sb->level = mddev->level;
1279 	sb->size = mddev->dev_sectors / 2;
1280 	sb->raid_disks = mddev->raid_disks;
1281 	sb->md_minor = mddev->md_minor;
1282 	sb->not_persistent = 0;
1283 	sb->utime = mddev->utime;
1284 	sb->state = 0;
1285 	sb->events_hi = (mddev->events>>32);
1286 	sb->events_lo = (u32)mddev->events;
1287 
1288 	if (mddev->reshape_position == MaxSector)
1289 		sb->minor_version = 90;
1290 	else {
1291 		sb->minor_version = 91;
1292 		sb->reshape_position = mddev->reshape_position;
1293 		sb->new_level = mddev->new_level;
1294 		sb->delta_disks = mddev->delta_disks;
1295 		sb->new_layout = mddev->new_layout;
1296 		sb->new_chunk = mddev->new_chunk_sectors << 9;
1297 	}
1298 	mddev->minor_version = sb->minor_version;
1299 	if (mddev->in_sync)
1300 	{
1301 		sb->recovery_cp = mddev->recovery_cp;
1302 		sb->cp_events_hi = (mddev->events>>32);
1303 		sb->cp_events_lo = (u32)mddev->events;
1304 		if (mddev->recovery_cp == MaxSector)
1305 			sb->state = (1<< MD_SB_CLEAN);
1306 	} else
1307 		sb->recovery_cp = 0;
1308 
1309 	sb->layout = mddev->layout;
1310 	sb->chunk_size = mddev->chunk_sectors << 9;
1311 
1312 	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1313 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1314 
1315 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1316 	rdev_for_each(rdev2, mddev) {
1317 		mdp_disk_t *d;
1318 		int desc_nr;
1319 		int is_active = test_bit(In_sync, &rdev2->flags);
1320 
1321 		if (rdev2->raid_disk >= 0 &&
1322 		    sb->minor_version >= 91)
1323 			/* we have nowhere to store the recovery_offset,
1324 			 * but if it is not below the reshape_position,
1325 			 * we can piggy-back on that.
1326 			 */
1327 			is_active = 1;
1328 		if (rdev2->raid_disk < 0 ||
1329 		    test_bit(Faulty, &rdev2->flags))
1330 			is_active = 0;
1331 		if (is_active)
1332 			desc_nr = rdev2->raid_disk;
1333 		else
1334 			desc_nr = next_spare++;
1335 		rdev2->desc_nr = desc_nr;
1336 		d = &sb->disks[rdev2->desc_nr];
1337 		nr_disks++;
1338 		d->number = rdev2->desc_nr;
1339 		d->major = MAJOR(rdev2->bdev->bd_dev);
1340 		d->minor = MINOR(rdev2->bdev->bd_dev);
1341 		if (is_active)
1342 			d->raid_disk = rdev2->raid_disk;
1343 		else
1344 			d->raid_disk = rdev2->desc_nr; /* compatibility */
1345 		if (test_bit(Faulty, &rdev2->flags))
1346 			d->state = (1<<MD_DISK_FAULTY);
1347 		else if (is_active) {
1348 			d->state = (1<<MD_DISK_ACTIVE);
1349 			if (test_bit(In_sync, &rdev2->flags))
1350 				d->state |= (1<<MD_DISK_SYNC);
1351 			active++;
1352 			working++;
1353 		} else {
1354 			d->state = 0;
1355 			spare++;
1356 			working++;
1357 		}
1358 		if (test_bit(WriteMostly, &rdev2->flags))
1359 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1360 	}
1361 	/* now set the "removed" and "faulty" bits on any missing devices */
1362 	for (i=0 ; i < mddev->raid_disks ; i++) {
1363 		mdp_disk_t *d = &sb->disks[i];
1364 		if (d->state == 0 && d->number == 0) {
1365 			d->number = i;
1366 			d->raid_disk = i;
1367 			d->state = (1<<MD_DISK_REMOVED);
1368 			d->state |= (1<<MD_DISK_FAULTY);
1369 			failed++;
1370 		}
1371 	}
1372 	sb->nr_disks = nr_disks;
1373 	sb->active_disks = active;
1374 	sb->working_disks = working;
1375 	sb->failed_disks = failed;
1376 	sb->spare_disks = spare;
1377 
1378 	sb->this_disk = sb->disks[rdev->desc_nr];
1379 	sb->sb_csum = calc_sb_csum(sb);
1380 }
1381 
1382 /*
1383  * rdev_size_change for 0.90.0
1384  */
1385 static unsigned long long
1386 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1387 {
1388 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1389 		return 0; /* component must fit device */
1390 	if (rdev->mddev->bitmap_info.offset)
1391 		return 0; /* can't move bitmap */
1392 	rdev->sb_start = calc_dev_sboffset(rdev);
1393 	if (!num_sectors || num_sectors > rdev->sb_start)
1394 		num_sectors = rdev->sb_start;
1395 	/* Limit to 4TB as metadata cannot record more than that.
1396 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1397 	 */
1398 	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1399 		num_sectors = (2ULL << 32) - 2;
1400 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1401 		       rdev->sb_page);
1402 	md_super_wait(rdev->mddev);
1403 	return num_sectors;
1404 }
1405 
1406 static int
1407 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1408 {
1409 	/* non-zero offset changes not possible with v0.90 */
1410 	return new_offset == 0;
1411 }
1412 
1413 /*
1414  * version 1 superblock
1415  */
1416 
1417 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1418 {
1419 	__le32 disk_csum;
1420 	u32 csum;
1421 	unsigned long long newcsum;
1422 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1423 	__le32 *isuper = (__le32*)sb;
1424 
1425 	disk_csum = sb->sb_csum;
1426 	sb->sb_csum = 0;
1427 	newcsum = 0;
1428 	for (; size >= 4; size -= 4)
1429 		newcsum += le32_to_cpu(*isuper++);
1430 
1431 	if (size == 2)
1432 		newcsum += le16_to_cpu(*(__le16*) isuper);
1433 
1434 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1435 	sb->sb_csum = disk_csum;
1436 	return cpu_to_le32(csum);
1437 }
1438 
1439 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1440 			    int acknowledged);
1441 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1442 {
1443 	struct mdp_superblock_1 *sb;
1444 	int ret;
1445 	sector_t sb_start;
1446 	sector_t sectors;
1447 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1448 	int bmask;
1449 
1450 	/*
1451 	 * Calculate the position of the superblock in 512byte sectors.
1452 	 * It is always aligned to a 4K boundary and
1453 	 * depeding on minor_version, it can be:
1454 	 * 0: At least 8K, but less than 12K, from end of device
1455 	 * 1: At start of device
1456 	 * 2: 4K from start of device.
1457 	 */
1458 	switch(minor_version) {
1459 	case 0:
1460 		sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1461 		sb_start -= 8*2;
1462 		sb_start &= ~(sector_t)(4*2-1);
1463 		break;
1464 	case 1:
1465 		sb_start = 0;
1466 		break;
1467 	case 2:
1468 		sb_start = 8;
1469 		break;
1470 	default:
1471 		return -EINVAL;
1472 	}
1473 	rdev->sb_start = sb_start;
1474 
1475 	/* superblock is rarely larger than 1K, but it can be larger,
1476 	 * and it is safe to read 4k, so we do that
1477 	 */
1478 	ret = read_disk_sb(rdev, 4096);
1479 	if (ret) return ret;
1480 
1481 
1482 	sb = page_address(rdev->sb_page);
1483 
1484 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1485 	    sb->major_version != cpu_to_le32(1) ||
1486 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1487 	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1488 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1489 		return -EINVAL;
1490 
1491 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1492 		printk("md: invalid superblock checksum on %s\n",
1493 			bdevname(rdev->bdev,b));
1494 		return -EINVAL;
1495 	}
1496 	if (le64_to_cpu(sb->data_size) < 10) {
1497 		printk("md: data_size too small on %s\n",
1498 		       bdevname(rdev->bdev,b));
1499 		return -EINVAL;
1500 	}
1501 	if (sb->pad0 ||
1502 	    sb->pad3[0] ||
1503 	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1504 		/* Some padding is non-zero, might be a new feature */
1505 		return -EINVAL;
1506 
1507 	rdev->preferred_minor = 0xffff;
1508 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1509 	rdev->new_data_offset = rdev->data_offset;
1510 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1511 	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1512 		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1513 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1514 
1515 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1516 	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1517 	if (rdev->sb_size & bmask)
1518 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1519 
1520 	if (minor_version
1521 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1522 		return -EINVAL;
1523 	if (minor_version
1524 	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1525 		return -EINVAL;
1526 
1527 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1528 		rdev->desc_nr = -1;
1529 	else
1530 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1531 
1532 	if (!rdev->bb_page) {
1533 		rdev->bb_page = alloc_page(GFP_KERNEL);
1534 		if (!rdev->bb_page)
1535 			return -ENOMEM;
1536 	}
1537 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538 	    rdev->badblocks.count == 0) {
1539 		/* need to load the bad block list.
1540 		 * Currently we limit it to one page.
1541 		 */
1542 		s32 offset;
1543 		sector_t bb_sector;
1544 		u64 *bbp;
1545 		int i;
1546 		int sectors = le16_to_cpu(sb->bblog_size);
1547 		if (sectors > (PAGE_SIZE / 512))
1548 			return -EINVAL;
1549 		offset = le32_to_cpu(sb->bblog_offset);
1550 		if (offset == 0)
1551 			return -EINVAL;
1552 		bb_sector = (long long)offset;
1553 		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554 				  rdev->bb_page, READ, true))
1555 			return -EIO;
1556 		bbp = (u64 *)page_address(rdev->bb_page);
1557 		rdev->badblocks.shift = sb->bblog_shift;
1558 		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559 			u64 bb = le64_to_cpu(*bbp);
1560 			int count = bb & (0x3ff);
1561 			u64 sector = bb >> 10;
1562 			sector <<= sb->bblog_shift;
1563 			count <<= sb->bblog_shift;
1564 			if (bb + 1 == 0)
1565 				break;
1566 			if (md_set_badblocks(&rdev->badblocks,
1567 					     sector, count, 1) == 0)
1568 				return -EINVAL;
1569 		}
1570 	} else if (sb->bblog_offset != 0)
1571 		rdev->badblocks.shift = 0;
1572 
1573 	if (!refdev) {
1574 		ret = 1;
1575 	} else {
1576 		__u64 ev1, ev2;
1577 		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1578 
1579 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1580 		    sb->level != refsb->level ||
1581 		    sb->layout != refsb->layout ||
1582 		    sb->chunksize != refsb->chunksize) {
1583 			printk(KERN_WARNING "md: %s has strangely different"
1584 				" superblock to %s\n",
1585 				bdevname(rdev->bdev,b),
1586 				bdevname(refdev->bdev,b2));
1587 			return -EINVAL;
1588 		}
1589 		ev1 = le64_to_cpu(sb->events);
1590 		ev2 = le64_to_cpu(refsb->events);
1591 
1592 		if (ev1 > ev2)
1593 			ret = 1;
1594 		else
1595 			ret = 0;
1596 	}
1597 	if (minor_version) {
1598 		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1599 		sectors -= rdev->data_offset;
1600 	} else
1601 		sectors = rdev->sb_start;
1602 	if (sectors < le64_to_cpu(sb->data_size))
1603 		return -EINVAL;
1604 	rdev->sectors = le64_to_cpu(sb->data_size);
1605 	return ret;
1606 }
1607 
1608 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1609 {
1610 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1611 	__u64 ev1 = le64_to_cpu(sb->events);
1612 
1613 	rdev->raid_disk = -1;
1614 	clear_bit(Faulty, &rdev->flags);
1615 	clear_bit(In_sync, &rdev->flags);
1616 	clear_bit(WriteMostly, &rdev->flags);
1617 
1618 	if (mddev->raid_disks == 0) {
1619 		mddev->major_version = 1;
1620 		mddev->patch_version = 0;
1621 		mddev->external = 0;
1622 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1623 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1624 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1625 		mddev->level = le32_to_cpu(sb->level);
1626 		mddev->clevel[0] = 0;
1627 		mddev->layout = le32_to_cpu(sb->layout);
1628 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1629 		mddev->dev_sectors = le64_to_cpu(sb->size);
1630 		mddev->events = ev1;
1631 		mddev->bitmap_info.offset = 0;
1632 		mddev->bitmap_info.space = 0;
1633 		/* Default location for bitmap is 1K after superblock
1634 		 * using 3K - total of 4K
1635 		 */
1636 		mddev->bitmap_info.default_offset = 1024 >> 9;
1637 		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1638 		mddev->reshape_backwards = 0;
1639 
1640 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1641 		memcpy(mddev->uuid, sb->set_uuid, 16);
1642 
1643 		mddev->max_disks =  (4096-256)/2;
1644 
1645 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1646 		    mddev->bitmap_info.file == NULL) {
1647 			mddev->bitmap_info.offset =
1648 				(__s32)le32_to_cpu(sb->bitmap_offset);
1649 			/* Metadata doesn't record how much space is available.
1650 			 * For 1.0, we assume we can use up to the superblock
1651 			 * if before, else to 4K beyond superblock.
1652 			 * For others, assume no change is possible.
1653 			 */
1654 			if (mddev->minor_version > 0)
1655 				mddev->bitmap_info.space = 0;
1656 			else if (mddev->bitmap_info.offset > 0)
1657 				mddev->bitmap_info.space =
1658 					8 - mddev->bitmap_info.offset;
1659 			else
1660 				mddev->bitmap_info.space =
1661 					-mddev->bitmap_info.offset;
1662 		}
1663 
1664 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1665 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1666 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1667 			mddev->new_level = le32_to_cpu(sb->new_level);
1668 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1669 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1670 			if (mddev->delta_disks < 0 ||
1671 			    (mddev->delta_disks == 0 &&
1672 			     (le32_to_cpu(sb->feature_map)
1673 			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1674 				mddev->reshape_backwards = 1;
1675 		} else {
1676 			mddev->reshape_position = MaxSector;
1677 			mddev->delta_disks = 0;
1678 			mddev->new_level = mddev->level;
1679 			mddev->new_layout = mddev->layout;
1680 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1681 		}
1682 
1683 	} else if (mddev->pers == NULL) {
1684 		/* Insist of good event counter while assembling, except for
1685 		 * spares (which don't need an event count) */
1686 		++ev1;
1687 		if (rdev->desc_nr >= 0 &&
1688 		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1689 		    le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1690 			if (ev1 < mddev->events)
1691 				return -EINVAL;
1692 	} else if (mddev->bitmap) {
1693 		/* If adding to array with a bitmap, then we can accept an
1694 		 * older device, but not too old.
1695 		 */
1696 		if (ev1 < mddev->bitmap->events_cleared)
1697 			return 0;
1698 	} else {
1699 		if (ev1 < mddev->events)
1700 			/* just a hot-add of a new device, leave raid_disk at -1 */
1701 			return 0;
1702 	}
1703 	if (mddev->level != LEVEL_MULTIPATH) {
1704 		int role;
1705 		if (rdev->desc_nr < 0 ||
1706 		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1707 			role = 0xffff;
1708 			rdev->desc_nr = -1;
1709 		} else
1710 			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1711 		switch(role) {
1712 		case 0xffff: /* spare */
1713 			break;
1714 		case 0xfffe: /* faulty */
1715 			set_bit(Faulty, &rdev->flags);
1716 			break;
1717 		default:
1718 			if ((le32_to_cpu(sb->feature_map) &
1719 			     MD_FEATURE_RECOVERY_OFFSET))
1720 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1721 			else
1722 				set_bit(In_sync, &rdev->flags);
1723 			rdev->raid_disk = role;
1724 			break;
1725 		}
1726 		if (sb->devflags & WriteMostly1)
1727 			set_bit(WriteMostly, &rdev->flags);
1728 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1729 			set_bit(Replacement, &rdev->flags);
1730 	} else /* MULTIPATH are always insync */
1731 		set_bit(In_sync, &rdev->flags);
1732 
1733 	return 0;
1734 }
1735 
1736 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1737 {
1738 	struct mdp_superblock_1 *sb;
1739 	struct md_rdev *rdev2;
1740 	int max_dev, i;
1741 	/* make rdev->sb match mddev and rdev data. */
1742 
1743 	sb = page_address(rdev->sb_page);
1744 
1745 	sb->feature_map = 0;
1746 	sb->pad0 = 0;
1747 	sb->recovery_offset = cpu_to_le64(0);
1748 	memset(sb->pad3, 0, sizeof(sb->pad3));
1749 
1750 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1751 	sb->events = cpu_to_le64(mddev->events);
1752 	if (mddev->in_sync)
1753 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1754 	else
1755 		sb->resync_offset = cpu_to_le64(0);
1756 
1757 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1758 
1759 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1760 	sb->size = cpu_to_le64(mddev->dev_sectors);
1761 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1762 	sb->level = cpu_to_le32(mddev->level);
1763 	sb->layout = cpu_to_le32(mddev->layout);
1764 
1765 	if (test_bit(WriteMostly, &rdev->flags))
1766 		sb->devflags |= WriteMostly1;
1767 	else
1768 		sb->devflags &= ~WriteMostly1;
1769 	sb->data_offset = cpu_to_le64(rdev->data_offset);
1770 	sb->data_size = cpu_to_le64(rdev->sectors);
1771 
1772 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1773 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1774 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1775 	}
1776 
1777 	if (rdev->raid_disk >= 0 &&
1778 	    !test_bit(In_sync, &rdev->flags)) {
1779 		sb->feature_map |=
1780 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1781 		sb->recovery_offset =
1782 			cpu_to_le64(rdev->recovery_offset);
1783 	}
1784 	if (test_bit(Replacement, &rdev->flags))
1785 		sb->feature_map |=
1786 			cpu_to_le32(MD_FEATURE_REPLACEMENT);
1787 
1788 	if (mddev->reshape_position != MaxSector) {
1789 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1790 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1791 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1792 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1793 		sb->new_level = cpu_to_le32(mddev->new_level);
1794 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1795 		if (mddev->delta_disks == 0 &&
1796 		    mddev->reshape_backwards)
1797 			sb->feature_map
1798 				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1799 		if (rdev->new_data_offset != rdev->data_offset) {
1800 			sb->feature_map
1801 				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1802 			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1803 							     - rdev->data_offset));
1804 		}
1805 	}
1806 
1807 	if (rdev->badblocks.count == 0)
1808 		/* Nothing to do for bad blocks*/ ;
1809 	else if (sb->bblog_offset == 0)
1810 		/* Cannot record bad blocks on this device */
1811 		md_error(mddev, rdev);
1812 	else {
1813 		struct badblocks *bb = &rdev->badblocks;
1814 		u64 *bbp = (u64 *)page_address(rdev->bb_page);
1815 		u64 *p = bb->page;
1816 		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1817 		if (bb->changed) {
1818 			unsigned seq;
1819 
1820 retry:
1821 			seq = read_seqbegin(&bb->lock);
1822 
1823 			memset(bbp, 0xff, PAGE_SIZE);
1824 
1825 			for (i = 0 ; i < bb->count ; i++) {
1826 				u64 internal_bb = p[i];
1827 				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1828 						| BB_LEN(internal_bb));
1829 				bbp[i] = cpu_to_le64(store_bb);
1830 			}
1831 			bb->changed = 0;
1832 			if (read_seqretry(&bb->lock, seq))
1833 				goto retry;
1834 
1835 			bb->sector = (rdev->sb_start +
1836 				      (int)le32_to_cpu(sb->bblog_offset));
1837 			bb->size = le16_to_cpu(sb->bblog_size);
1838 		}
1839 	}
1840 
1841 	max_dev = 0;
1842 	rdev_for_each(rdev2, mddev)
1843 		if (rdev2->desc_nr+1 > max_dev)
1844 			max_dev = rdev2->desc_nr+1;
1845 
1846 	if (max_dev > le32_to_cpu(sb->max_dev)) {
1847 		int bmask;
1848 		sb->max_dev = cpu_to_le32(max_dev);
1849 		rdev->sb_size = max_dev * 2 + 256;
1850 		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1851 		if (rdev->sb_size & bmask)
1852 			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1853 	} else
1854 		max_dev = le32_to_cpu(sb->max_dev);
1855 
1856 	for (i=0; i<max_dev;i++)
1857 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1858 
1859 	rdev_for_each(rdev2, mddev) {
1860 		i = rdev2->desc_nr;
1861 		if (test_bit(Faulty, &rdev2->flags))
1862 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1863 		else if (test_bit(In_sync, &rdev2->flags))
1864 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1865 		else if (rdev2->raid_disk >= 0)
1866 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1867 		else
1868 			sb->dev_roles[i] = cpu_to_le16(0xffff);
1869 	}
1870 
1871 	sb->sb_csum = calc_sb_1_csum(sb);
1872 }
1873 
1874 static unsigned long long
1875 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1876 {
1877 	struct mdp_superblock_1 *sb;
1878 	sector_t max_sectors;
1879 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1880 		return 0; /* component must fit device */
1881 	if (rdev->data_offset != rdev->new_data_offset)
1882 		return 0; /* too confusing */
1883 	if (rdev->sb_start < rdev->data_offset) {
1884 		/* minor versions 1 and 2; superblock before data */
1885 		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1886 		max_sectors -= rdev->data_offset;
1887 		if (!num_sectors || num_sectors > max_sectors)
1888 			num_sectors = max_sectors;
1889 	} else if (rdev->mddev->bitmap_info.offset) {
1890 		/* minor version 0 with bitmap we can't move */
1891 		return 0;
1892 	} else {
1893 		/* minor version 0; superblock after data */
1894 		sector_t sb_start;
1895 		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1896 		sb_start &= ~(sector_t)(4*2 - 1);
1897 		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1898 		if (!num_sectors || num_sectors > max_sectors)
1899 			num_sectors = max_sectors;
1900 		rdev->sb_start = sb_start;
1901 	}
1902 	sb = page_address(rdev->sb_page);
1903 	sb->data_size = cpu_to_le64(num_sectors);
1904 	sb->super_offset = rdev->sb_start;
1905 	sb->sb_csum = calc_sb_1_csum(sb);
1906 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1907 		       rdev->sb_page);
1908 	md_super_wait(rdev->mddev);
1909 	return num_sectors;
1910 
1911 }
1912 
1913 static int
1914 super_1_allow_new_offset(struct md_rdev *rdev,
1915 			 unsigned long long new_offset)
1916 {
1917 	/* All necessary checks on new >= old have been done */
1918 	struct bitmap *bitmap;
1919 	if (new_offset >= rdev->data_offset)
1920 		return 1;
1921 
1922 	/* with 1.0 metadata, there is no metadata to tread on
1923 	 * so we can always move back */
1924 	if (rdev->mddev->minor_version == 0)
1925 		return 1;
1926 
1927 	/* otherwise we must be sure not to step on
1928 	 * any metadata, so stay:
1929 	 * 36K beyond start of superblock
1930 	 * beyond end of badblocks
1931 	 * beyond write-intent bitmap
1932 	 */
1933 	if (rdev->sb_start + (32+4)*2 > new_offset)
1934 		return 0;
1935 	bitmap = rdev->mddev->bitmap;
1936 	if (bitmap && !rdev->mddev->bitmap_info.file &&
1937 	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
1938 	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1939 		return 0;
1940 	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1941 		return 0;
1942 
1943 	return 1;
1944 }
1945 
1946 static struct super_type super_types[] = {
1947 	[0] = {
1948 		.name	= "0.90.0",
1949 		.owner	= THIS_MODULE,
1950 		.load_super	    = super_90_load,
1951 		.validate_super	    = super_90_validate,
1952 		.sync_super	    = super_90_sync,
1953 		.rdev_size_change   = super_90_rdev_size_change,
1954 		.allow_new_offset   = super_90_allow_new_offset,
1955 	},
1956 	[1] = {
1957 		.name	= "md-1",
1958 		.owner	= THIS_MODULE,
1959 		.load_super	    = super_1_load,
1960 		.validate_super	    = super_1_validate,
1961 		.sync_super	    = super_1_sync,
1962 		.rdev_size_change   = super_1_rdev_size_change,
1963 		.allow_new_offset   = super_1_allow_new_offset,
1964 	},
1965 };
1966 
1967 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1968 {
1969 	if (mddev->sync_super) {
1970 		mddev->sync_super(mddev, rdev);
1971 		return;
1972 	}
1973 
1974 	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1975 
1976 	super_types[mddev->major_version].sync_super(mddev, rdev);
1977 }
1978 
1979 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1980 {
1981 	struct md_rdev *rdev, *rdev2;
1982 
1983 	rcu_read_lock();
1984 	rdev_for_each_rcu(rdev, mddev1)
1985 		rdev_for_each_rcu(rdev2, mddev2)
1986 			if (rdev->bdev->bd_contains ==
1987 			    rdev2->bdev->bd_contains) {
1988 				rcu_read_unlock();
1989 				return 1;
1990 			}
1991 	rcu_read_unlock();
1992 	return 0;
1993 }
1994 
1995 static LIST_HEAD(pending_raid_disks);
1996 
1997 /*
1998  * Try to register data integrity profile for an mddev
1999  *
2000  * This is called when an array is started and after a disk has been kicked
2001  * from the array. It only succeeds if all working and active component devices
2002  * are integrity capable with matching profiles.
2003  */
2004 int md_integrity_register(struct mddev *mddev)
2005 {
2006 	struct md_rdev *rdev, *reference = NULL;
2007 
2008 	if (list_empty(&mddev->disks))
2009 		return 0; /* nothing to do */
2010 	if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2011 		return 0; /* shouldn't register, or already is */
2012 	rdev_for_each(rdev, mddev) {
2013 		/* skip spares and non-functional disks */
2014 		if (test_bit(Faulty, &rdev->flags))
2015 			continue;
2016 		if (rdev->raid_disk < 0)
2017 			continue;
2018 		if (!reference) {
2019 			/* Use the first rdev as the reference */
2020 			reference = rdev;
2021 			continue;
2022 		}
2023 		/* does this rdev's profile match the reference profile? */
2024 		if (blk_integrity_compare(reference->bdev->bd_disk,
2025 				rdev->bdev->bd_disk) < 0)
2026 			return -EINVAL;
2027 	}
2028 	if (!reference || !bdev_get_integrity(reference->bdev))
2029 		return 0;
2030 	/*
2031 	 * All component devices are integrity capable and have matching
2032 	 * profiles, register the common profile for the md device.
2033 	 */
2034 	if (blk_integrity_register(mddev->gendisk,
2035 			bdev_get_integrity(reference->bdev)) != 0) {
2036 		printk(KERN_ERR "md: failed to register integrity for %s\n",
2037 			mdname(mddev));
2038 		return -EINVAL;
2039 	}
2040 	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2041 	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2042 		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2043 		       mdname(mddev));
2044 		return -EINVAL;
2045 	}
2046 	return 0;
2047 }
2048 EXPORT_SYMBOL(md_integrity_register);
2049 
2050 /* Disable data integrity if non-capable/non-matching disk is being added */
2051 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2052 {
2053 	struct blk_integrity *bi_rdev;
2054 	struct blk_integrity *bi_mddev;
2055 
2056 	if (!mddev->gendisk)
2057 		return;
2058 
2059 	bi_rdev = bdev_get_integrity(rdev->bdev);
2060 	bi_mddev = blk_get_integrity(mddev->gendisk);
2061 
2062 	if (!bi_mddev) /* nothing to do */
2063 		return;
2064 	if (rdev->raid_disk < 0) /* skip spares */
2065 		return;
2066 	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2067 					     rdev->bdev->bd_disk) >= 0)
2068 		return;
2069 	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2070 	blk_integrity_unregister(mddev->gendisk);
2071 }
2072 EXPORT_SYMBOL(md_integrity_add_rdev);
2073 
2074 static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
2075 {
2076 	char b[BDEVNAME_SIZE];
2077 	struct kobject *ko;
2078 	char *s;
2079 	int err;
2080 
2081 	if (rdev->mddev) {
2082 		MD_BUG();
2083 		return -EINVAL;
2084 	}
2085 
2086 	/* prevent duplicates */
2087 	if (find_rdev(mddev, rdev->bdev->bd_dev))
2088 		return -EEXIST;
2089 
2090 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2091 	if (rdev->sectors && (mddev->dev_sectors == 0 ||
2092 			rdev->sectors < mddev->dev_sectors)) {
2093 		if (mddev->pers) {
2094 			/* Cannot change size, so fail
2095 			 * If mddev->level <= 0, then we don't care
2096 			 * about aligning sizes (e.g. linear)
2097 			 */
2098 			if (mddev->level > 0)
2099 				return -ENOSPC;
2100 		} else
2101 			mddev->dev_sectors = rdev->sectors;
2102 	}
2103 
2104 	/* Verify rdev->desc_nr is unique.
2105 	 * If it is -1, assign a free number, else
2106 	 * check number is not in use
2107 	 */
2108 	if (rdev->desc_nr < 0) {
2109 		int choice = 0;
2110 		if (mddev->pers) choice = mddev->raid_disks;
2111 		while (find_rdev_nr(mddev, choice))
2112 			choice++;
2113 		rdev->desc_nr = choice;
2114 	} else {
2115 		if (find_rdev_nr(mddev, rdev->desc_nr))
2116 			return -EBUSY;
2117 	}
2118 	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2119 		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2120 		       mdname(mddev), mddev->max_disks);
2121 		return -EBUSY;
2122 	}
2123 	bdevname(rdev->bdev,b);
2124 	while ( (s=strchr(b, '/')) != NULL)
2125 		*s = '!';
2126 
2127 	rdev->mddev = mddev;
2128 	printk(KERN_INFO "md: bind<%s>\n", b);
2129 
2130 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2131 		goto fail;
2132 
2133 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2134 	if (sysfs_create_link(&rdev->kobj, ko, "block"))
2135 		/* failure here is OK */;
2136 	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2137 
2138 	list_add_rcu(&rdev->same_set, &mddev->disks);
2139 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2140 
2141 	/* May as well allow recovery to be retried once */
2142 	mddev->recovery_disabled++;
2143 
2144 	return 0;
2145 
2146  fail:
2147 	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2148 	       b, mdname(mddev));
2149 	return err;
2150 }
2151 
2152 static void md_delayed_delete(struct work_struct *ws)
2153 {
2154 	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2155 	kobject_del(&rdev->kobj);
2156 	kobject_put(&rdev->kobj);
2157 }
2158 
2159 static void unbind_rdev_from_array(struct md_rdev * rdev)
2160 {
2161 	char b[BDEVNAME_SIZE];
2162 	if (!rdev->mddev) {
2163 		MD_BUG();
2164 		return;
2165 	}
2166 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2167 	list_del_rcu(&rdev->same_set);
2168 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2169 	rdev->mddev = NULL;
2170 	sysfs_remove_link(&rdev->kobj, "block");
2171 	sysfs_put(rdev->sysfs_state);
2172 	rdev->sysfs_state = NULL;
2173 	rdev->badblocks.count = 0;
2174 	/* We need to delay this, otherwise we can deadlock when
2175 	 * writing to 'remove' to "dev/state".  We also need
2176 	 * to delay it due to rcu usage.
2177 	 */
2178 	synchronize_rcu();
2179 	INIT_WORK(&rdev->del_work, md_delayed_delete);
2180 	kobject_get(&rdev->kobj);
2181 	queue_work(md_misc_wq, &rdev->del_work);
2182 }
2183 
2184 /*
2185  * prevent the device from being mounted, repartitioned or
2186  * otherwise reused by a RAID array (or any other kernel
2187  * subsystem), by bd_claiming the device.
2188  */
2189 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2190 {
2191 	int err = 0;
2192 	struct block_device *bdev;
2193 	char b[BDEVNAME_SIZE];
2194 
2195 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2196 				 shared ? (struct md_rdev *)lock_rdev : rdev);
2197 	if (IS_ERR(bdev)) {
2198 		printk(KERN_ERR "md: could not open %s.\n",
2199 			__bdevname(dev, b));
2200 		return PTR_ERR(bdev);
2201 	}
2202 	rdev->bdev = bdev;
2203 	return err;
2204 }
2205 
2206 static void unlock_rdev(struct md_rdev *rdev)
2207 {
2208 	struct block_device *bdev = rdev->bdev;
2209 	rdev->bdev = NULL;
2210 	if (!bdev)
2211 		MD_BUG();
2212 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2213 }
2214 
2215 void md_autodetect_dev(dev_t dev);
2216 
2217 static void export_rdev(struct md_rdev * rdev)
2218 {
2219 	char b[BDEVNAME_SIZE];
2220 	printk(KERN_INFO "md: export_rdev(%s)\n",
2221 		bdevname(rdev->bdev,b));
2222 	if (rdev->mddev)
2223 		MD_BUG();
2224 	md_rdev_clear(rdev);
2225 #ifndef MODULE
2226 	if (test_bit(AutoDetected, &rdev->flags))
2227 		md_autodetect_dev(rdev->bdev->bd_dev);
2228 #endif
2229 	unlock_rdev(rdev);
2230 	kobject_put(&rdev->kobj);
2231 }
2232 
2233 static void kick_rdev_from_array(struct md_rdev * rdev)
2234 {
2235 	unbind_rdev_from_array(rdev);
2236 	export_rdev(rdev);
2237 }
2238 
2239 static void export_array(struct mddev *mddev)
2240 {
2241 	struct md_rdev *rdev, *tmp;
2242 
2243 	rdev_for_each_safe(rdev, tmp, mddev) {
2244 		if (!rdev->mddev) {
2245 			MD_BUG();
2246 			continue;
2247 		}
2248 		kick_rdev_from_array(rdev);
2249 	}
2250 	if (!list_empty(&mddev->disks))
2251 		MD_BUG();
2252 	mddev->raid_disks = 0;
2253 	mddev->major_version = 0;
2254 }
2255 
2256 static void print_desc(mdp_disk_t *desc)
2257 {
2258 	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
2259 		desc->major,desc->minor,desc->raid_disk,desc->state);
2260 }
2261 
2262 static void print_sb_90(mdp_super_t *sb)
2263 {
2264 	int i;
2265 
2266 	printk(KERN_INFO
2267 		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2268 		sb->major_version, sb->minor_version, sb->patch_version,
2269 		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2270 		sb->ctime);
2271 	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
2272 		sb->level, sb->size, sb->nr_disks, sb->raid_disks,
2273 		sb->md_minor, sb->layout, sb->chunk_size);
2274 	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
2275 		" FD:%d SD:%d CSUM:%08x E:%08lx\n",
2276 		sb->utime, sb->state, sb->active_disks, sb->working_disks,
2277 		sb->failed_disks, sb->spare_disks,
2278 		sb->sb_csum, (unsigned long)sb->events_lo);
2279 
2280 	printk(KERN_INFO);
2281 	for (i = 0; i < MD_SB_DISKS; i++) {
2282 		mdp_disk_t *desc;
2283 
2284 		desc = sb->disks + i;
2285 		if (desc->number || desc->major || desc->minor ||
2286 		    desc->raid_disk || (desc->state && (desc->state != 4))) {
2287 			printk("     D %2d: ", i);
2288 			print_desc(desc);
2289 		}
2290 	}
2291 	printk(KERN_INFO "md:     THIS: ");
2292 	print_desc(&sb->this_disk);
2293 }
2294 
2295 static void print_sb_1(struct mdp_superblock_1 *sb)
2296 {
2297 	__u8 *uuid;
2298 
2299 	uuid = sb->set_uuid;
2300 	printk(KERN_INFO
2301 	       "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
2302 	       "md:    Name: \"%s\" CT:%llu\n",
2303 		le32_to_cpu(sb->major_version),
2304 		le32_to_cpu(sb->feature_map),
2305 		uuid,
2306 		sb->set_name,
2307 		(unsigned long long)le64_to_cpu(sb->ctime)
2308 		       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
2309 
2310 	uuid = sb->device_uuid;
2311 	printk(KERN_INFO
2312 	       "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
2313 			" RO:%llu\n"
2314 	       "md:     Dev:%08x UUID: %pU\n"
2315 	       "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
2316 	       "md:         (MaxDev:%u) \n",
2317 		le32_to_cpu(sb->level),
2318 		(unsigned long long)le64_to_cpu(sb->size),
2319 		le32_to_cpu(sb->raid_disks),
2320 		le32_to_cpu(sb->layout),
2321 		le32_to_cpu(sb->chunksize),
2322 		(unsigned long long)le64_to_cpu(sb->data_offset),
2323 		(unsigned long long)le64_to_cpu(sb->data_size),
2324 		(unsigned long long)le64_to_cpu(sb->super_offset),
2325 		(unsigned long long)le64_to_cpu(sb->recovery_offset),
2326 		le32_to_cpu(sb->dev_number),
2327 		uuid,
2328 		sb->devflags,
2329 		(unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
2330 		(unsigned long long)le64_to_cpu(sb->events),
2331 		(unsigned long long)le64_to_cpu(sb->resync_offset),
2332 		le32_to_cpu(sb->sb_csum),
2333 		le32_to_cpu(sb->max_dev)
2334 		);
2335 }
2336 
2337 static void print_rdev(struct md_rdev *rdev, int major_version)
2338 {
2339 	char b[BDEVNAME_SIZE];
2340 	printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
2341 		bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
2342 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
2343 	        rdev->desc_nr);
2344 	if (rdev->sb_loaded) {
2345 		printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2346 		switch (major_version) {
2347 		case 0:
2348 			print_sb_90(page_address(rdev->sb_page));
2349 			break;
2350 		case 1:
2351 			print_sb_1(page_address(rdev->sb_page));
2352 			break;
2353 		}
2354 	} else
2355 		printk(KERN_INFO "md: no rdev superblock!\n");
2356 }
2357 
2358 static void md_print_devices(void)
2359 {
2360 	struct list_head *tmp;
2361 	struct md_rdev *rdev;
2362 	struct mddev *mddev;
2363 	char b[BDEVNAME_SIZE];
2364 
2365 	printk("\n");
2366 	printk("md:	**********************************\n");
2367 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
2368 	printk("md:	**********************************\n");
2369 	for_each_mddev(mddev, tmp) {
2370 
2371 		if (mddev->bitmap)
2372 			bitmap_print_sb(mddev->bitmap);
2373 		else
2374 			printk("%s: ", mdname(mddev));
2375 		rdev_for_each(rdev, mddev)
2376 			printk("<%s>", bdevname(rdev->bdev,b));
2377 		printk("\n");
2378 
2379 		rdev_for_each(rdev, mddev)
2380 			print_rdev(rdev, mddev->major_version);
2381 	}
2382 	printk("md:	**********************************\n");
2383 	printk("\n");
2384 }
2385 
2386 
2387 static void sync_sbs(struct mddev * mddev, int nospares)
2388 {
2389 	/* Update each superblock (in-memory image), but
2390 	 * if we are allowed to, skip spares which already
2391 	 * have the right event counter, or have one earlier
2392 	 * (which would mean they aren't being marked as dirty
2393 	 * with the rest of the array)
2394 	 */
2395 	struct md_rdev *rdev;
2396 	rdev_for_each(rdev, mddev) {
2397 		if (rdev->sb_events == mddev->events ||
2398 		    (nospares &&
2399 		     rdev->raid_disk < 0 &&
2400 		     rdev->sb_events+1 == mddev->events)) {
2401 			/* Don't update this superblock */
2402 			rdev->sb_loaded = 2;
2403 		} else {
2404 			sync_super(mddev, rdev);
2405 			rdev->sb_loaded = 1;
2406 		}
2407 	}
2408 }
2409 
2410 static void md_update_sb(struct mddev * mddev, int force_change)
2411 {
2412 	struct md_rdev *rdev;
2413 	int sync_req;
2414 	int nospares = 0;
2415 	int any_badblocks_changed = 0;
2416 
2417 	if (mddev->ro) {
2418 		if (force_change)
2419 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
2420 		return;
2421 	}
2422 repeat:
2423 	/* First make sure individual recovery_offsets are correct */
2424 	rdev_for_each(rdev, mddev) {
2425 		if (rdev->raid_disk >= 0 &&
2426 		    mddev->delta_disks >= 0 &&
2427 		    !test_bit(In_sync, &rdev->flags) &&
2428 		    mddev->curr_resync_completed > rdev->recovery_offset)
2429 				rdev->recovery_offset = mddev->curr_resync_completed;
2430 
2431 	}
2432 	if (!mddev->persistent) {
2433 		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2434 		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2435 		if (!mddev->external) {
2436 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2437 			rdev_for_each(rdev, mddev) {
2438 				if (rdev->badblocks.changed) {
2439 					rdev->badblocks.changed = 0;
2440 					md_ack_all_badblocks(&rdev->badblocks);
2441 					md_error(mddev, rdev);
2442 				}
2443 				clear_bit(Blocked, &rdev->flags);
2444 				clear_bit(BlockedBadBlocks, &rdev->flags);
2445 				wake_up(&rdev->blocked_wait);
2446 			}
2447 		}
2448 		wake_up(&mddev->sb_wait);
2449 		return;
2450 	}
2451 
2452 	spin_lock_irq(&mddev->write_lock);
2453 
2454 	mddev->utime = get_seconds();
2455 
2456 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2457 		force_change = 1;
2458 	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2459 		/* just a clean<-> dirty transition, possibly leave spares alone,
2460 		 * though if events isn't the right even/odd, we will have to do
2461 		 * spares after all
2462 		 */
2463 		nospares = 1;
2464 	if (force_change)
2465 		nospares = 0;
2466 	if (mddev->degraded)
2467 		/* If the array is degraded, then skipping spares is both
2468 		 * dangerous and fairly pointless.
2469 		 * Dangerous because a device that was removed from the array
2470 		 * might have a event_count that still looks up-to-date,
2471 		 * so it can be re-added without a resync.
2472 		 * Pointless because if there are any spares to skip,
2473 		 * then a recovery will happen and soon that array won't
2474 		 * be degraded any more and the spare can go back to sleep then.
2475 		 */
2476 		nospares = 0;
2477 
2478 	sync_req = mddev->in_sync;
2479 
2480 	/* If this is just a dirty<->clean transition, and the array is clean
2481 	 * and 'events' is odd, we can roll back to the previous clean state */
2482 	if (nospares
2483 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2484 	    && mddev->can_decrease_events
2485 	    && mddev->events != 1) {
2486 		mddev->events--;
2487 		mddev->can_decrease_events = 0;
2488 	} else {
2489 		/* otherwise we have to go forward and ... */
2490 		mddev->events ++;
2491 		mddev->can_decrease_events = nospares;
2492 	}
2493 
2494 	if (!mddev->events) {
2495 		/*
2496 		 * oops, this 64-bit counter should never wrap.
2497 		 * Either we are in around ~1 trillion A.C., assuming
2498 		 * 1 reboot per second, or we have a bug:
2499 		 */
2500 		MD_BUG();
2501 		mddev->events --;
2502 	}
2503 
2504 	rdev_for_each(rdev, mddev) {
2505 		if (rdev->badblocks.changed)
2506 			any_badblocks_changed++;
2507 		if (test_bit(Faulty, &rdev->flags))
2508 			set_bit(FaultRecorded, &rdev->flags);
2509 	}
2510 
2511 	sync_sbs(mddev, nospares);
2512 	spin_unlock_irq(&mddev->write_lock);
2513 
2514 	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2515 		 mdname(mddev), mddev->in_sync);
2516 
2517 	bitmap_update_sb(mddev->bitmap);
2518 	rdev_for_each(rdev, mddev) {
2519 		char b[BDEVNAME_SIZE];
2520 
2521 		if (rdev->sb_loaded != 1)
2522 			continue; /* no noise on spare devices */
2523 
2524 		if (!test_bit(Faulty, &rdev->flags) &&
2525 		    rdev->saved_raid_disk == -1) {
2526 			md_super_write(mddev,rdev,
2527 				       rdev->sb_start, rdev->sb_size,
2528 				       rdev->sb_page);
2529 			pr_debug("md: (write) %s's sb offset: %llu\n",
2530 				 bdevname(rdev->bdev, b),
2531 				 (unsigned long long)rdev->sb_start);
2532 			rdev->sb_events = mddev->events;
2533 			if (rdev->badblocks.size) {
2534 				md_super_write(mddev, rdev,
2535 					       rdev->badblocks.sector,
2536 					       rdev->badblocks.size << 9,
2537 					       rdev->bb_page);
2538 				rdev->badblocks.size = 0;
2539 			}
2540 
2541 		} else if (test_bit(Faulty, &rdev->flags))
2542 			pr_debug("md: %s (skipping faulty)\n",
2543 				 bdevname(rdev->bdev, b));
2544 		else
2545 			pr_debug("(skipping incremental s/r ");
2546 
2547 		if (mddev->level == LEVEL_MULTIPATH)
2548 			/* only need to write one superblock... */
2549 			break;
2550 	}
2551 	md_super_wait(mddev);
2552 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2553 
2554 	spin_lock_irq(&mddev->write_lock);
2555 	if (mddev->in_sync != sync_req ||
2556 	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2557 		/* have to write it out again */
2558 		spin_unlock_irq(&mddev->write_lock);
2559 		goto repeat;
2560 	}
2561 	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2562 	spin_unlock_irq(&mddev->write_lock);
2563 	wake_up(&mddev->sb_wait);
2564 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2565 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2566 
2567 	rdev_for_each(rdev, mddev) {
2568 		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2569 			clear_bit(Blocked, &rdev->flags);
2570 
2571 		if (any_badblocks_changed)
2572 			md_ack_all_badblocks(&rdev->badblocks);
2573 		clear_bit(BlockedBadBlocks, &rdev->flags);
2574 		wake_up(&rdev->blocked_wait);
2575 	}
2576 }
2577 
2578 /* words written to sysfs files may, or may not, be \n terminated.
2579  * We want to accept with case. For this we use cmd_match.
2580  */
2581 static int cmd_match(const char *cmd, const char *str)
2582 {
2583 	/* See if cmd, written into a sysfs file, matches
2584 	 * str.  They must either be the same, or cmd can
2585 	 * have a trailing newline
2586 	 */
2587 	while (*cmd && *str && *cmd == *str) {
2588 		cmd++;
2589 		str++;
2590 	}
2591 	if (*cmd == '\n')
2592 		cmd++;
2593 	if (*str || *cmd)
2594 		return 0;
2595 	return 1;
2596 }
2597 
2598 struct rdev_sysfs_entry {
2599 	struct attribute attr;
2600 	ssize_t (*show)(struct md_rdev *, char *);
2601 	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2602 };
2603 
2604 static ssize_t
2605 state_show(struct md_rdev *rdev, char *page)
2606 {
2607 	char *sep = "";
2608 	size_t len = 0;
2609 
2610 	if (test_bit(Faulty, &rdev->flags) ||
2611 	    rdev->badblocks.unacked_exist) {
2612 		len+= sprintf(page+len, "%sfaulty",sep);
2613 		sep = ",";
2614 	}
2615 	if (test_bit(In_sync, &rdev->flags)) {
2616 		len += sprintf(page+len, "%sin_sync",sep);
2617 		sep = ",";
2618 	}
2619 	if (test_bit(WriteMostly, &rdev->flags)) {
2620 		len += sprintf(page+len, "%swrite_mostly",sep);
2621 		sep = ",";
2622 	}
2623 	if (test_bit(Blocked, &rdev->flags) ||
2624 	    (rdev->badblocks.unacked_exist
2625 	     && !test_bit(Faulty, &rdev->flags))) {
2626 		len += sprintf(page+len, "%sblocked", sep);
2627 		sep = ",";
2628 	}
2629 	if (!test_bit(Faulty, &rdev->flags) &&
2630 	    !test_bit(In_sync, &rdev->flags)) {
2631 		len += sprintf(page+len, "%sspare", sep);
2632 		sep = ",";
2633 	}
2634 	if (test_bit(WriteErrorSeen, &rdev->flags)) {
2635 		len += sprintf(page+len, "%swrite_error", sep);
2636 		sep = ",";
2637 	}
2638 	if (test_bit(WantReplacement, &rdev->flags)) {
2639 		len += sprintf(page+len, "%swant_replacement", sep);
2640 		sep = ",";
2641 	}
2642 	if (test_bit(Replacement, &rdev->flags)) {
2643 		len += sprintf(page+len, "%sreplacement", sep);
2644 		sep = ",";
2645 	}
2646 
2647 	return len+sprintf(page+len, "\n");
2648 }
2649 
2650 static ssize_t
2651 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2652 {
2653 	/* can write
2654 	 *  faulty  - simulates an error
2655 	 *  remove  - disconnects the device
2656 	 *  writemostly - sets write_mostly
2657 	 *  -writemostly - clears write_mostly
2658 	 *  blocked - sets the Blocked flags
2659 	 *  -blocked - clears the Blocked and possibly simulates an error
2660 	 *  insync - sets Insync providing device isn't active
2661 	 *  write_error - sets WriteErrorSeen
2662 	 *  -write_error - clears WriteErrorSeen
2663 	 */
2664 	int err = -EINVAL;
2665 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2666 		md_error(rdev->mddev, rdev);
2667 		if (test_bit(Faulty, &rdev->flags))
2668 			err = 0;
2669 		else
2670 			err = -EBUSY;
2671 	} else if (cmd_match(buf, "remove")) {
2672 		if (rdev->raid_disk >= 0)
2673 			err = -EBUSY;
2674 		else {
2675 			struct mddev *mddev = rdev->mddev;
2676 			kick_rdev_from_array(rdev);
2677 			if (mddev->pers)
2678 				md_update_sb(mddev, 1);
2679 			md_new_event(mddev);
2680 			err = 0;
2681 		}
2682 	} else if (cmd_match(buf, "writemostly")) {
2683 		set_bit(WriteMostly, &rdev->flags);
2684 		err = 0;
2685 	} else if (cmd_match(buf, "-writemostly")) {
2686 		clear_bit(WriteMostly, &rdev->flags);
2687 		err = 0;
2688 	} else if (cmd_match(buf, "blocked")) {
2689 		set_bit(Blocked, &rdev->flags);
2690 		err = 0;
2691 	} else if (cmd_match(buf, "-blocked")) {
2692 		if (!test_bit(Faulty, &rdev->flags) &&
2693 		    rdev->badblocks.unacked_exist) {
2694 			/* metadata handler doesn't understand badblocks,
2695 			 * so we need to fail the device
2696 			 */
2697 			md_error(rdev->mddev, rdev);
2698 		}
2699 		clear_bit(Blocked, &rdev->flags);
2700 		clear_bit(BlockedBadBlocks, &rdev->flags);
2701 		wake_up(&rdev->blocked_wait);
2702 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2703 		md_wakeup_thread(rdev->mddev->thread);
2704 
2705 		err = 0;
2706 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2707 		set_bit(In_sync, &rdev->flags);
2708 		err = 0;
2709 	} else if (cmd_match(buf, "write_error")) {
2710 		set_bit(WriteErrorSeen, &rdev->flags);
2711 		err = 0;
2712 	} else if (cmd_match(buf, "-write_error")) {
2713 		clear_bit(WriteErrorSeen, &rdev->flags);
2714 		err = 0;
2715 	} else if (cmd_match(buf, "want_replacement")) {
2716 		/* Any non-spare device that is not a replacement can
2717 		 * become want_replacement at any time, but we then need to
2718 		 * check if recovery is needed.
2719 		 */
2720 		if (rdev->raid_disk >= 0 &&
2721 		    !test_bit(Replacement, &rdev->flags))
2722 			set_bit(WantReplacement, &rdev->flags);
2723 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2724 		md_wakeup_thread(rdev->mddev->thread);
2725 		err = 0;
2726 	} else if (cmd_match(buf, "-want_replacement")) {
2727 		/* Clearing 'want_replacement' is always allowed.
2728 		 * Once replacements starts it is too late though.
2729 		 */
2730 		err = 0;
2731 		clear_bit(WantReplacement, &rdev->flags);
2732 	} else if (cmd_match(buf, "replacement")) {
2733 		/* Can only set a device as a replacement when array has not
2734 		 * yet been started.  Once running, replacement is automatic
2735 		 * from spares, or by assigning 'slot'.
2736 		 */
2737 		if (rdev->mddev->pers)
2738 			err = -EBUSY;
2739 		else {
2740 			set_bit(Replacement, &rdev->flags);
2741 			err = 0;
2742 		}
2743 	} else if (cmd_match(buf, "-replacement")) {
2744 		/* Similarly, can only clear Replacement before start */
2745 		if (rdev->mddev->pers)
2746 			err = -EBUSY;
2747 		else {
2748 			clear_bit(Replacement, &rdev->flags);
2749 			err = 0;
2750 		}
2751 	}
2752 	if (!err)
2753 		sysfs_notify_dirent_safe(rdev->sysfs_state);
2754 	return err ? err : len;
2755 }
2756 static struct rdev_sysfs_entry rdev_state =
2757 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2758 
2759 static ssize_t
2760 errors_show(struct md_rdev *rdev, char *page)
2761 {
2762 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2763 }
2764 
2765 static ssize_t
2766 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2767 {
2768 	char *e;
2769 	unsigned long n = simple_strtoul(buf, &e, 10);
2770 	if (*buf && (*e == 0 || *e == '\n')) {
2771 		atomic_set(&rdev->corrected_errors, n);
2772 		return len;
2773 	}
2774 	return -EINVAL;
2775 }
2776 static struct rdev_sysfs_entry rdev_errors =
2777 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2778 
2779 static ssize_t
2780 slot_show(struct md_rdev *rdev, char *page)
2781 {
2782 	if (rdev->raid_disk < 0)
2783 		return sprintf(page, "none\n");
2784 	else
2785 		return sprintf(page, "%d\n", rdev->raid_disk);
2786 }
2787 
2788 static ssize_t
2789 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2790 {
2791 	char *e;
2792 	int err;
2793 	int slot = simple_strtoul(buf, &e, 10);
2794 	if (strncmp(buf, "none", 4)==0)
2795 		slot = -1;
2796 	else if (e==buf || (*e && *e!= '\n'))
2797 		return -EINVAL;
2798 	if (rdev->mddev->pers && slot == -1) {
2799 		/* Setting 'slot' on an active array requires also
2800 		 * updating the 'rd%d' link, and communicating
2801 		 * with the personality with ->hot_*_disk.
2802 		 * For now we only support removing
2803 		 * failed/spare devices.  This normally happens automatically,
2804 		 * but not when the metadata is externally managed.
2805 		 */
2806 		if (rdev->raid_disk == -1)
2807 			return -EEXIST;
2808 		/* personality does all needed checks */
2809 		if (rdev->mddev->pers->hot_remove_disk == NULL)
2810 			return -EINVAL;
2811 		clear_bit(Blocked, &rdev->flags);
2812 		remove_and_add_spares(rdev->mddev, rdev);
2813 		if (rdev->raid_disk >= 0)
2814 			return -EBUSY;
2815 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2816 		md_wakeup_thread(rdev->mddev->thread);
2817 	} else if (rdev->mddev->pers) {
2818 		/* Activating a spare .. or possibly reactivating
2819 		 * if we ever get bitmaps working here.
2820 		 */
2821 
2822 		if (rdev->raid_disk != -1)
2823 			return -EBUSY;
2824 
2825 		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2826 			return -EBUSY;
2827 
2828 		if (rdev->mddev->pers->hot_add_disk == NULL)
2829 			return -EINVAL;
2830 
2831 		if (slot >= rdev->mddev->raid_disks &&
2832 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2833 			return -ENOSPC;
2834 
2835 		rdev->raid_disk = slot;
2836 		if (test_bit(In_sync, &rdev->flags))
2837 			rdev->saved_raid_disk = slot;
2838 		else
2839 			rdev->saved_raid_disk = -1;
2840 		clear_bit(In_sync, &rdev->flags);
2841 		err = rdev->mddev->pers->
2842 			hot_add_disk(rdev->mddev, rdev);
2843 		if (err) {
2844 			rdev->raid_disk = -1;
2845 			return err;
2846 		} else
2847 			sysfs_notify_dirent_safe(rdev->sysfs_state);
2848 		if (sysfs_link_rdev(rdev->mddev, rdev))
2849 			/* failure here is OK */;
2850 		/* don't wakeup anyone, leave that to userspace. */
2851 	} else {
2852 		if (slot >= rdev->mddev->raid_disks &&
2853 		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2854 			return -ENOSPC;
2855 		rdev->raid_disk = slot;
2856 		/* assume it is working */
2857 		clear_bit(Faulty, &rdev->flags);
2858 		clear_bit(WriteMostly, &rdev->flags);
2859 		set_bit(In_sync, &rdev->flags);
2860 		sysfs_notify_dirent_safe(rdev->sysfs_state);
2861 	}
2862 	return len;
2863 }
2864 
2865 
2866 static struct rdev_sysfs_entry rdev_slot =
2867 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2868 
2869 static ssize_t
2870 offset_show(struct md_rdev *rdev, char *page)
2871 {
2872 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2873 }
2874 
2875 static ssize_t
2876 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2877 {
2878 	unsigned long long offset;
2879 	if (strict_strtoull(buf, 10, &offset) < 0)
2880 		return -EINVAL;
2881 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
2882 		return -EBUSY;
2883 	if (rdev->sectors && rdev->mddev->external)
2884 		/* Must set offset before size, so overlap checks
2885 		 * can be sane */
2886 		return -EBUSY;
2887 	rdev->data_offset = offset;
2888 	rdev->new_data_offset = offset;
2889 	return len;
2890 }
2891 
2892 static struct rdev_sysfs_entry rdev_offset =
2893 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2894 
2895 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2896 {
2897 	return sprintf(page, "%llu\n",
2898 		       (unsigned long long)rdev->new_data_offset);
2899 }
2900 
2901 static ssize_t new_offset_store(struct md_rdev *rdev,
2902 				const char *buf, size_t len)
2903 {
2904 	unsigned long long new_offset;
2905 	struct mddev *mddev = rdev->mddev;
2906 
2907 	if (strict_strtoull(buf, 10, &new_offset) < 0)
2908 		return -EINVAL;
2909 
2910 	if (mddev->sync_thread)
2911 		return -EBUSY;
2912 	if (new_offset == rdev->data_offset)
2913 		/* reset is always permitted */
2914 		;
2915 	else if (new_offset > rdev->data_offset) {
2916 		/* must not push array size beyond rdev_sectors */
2917 		if (new_offset - rdev->data_offset
2918 		    + mddev->dev_sectors > rdev->sectors)
2919 				return -E2BIG;
2920 	}
2921 	/* Metadata worries about other space details. */
2922 
2923 	/* decreasing the offset is inconsistent with a backwards
2924 	 * reshape.
2925 	 */
2926 	if (new_offset < rdev->data_offset &&
2927 	    mddev->reshape_backwards)
2928 		return -EINVAL;
2929 	/* Increasing offset is inconsistent with forwards
2930 	 * reshape.  reshape_direction should be set to
2931 	 * 'backwards' first.
2932 	 */
2933 	if (new_offset > rdev->data_offset &&
2934 	    !mddev->reshape_backwards)
2935 		return -EINVAL;
2936 
2937 	if (mddev->pers && mddev->persistent &&
2938 	    !super_types[mddev->major_version]
2939 	    .allow_new_offset(rdev, new_offset))
2940 		return -E2BIG;
2941 	rdev->new_data_offset = new_offset;
2942 	if (new_offset > rdev->data_offset)
2943 		mddev->reshape_backwards = 1;
2944 	else if (new_offset < rdev->data_offset)
2945 		mddev->reshape_backwards = 0;
2946 
2947 	return len;
2948 }
2949 static struct rdev_sysfs_entry rdev_new_offset =
2950 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2951 
2952 static ssize_t
2953 rdev_size_show(struct md_rdev *rdev, char *page)
2954 {
2955 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2956 }
2957 
2958 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2959 {
2960 	/* check if two start/length pairs overlap */
2961 	if (s1+l1 <= s2)
2962 		return 0;
2963 	if (s2+l2 <= s1)
2964 		return 0;
2965 	return 1;
2966 }
2967 
2968 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2969 {
2970 	unsigned long long blocks;
2971 	sector_t new;
2972 
2973 	if (strict_strtoull(buf, 10, &blocks) < 0)
2974 		return -EINVAL;
2975 
2976 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2977 		return -EINVAL; /* sector conversion overflow */
2978 
2979 	new = blocks * 2;
2980 	if (new != blocks * 2)
2981 		return -EINVAL; /* unsigned long long to sector_t overflow */
2982 
2983 	*sectors = new;
2984 	return 0;
2985 }
2986 
2987 static ssize_t
2988 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2989 {
2990 	struct mddev *my_mddev = rdev->mddev;
2991 	sector_t oldsectors = rdev->sectors;
2992 	sector_t sectors;
2993 
2994 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
2995 		return -EINVAL;
2996 	if (rdev->data_offset != rdev->new_data_offset)
2997 		return -EINVAL; /* too confusing */
2998 	if (my_mddev->pers && rdev->raid_disk >= 0) {
2999 		if (my_mddev->persistent) {
3000 			sectors = super_types[my_mddev->major_version].
3001 				rdev_size_change(rdev, sectors);
3002 			if (!sectors)
3003 				return -EBUSY;
3004 		} else if (!sectors)
3005 			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3006 				rdev->data_offset;
3007 		if (!my_mddev->pers->resize)
3008 			/* Cannot change size for RAID0 or Linear etc */
3009 			return -EINVAL;
3010 	}
3011 	if (sectors < my_mddev->dev_sectors)
3012 		return -EINVAL; /* component must fit device */
3013 
3014 	rdev->sectors = sectors;
3015 	if (sectors > oldsectors && my_mddev->external) {
3016 		/* need to check that all other rdevs with the same ->bdev
3017 		 * do not overlap.  We need to unlock the mddev to avoid
3018 		 * a deadlock.  We have already changed rdev->sectors, and if
3019 		 * we have to change it back, we will have the lock again.
3020 		 */
3021 		struct mddev *mddev;
3022 		int overlap = 0;
3023 		struct list_head *tmp;
3024 
3025 		mddev_unlock(my_mddev);
3026 		for_each_mddev(mddev, tmp) {
3027 			struct md_rdev *rdev2;
3028 
3029 			mddev_lock(mddev);
3030 			rdev_for_each(rdev2, mddev)
3031 				if (rdev->bdev == rdev2->bdev &&
3032 				    rdev != rdev2 &&
3033 				    overlaps(rdev->data_offset, rdev->sectors,
3034 					     rdev2->data_offset,
3035 					     rdev2->sectors)) {
3036 					overlap = 1;
3037 					break;
3038 				}
3039 			mddev_unlock(mddev);
3040 			if (overlap) {
3041 				mddev_put(mddev);
3042 				break;
3043 			}
3044 		}
3045 		mddev_lock(my_mddev);
3046 		if (overlap) {
3047 			/* Someone else could have slipped in a size
3048 			 * change here, but doing so is just silly.
3049 			 * We put oldsectors back because we *know* it is
3050 			 * safe, and trust userspace not to race with
3051 			 * itself
3052 			 */
3053 			rdev->sectors = oldsectors;
3054 			return -EBUSY;
3055 		}
3056 	}
3057 	return len;
3058 }
3059 
3060 static struct rdev_sysfs_entry rdev_size =
3061 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3062 
3063 
3064 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3065 {
3066 	unsigned long long recovery_start = rdev->recovery_offset;
3067 
3068 	if (test_bit(In_sync, &rdev->flags) ||
3069 	    recovery_start == MaxSector)
3070 		return sprintf(page, "none\n");
3071 
3072 	return sprintf(page, "%llu\n", recovery_start);
3073 }
3074 
3075 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3076 {
3077 	unsigned long long recovery_start;
3078 
3079 	if (cmd_match(buf, "none"))
3080 		recovery_start = MaxSector;
3081 	else if (strict_strtoull(buf, 10, &recovery_start))
3082 		return -EINVAL;
3083 
3084 	if (rdev->mddev->pers &&
3085 	    rdev->raid_disk >= 0)
3086 		return -EBUSY;
3087 
3088 	rdev->recovery_offset = recovery_start;
3089 	if (recovery_start == MaxSector)
3090 		set_bit(In_sync, &rdev->flags);
3091 	else
3092 		clear_bit(In_sync, &rdev->flags);
3093 	return len;
3094 }
3095 
3096 static struct rdev_sysfs_entry rdev_recovery_start =
3097 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3098 
3099 
3100 static ssize_t
3101 badblocks_show(struct badblocks *bb, char *page, int unack);
3102 static ssize_t
3103 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3104 
3105 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3106 {
3107 	return badblocks_show(&rdev->badblocks, page, 0);
3108 }
3109 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3110 {
3111 	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3112 	/* Maybe that ack was all we needed */
3113 	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3114 		wake_up(&rdev->blocked_wait);
3115 	return rv;
3116 }
3117 static struct rdev_sysfs_entry rdev_bad_blocks =
3118 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3119 
3120 
3121 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3122 {
3123 	return badblocks_show(&rdev->badblocks, page, 1);
3124 }
3125 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3126 {
3127 	return badblocks_store(&rdev->badblocks, page, len, 1);
3128 }
3129 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3130 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3131 
3132 static struct attribute *rdev_default_attrs[] = {
3133 	&rdev_state.attr,
3134 	&rdev_errors.attr,
3135 	&rdev_slot.attr,
3136 	&rdev_offset.attr,
3137 	&rdev_new_offset.attr,
3138 	&rdev_size.attr,
3139 	&rdev_recovery_start.attr,
3140 	&rdev_bad_blocks.attr,
3141 	&rdev_unack_bad_blocks.attr,
3142 	NULL,
3143 };
3144 static ssize_t
3145 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3146 {
3147 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3148 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3149 	struct mddev *mddev = rdev->mddev;
3150 	ssize_t rv;
3151 
3152 	if (!entry->show)
3153 		return -EIO;
3154 
3155 	rv = mddev ? mddev_lock(mddev) : -EBUSY;
3156 	if (!rv) {
3157 		if (rdev->mddev == NULL)
3158 			rv = -EBUSY;
3159 		else
3160 			rv = entry->show(rdev, page);
3161 		mddev_unlock(mddev);
3162 	}
3163 	return rv;
3164 }
3165 
3166 static ssize_t
3167 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3168 	      const char *page, size_t length)
3169 {
3170 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3171 	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3172 	ssize_t rv;
3173 	struct mddev *mddev = rdev->mddev;
3174 
3175 	if (!entry->store)
3176 		return -EIO;
3177 	if (!capable(CAP_SYS_ADMIN))
3178 		return -EACCES;
3179 	rv = mddev ? mddev_lock(mddev): -EBUSY;
3180 	if (!rv) {
3181 		if (rdev->mddev == NULL)
3182 			rv = -EBUSY;
3183 		else
3184 			rv = entry->store(rdev, page, length);
3185 		mddev_unlock(mddev);
3186 	}
3187 	return rv;
3188 }
3189 
3190 static void rdev_free(struct kobject *ko)
3191 {
3192 	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3193 	kfree(rdev);
3194 }
3195 static const struct sysfs_ops rdev_sysfs_ops = {
3196 	.show		= rdev_attr_show,
3197 	.store		= rdev_attr_store,
3198 };
3199 static struct kobj_type rdev_ktype = {
3200 	.release	= rdev_free,
3201 	.sysfs_ops	= &rdev_sysfs_ops,
3202 	.default_attrs	= rdev_default_attrs,
3203 };
3204 
3205 int md_rdev_init(struct md_rdev *rdev)
3206 {
3207 	rdev->desc_nr = -1;
3208 	rdev->saved_raid_disk = -1;
3209 	rdev->raid_disk = -1;
3210 	rdev->flags = 0;
3211 	rdev->data_offset = 0;
3212 	rdev->new_data_offset = 0;
3213 	rdev->sb_events = 0;
3214 	rdev->last_read_error.tv_sec  = 0;
3215 	rdev->last_read_error.tv_nsec = 0;
3216 	rdev->sb_loaded = 0;
3217 	rdev->bb_page = NULL;
3218 	atomic_set(&rdev->nr_pending, 0);
3219 	atomic_set(&rdev->read_errors, 0);
3220 	atomic_set(&rdev->corrected_errors, 0);
3221 
3222 	INIT_LIST_HEAD(&rdev->same_set);
3223 	init_waitqueue_head(&rdev->blocked_wait);
3224 
3225 	/* Add space to store bad block list.
3226 	 * This reserves the space even on arrays where it cannot
3227 	 * be used - I wonder if that matters
3228 	 */
3229 	rdev->badblocks.count = 0;
3230 	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3231 	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3232 	seqlock_init(&rdev->badblocks.lock);
3233 	if (rdev->badblocks.page == NULL)
3234 		return -ENOMEM;
3235 
3236 	return 0;
3237 }
3238 EXPORT_SYMBOL_GPL(md_rdev_init);
3239 /*
3240  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3241  *
3242  * mark the device faulty if:
3243  *
3244  *   - the device is nonexistent (zero size)
3245  *   - the device has no valid superblock
3246  *
3247  * a faulty rdev _never_ has rdev->sb set.
3248  */
3249 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3250 {
3251 	char b[BDEVNAME_SIZE];
3252 	int err;
3253 	struct md_rdev *rdev;
3254 	sector_t size;
3255 
3256 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3257 	if (!rdev) {
3258 		printk(KERN_ERR "md: could not alloc mem for new device!\n");
3259 		return ERR_PTR(-ENOMEM);
3260 	}
3261 
3262 	err = md_rdev_init(rdev);
3263 	if (err)
3264 		goto abort_free;
3265 	err = alloc_disk_sb(rdev);
3266 	if (err)
3267 		goto abort_free;
3268 
3269 	err = lock_rdev(rdev, newdev, super_format == -2);
3270 	if (err)
3271 		goto abort_free;
3272 
3273 	kobject_init(&rdev->kobj, &rdev_ktype);
3274 
3275 	size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3276 	if (!size) {
3277 		printk(KERN_WARNING
3278 			"md: %s has zero or unknown size, marking faulty!\n",
3279 			bdevname(rdev->bdev,b));
3280 		err = -EINVAL;
3281 		goto abort_free;
3282 	}
3283 
3284 	if (super_format >= 0) {
3285 		err = super_types[super_format].
3286 			load_super(rdev, NULL, super_minor);
3287 		if (err == -EINVAL) {
3288 			printk(KERN_WARNING
3289 				"md: %s does not have a valid v%d.%d "
3290 			       "superblock, not importing!\n",
3291 				bdevname(rdev->bdev,b),
3292 			       super_format, super_minor);
3293 			goto abort_free;
3294 		}
3295 		if (err < 0) {
3296 			printk(KERN_WARNING
3297 				"md: could not read %s's sb, not importing!\n",
3298 				bdevname(rdev->bdev,b));
3299 			goto abort_free;
3300 		}
3301 	}
3302 
3303 	return rdev;
3304 
3305 abort_free:
3306 	if (rdev->bdev)
3307 		unlock_rdev(rdev);
3308 	md_rdev_clear(rdev);
3309 	kfree(rdev);
3310 	return ERR_PTR(err);
3311 }
3312 
3313 /*
3314  * Check a full RAID array for plausibility
3315  */
3316 
3317 
3318 static void analyze_sbs(struct mddev * mddev)
3319 {
3320 	int i;
3321 	struct md_rdev *rdev, *freshest, *tmp;
3322 	char b[BDEVNAME_SIZE];
3323 
3324 	freshest = NULL;
3325 	rdev_for_each_safe(rdev, tmp, mddev)
3326 		switch (super_types[mddev->major_version].
3327 			load_super(rdev, freshest, mddev->minor_version)) {
3328 		case 1:
3329 			freshest = rdev;
3330 			break;
3331 		case 0:
3332 			break;
3333 		default:
3334 			printk( KERN_ERR \
3335 				"md: fatal superblock inconsistency in %s"
3336 				" -- removing from array\n",
3337 				bdevname(rdev->bdev,b));
3338 			kick_rdev_from_array(rdev);
3339 		}
3340 
3341 
3342 	super_types[mddev->major_version].
3343 		validate_super(mddev, freshest);
3344 
3345 	i = 0;
3346 	rdev_for_each_safe(rdev, tmp, mddev) {
3347 		if (mddev->max_disks &&
3348 		    (rdev->desc_nr >= mddev->max_disks ||
3349 		     i > mddev->max_disks)) {
3350 			printk(KERN_WARNING
3351 			       "md: %s: %s: only %d devices permitted\n",
3352 			       mdname(mddev), bdevname(rdev->bdev, b),
3353 			       mddev->max_disks);
3354 			kick_rdev_from_array(rdev);
3355 			continue;
3356 		}
3357 		if (rdev != freshest)
3358 			if (super_types[mddev->major_version].
3359 			    validate_super(mddev, rdev)) {
3360 				printk(KERN_WARNING "md: kicking non-fresh %s"
3361 					" from array!\n",
3362 					bdevname(rdev->bdev,b));
3363 				kick_rdev_from_array(rdev);
3364 				continue;
3365 			}
3366 		if (mddev->level == LEVEL_MULTIPATH) {
3367 			rdev->desc_nr = i++;
3368 			rdev->raid_disk = rdev->desc_nr;
3369 			set_bit(In_sync, &rdev->flags);
3370 		} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3371 			rdev->raid_disk = -1;
3372 			clear_bit(In_sync, &rdev->flags);
3373 		}
3374 	}
3375 }
3376 
3377 /* Read a fixed-point number.
3378  * Numbers in sysfs attributes should be in "standard" units where
3379  * possible, so time should be in seconds.
3380  * However we internally use a a much smaller unit such as
3381  * milliseconds or jiffies.
3382  * This function takes a decimal number with a possible fractional
3383  * component, and produces an integer which is the result of
3384  * multiplying that number by 10^'scale'.
3385  * all without any floating-point arithmetic.
3386  */
3387 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3388 {
3389 	unsigned long result = 0;
3390 	long decimals = -1;
3391 	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3392 		if (*cp == '.')
3393 			decimals = 0;
3394 		else if (decimals < scale) {
3395 			unsigned int value;
3396 			value = *cp - '0';
3397 			result = result * 10 + value;
3398 			if (decimals >= 0)
3399 				decimals++;
3400 		}
3401 		cp++;
3402 	}
3403 	if (*cp == '\n')
3404 		cp++;
3405 	if (*cp)
3406 		return -EINVAL;
3407 	if (decimals < 0)
3408 		decimals = 0;
3409 	while (decimals < scale) {
3410 		result *= 10;
3411 		decimals ++;
3412 	}
3413 	*res = result;
3414 	return 0;
3415 }
3416 
3417 
3418 static void md_safemode_timeout(unsigned long data);
3419 
3420 static ssize_t
3421 safe_delay_show(struct mddev *mddev, char *page)
3422 {
3423 	int msec = (mddev->safemode_delay*1000)/HZ;
3424 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3425 }
3426 static ssize_t
3427 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3428 {
3429 	unsigned long msec;
3430 
3431 	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3432 		return -EINVAL;
3433 	if (msec == 0)
3434 		mddev->safemode_delay = 0;
3435 	else {
3436 		unsigned long old_delay = mddev->safemode_delay;
3437 		mddev->safemode_delay = (msec*HZ)/1000;
3438 		if (mddev->safemode_delay == 0)
3439 			mddev->safemode_delay = 1;
3440 		if (mddev->safemode_delay < old_delay)
3441 			md_safemode_timeout((unsigned long)mddev);
3442 	}
3443 	return len;
3444 }
3445 static struct md_sysfs_entry md_safe_delay =
3446 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3447 
3448 static ssize_t
3449 level_show(struct mddev *mddev, char *page)
3450 {
3451 	struct md_personality *p = mddev->pers;
3452 	if (p)
3453 		return sprintf(page, "%s\n", p->name);
3454 	else if (mddev->clevel[0])
3455 		return sprintf(page, "%s\n", mddev->clevel);
3456 	else if (mddev->level != LEVEL_NONE)
3457 		return sprintf(page, "%d\n", mddev->level);
3458 	else
3459 		return 0;
3460 }
3461 
3462 static ssize_t
3463 level_store(struct mddev *mddev, const char *buf, size_t len)
3464 {
3465 	char clevel[16];
3466 	ssize_t rv = len;
3467 	struct md_personality *pers;
3468 	long level;
3469 	void *priv;
3470 	struct md_rdev *rdev;
3471 
3472 	if (mddev->pers == NULL) {
3473 		if (len == 0)
3474 			return 0;
3475 		if (len >= sizeof(mddev->clevel))
3476 			return -ENOSPC;
3477 		strncpy(mddev->clevel, buf, len);
3478 		if (mddev->clevel[len-1] == '\n')
3479 			len--;
3480 		mddev->clevel[len] = 0;
3481 		mddev->level = LEVEL_NONE;
3482 		return rv;
3483 	}
3484 
3485 	/* request to change the personality.  Need to ensure:
3486 	 *  - array is not engaged in resync/recovery/reshape
3487 	 *  - old personality can be suspended
3488 	 *  - new personality will access other array.
3489 	 */
3490 
3491 	if (mddev->sync_thread ||
3492 	    mddev->reshape_position != MaxSector ||
3493 	    mddev->sysfs_active)
3494 		return -EBUSY;
3495 
3496 	if (!mddev->pers->quiesce) {
3497 		printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3498 		       mdname(mddev), mddev->pers->name);
3499 		return -EINVAL;
3500 	}
3501 
3502 	/* Now find the new personality */
3503 	if (len == 0 || len >= sizeof(clevel))
3504 		return -EINVAL;
3505 	strncpy(clevel, buf, len);
3506 	if (clevel[len-1] == '\n')
3507 		len--;
3508 	clevel[len] = 0;
3509 	if (strict_strtol(clevel, 10, &level))
3510 		level = LEVEL_NONE;
3511 
3512 	if (request_module("md-%s", clevel) != 0)
3513 		request_module("md-level-%s", clevel);
3514 	spin_lock(&pers_lock);
3515 	pers = find_pers(level, clevel);
3516 	if (!pers || !try_module_get(pers->owner)) {
3517 		spin_unlock(&pers_lock);
3518 		printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3519 		return -EINVAL;
3520 	}
3521 	spin_unlock(&pers_lock);
3522 
3523 	if (pers == mddev->pers) {
3524 		/* Nothing to do! */
3525 		module_put(pers->owner);
3526 		return rv;
3527 	}
3528 	if (!pers->takeover) {
3529 		module_put(pers->owner);
3530 		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3531 		       mdname(mddev), clevel);
3532 		return -EINVAL;
3533 	}
3534 
3535 	rdev_for_each(rdev, mddev)
3536 		rdev->new_raid_disk = rdev->raid_disk;
3537 
3538 	/* ->takeover must set new_* and/or delta_disks
3539 	 * if it succeeds, and may set them when it fails.
3540 	 */
3541 	priv = pers->takeover(mddev);
3542 	if (IS_ERR(priv)) {
3543 		mddev->new_level = mddev->level;
3544 		mddev->new_layout = mddev->layout;
3545 		mddev->new_chunk_sectors = mddev->chunk_sectors;
3546 		mddev->raid_disks -= mddev->delta_disks;
3547 		mddev->delta_disks = 0;
3548 		mddev->reshape_backwards = 0;
3549 		module_put(pers->owner);
3550 		printk(KERN_WARNING "md: %s: %s would not accept array\n",
3551 		       mdname(mddev), clevel);
3552 		return PTR_ERR(priv);
3553 	}
3554 
3555 	/* Looks like we have a winner */
3556 	mddev_suspend(mddev);
3557 	mddev->pers->stop(mddev);
3558 
3559 	if (mddev->pers->sync_request == NULL &&
3560 	    pers->sync_request != NULL) {
3561 		/* need to add the md_redundancy_group */
3562 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3563 			printk(KERN_WARNING
3564 			       "md: cannot register extra attributes for %s\n",
3565 			       mdname(mddev));
3566 		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3567 	}
3568 	if (mddev->pers->sync_request != NULL &&
3569 	    pers->sync_request == NULL) {
3570 		/* need to remove the md_redundancy_group */
3571 		if (mddev->to_remove == NULL)
3572 			mddev->to_remove = &md_redundancy_group;
3573 	}
3574 
3575 	if (mddev->pers->sync_request == NULL &&
3576 	    mddev->external) {
3577 		/* We are converting from a no-redundancy array
3578 		 * to a redundancy array and metadata is managed
3579 		 * externally so we need to be sure that writes
3580 		 * won't block due to a need to transition
3581 		 *      clean->dirty
3582 		 * until external management is started.
3583 		 */
3584 		mddev->in_sync = 0;
3585 		mddev->safemode_delay = 0;
3586 		mddev->safemode = 0;
3587 	}
3588 
3589 	rdev_for_each(rdev, mddev) {
3590 		if (rdev->raid_disk < 0)
3591 			continue;
3592 		if (rdev->new_raid_disk >= mddev->raid_disks)
3593 			rdev->new_raid_disk = -1;
3594 		if (rdev->new_raid_disk == rdev->raid_disk)
3595 			continue;
3596 		sysfs_unlink_rdev(mddev, rdev);
3597 	}
3598 	rdev_for_each(rdev, mddev) {
3599 		if (rdev->raid_disk < 0)
3600 			continue;
3601 		if (rdev->new_raid_disk == rdev->raid_disk)
3602 			continue;
3603 		rdev->raid_disk = rdev->new_raid_disk;
3604 		if (rdev->raid_disk < 0)
3605 			clear_bit(In_sync, &rdev->flags);
3606 		else {
3607 			if (sysfs_link_rdev(mddev, rdev))
3608 				printk(KERN_WARNING "md: cannot register rd%d"
3609 				       " for %s after level change\n",
3610 				       rdev->raid_disk, mdname(mddev));
3611 		}
3612 	}
3613 
3614 	module_put(mddev->pers->owner);
3615 	mddev->pers = pers;
3616 	mddev->private = priv;
3617 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3618 	mddev->level = mddev->new_level;
3619 	mddev->layout = mddev->new_layout;
3620 	mddev->chunk_sectors = mddev->new_chunk_sectors;
3621 	mddev->delta_disks = 0;
3622 	mddev->reshape_backwards = 0;
3623 	mddev->degraded = 0;
3624 	if (mddev->pers->sync_request == NULL) {
3625 		/* this is now an array without redundancy, so
3626 		 * it must always be in_sync
3627 		 */
3628 		mddev->in_sync = 1;
3629 		del_timer_sync(&mddev->safemode_timer);
3630 	}
3631 	pers->run(mddev);
3632 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
3633 	mddev_resume(mddev);
3634 	sysfs_notify(&mddev->kobj, NULL, "level");
3635 	md_new_event(mddev);
3636 	return rv;
3637 }
3638 
3639 static struct md_sysfs_entry md_level =
3640 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3641 
3642 
3643 static ssize_t
3644 layout_show(struct mddev *mddev, char *page)
3645 {
3646 	/* just a number, not meaningful for all levels */
3647 	if (mddev->reshape_position != MaxSector &&
3648 	    mddev->layout != mddev->new_layout)
3649 		return sprintf(page, "%d (%d)\n",
3650 			       mddev->new_layout, mddev->layout);
3651 	return sprintf(page, "%d\n", mddev->layout);
3652 }
3653 
3654 static ssize_t
3655 layout_store(struct mddev *mddev, const char *buf, size_t len)
3656 {
3657 	char *e;
3658 	unsigned long n = simple_strtoul(buf, &e, 10);
3659 
3660 	if (!*buf || (*e && *e != '\n'))
3661 		return -EINVAL;
3662 
3663 	if (mddev->pers) {
3664 		int err;
3665 		if (mddev->pers->check_reshape == NULL)
3666 			return -EBUSY;
3667 		mddev->new_layout = n;
3668 		err = mddev->pers->check_reshape(mddev);
3669 		if (err) {
3670 			mddev->new_layout = mddev->layout;
3671 			return err;
3672 		}
3673 	} else {
3674 		mddev->new_layout = n;
3675 		if (mddev->reshape_position == MaxSector)
3676 			mddev->layout = n;
3677 	}
3678 	return len;
3679 }
3680 static struct md_sysfs_entry md_layout =
3681 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3682 
3683 
3684 static ssize_t
3685 raid_disks_show(struct mddev *mddev, char *page)
3686 {
3687 	if (mddev->raid_disks == 0)
3688 		return 0;
3689 	if (mddev->reshape_position != MaxSector &&
3690 	    mddev->delta_disks != 0)
3691 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3692 			       mddev->raid_disks - mddev->delta_disks);
3693 	return sprintf(page, "%d\n", mddev->raid_disks);
3694 }
3695 
3696 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3697 
3698 static ssize_t
3699 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3700 {
3701 	char *e;
3702 	int rv = 0;
3703 	unsigned long n = simple_strtoul(buf, &e, 10);
3704 
3705 	if (!*buf || (*e && *e != '\n'))
3706 		return -EINVAL;
3707 
3708 	if (mddev->pers)
3709 		rv = update_raid_disks(mddev, n);
3710 	else if (mddev->reshape_position != MaxSector) {
3711 		struct md_rdev *rdev;
3712 		int olddisks = mddev->raid_disks - mddev->delta_disks;
3713 
3714 		rdev_for_each(rdev, mddev) {
3715 			if (olddisks < n &&
3716 			    rdev->data_offset < rdev->new_data_offset)
3717 				return -EINVAL;
3718 			if (olddisks > n &&
3719 			    rdev->data_offset > rdev->new_data_offset)
3720 				return -EINVAL;
3721 		}
3722 		mddev->delta_disks = n - olddisks;
3723 		mddev->raid_disks = n;
3724 		mddev->reshape_backwards = (mddev->delta_disks < 0);
3725 	} else
3726 		mddev->raid_disks = n;
3727 	return rv ? rv : len;
3728 }
3729 static struct md_sysfs_entry md_raid_disks =
3730 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3731 
3732 static ssize_t
3733 chunk_size_show(struct mddev *mddev, char *page)
3734 {
3735 	if (mddev->reshape_position != MaxSector &&
3736 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
3737 		return sprintf(page, "%d (%d)\n",
3738 			       mddev->new_chunk_sectors << 9,
3739 			       mddev->chunk_sectors << 9);
3740 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3741 }
3742 
3743 static ssize_t
3744 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3745 {
3746 	char *e;
3747 	unsigned long n = simple_strtoul(buf, &e, 10);
3748 
3749 	if (!*buf || (*e && *e != '\n'))
3750 		return -EINVAL;
3751 
3752 	if (mddev->pers) {
3753 		int err;
3754 		if (mddev->pers->check_reshape == NULL)
3755 			return -EBUSY;
3756 		mddev->new_chunk_sectors = n >> 9;
3757 		err = mddev->pers->check_reshape(mddev);
3758 		if (err) {
3759 			mddev->new_chunk_sectors = mddev->chunk_sectors;
3760 			return err;
3761 		}
3762 	} else {
3763 		mddev->new_chunk_sectors = n >> 9;
3764 		if (mddev->reshape_position == MaxSector)
3765 			mddev->chunk_sectors = n >> 9;
3766 	}
3767 	return len;
3768 }
3769 static struct md_sysfs_entry md_chunk_size =
3770 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3771 
3772 static ssize_t
3773 resync_start_show(struct mddev *mddev, char *page)
3774 {
3775 	if (mddev->recovery_cp == MaxSector)
3776 		return sprintf(page, "none\n");
3777 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3778 }
3779 
3780 static ssize_t
3781 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3782 {
3783 	char *e;
3784 	unsigned long long n = simple_strtoull(buf, &e, 10);
3785 
3786 	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3787 		return -EBUSY;
3788 	if (cmd_match(buf, "none"))
3789 		n = MaxSector;
3790 	else if (!*buf || (*e && *e != '\n'))
3791 		return -EINVAL;
3792 
3793 	mddev->recovery_cp = n;
3794 	if (mddev->pers)
3795 		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3796 	return len;
3797 }
3798 static struct md_sysfs_entry md_resync_start =
3799 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3800 
3801 /*
3802  * The array state can be:
3803  *
3804  * clear
3805  *     No devices, no size, no level
3806  *     Equivalent to STOP_ARRAY ioctl
3807  * inactive
3808  *     May have some settings, but array is not active
3809  *        all IO results in error
3810  *     When written, doesn't tear down array, but just stops it
3811  * suspended (not supported yet)
3812  *     All IO requests will block. The array can be reconfigured.
3813  *     Writing this, if accepted, will block until array is quiescent
3814  * readonly
3815  *     no resync can happen.  no superblocks get written.
3816  *     write requests fail
3817  * read-auto
3818  *     like readonly, but behaves like 'clean' on a write request.
3819  *
3820  * clean - no pending writes, but otherwise active.
3821  *     When written to inactive array, starts without resync
3822  *     If a write request arrives then
3823  *       if metadata is known, mark 'dirty' and switch to 'active'.
3824  *       if not known, block and switch to write-pending
3825  *     If written to an active array that has pending writes, then fails.
3826  * active
3827  *     fully active: IO and resync can be happening.
3828  *     When written to inactive array, starts with resync
3829  *
3830  * write-pending
3831  *     clean, but writes are blocked waiting for 'active' to be written.
3832  *
3833  * active-idle
3834  *     like active, but no writes have been seen for a while (100msec).
3835  *
3836  */
3837 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3838 		   write_pending, active_idle, bad_word};
3839 static char *array_states[] = {
3840 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3841 	"write-pending", "active-idle", NULL };
3842 
3843 static int match_word(const char *word, char **list)
3844 {
3845 	int n;
3846 	for (n=0; list[n]; n++)
3847 		if (cmd_match(word, list[n]))
3848 			break;
3849 	return n;
3850 }
3851 
3852 static ssize_t
3853 array_state_show(struct mddev *mddev, char *page)
3854 {
3855 	enum array_state st = inactive;
3856 
3857 	if (mddev->pers)
3858 		switch(mddev->ro) {
3859 		case 1:
3860 			st = readonly;
3861 			break;
3862 		case 2:
3863 			st = read_auto;
3864 			break;
3865 		case 0:
3866 			if (mddev->in_sync)
3867 				st = clean;
3868 			else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3869 				st = write_pending;
3870 			else if (mddev->safemode)
3871 				st = active_idle;
3872 			else
3873 				st = active;
3874 		}
3875 	else {
3876 		if (list_empty(&mddev->disks) &&
3877 		    mddev->raid_disks == 0 &&
3878 		    mddev->dev_sectors == 0)
3879 			st = clear;
3880 		else
3881 			st = inactive;
3882 	}
3883 	return sprintf(page, "%s\n", array_states[st]);
3884 }
3885 
3886 static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
3887 static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
3888 static int do_md_run(struct mddev * mddev);
3889 static int restart_array(struct mddev *mddev);
3890 
3891 static ssize_t
3892 array_state_store(struct mddev *mddev, const char *buf, size_t len)
3893 {
3894 	int err = -EINVAL;
3895 	enum array_state st = match_word(buf, array_states);
3896 	switch(st) {
3897 	case bad_word:
3898 		break;
3899 	case clear:
3900 		/* stopping an active array */
3901 		err = do_md_stop(mddev, 0, NULL);
3902 		break;
3903 	case inactive:
3904 		/* stopping an active array */
3905 		if (mddev->pers)
3906 			err = do_md_stop(mddev, 2, NULL);
3907 		else
3908 			err = 0; /* already inactive */
3909 		break;
3910 	case suspended:
3911 		break; /* not supported yet */
3912 	case readonly:
3913 		if (mddev->pers)
3914 			err = md_set_readonly(mddev, NULL);
3915 		else {
3916 			mddev->ro = 1;
3917 			set_disk_ro(mddev->gendisk, 1);
3918 			err = do_md_run(mddev);
3919 		}
3920 		break;
3921 	case read_auto:
3922 		if (mddev->pers) {
3923 			if (mddev->ro == 0)
3924 				err = md_set_readonly(mddev, NULL);
3925 			else if (mddev->ro == 1)
3926 				err = restart_array(mddev);
3927 			if (err == 0) {
3928 				mddev->ro = 2;
3929 				set_disk_ro(mddev->gendisk, 0);
3930 			}
3931 		} else {
3932 			mddev->ro = 2;
3933 			err = do_md_run(mddev);
3934 		}
3935 		break;
3936 	case clean:
3937 		if (mddev->pers) {
3938 			restart_array(mddev);
3939 			spin_lock_irq(&mddev->write_lock);
3940 			if (atomic_read(&mddev->writes_pending) == 0) {
3941 				if (mddev->in_sync == 0) {
3942 					mddev->in_sync = 1;
3943 					if (mddev->safemode == 1)
3944 						mddev->safemode = 0;
3945 					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3946 				}
3947 				err = 0;
3948 			} else
3949 				err = -EBUSY;
3950 			spin_unlock_irq(&mddev->write_lock);
3951 		} else
3952 			err = -EINVAL;
3953 		break;
3954 	case active:
3955 		if (mddev->pers) {
3956 			restart_array(mddev);
3957 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3958 			wake_up(&mddev->sb_wait);
3959 			err = 0;
3960 		} else {
3961 			mddev->ro = 0;
3962 			set_disk_ro(mddev->gendisk, 0);
3963 			err = do_md_run(mddev);
3964 		}
3965 		break;
3966 	case write_pending:
3967 	case active_idle:
3968 		/* these cannot be set */
3969 		break;
3970 	}
3971 	if (err)
3972 		return err;
3973 	else {
3974 		if (mddev->hold_active == UNTIL_IOCTL)
3975 			mddev->hold_active = 0;
3976 		sysfs_notify_dirent_safe(mddev->sysfs_state);
3977 		return len;
3978 	}
3979 }
3980 static struct md_sysfs_entry md_array_state =
3981 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3982 
3983 static ssize_t
3984 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3985 	return sprintf(page, "%d\n",
3986 		       atomic_read(&mddev->max_corr_read_errors));
3987 }
3988 
3989 static ssize_t
3990 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3991 {
3992 	char *e;
3993 	unsigned long n = simple_strtoul(buf, &e, 10);
3994 
3995 	if (*buf && (*e == 0 || *e == '\n')) {
3996 		atomic_set(&mddev->max_corr_read_errors, n);
3997 		return len;
3998 	}
3999 	return -EINVAL;
4000 }
4001 
4002 static struct md_sysfs_entry max_corr_read_errors =
4003 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4004 	max_corrected_read_errors_store);
4005 
4006 static ssize_t
4007 null_show(struct mddev *mddev, char *page)
4008 {
4009 	return -EINVAL;
4010 }
4011 
4012 static ssize_t
4013 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4014 {
4015 	/* buf must be %d:%d\n? giving major and minor numbers */
4016 	/* The new device is added to the array.
4017 	 * If the array has a persistent superblock, we read the
4018 	 * superblock to initialise info and check validity.
4019 	 * Otherwise, only checking done is that in bind_rdev_to_array,
4020 	 * which mainly checks size.
4021 	 */
4022 	char *e;
4023 	int major = simple_strtoul(buf, &e, 10);
4024 	int minor;
4025 	dev_t dev;
4026 	struct md_rdev *rdev;
4027 	int err;
4028 
4029 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4030 		return -EINVAL;
4031 	minor = simple_strtoul(e+1, &e, 10);
4032 	if (*e && *e != '\n')
4033 		return -EINVAL;
4034 	dev = MKDEV(major, minor);
4035 	if (major != MAJOR(dev) ||
4036 	    minor != MINOR(dev))
4037 		return -EOVERFLOW;
4038 
4039 
4040 	if (mddev->persistent) {
4041 		rdev = md_import_device(dev, mddev->major_version,
4042 					mddev->minor_version);
4043 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4044 			struct md_rdev *rdev0
4045 				= list_entry(mddev->disks.next,
4046 					     struct md_rdev, same_set);
4047 			err = super_types[mddev->major_version]
4048 				.load_super(rdev, rdev0, mddev->minor_version);
4049 			if (err < 0)
4050 				goto out;
4051 		}
4052 	} else if (mddev->external)
4053 		rdev = md_import_device(dev, -2, -1);
4054 	else
4055 		rdev = md_import_device(dev, -1, -1);
4056 
4057 	if (IS_ERR(rdev))
4058 		return PTR_ERR(rdev);
4059 	err = bind_rdev_to_array(rdev, mddev);
4060  out:
4061 	if (err)
4062 		export_rdev(rdev);
4063 	return err ? err : len;
4064 }
4065 
4066 static struct md_sysfs_entry md_new_device =
4067 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4068 
4069 static ssize_t
4070 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4071 {
4072 	char *end;
4073 	unsigned long chunk, end_chunk;
4074 
4075 	if (!mddev->bitmap)
4076 		goto out;
4077 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4078 	while (*buf) {
4079 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4080 		if (buf == end) break;
4081 		if (*end == '-') { /* range */
4082 			buf = end + 1;
4083 			end_chunk = simple_strtoul(buf, &end, 0);
4084 			if (buf == end) break;
4085 		}
4086 		if (*end && !isspace(*end)) break;
4087 		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4088 		buf = skip_spaces(end);
4089 	}
4090 	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4091 out:
4092 	return len;
4093 }
4094 
4095 static struct md_sysfs_entry md_bitmap =
4096 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4097 
4098 static ssize_t
4099 size_show(struct mddev *mddev, char *page)
4100 {
4101 	return sprintf(page, "%llu\n",
4102 		(unsigned long long)mddev->dev_sectors / 2);
4103 }
4104 
4105 static int update_size(struct mddev *mddev, sector_t num_sectors);
4106 
4107 static ssize_t
4108 size_store(struct mddev *mddev, const char *buf, size_t len)
4109 {
4110 	/* If array is inactive, we can reduce the component size, but
4111 	 * not increase it (except from 0).
4112 	 * If array is active, we can try an on-line resize
4113 	 */
4114 	sector_t sectors;
4115 	int err = strict_blocks_to_sectors(buf, &sectors);
4116 
4117 	if (err < 0)
4118 		return err;
4119 	if (mddev->pers) {
4120 		err = update_size(mddev, sectors);
4121 		md_update_sb(mddev, 1);
4122 	} else {
4123 		if (mddev->dev_sectors == 0 ||
4124 		    mddev->dev_sectors > sectors)
4125 			mddev->dev_sectors = sectors;
4126 		else
4127 			err = -ENOSPC;
4128 	}
4129 	return err ? err : len;
4130 }
4131 
4132 static struct md_sysfs_entry md_size =
4133 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4134 
4135 
4136 /* Metadata version.
4137  * This is one of
4138  *   'none' for arrays with no metadata (good luck...)
4139  *   'external' for arrays with externally managed metadata,
4140  * or N.M for internally known formats
4141  */
4142 static ssize_t
4143 metadata_show(struct mddev *mddev, char *page)
4144 {
4145 	if (mddev->persistent)
4146 		return sprintf(page, "%d.%d\n",
4147 			       mddev->major_version, mddev->minor_version);
4148 	else if (mddev->external)
4149 		return sprintf(page, "external:%s\n", mddev->metadata_type);
4150 	else
4151 		return sprintf(page, "none\n");
4152 }
4153 
4154 static ssize_t
4155 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4156 {
4157 	int major, minor;
4158 	char *e;
4159 	/* Changing the details of 'external' metadata is
4160 	 * always permitted.  Otherwise there must be
4161 	 * no devices attached to the array.
4162 	 */
4163 	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4164 		;
4165 	else if (!list_empty(&mddev->disks))
4166 		return -EBUSY;
4167 
4168 	if (cmd_match(buf, "none")) {
4169 		mddev->persistent = 0;
4170 		mddev->external = 0;
4171 		mddev->major_version = 0;
4172 		mddev->minor_version = 90;
4173 		return len;
4174 	}
4175 	if (strncmp(buf, "external:", 9) == 0) {
4176 		size_t namelen = len-9;
4177 		if (namelen >= sizeof(mddev->metadata_type))
4178 			namelen = sizeof(mddev->metadata_type)-1;
4179 		strncpy(mddev->metadata_type, buf+9, namelen);
4180 		mddev->metadata_type[namelen] = 0;
4181 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4182 			mddev->metadata_type[--namelen] = 0;
4183 		mddev->persistent = 0;
4184 		mddev->external = 1;
4185 		mddev->major_version = 0;
4186 		mddev->minor_version = 90;
4187 		return len;
4188 	}
4189 	major = simple_strtoul(buf, &e, 10);
4190 	if (e==buf || *e != '.')
4191 		return -EINVAL;
4192 	buf = e+1;
4193 	minor = simple_strtoul(buf, &e, 10);
4194 	if (e==buf || (*e && *e != '\n') )
4195 		return -EINVAL;
4196 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4197 		return -ENOENT;
4198 	mddev->major_version = major;
4199 	mddev->minor_version = minor;
4200 	mddev->persistent = 1;
4201 	mddev->external = 0;
4202 	return len;
4203 }
4204 
4205 static struct md_sysfs_entry md_metadata =
4206 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4207 
4208 static ssize_t
4209 action_show(struct mddev *mddev, char *page)
4210 {
4211 	char *type = "idle";
4212 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4213 		type = "frozen";
4214 	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4215 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
4216 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4217 			type = "reshape";
4218 		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4219 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4220 				type = "resync";
4221 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4222 				type = "check";
4223 			else
4224 				type = "repair";
4225 		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4226 			type = "recover";
4227 	}
4228 	return sprintf(page, "%s\n", type);
4229 }
4230 
4231 static ssize_t
4232 action_store(struct mddev *mddev, const char *page, size_t len)
4233 {
4234 	if (!mddev->pers || !mddev->pers->sync_request)
4235 		return -EINVAL;
4236 
4237 	if (cmd_match(page, "frozen"))
4238 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4239 	else
4240 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4241 
4242 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4243 		if (mddev->sync_thread) {
4244 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4245 			md_reap_sync_thread(mddev);
4246 		}
4247 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4248 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4249 		return -EBUSY;
4250 	else if (cmd_match(page, "resync"))
4251 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4252 	else if (cmd_match(page, "recover")) {
4253 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4254 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4255 	} else if (cmd_match(page, "reshape")) {
4256 		int err;
4257 		if (mddev->pers->start_reshape == NULL)
4258 			return -EINVAL;
4259 		err = mddev->pers->start_reshape(mddev);
4260 		if (err)
4261 			return err;
4262 		sysfs_notify(&mddev->kobj, NULL, "degraded");
4263 	} else {
4264 		if (cmd_match(page, "check"))
4265 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4266 		else if (!cmd_match(page, "repair"))
4267 			return -EINVAL;
4268 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4269 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4270 	}
4271 	if (mddev->ro == 2) {
4272 		/* A write to sync_action is enough to justify
4273 		 * canceling read-auto mode
4274 		 */
4275 		mddev->ro = 0;
4276 		md_wakeup_thread(mddev->sync_thread);
4277 	}
4278 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4279 	md_wakeup_thread(mddev->thread);
4280 	sysfs_notify_dirent_safe(mddev->sysfs_action);
4281 	return len;
4282 }
4283 
4284 static ssize_t
4285 mismatch_cnt_show(struct mddev *mddev, char *page)
4286 {
4287 	return sprintf(page, "%llu\n",
4288 		       (unsigned long long)
4289 		       atomic64_read(&mddev->resync_mismatches));
4290 }
4291 
4292 static struct md_sysfs_entry md_scan_mode =
4293 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4294 
4295 
4296 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4297 
4298 static ssize_t
4299 sync_min_show(struct mddev *mddev, char *page)
4300 {
4301 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
4302 		       mddev->sync_speed_min ? "local": "system");
4303 }
4304 
4305 static ssize_t
4306 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4307 {
4308 	int min;
4309 	char *e;
4310 	if (strncmp(buf, "system", 6)==0) {
4311 		mddev->sync_speed_min = 0;
4312 		return len;
4313 	}
4314 	min = simple_strtoul(buf, &e, 10);
4315 	if (buf == e || (*e && *e != '\n') || min <= 0)
4316 		return -EINVAL;
4317 	mddev->sync_speed_min = min;
4318 	return len;
4319 }
4320 
4321 static struct md_sysfs_entry md_sync_min =
4322 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4323 
4324 static ssize_t
4325 sync_max_show(struct mddev *mddev, char *page)
4326 {
4327 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
4328 		       mddev->sync_speed_max ? "local": "system");
4329 }
4330 
4331 static ssize_t
4332 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4333 {
4334 	int max;
4335 	char *e;
4336 	if (strncmp(buf, "system", 6)==0) {
4337 		mddev->sync_speed_max = 0;
4338 		return len;
4339 	}
4340 	max = simple_strtoul(buf, &e, 10);
4341 	if (buf == e || (*e && *e != '\n') || max <= 0)
4342 		return -EINVAL;
4343 	mddev->sync_speed_max = max;
4344 	return len;
4345 }
4346 
4347 static struct md_sysfs_entry md_sync_max =
4348 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4349 
4350 static ssize_t
4351 degraded_show(struct mddev *mddev, char *page)
4352 {
4353 	return sprintf(page, "%d\n", mddev->degraded);
4354 }
4355 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4356 
4357 static ssize_t
4358 sync_force_parallel_show(struct mddev *mddev, char *page)
4359 {
4360 	return sprintf(page, "%d\n", mddev->parallel_resync);
4361 }
4362 
4363 static ssize_t
4364 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4365 {
4366 	long n;
4367 
4368 	if (strict_strtol(buf, 10, &n))
4369 		return -EINVAL;
4370 
4371 	if (n != 0 && n != 1)
4372 		return -EINVAL;
4373 
4374 	mddev->parallel_resync = n;
4375 
4376 	if (mddev->sync_thread)
4377 		wake_up(&resync_wait);
4378 
4379 	return len;
4380 }
4381 
4382 /* force parallel resync, even with shared block devices */
4383 static struct md_sysfs_entry md_sync_force_parallel =
4384 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4385        sync_force_parallel_show, sync_force_parallel_store);
4386 
4387 static ssize_t
4388 sync_speed_show(struct mddev *mddev, char *page)
4389 {
4390 	unsigned long resync, dt, db;
4391 	if (mddev->curr_resync == 0)
4392 		return sprintf(page, "none\n");
4393 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4394 	dt = (jiffies - mddev->resync_mark) / HZ;
4395 	if (!dt) dt++;
4396 	db = resync - mddev->resync_mark_cnt;
4397 	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4398 }
4399 
4400 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4401 
4402 static ssize_t
4403 sync_completed_show(struct mddev *mddev, char *page)
4404 {
4405 	unsigned long long max_sectors, resync;
4406 
4407 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4408 		return sprintf(page, "none\n");
4409 
4410 	if (mddev->curr_resync == 1 ||
4411 	    mddev->curr_resync == 2)
4412 		return sprintf(page, "delayed\n");
4413 
4414 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4415 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4416 		max_sectors = mddev->resync_max_sectors;
4417 	else
4418 		max_sectors = mddev->dev_sectors;
4419 
4420 	resync = mddev->curr_resync_completed;
4421 	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4422 }
4423 
4424 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4425 
4426 static ssize_t
4427 min_sync_show(struct mddev *mddev, char *page)
4428 {
4429 	return sprintf(page, "%llu\n",
4430 		       (unsigned long long)mddev->resync_min);
4431 }
4432 static ssize_t
4433 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4434 {
4435 	unsigned long long min;
4436 	if (strict_strtoull(buf, 10, &min))
4437 		return -EINVAL;
4438 	if (min > mddev->resync_max)
4439 		return -EINVAL;
4440 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4441 		return -EBUSY;
4442 
4443 	/* Must be a multiple of chunk_size */
4444 	if (mddev->chunk_sectors) {
4445 		sector_t temp = min;
4446 		if (sector_div(temp, mddev->chunk_sectors))
4447 			return -EINVAL;
4448 	}
4449 	mddev->resync_min = min;
4450 
4451 	return len;
4452 }
4453 
4454 static struct md_sysfs_entry md_min_sync =
4455 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4456 
4457 static ssize_t
4458 max_sync_show(struct mddev *mddev, char *page)
4459 {
4460 	if (mddev->resync_max == MaxSector)
4461 		return sprintf(page, "max\n");
4462 	else
4463 		return sprintf(page, "%llu\n",
4464 			       (unsigned long long)mddev->resync_max);
4465 }
4466 static ssize_t
4467 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4468 {
4469 	if (strncmp(buf, "max", 3) == 0)
4470 		mddev->resync_max = MaxSector;
4471 	else {
4472 		unsigned long long max;
4473 		if (strict_strtoull(buf, 10, &max))
4474 			return -EINVAL;
4475 		if (max < mddev->resync_min)
4476 			return -EINVAL;
4477 		if (max < mddev->resync_max &&
4478 		    mddev->ro == 0 &&
4479 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4480 			return -EBUSY;
4481 
4482 		/* Must be a multiple of chunk_size */
4483 		if (mddev->chunk_sectors) {
4484 			sector_t temp = max;
4485 			if (sector_div(temp, mddev->chunk_sectors))
4486 				return -EINVAL;
4487 		}
4488 		mddev->resync_max = max;
4489 	}
4490 	wake_up(&mddev->recovery_wait);
4491 	return len;
4492 }
4493 
4494 static struct md_sysfs_entry md_max_sync =
4495 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4496 
4497 static ssize_t
4498 suspend_lo_show(struct mddev *mddev, char *page)
4499 {
4500 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4501 }
4502 
4503 static ssize_t
4504 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4505 {
4506 	char *e;
4507 	unsigned long long new = simple_strtoull(buf, &e, 10);
4508 	unsigned long long old = mddev->suspend_lo;
4509 
4510 	if (mddev->pers == NULL ||
4511 	    mddev->pers->quiesce == NULL)
4512 		return -EINVAL;
4513 	if (buf == e || (*e && *e != '\n'))
4514 		return -EINVAL;
4515 
4516 	mddev->suspend_lo = new;
4517 	if (new >= old)
4518 		/* Shrinking suspended region */
4519 		mddev->pers->quiesce(mddev, 2);
4520 	else {
4521 		/* Expanding suspended region - need to wait */
4522 		mddev->pers->quiesce(mddev, 1);
4523 		mddev->pers->quiesce(mddev, 0);
4524 	}
4525 	return len;
4526 }
4527 static struct md_sysfs_entry md_suspend_lo =
4528 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4529 
4530 
4531 static ssize_t
4532 suspend_hi_show(struct mddev *mddev, char *page)
4533 {
4534 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4535 }
4536 
4537 static ssize_t
4538 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4539 {
4540 	char *e;
4541 	unsigned long long new = simple_strtoull(buf, &e, 10);
4542 	unsigned long long old = mddev->suspend_hi;
4543 
4544 	if (mddev->pers == NULL ||
4545 	    mddev->pers->quiesce == NULL)
4546 		return -EINVAL;
4547 	if (buf == e || (*e && *e != '\n'))
4548 		return -EINVAL;
4549 
4550 	mddev->suspend_hi = new;
4551 	if (new <= old)
4552 		/* Shrinking suspended region */
4553 		mddev->pers->quiesce(mddev, 2);
4554 	else {
4555 		/* Expanding suspended region - need to wait */
4556 		mddev->pers->quiesce(mddev, 1);
4557 		mddev->pers->quiesce(mddev, 0);
4558 	}
4559 	return len;
4560 }
4561 static struct md_sysfs_entry md_suspend_hi =
4562 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4563 
4564 static ssize_t
4565 reshape_position_show(struct mddev *mddev, char *page)
4566 {
4567 	if (mddev->reshape_position != MaxSector)
4568 		return sprintf(page, "%llu\n",
4569 			       (unsigned long long)mddev->reshape_position);
4570 	strcpy(page, "none\n");
4571 	return 5;
4572 }
4573 
4574 static ssize_t
4575 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4576 {
4577 	struct md_rdev *rdev;
4578 	char *e;
4579 	unsigned long long new = simple_strtoull(buf, &e, 10);
4580 	if (mddev->pers)
4581 		return -EBUSY;
4582 	if (buf == e || (*e && *e != '\n'))
4583 		return -EINVAL;
4584 	mddev->reshape_position = new;
4585 	mddev->delta_disks = 0;
4586 	mddev->reshape_backwards = 0;
4587 	mddev->new_level = mddev->level;
4588 	mddev->new_layout = mddev->layout;
4589 	mddev->new_chunk_sectors = mddev->chunk_sectors;
4590 	rdev_for_each(rdev, mddev)
4591 		rdev->new_data_offset = rdev->data_offset;
4592 	return len;
4593 }
4594 
4595 static struct md_sysfs_entry md_reshape_position =
4596 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4597        reshape_position_store);
4598 
4599 static ssize_t
4600 reshape_direction_show(struct mddev *mddev, char *page)
4601 {
4602 	return sprintf(page, "%s\n",
4603 		       mddev->reshape_backwards ? "backwards" : "forwards");
4604 }
4605 
4606 static ssize_t
4607 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4608 {
4609 	int backwards = 0;
4610 	if (cmd_match(buf, "forwards"))
4611 		backwards = 0;
4612 	else if (cmd_match(buf, "backwards"))
4613 		backwards = 1;
4614 	else
4615 		return -EINVAL;
4616 	if (mddev->reshape_backwards == backwards)
4617 		return len;
4618 
4619 	/* check if we are allowed to change */
4620 	if (mddev->delta_disks)
4621 		return -EBUSY;
4622 
4623 	if (mddev->persistent &&
4624 	    mddev->major_version == 0)
4625 		return -EINVAL;
4626 
4627 	mddev->reshape_backwards = backwards;
4628 	return len;
4629 }
4630 
4631 static struct md_sysfs_entry md_reshape_direction =
4632 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4633        reshape_direction_store);
4634 
4635 static ssize_t
4636 array_size_show(struct mddev *mddev, char *page)
4637 {
4638 	if (mddev->external_size)
4639 		return sprintf(page, "%llu\n",
4640 			       (unsigned long long)mddev->array_sectors/2);
4641 	else
4642 		return sprintf(page, "default\n");
4643 }
4644 
4645 static ssize_t
4646 array_size_store(struct mddev *mddev, const char *buf, size_t len)
4647 {
4648 	sector_t sectors;
4649 
4650 	if (strncmp(buf, "default", 7) == 0) {
4651 		if (mddev->pers)
4652 			sectors = mddev->pers->size(mddev, 0, 0);
4653 		else
4654 			sectors = mddev->array_sectors;
4655 
4656 		mddev->external_size = 0;
4657 	} else {
4658 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
4659 			return -EINVAL;
4660 		if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4661 			return -E2BIG;
4662 
4663 		mddev->external_size = 1;
4664 	}
4665 
4666 	mddev->array_sectors = sectors;
4667 	if (mddev->pers) {
4668 		set_capacity(mddev->gendisk, mddev->array_sectors);
4669 		revalidate_disk(mddev->gendisk);
4670 	}
4671 	return len;
4672 }
4673 
4674 static struct md_sysfs_entry md_array_size =
4675 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4676        array_size_store);
4677 
4678 static struct attribute *md_default_attrs[] = {
4679 	&md_level.attr,
4680 	&md_layout.attr,
4681 	&md_raid_disks.attr,
4682 	&md_chunk_size.attr,
4683 	&md_size.attr,
4684 	&md_resync_start.attr,
4685 	&md_metadata.attr,
4686 	&md_new_device.attr,
4687 	&md_safe_delay.attr,
4688 	&md_array_state.attr,
4689 	&md_reshape_position.attr,
4690 	&md_reshape_direction.attr,
4691 	&md_array_size.attr,
4692 	&max_corr_read_errors.attr,
4693 	NULL,
4694 };
4695 
4696 static struct attribute *md_redundancy_attrs[] = {
4697 	&md_scan_mode.attr,
4698 	&md_mismatches.attr,
4699 	&md_sync_min.attr,
4700 	&md_sync_max.attr,
4701 	&md_sync_speed.attr,
4702 	&md_sync_force_parallel.attr,
4703 	&md_sync_completed.attr,
4704 	&md_min_sync.attr,
4705 	&md_max_sync.attr,
4706 	&md_suspend_lo.attr,
4707 	&md_suspend_hi.attr,
4708 	&md_bitmap.attr,
4709 	&md_degraded.attr,
4710 	NULL,
4711 };
4712 static struct attribute_group md_redundancy_group = {
4713 	.name = NULL,
4714 	.attrs = md_redundancy_attrs,
4715 };
4716 
4717 
4718 static ssize_t
4719 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4720 {
4721 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4722 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4723 	ssize_t rv;
4724 
4725 	if (!entry->show)
4726 		return -EIO;
4727 	spin_lock(&all_mddevs_lock);
4728 	if (list_empty(&mddev->all_mddevs)) {
4729 		spin_unlock(&all_mddevs_lock);
4730 		return -EBUSY;
4731 	}
4732 	mddev_get(mddev);
4733 	spin_unlock(&all_mddevs_lock);
4734 
4735 	rv = mddev_lock(mddev);
4736 	if (!rv) {
4737 		rv = entry->show(mddev, page);
4738 		mddev_unlock(mddev);
4739 	}
4740 	mddev_put(mddev);
4741 	return rv;
4742 }
4743 
4744 static ssize_t
4745 md_attr_store(struct kobject *kobj, struct attribute *attr,
4746 	      const char *page, size_t length)
4747 {
4748 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4749 	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4750 	ssize_t rv;
4751 
4752 	if (!entry->store)
4753 		return -EIO;
4754 	if (!capable(CAP_SYS_ADMIN))
4755 		return -EACCES;
4756 	spin_lock(&all_mddevs_lock);
4757 	if (list_empty(&mddev->all_mddevs)) {
4758 		spin_unlock(&all_mddevs_lock);
4759 		return -EBUSY;
4760 	}
4761 	mddev_get(mddev);
4762 	spin_unlock(&all_mddevs_lock);
4763 	if (entry->store == new_dev_store)
4764 		flush_workqueue(md_misc_wq);
4765 	rv = mddev_lock(mddev);
4766 	if (!rv) {
4767 		rv = entry->store(mddev, page, length);
4768 		mddev_unlock(mddev);
4769 	}
4770 	mddev_put(mddev);
4771 	return rv;
4772 }
4773 
4774 static void md_free(struct kobject *ko)
4775 {
4776 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
4777 
4778 	if (mddev->sysfs_state)
4779 		sysfs_put(mddev->sysfs_state);
4780 
4781 	if (mddev->gendisk) {
4782 		del_gendisk(mddev->gendisk);
4783 		put_disk(mddev->gendisk);
4784 	}
4785 	if (mddev->queue)
4786 		blk_cleanup_queue(mddev->queue);
4787 
4788 	kfree(mddev);
4789 }
4790 
4791 static const struct sysfs_ops md_sysfs_ops = {
4792 	.show	= md_attr_show,
4793 	.store	= md_attr_store,
4794 };
4795 static struct kobj_type md_ktype = {
4796 	.release	= md_free,
4797 	.sysfs_ops	= &md_sysfs_ops,
4798 	.default_attrs	= md_default_attrs,
4799 };
4800 
4801 int mdp_major = 0;
4802 
4803 static void mddev_delayed_delete(struct work_struct *ws)
4804 {
4805 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
4806 
4807 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4808 	kobject_del(&mddev->kobj);
4809 	kobject_put(&mddev->kobj);
4810 }
4811 
4812 static int md_alloc(dev_t dev, char *name)
4813 {
4814 	static DEFINE_MUTEX(disks_mutex);
4815 	struct mddev *mddev = mddev_find(dev);
4816 	struct gendisk *disk;
4817 	int partitioned;
4818 	int shift;
4819 	int unit;
4820 	int error;
4821 
4822 	if (!mddev)
4823 		return -ENODEV;
4824 
4825 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4826 	shift = partitioned ? MdpMinorShift : 0;
4827 	unit = MINOR(mddev->unit) >> shift;
4828 
4829 	/* wait for any previous instance of this device to be
4830 	 * completely removed (mddev_delayed_delete).
4831 	 */
4832 	flush_workqueue(md_misc_wq);
4833 
4834 	mutex_lock(&disks_mutex);
4835 	error = -EEXIST;
4836 	if (mddev->gendisk)
4837 		goto abort;
4838 
4839 	if (name) {
4840 		/* Need to ensure that 'name' is not a duplicate.
4841 		 */
4842 		struct mddev *mddev2;
4843 		spin_lock(&all_mddevs_lock);
4844 
4845 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4846 			if (mddev2->gendisk &&
4847 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
4848 				spin_unlock(&all_mddevs_lock);
4849 				goto abort;
4850 			}
4851 		spin_unlock(&all_mddevs_lock);
4852 	}
4853 
4854 	error = -ENOMEM;
4855 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
4856 	if (!mddev->queue)
4857 		goto abort;
4858 	mddev->queue->queuedata = mddev;
4859 
4860 	blk_queue_make_request(mddev->queue, md_make_request);
4861 	blk_set_stacking_limits(&mddev->queue->limits);
4862 
4863 	disk = alloc_disk(1 << shift);
4864 	if (!disk) {
4865 		blk_cleanup_queue(mddev->queue);
4866 		mddev->queue = NULL;
4867 		goto abort;
4868 	}
4869 	disk->major = MAJOR(mddev->unit);
4870 	disk->first_minor = unit << shift;
4871 	if (name)
4872 		strcpy(disk->disk_name, name);
4873 	else if (partitioned)
4874 		sprintf(disk->disk_name, "md_d%d", unit);
4875 	else
4876 		sprintf(disk->disk_name, "md%d", unit);
4877 	disk->fops = &md_fops;
4878 	disk->private_data = mddev;
4879 	disk->queue = mddev->queue;
4880 	blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4881 	/* Allow extended partitions.  This makes the
4882 	 * 'mdp' device redundant, but we can't really
4883 	 * remove it now.
4884 	 */
4885 	disk->flags |= GENHD_FL_EXT_DEVT;
4886 	mddev->gendisk = disk;
4887 	/* As soon as we call add_disk(), another thread could get
4888 	 * through to md_open, so make sure it doesn't get too far
4889 	 */
4890 	mutex_lock(&mddev->open_mutex);
4891 	add_disk(disk);
4892 
4893 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4894 				     &disk_to_dev(disk)->kobj, "%s", "md");
4895 	if (error) {
4896 		/* This isn't possible, but as kobject_init_and_add is marked
4897 		 * __must_check, we must do something with the result
4898 		 */
4899 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4900 		       disk->disk_name);
4901 		error = 0;
4902 	}
4903 	if (mddev->kobj.sd &&
4904 	    sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4905 		printk(KERN_DEBUG "pointless warning\n");
4906 	mutex_unlock(&mddev->open_mutex);
4907  abort:
4908 	mutex_unlock(&disks_mutex);
4909 	if (!error && mddev->kobj.sd) {
4910 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
4911 		mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4912 	}
4913 	mddev_put(mddev);
4914 	return error;
4915 }
4916 
4917 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4918 {
4919 	md_alloc(dev, NULL);
4920 	return NULL;
4921 }
4922 
4923 static int add_named_array(const char *val, struct kernel_param *kp)
4924 {
4925 	/* val must be "md_*" where * is not all digits.
4926 	 * We allocate an array with a large free minor number, and
4927 	 * set the name to val.  val must not already be an active name.
4928 	 */
4929 	int len = strlen(val);
4930 	char buf[DISK_NAME_LEN];
4931 
4932 	while (len && val[len-1] == '\n')
4933 		len--;
4934 	if (len >= DISK_NAME_LEN)
4935 		return -E2BIG;
4936 	strlcpy(buf, val, len+1);
4937 	if (strncmp(buf, "md_", 3) != 0)
4938 		return -EINVAL;
4939 	return md_alloc(0, buf);
4940 }
4941 
4942 static void md_safemode_timeout(unsigned long data)
4943 {
4944 	struct mddev *mddev = (struct mddev *) data;
4945 
4946 	if (!atomic_read(&mddev->writes_pending)) {
4947 		mddev->safemode = 1;
4948 		if (mddev->external)
4949 			sysfs_notify_dirent_safe(mddev->sysfs_state);
4950 	}
4951 	md_wakeup_thread(mddev->thread);
4952 }
4953 
4954 static int start_dirty_degraded;
4955 
4956 int md_run(struct mddev *mddev)
4957 {
4958 	int err;
4959 	struct md_rdev *rdev;
4960 	struct md_personality *pers;
4961 
4962 	if (list_empty(&mddev->disks))
4963 		/* cannot run an array with no devices.. */
4964 		return -EINVAL;
4965 
4966 	if (mddev->pers)
4967 		return -EBUSY;
4968 	/* Cannot run until previous stop completes properly */
4969 	if (mddev->sysfs_active)
4970 		return -EBUSY;
4971 
4972 	/*
4973 	 * Analyze all RAID superblock(s)
4974 	 */
4975 	if (!mddev->raid_disks) {
4976 		if (!mddev->persistent)
4977 			return -EINVAL;
4978 		analyze_sbs(mddev);
4979 	}
4980 
4981 	if (mddev->level != LEVEL_NONE)
4982 		request_module("md-level-%d", mddev->level);
4983 	else if (mddev->clevel[0])
4984 		request_module("md-%s", mddev->clevel);
4985 
4986 	/*
4987 	 * Drop all container device buffers, from now on
4988 	 * the only valid external interface is through the md
4989 	 * device.
4990 	 */
4991 	rdev_for_each(rdev, mddev) {
4992 		if (test_bit(Faulty, &rdev->flags))
4993 			continue;
4994 		sync_blockdev(rdev->bdev);
4995 		invalidate_bdev(rdev->bdev);
4996 
4997 		/* perform some consistency tests on the device.
4998 		 * We don't want the data to overlap the metadata,
4999 		 * Internal Bitmap issues have been handled elsewhere.
5000 		 */
5001 		if (rdev->meta_bdev) {
5002 			/* Nothing to check */;
5003 		} else if (rdev->data_offset < rdev->sb_start) {
5004 			if (mddev->dev_sectors &&
5005 			    rdev->data_offset + mddev->dev_sectors
5006 			    > rdev->sb_start) {
5007 				printk("md: %s: data overlaps metadata\n",
5008 				       mdname(mddev));
5009 				return -EINVAL;
5010 			}
5011 		} else {
5012 			if (rdev->sb_start + rdev->sb_size/512
5013 			    > rdev->data_offset) {
5014 				printk("md: %s: metadata overlaps data\n",
5015 				       mdname(mddev));
5016 				return -EINVAL;
5017 			}
5018 		}
5019 		sysfs_notify_dirent_safe(rdev->sysfs_state);
5020 	}
5021 
5022 	if (mddev->bio_set == NULL)
5023 		mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5024 
5025 	spin_lock(&pers_lock);
5026 	pers = find_pers(mddev->level, mddev->clevel);
5027 	if (!pers || !try_module_get(pers->owner)) {
5028 		spin_unlock(&pers_lock);
5029 		if (mddev->level != LEVEL_NONE)
5030 			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5031 			       mddev->level);
5032 		else
5033 			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5034 			       mddev->clevel);
5035 		return -EINVAL;
5036 	}
5037 	mddev->pers = pers;
5038 	spin_unlock(&pers_lock);
5039 	if (mddev->level != pers->level) {
5040 		mddev->level = pers->level;
5041 		mddev->new_level = pers->level;
5042 	}
5043 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5044 
5045 	if (mddev->reshape_position != MaxSector &&
5046 	    pers->start_reshape == NULL) {
5047 		/* This personality cannot handle reshaping... */
5048 		mddev->pers = NULL;
5049 		module_put(pers->owner);
5050 		return -EINVAL;
5051 	}
5052 
5053 	if (pers->sync_request) {
5054 		/* Warn if this is a potentially silly
5055 		 * configuration.
5056 		 */
5057 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5058 		struct md_rdev *rdev2;
5059 		int warned = 0;
5060 
5061 		rdev_for_each(rdev, mddev)
5062 			rdev_for_each(rdev2, mddev) {
5063 				if (rdev < rdev2 &&
5064 				    rdev->bdev->bd_contains ==
5065 				    rdev2->bdev->bd_contains) {
5066 					printk(KERN_WARNING
5067 					       "%s: WARNING: %s appears to be"
5068 					       " on the same physical disk as"
5069 					       " %s.\n",
5070 					       mdname(mddev),
5071 					       bdevname(rdev->bdev,b),
5072 					       bdevname(rdev2->bdev,b2));
5073 					warned = 1;
5074 				}
5075 			}
5076 
5077 		if (warned)
5078 			printk(KERN_WARNING
5079 			       "True protection against single-disk"
5080 			       " failure might be compromised.\n");
5081 	}
5082 
5083 	mddev->recovery = 0;
5084 	/* may be over-ridden by personality */
5085 	mddev->resync_max_sectors = mddev->dev_sectors;
5086 
5087 	mddev->ok_start_degraded = start_dirty_degraded;
5088 
5089 	if (start_readonly && mddev->ro == 0)
5090 		mddev->ro = 2; /* read-only, but switch on first write */
5091 
5092 	err = mddev->pers->run(mddev);
5093 	if (err)
5094 		printk(KERN_ERR "md: pers->run() failed ...\n");
5095 	else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5096 		WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5097 			  " but 'external_size' not in effect?\n", __func__);
5098 		printk(KERN_ERR
5099 		       "md: invalid array_size %llu > default size %llu\n",
5100 		       (unsigned long long)mddev->array_sectors / 2,
5101 		       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5102 		err = -EINVAL;
5103 		mddev->pers->stop(mddev);
5104 	}
5105 	if (err == 0 && mddev->pers->sync_request &&
5106 	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5107 		err = bitmap_create(mddev);
5108 		if (err) {
5109 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5110 			       mdname(mddev), err);
5111 			mddev->pers->stop(mddev);
5112 		}
5113 	}
5114 	if (err) {
5115 		module_put(mddev->pers->owner);
5116 		mddev->pers = NULL;
5117 		bitmap_destroy(mddev);
5118 		return err;
5119 	}
5120 	if (mddev->pers->sync_request) {
5121 		if (mddev->kobj.sd &&
5122 		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5123 			printk(KERN_WARNING
5124 			       "md: cannot register extra attributes for %s\n",
5125 			       mdname(mddev));
5126 		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5127 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
5128 		mddev->ro = 0;
5129 
5130  	atomic_set(&mddev->writes_pending,0);
5131 	atomic_set(&mddev->max_corr_read_errors,
5132 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5133 	mddev->safemode = 0;
5134 	mddev->safemode_timer.function = md_safemode_timeout;
5135 	mddev->safemode_timer.data = (unsigned long) mddev;
5136 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5137 	mddev->in_sync = 1;
5138 	smp_wmb();
5139 	mddev->ready = 1;
5140 	rdev_for_each(rdev, mddev)
5141 		if (rdev->raid_disk >= 0)
5142 			if (sysfs_link_rdev(mddev, rdev))
5143 				/* failure here is OK */;
5144 
5145 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5146 
5147 	if (mddev->flags)
5148 		md_update_sb(mddev, 0);
5149 
5150 	md_new_event(mddev);
5151 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5152 	sysfs_notify_dirent_safe(mddev->sysfs_action);
5153 	sysfs_notify(&mddev->kobj, NULL, "degraded");
5154 	return 0;
5155 }
5156 EXPORT_SYMBOL_GPL(md_run);
5157 
5158 static int do_md_run(struct mddev *mddev)
5159 {
5160 	int err;
5161 
5162 	err = md_run(mddev);
5163 	if (err)
5164 		goto out;
5165 	err = bitmap_load(mddev);
5166 	if (err) {
5167 		bitmap_destroy(mddev);
5168 		goto out;
5169 	}
5170 
5171 	md_wakeup_thread(mddev->thread);
5172 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5173 
5174 	set_capacity(mddev->gendisk, mddev->array_sectors);
5175 	revalidate_disk(mddev->gendisk);
5176 	mddev->changed = 1;
5177 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5178 out:
5179 	return err;
5180 }
5181 
5182 static int restart_array(struct mddev *mddev)
5183 {
5184 	struct gendisk *disk = mddev->gendisk;
5185 
5186 	/* Complain if it has no devices */
5187 	if (list_empty(&mddev->disks))
5188 		return -ENXIO;
5189 	if (!mddev->pers)
5190 		return -EINVAL;
5191 	if (!mddev->ro)
5192 		return -EBUSY;
5193 	mddev->safemode = 0;
5194 	mddev->ro = 0;
5195 	set_disk_ro(disk, 0);
5196 	printk(KERN_INFO "md: %s switched to read-write mode.\n",
5197 		mdname(mddev));
5198 	/* Kick recovery or resync if necessary */
5199 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5200 	md_wakeup_thread(mddev->thread);
5201 	md_wakeup_thread(mddev->sync_thread);
5202 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5203 	return 0;
5204 }
5205 
5206 /* similar to deny_write_access, but accounts for our holding a reference
5207  * to the file ourselves */
5208 static int deny_bitmap_write_access(struct file * file)
5209 {
5210 	struct inode *inode = file->f_mapping->host;
5211 
5212 	spin_lock(&inode->i_lock);
5213 	if (atomic_read(&inode->i_writecount) > 1) {
5214 		spin_unlock(&inode->i_lock);
5215 		return -ETXTBSY;
5216 	}
5217 	atomic_set(&inode->i_writecount, -1);
5218 	spin_unlock(&inode->i_lock);
5219 
5220 	return 0;
5221 }
5222 
5223 void restore_bitmap_write_access(struct file *file)
5224 {
5225 	struct inode *inode = file->f_mapping->host;
5226 
5227 	spin_lock(&inode->i_lock);
5228 	atomic_set(&inode->i_writecount, 1);
5229 	spin_unlock(&inode->i_lock);
5230 }
5231 
5232 static void md_clean(struct mddev *mddev)
5233 {
5234 	mddev->array_sectors = 0;
5235 	mddev->external_size = 0;
5236 	mddev->dev_sectors = 0;
5237 	mddev->raid_disks = 0;
5238 	mddev->recovery_cp = 0;
5239 	mddev->resync_min = 0;
5240 	mddev->resync_max = MaxSector;
5241 	mddev->reshape_position = MaxSector;
5242 	mddev->external = 0;
5243 	mddev->persistent = 0;
5244 	mddev->level = LEVEL_NONE;
5245 	mddev->clevel[0] = 0;
5246 	mddev->flags = 0;
5247 	mddev->ro = 0;
5248 	mddev->metadata_type[0] = 0;
5249 	mddev->chunk_sectors = 0;
5250 	mddev->ctime = mddev->utime = 0;
5251 	mddev->layout = 0;
5252 	mddev->max_disks = 0;
5253 	mddev->events = 0;
5254 	mddev->can_decrease_events = 0;
5255 	mddev->delta_disks = 0;
5256 	mddev->reshape_backwards = 0;
5257 	mddev->new_level = LEVEL_NONE;
5258 	mddev->new_layout = 0;
5259 	mddev->new_chunk_sectors = 0;
5260 	mddev->curr_resync = 0;
5261 	atomic64_set(&mddev->resync_mismatches, 0);
5262 	mddev->suspend_lo = mddev->suspend_hi = 0;
5263 	mddev->sync_speed_min = mddev->sync_speed_max = 0;
5264 	mddev->recovery = 0;
5265 	mddev->in_sync = 0;
5266 	mddev->changed = 0;
5267 	mddev->degraded = 0;
5268 	mddev->safemode = 0;
5269 	mddev->merge_check_needed = 0;
5270 	mddev->bitmap_info.offset = 0;
5271 	mddev->bitmap_info.default_offset = 0;
5272 	mddev->bitmap_info.default_space = 0;
5273 	mddev->bitmap_info.chunksize = 0;
5274 	mddev->bitmap_info.daemon_sleep = 0;
5275 	mddev->bitmap_info.max_write_behind = 0;
5276 }
5277 
5278 static void __md_stop_writes(struct mddev *mddev)
5279 {
5280 	if (mddev->sync_thread) {
5281 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5282 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5283 		md_reap_sync_thread(mddev);
5284 	}
5285 
5286 	del_timer_sync(&mddev->safemode_timer);
5287 
5288 	bitmap_flush(mddev);
5289 	md_super_wait(mddev);
5290 
5291 	if (mddev->ro == 0 &&
5292 	    (!mddev->in_sync || mddev->flags)) {
5293 		/* mark array as shutdown cleanly */
5294 		mddev->in_sync = 1;
5295 		md_update_sb(mddev, 1);
5296 	}
5297 }
5298 
5299 void md_stop_writes(struct mddev *mddev)
5300 {
5301 	mddev_lock(mddev);
5302 	__md_stop_writes(mddev);
5303 	mddev_unlock(mddev);
5304 }
5305 EXPORT_SYMBOL_GPL(md_stop_writes);
5306 
5307 static void __md_stop(struct mddev *mddev)
5308 {
5309 	mddev->ready = 0;
5310 	mddev->pers->stop(mddev);
5311 	if (mddev->pers->sync_request && mddev->to_remove == NULL)
5312 		mddev->to_remove = &md_redundancy_group;
5313 	module_put(mddev->pers->owner);
5314 	mddev->pers = NULL;
5315 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5316 }
5317 
5318 void md_stop(struct mddev *mddev)
5319 {
5320 	/* stop the array and free an attached data structures.
5321 	 * This is called from dm-raid
5322 	 */
5323 	__md_stop(mddev);
5324 	bitmap_destroy(mddev);
5325 	if (mddev->bio_set)
5326 		bioset_free(mddev->bio_set);
5327 }
5328 
5329 EXPORT_SYMBOL_GPL(md_stop);
5330 
5331 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5332 {
5333 	int err = 0;
5334 	mutex_lock(&mddev->open_mutex);
5335 	if (atomic_read(&mddev->openers) > !!bdev) {
5336 		printk("md: %s still in use.\n",mdname(mddev));
5337 		err = -EBUSY;
5338 		goto out;
5339 	}
5340 	if (bdev)
5341 		sync_blockdev(bdev);
5342 	if (mddev->pers) {
5343 		__md_stop_writes(mddev);
5344 
5345 		err  = -ENXIO;
5346 		if (mddev->ro==1)
5347 			goto out;
5348 		mddev->ro = 1;
5349 		set_disk_ro(mddev->gendisk, 1);
5350 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5351 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5352 		err = 0;
5353 	}
5354 out:
5355 	mutex_unlock(&mddev->open_mutex);
5356 	return err;
5357 }
5358 
5359 /* mode:
5360  *   0 - completely stop and dis-assemble array
5361  *   2 - stop but do not disassemble array
5362  */
5363 static int do_md_stop(struct mddev * mddev, int mode,
5364 		      struct block_device *bdev)
5365 {
5366 	struct gendisk *disk = mddev->gendisk;
5367 	struct md_rdev *rdev;
5368 
5369 	mutex_lock(&mddev->open_mutex);
5370 	if (atomic_read(&mddev->openers) > !!bdev ||
5371 	    mddev->sysfs_active) {
5372 		printk("md: %s still in use.\n",mdname(mddev));
5373 		mutex_unlock(&mddev->open_mutex);
5374 		return -EBUSY;
5375 	}
5376 	if (bdev)
5377 		/* It is possible IO was issued on some other
5378 		 * open file which was closed before we took ->open_mutex.
5379 		 * As that was not the last close __blkdev_put will not
5380 		 * have called sync_blockdev, so we must.
5381 		 */
5382 		sync_blockdev(bdev);
5383 
5384 	if (mddev->pers) {
5385 		if (mddev->ro)
5386 			set_disk_ro(disk, 0);
5387 
5388 		__md_stop_writes(mddev);
5389 		__md_stop(mddev);
5390 		mddev->queue->merge_bvec_fn = NULL;
5391 		mddev->queue->backing_dev_info.congested_fn = NULL;
5392 
5393 		/* tell userspace to handle 'inactive' */
5394 		sysfs_notify_dirent_safe(mddev->sysfs_state);
5395 
5396 		rdev_for_each(rdev, mddev)
5397 			if (rdev->raid_disk >= 0)
5398 				sysfs_unlink_rdev(mddev, rdev);
5399 
5400 		set_capacity(disk, 0);
5401 		mutex_unlock(&mddev->open_mutex);
5402 		mddev->changed = 1;
5403 		revalidate_disk(disk);
5404 
5405 		if (mddev->ro)
5406 			mddev->ro = 0;
5407 	} else
5408 		mutex_unlock(&mddev->open_mutex);
5409 	/*
5410 	 * Free resources if final stop
5411 	 */
5412 	if (mode == 0) {
5413 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5414 
5415 		bitmap_destroy(mddev);
5416 		if (mddev->bitmap_info.file) {
5417 			restore_bitmap_write_access(mddev->bitmap_info.file);
5418 			fput(mddev->bitmap_info.file);
5419 			mddev->bitmap_info.file = NULL;
5420 		}
5421 		mddev->bitmap_info.offset = 0;
5422 
5423 		export_array(mddev);
5424 
5425 		md_clean(mddev);
5426 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5427 		if (mddev->hold_active == UNTIL_STOP)
5428 			mddev->hold_active = 0;
5429 	}
5430 	blk_integrity_unregister(disk);
5431 	md_new_event(mddev);
5432 	sysfs_notify_dirent_safe(mddev->sysfs_state);
5433 	return 0;
5434 }
5435 
5436 #ifndef MODULE
5437 static void autorun_array(struct mddev *mddev)
5438 {
5439 	struct md_rdev *rdev;
5440 	int err;
5441 
5442 	if (list_empty(&mddev->disks))
5443 		return;
5444 
5445 	printk(KERN_INFO "md: running: ");
5446 
5447 	rdev_for_each(rdev, mddev) {
5448 		char b[BDEVNAME_SIZE];
5449 		printk("<%s>", bdevname(rdev->bdev,b));
5450 	}
5451 	printk("\n");
5452 
5453 	err = do_md_run(mddev);
5454 	if (err) {
5455 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5456 		do_md_stop(mddev, 0, NULL);
5457 	}
5458 }
5459 
5460 /*
5461  * lets try to run arrays based on all disks that have arrived
5462  * until now. (those are in pending_raid_disks)
5463  *
5464  * the method: pick the first pending disk, collect all disks with
5465  * the same UUID, remove all from the pending list and put them into
5466  * the 'same_array' list. Then order this list based on superblock
5467  * update time (freshest comes first), kick out 'old' disks and
5468  * compare superblocks. If everything's fine then run it.
5469  *
5470  * If "unit" is allocated, then bump its reference count
5471  */
5472 static void autorun_devices(int part)
5473 {
5474 	struct md_rdev *rdev0, *rdev, *tmp;
5475 	struct mddev *mddev;
5476 	char b[BDEVNAME_SIZE];
5477 
5478 	printk(KERN_INFO "md: autorun ...\n");
5479 	while (!list_empty(&pending_raid_disks)) {
5480 		int unit;
5481 		dev_t dev;
5482 		LIST_HEAD(candidates);
5483 		rdev0 = list_entry(pending_raid_disks.next,
5484 					 struct md_rdev, same_set);
5485 
5486 		printk(KERN_INFO "md: considering %s ...\n",
5487 			bdevname(rdev0->bdev,b));
5488 		INIT_LIST_HEAD(&candidates);
5489 		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5490 			if (super_90_load(rdev, rdev0, 0) >= 0) {
5491 				printk(KERN_INFO "md:  adding %s ...\n",
5492 					bdevname(rdev->bdev,b));
5493 				list_move(&rdev->same_set, &candidates);
5494 			}
5495 		/*
5496 		 * now we have a set of devices, with all of them having
5497 		 * mostly sane superblocks. It's time to allocate the
5498 		 * mddev.
5499 		 */
5500 		if (part) {
5501 			dev = MKDEV(mdp_major,
5502 				    rdev0->preferred_minor << MdpMinorShift);
5503 			unit = MINOR(dev) >> MdpMinorShift;
5504 		} else {
5505 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5506 			unit = MINOR(dev);
5507 		}
5508 		if (rdev0->preferred_minor != unit) {
5509 			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5510 			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5511 			break;
5512 		}
5513 
5514 		md_probe(dev, NULL, NULL);
5515 		mddev = mddev_find(dev);
5516 		if (!mddev || !mddev->gendisk) {
5517 			if (mddev)
5518 				mddev_put(mddev);
5519 			printk(KERN_ERR
5520 				"md: cannot allocate memory for md drive.\n");
5521 			break;
5522 		}
5523 		if (mddev_lock(mddev))
5524 			printk(KERN_WARNING "md: %s locked, cannot run\n",
5525 			       mdname(mddev));
5526 		else if (mddev->raid_disks || mddev->major_version
5527 			 || !list_empty(&mddev->disks)) {
5528 			printk(KERN_WARNING
5529 				"md: %s already running, cannot run %s\n",
5530 				mdname(mddev), bdevname(rdev0->bdev,b));
5531 			mddev_unlock(mddev);
5532 		} else {
5533 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
5534 			mddev->persistent = 1;
5535 			rdev_for_each_list(rdev, tmp, &candidates) {
5536 				list_del_init(&rdev->same_set);
5537 				if (bind_rdev_to_array(rdev, mddev))
5538 					export_rdev(rdev);
5539 			}
5540 			autorun_array(mddev);
5541 			mddev_unlock(mddev);
5542 		}
5543 		/* on success, candidates will be empty, on error
5544 		 * it won't...
5545 		 */
5546 		rdev_for_each_list(rdev, tmp, &candidates) {
5547 			list_del_init(&rdev->same_set);
5548 			export_rdev(rdev);
5549 		}
5550 		mddev_put(mddev);
5551 	}
5552 	printk(KERN_INFO "md: ... autorun DONE.\n");
5553 }
5554 #endif /* !MODULE */
5555 
5556 static int get_version(void __user * arg)
5557 {
5558 	mdu_version_t ver;
5559 
5560 	ver.major = MD_MAJOR_VERSION;
5561 	ver.minor = MD_MINOR_VERSION;
5562 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
5563 
5564 	if (copy_to_user(arg, &ver, sizeof(ver)))
5565 		return -EFAULT;
5566 
5567 	return 0;
5568 }
5569 
5570 static int get_array_info(struct mddev * mddev, void __user * arg)
5571 {
5572 	mdu_array_info_t info;
5573 	int nr,working,insync,failed,spare;
5574 	struct md_rdev *rdev;
5575 
5576 	nr = working = insync = failed = spare = 0;
5577 	rcu_read_lock();
5578 	rdev_for_each_rcu(rdev, mddev) {
5579 		nr++;
5580 		if (test_bit(Faulty, &rdev->flags))
5581 			failed++;
5582 		else {
5583 			working++;
5584 			if (test_bit(In_sync, &rdev->flags))
5585 				insync++;
5586 			else
5587 				spare++;
5588 		}
5589 	}
5590 	rcu_read_unlock();
5591 
5592 	info.major_version = mddev->major_version;
5593 	info.minor_version = mddev->minor_version;
5594 	info.patch_version = MD_PATCHLEVEL_VERSION;
5595 	info.ctime         = mddev->ctime;
5596 	info.level         = mddev->level;
5597 	info.size          = mddev->dev_sectors / 2;
5598 	if (info.size != mddev->dev_sectors / 2) /* overflow */
5599 		info.size = -1;
5600 	info.nr_disks      = nr;
5601 	info.raid_disks    = mddev->raid_disks;
5602 	info.md_minor      = mddev->md_minor;
5603 	info.not_persistent= !mddev->persistent;
5604 
5605 	info.utime         = mddev->utime;
5606 	info.state         = 0;
5607 	if (mddev->in_sync)
5608 		info.state = (1<<MD_SB_CLEAN);
5609 	if (mddev->bitmap && mddev->bitmap_info.offset)
5610 		info.state = (1<<MD_SB_BITMAP_PRESENT);
5611 	info.active_disks  = insync;
5612 	info.working_disks = working;
5613 	info.failed_disks  = failed;
5614 	info.spare_disks   = spare;
5615 
5616 	info.layout        = mddev->layout;
5617 	info.chunk_size    = mddev->chunk_sectors << 9;
5618 
5619 	if (copy_to_user(arg, &info, sizeof(info)))
5620 		return -EFAULT;
5621 
5622 	return 0;
5623 }
5624 
5625 static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5626 {
5627 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5628 	char *ptr, *buf = NULL;
5629 	int err = -ENOMEM;
5630 
5631 	if (md_allow_write(mddev))
5632 		file = kmalloc(sizeof(*file), GFP_NOIO);
5633 	else
5634 		file = kmalloc(sizeof(*file), GFP_KERNEL);
5635 
5636 	if (!file)
5637 		goto out;
5638 
5639 	/* bitmap disabled, zero the first byte and copy out */
5640 	if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5641 		file->pathname[0] = '\0';
5642 		goto copy_out;
5643 	}
5644 
5645 	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5646 	if (!buf)
5647 		goto out;
5648 
5649 	ptr = d_path(&mddev->bitmap->storage.file->f_path,
5650 		     buf, sizeof(file->pathname));
5651 	if (IS_ERR(ptr))
5652 		goto out;
5653 
5654 	strcpy(file->pathname, ptr);
5655 
5656 copy_out:
5657 	err = 0;
5658 	if (copy_to_user(arg, file, sizeof(*file)))
5659 		err = -EFAULT;
5660 out:
5661 	kfree(buf);
5662 	kfree(file);
5663 	return err;
5664 }
5665 
5666 static int get_disk_info(struct mddev * mddev, void __user * arg)
5667 {
5668 	mdu_disk_info_t info;
5669 	struct md_rdev *rdev;
5670 
5671 	if (copy_from_user(&info, arg, sizeof(info)))
5672 		return -EFAULT;
5673 
5674 	rcu_read_lock();
5675 	rdev = find_rdev_nr_rcu(mddev, info.number);
5676 	if (rdev) {
5677 		info.major = MAJOR(rdev->bdev->bd_dev);
5678 		info.minor = MINOR(rdev->bdev->bd_dev);
5679 		info.raid_disk = rdev->raid_disk;
5680 		info.state = 0;
5681 		if (test_bit(Faulty, &rdev->flags))
5682 			info.state |= (1<<MD_DISK_FAULTY);
5683 		else if (test_bit(In_sync, &rdev->flags)) {
5684 			info.state |= (1<<MD_DISK_ACTIVE);
5685 			info.state |= (1<<MD_DISK_SYNC);
5686 		}
5687 		if (test_bit(WriteMostly, &rdev->flags))
5688 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
5689 	} else {
5690 		info.major = info.minor = 0;
5691 		info.raid_disk = -1;
5692 		info.state = (1<<MD_DISK_REMOVED);
5693 	}
5694 	rcu_read_unlock();
5695 
5696 	if (copy_to_user(arg, &info, sizeof(info)))
5697 		return -EFAULT;
5698 
5699 	return 0;
5700 }
5701 
5702 static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5703 {
5704 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5705 	struct md_rdev *rdev;
5706 	dev_t dev = MKDEV(info->major,info->minor);
5707 
5708 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5709 		return -EOVERFLOW;
5710 
5711 	if (!mddev->raid_disks) {
5712 		int err;
5713 		/* expecting a device which has a superblock */
5714 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5715 		if (IS_ERR(rdev)) {
5716 			printk(KERN_WARNING
5717 				"md: md_import_device returned %ld\n",
5718 				PTR_ERR(rdev));
5719 			return PTR_ERR(rdev);
5720 		}
5721 		if (!list_empty(&mddev->disks)) {
5722 			struct md_rdev *rdev0
5723 				= list_entry(mddev->disks.next,
5724 					     struct md_rdev, same_set);
5725 			err = super_types[mddev->major_version]
5726 				.load_super(rdev, rdev0, mddev->minor_version);
5727 			if (err < 0) {
5728 				printk(KERN_WARNING
5729 					"md: %s has different UUID to %s\n",
5730 					bdevname(rdev->bdev,b),
5731 					bdevname(rdev0->bdev,b2));
5732 				export_rdev(rdev);
5733 				return -EINVAL;
5734 			}
5735 		}
5736 		err = bind_rdev_to_array(rdev, mddev);
5737 		if (err)
5738 			export_rdev(rdev);
5739 		return err;
5740 	}
5741 
5742 	/*
5743 	 * add_new_disk can be used once the array is assembled
5744 	 * to add "hot spares".  They must already have a superblock
5745 	 * written
5746 	 */
5747 	if (mddev->pers) {
5748 		int err;
5749 		if (!mddev->pers->hot_add_disk) {
5750 			printk(KERN_WARNING
5751 				"%s: personality does not support diskops!\n",
5752 			       mdname(mddev));
5753 			return -EINVAL;
5754 		}
5755 		if (mddev->persistent)
5756 			rdev = md_import_device(dev, mddev->major_version,
5757 						mddev->minor_version);
5758 		else
5759 			rdev = md_import_device(dev, -1, -1);
5760 		if (IS_ERR(rdev)) {
5761 			printk(KERN_WARNING
5762 				"md: md_import_device returned %ld\n",
5763 				PTR_ERR(rdev));
5764 			return PTR_ERR(rdev);
5765 		}
5766 		/* set saved_raid_disk if appropriate */
5767 		if (!mddev->persistent) {
5768 			if (info->state & (1<<MD_DISK_SYNC)  &&
5769 			    info->raid_disk < mddev->raid_disks) {
5770 				rdev->raid_disk = info->raid_disk;
5771 				set_bit(In_sync, &rdev->flags);
5772 			} else
5773 				rdev->raid_disk = -1;
5774 		} else
5775 			super_types[mddev->major_version].
5776 				validate_super(mddev, rdev);
5777 		if ((info->state & (1<<MD_DISK_SYNC)) &&
5778 		     rdev->raid_disk != info->raid_disk) {
5779 			/* This was a hot-add request, but events doesn't
5780 			 * match, so reject it.
5781 			 */
5782 			export_rdev(rdev);
5783 			return -EINVAL;
5784 		}
5785 
5786 		if (test_bit(In_sync, &rdev->flags))
5787 			rdev->saved_raid_disk = rdev->raid_disk;
5788 		else
5789 			rdev->saved_raid_disk = -1;
5790 
5791 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
5792 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5793 			set_bit(WriteMostly, &rdev->flags);
5794 		else
5795 			clear_bit(WriteMostly, &rdev->flags);
5796 
5797 		rdev->raid_disk = -1;
5798 		err = bind_rdev_to_array(rdev, mddev);
5799 		if (!err && !mddev->pers->hot_remove_disk) {
5800 			/* If there is hot_add_disk but no hot_remove_disk
5801 			 * then added disks for geometry changes,
5802 			 * and should be added immediately.
5803 			 */
5804 			super_types[mddev->major_version].
5805 				validate_super(mddev, rdev);
5806 			err = mddev->pers->hot_add_disk(mddev, rdev);
5807 			if (err)
5808 				unbind_rdev_from_array(rdev);
5809 		}
5810 		if (err)
5811 			export_rdev(rdev);
5812 		else
5813 			sysfs_notify_dirent_safe(rdev->sysfs_state);
5814 
5815 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5816 		if (mddev->degraded)
5817 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5818 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5819 		if (!err)
5820 			md_new_event(mddev);
5821 		md_wakeup_thread(mddev->thread);
5822 		return err;
5823 	}
5824 
5825 	/* otherwise, add_new_disk is only allowed
5826 	 * for major_version==0 superblocks
5827 	 */
5828 	if (mddev->major_version != 0) {
5829 		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5830 		       mdname(mddev));
5831 		return -EINVAL;
5832 	}
5833 
5834 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
5835 		int err;
5836 		rdev = md_import_device(dev, -1, 0);
5837 		if (IS_ERR(rdev)) {
5838 			printk(KERN_WARNING
5839 				"md: error, md_import_device() returned %ld\n",
5840 				PTR_ERR(rdev));
5841 			return PTR_ERR(rdev);
5842 		}
5843 		rdev->desc_nr = info->number;
5844 		if (info->raid_disk < mddev->raid_disks)
5845 			rdev->raid_disk = info->raid_disk;
5846 		else
5847 			rdev->raid_disk = -1;
5848 
5849 		if (rdev->raid_disk < mddev->raid_disks)
5850 			if (info->state & (1<<MD_DISK_SYNC))
5851 				set_bit(In_sync, &rdev->flags);
5852 
5853 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5854 			set_bit(WriteMostly, &rdev->flags);
5855 
5856 		if (!mddev->persistent) {
5857 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
5858 			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5859 		} else
5860 			rdev->sb_start = calc_dev_sboffset(rdev);
5861 		rdev->sectors = rdev->sb_start;
5862 
5863 		err = bind_rdev_to_array(rdev, mddev);
5864 		if (err) {
5865 			export_rdev(rdev);
5866 			return err;
5867 		}
5868 	}
5869 
5870 	return 0;
5871 }
5872 
5873 static int hot_remove_disk(struct mddev * mddev, dev_t dev)
5874 {
5875 	char b[BDEVNAME_SIZE];
5876 	struct md_rdev *rdev;
5877 
5878 	rdev = find_rdev(mddev, dev);
5879 	if (!rdev)
5880 		return -ENXIO;
5881 
5882 	clear_bit(Blocked, &rdev->flags);
5883 	remove_and_add_spares(mddev, rdev);
5884 
5885 	if (rdev->raid_disk >= 0)
5886 		goto busy;
5887 
5888 	kick_rdev_from_array(rdev);
5889 	md_update_sb(mddev, 1);
5890 	md_new_event(mddev);
5891 
5892 	return 0;
5893 busy:
5894 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5895 		bdevname(rdev->bdev,b), mdname(mddev));
5896 	return -EBUSY;
5897 }
5898 
5899 static int hot_add_disk(struct mddev * mddev, dev_t dev)
5900 {
5901 	char b[BDEVNAME_SIZE];
5902 	int err;
5903 	struct md_rdev *rdev;
5904 
5905 	if (!mddev->pers)
5906 		return -ENODEV;
5907 
5908 	if (mddev->major_version != 0) {
5909 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5910 			" version-0 superblocks.\n",
5911 			mdname(mddev));
5912 		return -EINVAL;
5913 	}
5914 	if (!mddev->pers->hot_add_disk) {
5915 		printk(KERN_WARNING
5916 			"%s: personality does not support diskops!\n",
5917 			mdname(mddev));
5918 		return -EINVAL;
5919 	}
5920 
5921 	rdev = md_import_device(dev, -1, 0);
5922 	if (IS_ERR(rdev)) {
5923 		printk(KERN_WARNING
5924 			"md: error, md_import_device() returned %ld\n",
5925 			PTR_ERR(rdev));
5926 		return -EINVAL;
5927 	}
5928 
5929 	if (mddev->persistent)
5930 		rdev->sb_start = calc_dev_sboffset(rdev);
5931 	else
5932 		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5933 
5934 	rdev->sectors = rdev->sb_start;
5935 
5936 	if (test_bit(Faulty, &rdev->flags)) {
5937 		printk(KERN_WARNING
5938 			"md: can not hot-add faulty %s disk to %s!\n",
5939 			bdevname(rdev->bdev,b), mdname(mddev));
5940 		err = -EINVAL;
5941 		goto abort_export;
5942 	}
5943 	clear_bit(In_sync, &rdev->flags);
5944 	rdev->desc_nr = -1;
5945 	rdev->saved_raid_disk = -1;
5946 	err = bind_rdev_to_array(rdev, mddev);
5947 	if (err)
5948 		goto abort_export;
5949 
5950 	/*
5951 	 * The rest should better be atomic, we can have disk failures
5952 	 * noticed in interrupt contexts ...
5953 	 */
5954 
5955 	rdev->raid_disk = -1;
5956 
5957 	md_update_sb(mddev, 1);
5958 
5959 	/*
5960 	 * Kick recovery, maybe this spare has to be added to the
5961 	 * array immediately.
5962 	 */
5963 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5964 	md_wakeup_thread(mddev->thread);
5965 	md_new_event(mddev);
5966 	return 0;
5967 
5968 abort_export:
5969 	export_rdev(rdev);
5970 	return err;
5971 }
5972 
5973 static int set_bitmap_file(struct mddev *mddev, int fd)
5974 {
5975 	int err;
5976 
5977 	if (mddev->pers) {
5978 		if (!mddev->pers->quiesce)
5979 			return -EBUSY;
5980 		if (mddev->recovery || mddev->sync_thread)
5981 			return -EBUSY;
5982 		/* we should be able to change the bitmap.. */
5983 	}
5984 
5985 
5986 	if (fd >= 0) {
5987 		if (mddev->bitmap)
5988 			return -EEXIST; /* cannot add when bitmap is present */
5989 		mddev->bitmap_info.file = fget(fd);
5990 
5991 		if (mddev->bitmap_info.file == NULL) {
5992 			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5993 			       mdname(mddev));
5994 			return -EBADF;
5995 		}
5996 
5997 		err = deny_bitmap_write_access(mddev->bitmap_info.file);
5998 		if (err) {
5999 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
6000 			       mdname(mddev));
6001 			fput(mddev->bitmap_info.file);
6002 			mddev->bitmap_info.file = NULL;
6003 			return err;
6004 		}
6005 		mddev->bitmap_info.offset = 0; /* file overrides offset */
6006 	} else if (mddev->bitmap == NULL)
6007 		return -ENOENT; /* cannot remove what isn't there */
6008 	err = 0;
6009 	if (mddev->pers) {
6010 		mddev->pers->quiesce(mddev, 1);
6011 		if (fd >= 0) {
6012 			err = bitmap_create(mddev);
6013 			if (!err)
6014 				err = bitmap_load(mddev);
6015 		}
6016 		if (fd < 0 || err) {
6017 			bitmap_destroy(mddev);
6018 			fd = -1; /* make sure to put the file */
6019 		}
6020 		mddev->pers->quiesce(mddev, 0);
6021 	}
6022 	if (fd < 0) {
6023 		if (mddev->bitmap_info.file) {
6024 			restore_bitmap_write_access(mddev->bitmap_info.file);
6025 			fput(mddev->bitmap_info.file);
6026 		}
6027 		mddev->bitmap_info.file = NULL;
6028 	}
6029 
6030 	return err;
6031 }
6032 
6033 /*
6034  * set_array_info is used two different ways
6035  * The original usage is when creating a new array.
6036  * In this usage, raid_disks is > 0 and it together with
6037  *  level, size, not_persistent,layout,chunksize determine the
6038  *  shape of the array.
6039  *  This will always create an array with a type-0.90.0 superblock.
6040  * The newer usage is when assembling an array.
6041  *  In this case raid_disks will be 0, and the major_version field is
6042  *  use to determine which style super-blocks are to be found on the devices.
6043  *  The minor and patch _version numbers are also kept incase the
6044  *  super_block handler wishes to interpret them.
6045  */
6046 static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
6047 {
6048 
6049 	if (info->raid_disks == 0) {
6050 		/* just setting version number for superblock loading */
6051 		if (info->major_version < 0 ||
6052 		    info->major_version >= ARRAY_SIZE(super_types) ||
6053 		    super_types[info->major_version].name == NULL) {
6054 			/* maybe try to auto-load a module? */
6055 			printk(KERN_INFO
6056 				"md: superblock version %d not known\n",
6057 				info->major_version);
6058 			return -EINVAL;
6059 		}
6060 		mddev->major_version = info->major_version;
6061 		mddev->minor_version = info->minor_version;
6062 		mddev->patch_version = info->patch_version;
6063 		mddev->persistent = !info->not_persistent;
6064 		/* ensure mddev_put doesn't delete this now that there
6065 		 * is some minimal configuration.
6066 		 */
6067 		mddev->ctime         = get_seconds();
6068 		return 0;
6069 	}
6070 	mddev->major_version = MD_MAJOR_VERSION;
6071 	mddev->minor_version = MD_MINOR_VERSION;
6072 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
6073 	mddev->ctime         = get_seconds();
6074 
6075 	mddev->level         = info->level;
6076 	mddev->clevel[0]     = 0;
6077 	mddev->dev_sectors   = 2 * (sector_t)info->size;
6078 	mddev->raid_disks    = info->raid_disks;
6079 	/* don't set md_minor, it is determined by which /dev/md* was
6080 	 * openned
6081 	 */
6082 	if (info->state & (1<<MD_SB_CLEAN))
6083 		mddev->recovery_cp = MaxSector;
6084 	else
6085 		mddev->recovery_cp = 0;
6086 	mddev->persistent    = ! info->not_persistent;
6087 	mddev->external	     = 0;
6088 
6089 	mddev->layout        = info->layout;
6090 	mddev->chunk_sectors = info->chunk_size >> 9;
6091 
6092 	mddev->max_disks     = MD_SB_DISKS;
6093 
6094 	if (mddev->persistent)
6095 		mddev->flags         = 0;
6096 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
6097 
6098 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6099 	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6100 	mddev->bitmap_info.offset = 0;
6101 
6102 	mddev->reshape_position = MaxSector;
6103 
6104 	/*
6105 	 * Generate a 128 bit UUID
6106 	 */
6107 	get_random_bytes(mddev->uuid, 16);
6108 
6109 	mddev->new_level = mddev->level;
6110 	mddev->new_chunk_sectors = mddev->chunk_sectors;
6111 	mddev->new_layout = mddev->layout;
6112 	mddev->delta_disks = 0;
6113 	mddev->reshape_backwards = 0;
6114 
6115 	return 0;
6116 }
6117 
6118 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6119 {
6120 	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6121 
6122 	if (mddev->external_size)
6123 		return;
6124 
6125 	mddev->array_sectors = array_sectors;
6126 }
6127 EXPORT_SYMBOL(md_set_array_sectors);
6128 
6129 static int update_size(struct mddev *mddev, sector_t num_sectors)
6130 {
6131 	struct md_rdev *rdev;
6132 	int rv;
6133 	int fit = (num_sectors == 0);
6134 
6135 	if (mddev->pers->resize == NULL)
6136 		return -EINVAL;
6137 	/* The "num_sectors" is the number of sectors of each device that
6138 	 * is used.  This can only make sense for arrays with redundancy.
6139 	 * linear and raid0 always use whatever space is available. We can only
6140 	 * consider changing this number if no resync or reconstruction is
6141 	 * happening, and if the new size is acceptable. It must fit before the
6142 	 * sb_start or, if that is <data_offset, it must fit before the size
6143 	 * of each device.  If num_sectors is zero, we find the largest size
6144 	 * that fits.
6145 	 */
6146 	if (mddev->sync_thread)
6147 		return -EBUSY;
6148 
6149 	rdev_for_each(rdev, mddev) {
6150 		sector_t avail = rdev->sectors;
6151 
6152 		if (fit && (num_sectors == 0 || num_sectors > avail))
6153 			num_sectors = avail;
6154 		if (avail < num_sectors)
6155 			return -ENOSPC;
6156 	}
6157 	rv = mddev->pers->resize(mddev, num_sectors);
6158 	if (!rv)
6159 		revalidate_disk(mddev->gendisk);
6160 	return rv;
6161 }
6162 
6163 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6164 {
6165 	int rv;
6166 	struct md_rdev *rdev;
6167 	/* change the number of raid disks */
6168 	if (mddev->pers->check_reshape == NULL)
6169 		return -EINVAL;
6170 	if (raid_disks <= 0 ||
6171 	    (mddev->max_disks && raid_disks >= mddev->max_disks))
6172 		return -EINVAL;
6173 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
6174 		return -EBUSY;
6175 
6176 	rdev_for_each(rdev, mddev) {
6177 		if (mddev->raid_disks < raid_disks &&
6178 		    rdev->data_offset < rdev->new_data_offset)
6179 			return -EINVAL;
6180 		if (mddev->raid_disks > raid_disks &&
6181 		    rdev->data_offset > rdev->new_data_offset)
6182 			return -EINVAL;
6183 	}
6184 
6185 	mddev->delta_disks = raid_disks - mddev->raid_disks;
6186 	if (mddev->delta_disks < 0)
6187 		mddev->reshape_backwards = 1;
6188 	else if (mddev->delta_disks > 0)
6189 		mddev->reshape_backwards = 0;
6190 
6191 	rv = mddev->pers->check_reshape(mddev);
6192 	if (rv < 0) {
6193 		mddev->delta_disks = 0;
6194 		mddev->reshape_backwards = 0;
6195 	}
6196 	return rv;
6197 }
6198 
6199 
6200 /*
6201  * update_array_info is used to change the configuration of an
6202  * on-line array.
6203  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6204  * fields in the info are checked against the array.
6205  * Any differences that cannot be handled will cause an error.
6206  * Normally, only one change can be managed at a time.
6207  */
6208 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6209 {
6210 	int rv = 0;
6211 	int cnt = 0;
6212 	int state = 0;
6213 
6214 	/* calculate expected state,ignoring low bits */
6215 	if (mddev->bitmap && mddev->bitmap_info.offset)
6216 		state |= (1 << MD_SB_BITMAP_PRESENT);
6217 
6218 	if (mddev->major_version != info->major_version ||
6219 	    mddev->minor_version != info->minor_version ||
6220 /*	    mddev->patch_version != info->patch_version || */
6221 	    mddev->ctime         != info->ctime         ||
6222 	    mddev->level         != info->level         ||
6223 /*	    mddev->layout        != info->layout        || */
6224 	    !mddev->persistent	 != info->not_persistent||
6225 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
6226 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6227 	    ((state^info->state) & 0xfffffe00)
6228 		)
6229 		return -EINVAL;
6230 	/* Check there is only one change */
6231 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6232 		cnt++;
6233 	if (mddev->raid_disks != info->raid_disks)
6234 		cnt++;
6235 	if (mddev->layout != info->layout)
6236 		cnt++;
6237 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6238 		cnt++;
6239 	if (cnt == 0)
6240 		return 0;
6241 	if (cnt > 1)
6242 		return -EINVAL;
6243 
6244 	if (mddev->layout != info->layout) {
6245 		/* Change layout
6246 		 * we don't need to do anything at the md level, the
6247 		 * personality will take care of it all.
6248 		 */
6249 		if (mddev->pers->check_reshape == NULL)
6250 			return -EINVAL;
6251 		else {
6252 			mddev->new_layout = info->layout;
6253 			rv = mddev->pers->check_reshape(mddev);
6254 			if (rv)
6255 				mddev->new_layout = mddev->layout;
6256 			return rv;
6257 		}
6258 	}
6259 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6260 		rv = update_size(mddev, (sector_t)info->size * 2);
6261 
6262 	if (mddev->raid_disks    != info->raid_disks)
6263 		rv = update_raid_disks(mddev, info->raid_disks);
6264 
6265 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6266 		if (mddev->pers->quiesce == NULL)
6267 			return -EINVAL;
6268 		if (mddev->recovery || mddev->sync_thread)
6269 			return -EBUSY;
6270 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6271 			/* add the bitmap */
6272 			if (mddev->bitmap)
6273 				return -EEXIST;
6274 			if (mddev->bitmap_info.default_offset == 0)
6275 				return -EINVAL;
6276 			mddev->bitmap_info.offset =
6277 				mddev->bitmap_info.default_offset;
6278 			mddev->bitmap_info.space =
6279 				mddev->bitmap_info.default_space;
6280 			mddev->pers->quiesce(mddev, 1);
6281 			rv = bitmap_create(mddev);
6282 			if (!rv)
6283 				rv = bitmap_load(mddev);
6284 			if (rv)
6285 				bitmap_destroy(mddev);
6286 			mddev->pers->quiesce(mddev, 0);
6287 		} else {
6288 			/* remove the bitmap */
6289 			if (!mddev->bitmap)
6290 				return -ENOENT;
6291 			if (mddev->bitmap->storage.file)
6292 				return -EINVAL;
6293 			mddev->pers->quiesce(mddev, 1);
6294 			bitmap_destroy(mddev);
6295 			mddev->pers->quiesce(mddev, 0);
6296 			mddev->bitmap_info.offset = 0;
6297 		}
6298 	}
6299 	md_update_sb(mddev, 1);
6300 	return rv;
6301 }
6302 
6303 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6304 {
6305 	struct md_rdev *rdev;
6306 	int err = 0;
6307 
6308 	if (mddev->pers == NULL)
6309 		return -ENODEV;
6310 
6311 	rcu_read_lock();
6312 	rdev = find_rdev_rcu(mddev, dev);
6313 	if (!rdev)
6314 		err =  -ENODEV;
6315 	else {
6316 		md_error(mddev, rdev);
6317 		if (!test_bit(Faulty, &rdev->flags))
6318 			err = -EBUSY;
6319 	}
6320 	rcu_read_unlock();
6321 	return err;
6322 }
6323 
6324 /*
6325  * We have a problem here : there is no easy way to give a CHS
6326  * virtual geometry. We currently pretend that we have a 2 heads
6327  * 4 sectors (with a BIG number of cylinders...). This drives
6328  * dosfs just mad... ;-)
6329  */
6330 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6331 {
6332 	struct mddev *mddev = bdev->bd_disk->private_data;
6333 
6334 	geo->heads = 2;
6335 	geo->sectors = 4;
6336 	geo->cylinders = mddev->array_sectors / 8;
6337 	return 0;
6338 }
6339 
6340 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6341 			unsigned int cmd, unsigned long arg)
6342 {
6343 	int err = 0;
6344 	void __user *argp = (void __user *)arg;
6345 	struct mddev *mddev = NULL;
6346 	int ro;
6347 
6348 	switch (cmd) {
6349 	case RAID_VERSION:
6350 	case GET_ARRAY_INFO:
6351 	case GET_DISK_INFO:
6352 		break;
6353 	default:
6354 		if (!capable(CAP_SYS_ADMIN))
6355 			return -EACCES;
6356 	}
6357 
6358 	/*
6359 	 * Commands dealing with the RAID driver but not any
6360 	 * particular array:
6361 	 */
6362 	switch (cmd) {
6363 	case RAID_VERSION:
6364 		err = get_version(argp);
6365 		goto done;
6366 
6367 	case PRINT_RAID_DEBUG:
6368 		err = 0;
6369 		md_print_devices();
6370 		goto done;
6371 
6372 #ifndef MODULE
6373 	case RAID_AUTORUN:
6374 		err = 0;
6375 		autostart_arrays(arg);
6376 		goto done;
6377 #endif
6378 	default:;
6379 	}
6380 
6381 	/*
6382 	 * Commands creating/starting a new array:
6383 	 */
6384 
6385 	mddev = bdev->bd_disk->private_data;
6386 
6387 	if (!mddev) {
6388 		BUG();
6389 		goto abort;
6390 	}
6391 
6392 	/* Some actions do not requires the mutex */
6393 	switch (cmd) {
6394 	case GET_ARRAY_INFO:
6395 		if (!mddev->raid_disks && !mddev->external)
6396 			err = -ENODEV;
6397 		else
6398 			err = get_array_info(mddev, argp);
6399 		goto abort;
6400 
6401 	case GET_DISK_INFO:
6402 		if (!mddev->raid_disks && !mddev->external)
6403 			err = -ENODEV;
6404 		else
6405 			err = get_disk_info(mddev, argp);
6406 		goto abort;
6407 
6408 	case SET_DISK_FAULTY:
6409 		err = set_disk_faulty(mddev, new_decode_dev(arg));
6410 		goto abort;
6411 	}
6412 
6413 	if (cmd == ADD_NEW_DISK)
6414 		/* need to ensure md_delayed_delete() has completed */
6415 		flush_workqueue(md_misc_wq);
6416 
6417 	err = mddev_lock(mddev);
6418 	if (err) {
6419 		printk(KERN_INFO
6420 			"md: ioctl lock interrupted, reason %d, cmd %d\n",
6421 			err, cmd);
6422 		goto abort;
6423 	}
6424 
6425 	if (cmd == SET_ARRAY_INFO) {
6426 		mdu_array_info_t info;
6427 		if (!arg)
6428 			memset(&info, 0, sizeof(info));
6429 		else if (copy_from_user(&info, argp, sizeof(info))) {
6430 			err = -EFAULT;
6431 			goto abort_unlock;
6432 		}
6433 		if (mddev->pers) {
6434 			err = update_array_info(mddev, &info);
6435 			if (err) {
6436 				printk(KERN_WARNING "md: couldn't update"
6437 				       " array info. %d\n", err);
6438 				goto abort_unlock;
6439 			}
6440 			goto done_unlock;
6441 		}
6442 		if (!list_empty(&mddev->disks)) {
6443 			printk(KERN_WARNING
6444 			       "md: array %s already has disks!\n",
6445 			       mdname(mddev));
6446 			err = -EBUSY;
6447 			goto abort_unlock;
6448 		}
6449 		if (mddev->raid_disks) {
6450 			printk(KERN_WARNING
6451 			       "md: array %s already initialised!\n",
6452 			       mdname(mddev));
6453 			err = -EBUSY;
6454 			goto abort_unlock;
6455 		}
6456 		err = set_array_info(mddev, &info);
6457 		if (err) {
6458 			printk(KERN_WARNING "md: couldn't set"
6459 			       " array info. %d\n", err);
6460 			goto abort_unlock;
6461 		}
6462 		goto done_unlock;
6463 	}
6464 
6465 	/*
6466 	 * Commands querying/configuring an existing array:
6467 	 */
6468 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6469 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6470 	if ((!mddev->raid_disks && !mddev->external)
6471 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6472 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6473 	    && cmd != GET_BITMAP_FILE) {
6474 		err = -ENODEV;
6475 		goto abort_unlock;
6476 	}
6477 
6478 	/*
6479 	 * Commands even a read-only array can execute:
6480 	 */
6481 	switch (cmd) {
6482 	case GET_BITMAP_FILE:
6483 		err = get_bitmap_file(mddev, argp);
6484 		goto done_unlock;
6485 
6486 	case RESTART_ARRAY_RW:
6487 		err = restart_array(mddev);
6488 		goto done_unlock;
6489 
6490 	case STOP_ARRAY:
6491 		err = do_md_stop(mddev, 0, bdev);
6492 		goto done_unlock;
6493 
6494 	case STOP_ARRAY_RO:
6495 		err = md_set_readonly(mddev, bdev);
6496 		goto done_unlock;
6497 
6498 	case HOT_REMOVE_DISK:
6499 		err = hot_remove_disk(mddev, new_decode_dev(arg));
6500 		goto done_unlock;
6501 
6502 	case ADD_NEW_DISK:
6503 		/* We can support ADD_NEW_DISK on read-only arrays
6504 		 * on if we are re-adding a preexisting device.
6505 		 * So require mddev->pers and MD_DISK_SYNC.
6506 		 */
6507 		if (mddev->pers) {
6508 			mdu_disk_info_t info;
6509 			if (copy_from_user(&info, argp, sizeof(info)))
6510 				err = -EFAULT;
6511 			else if (!(info.state & (1<<MD_DISK_SYNC)))
6512 				/* Need to clear read-only for this */
6513 				break;
6514 			else
6515 				err = add_new_disk(mddev, &info);
6516 			goto done_unlock;
6517 		}
6518 		break;
6519 
6520 	case BLKROSET:
6521 		if (get_user(ro, (int __user *)(arg))) {
6522 			err = -EFAULT;
6523 			goto done_unlock;
6524 		}
6525 		err = -EINVAL;
6526 
6527 		/* if the bdev is going readonly the value of mddev->ro
6528 		 * does not matter, no writes are coming
6529 		 */
6530 		if (ro)
6531 			goto done_unlock;
6532 
6533 		/* are we are already prepared for writes? */
6534 		if (mddev->ro != 1)
6535 			goto done_unlock;
6536 
6537 		/* transitioning to readauto need only happen for
6538 		 * arrays that call md_write_start
6539 		 */
6540 		if (mddev->pers) {
6541 			err = restart_array(mddev);
6542 			if (err == 0) {
6543 				mddev->ro = 2;
6544 				set_disk_ro(mddev->gendisk, 0);
6545 			}
6546 		}
6547 		goto done_unlock;
6548 	}
6549 
6550 	/*
6551 	 * The remaining ioctls are changing the state of the
6552 	 * superblock, so we do not allow them on read-only arrays.
6553 	 * However non-MD ioctls (e.g. get-size) will still come through
6554 	 * here and hit the 'default' below, so only disallow
6555 	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
6556 	 */
6557 	if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
6558 		if (mddev->ro == 2) {
6559 			mddev->ro = 0;
6560 			sysfs_notify_dirent_safe(mddev->sysfs_state);
6561 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6562 			/* mddev_unlock will wake thread */
6563 			/* If a device failed while we were read-only, we
6564 			 * need to make sure the metadata is updated now.
6565 			 */
6566 			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6567 				mddev_unlock(mddev);
6568 				wait_event(mddev->sb_wait,
6569 					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6570 					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6571 				mddev_lock(mddev);
6572 			}
6573 		} else {
6574 			err = -EROFS;
6575 			goto abort_unlock;
6576 		}
6577 	}
6578 
6579 	switch (cmd) {
6580 	case ADD_NEW_DISK:
6581 	{
6582 		mdu_disk_info_t info;
6583 		if (copy_from_user(&info, argp, sizeof(info)))
6584 			err = -EFAULT;
6585 		else
6586 			err = add_new_disk(mddev, &info);
6587 		goto done_unlock;
6588 	}
6589 
6590 	case HOT_ADD_DISK:
6591 		err = hot_add_disk(mddev, new_decode_dev(arg));
6592 		goto done_unlock;
6593 
6594 	case RUN_ARRAY:
6595 		err = do_md_run(mddev);
6596 		goto done_unlock;
6597 
6598 	case SET_BITMAP_FILE:
6599 		err = set_bitmap_file(mddev, (int)arg);
6600 		goto done_unlock;
6601 
6602 	default:
6603 		err = -EINVAL;
6604 		goto abort_unlock;
6605 	}
6606 
6607 done_unlock:
6608 abort_unlock:
6609 	if (mddev->hold_active == UNTIL_IOCTL &&
6610 	    err != -EINVAL)
6611 		mddev->hold_active = 0;
6612 	mddev_unlock(mddev);
6613 
6614 	return err;
6615 done:
6616 	if (err)
6617 		MD_BUG();
6618 abort:
6619 	return err;
6620 }
6621 #ifdef CONFIG_COMPAT
6622 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6623 		    unsigned int cmd, unsigned long arg)
6624 {
6625 	switch (cmd) {
6626 	case HOT_REMOVE_DISK:
6627 	case HOT_ADD_DISK:
6628 	case SET_DISK_FAULTY:
6629 	case SET_BITMAP_FILE:
6630 		/* These take in integer arg, do not convert */
6631 		break;
6632 	default:
6633 		arg = (unsigned long)compat_ptr(arg);
6634 		break;
6635 	}
6636 
6637 	return md_ioctl(bdev, mode, cmd, arg);
6638 }
6639 #endif /* CONFIG_COMPAT */
6640 
6641 static int md_open(struct block_device *bdev, fmode_t mode)
6642 {
6643 	/*
6644 	 * Succeed if we can lock the mddev, which confirms that
6645 	 * it isn't being stopped right now.
6646 	 */
6647 	struct mddev *mddev = mddev_find(bdev->bd_dev);
6648 	int err;
6649 
6650 	if (!mddev)
6651 		return -ENODEV;
6652 
6653 	if (mddev->gendisk != bdev->bd_disk) {
6654 		/* we are racing with mddev_put which is discarding this
6655 		 * bd_disk.
6656 		 */
6657 		mddev_put(mddev);
6658 		/* Wait until bdev->bd_disk is definitely gone */
6659 		flush_workqueue(md_misc_wq);
6660 		/* Then retry the open from the top */
6661 		return -ERESTARTSYS;
6662 	}
6663 	BUG_ON(mddev != bdev->bd_disk->private_data);
6664 
6665 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6666 		goto out;
6667 
6668 	err = 0;
6669 	atomic_inc(&mddev->openers);
6670 	mutex_unlock(&mddev->open_mutex);
6671 
6672 	check_disk_change(bdev);
6673  out:
6674 	return err;
6675 }
6676 
6677 static int md_release(struct gendisk *disk, fmode_t mode)
6678 {
6679  	struct mddev *mddev = disk->private_data;
6680 
6681 	BUG_ON(!mddev);
6682 	atomic_dec(&mddev->openers);
6683 	mddev_put(mddev);
6684 
6685 	return 0;
6686 }
6687 
6688 static int md_media_changed(struct gendisk *disk)
6689 {
6690 	struct mddev *mddev = disk->private_data;
6691 
6692 	return mddev->changed;
6693 }
6694 
6695 static int md_revalidate(struct gendisk *disk)
6696 {
6697 	struct mddev *mddev = disk->private_data;
6698 
6699 	mddev->changed = 0;
6700 	return 0;
6701 }
6702 static const struct block_device_operations md_fops =
6703 {
6704 	.owner		= THIS_MODULE,
6705 	.open		= md_open,
6706 	.release	= md_release,
6707 	.ioctl		= md_ioctl,
6708 #ifdef CONFIG_COMPAT
6709 	.compat_ioctl	= md_compat_ioctl,
6710 #endif
6711 	.getgeo		= md_getgeo,
6712 	.media_changed  = md_media_changed,
6713 	.revalidate_disk= md_revalidate,
6714 };
6715 
6716 static int md_thread(void * arg)
6717 {
6718 	struct md_thread *thread = arg;
6719 
6720 	/*
6721 	 * md_thread is a 'system-thread', it's priority should be very
6722 	 * high. We avoid resource deadlocks individually in each
6723 	 * raid personality. (RAID5 does preallocation) We also use RR and
6724 	 * the very same RT priority as kswapd, thus we will never get
6725 	 * into a priority inversion deadlock.
6726 	 *
6727 	 * we definitely have to have equal or higher priority than
6728 	 * bdflush, otherwise bdflush will deadlock if there are too
6729 	 * many dirty RAID5 blocks.
6730 	 */
6731 
6732 	allow_signal(SIGKILL);
6733 	while (!kthread_should_stop()) {
6734 
6735 		/* We need to wait INTERRUPTIBLE so that
6736 		 * we don't add to the load-average.
6737 		 * That means we need to be sure no signals are
6738 		 * pending
6739 		 */
6740 		if (signal_pending(current))
6741 			flush_signals(current);
6742 
6743 		wait_event_interruptible_timeout
6744 			(thread->wqueue,
6745 			 test_bit(THREAD_WAKEUP, &thread->flags)
6746 			 || kthread_should_stop(),
6747 			 thread->timeout);
6748 
6749 		clear_bit(THREAD_WAKEUP, &thread->flags);
6750 		if (!kthread_should_stop())
6751 			thread->run(thread);
6752 	}
6753 
6754 	return 0;
6755 }
6756 
6757 void md_wakeup_thread(struct md_thread *thread)
6758 {
6759 	if (thread) {
6760 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6761 		set_bit(THREAD_WAKEUP, &thread->flags);
6762 		wake_up(&thread->wqueue);
6763 	}
6764 }
6765 
6766 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6767 		struct mddev *mddev, const char *name)
6768 {
6769 	struct md_thread *thread;
6770 
6771 	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
6772 	if (!thread)
6773 		return NULL;
6774 
6775 	init_waitqueue_head(&thread->wqueue);
6776 
6777 	thread->run = run;
6778 	thread->mddev = mddev;
6779 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
6780 	thread->tsk = kthread_run(md_thread, thread,
6781 				  "%s_%s",
6782 				  mdname(thread->mddev),
6783 				  name);
6784 	if (IS_ERR(thread->tsk)) {
6785 		kfree(thread);
6786 		return NULL;
6787 	}
6788 	return thread;
6789 }
6790 
6791 void md_unregister_thread(struct md_thread **threadp)
6792 {
6793 	struct md_thread *thread = *threadp;
6794 	if (!thread)
6795 		return;
6796 	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6797 	/* Locking ensures that mddev_unlock does not wake_up a
6798 	 * non-existent thread
6799 	 */
6800 	spin_lock(&pers_lock);
6801 	*threadp = NULL;
6802 	spin_unlock(&pers_lock);
6803 
6804 	kthread_stop(thread->tsk);
6805 	kfree(thread);
6806 }
6807 
6808 void md_error(struct mddev *mddev, struct md_rdev *rdev)
6809 {
6810 	if (!mddev) {
6811 		MD_BUG();
6812 		return;
6813 	}
6814 
6815 	if (!rdev || test_bit(Faulty, &rdev->flags))
6816 		return;
6817 
6818 	if (!mddev->pers || !mddev->pers->error_handler)
6819 		return;
6820 	mddev->pers->error_handler(mddev,rdev);
6821 	if (mddev->degraded)
6822 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6823 	sysfs_notify_dirent_safe(rdev->sysfs_state);
6824 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6825 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6826 	md_wakeup_thread(mddev->thread);
6827 	if (mddev->event_work.func)
6828 		queue_work(md_misc_wq, &mddev->event_work);
6829 	md_new_event_inintr(mddev);
6830 }
6831 
6832 /* seq_file implementation /proc/mdstat */
6833 
6834 static void status_unused(struct seq_file *seq)
6835 {
6836 	int i = 0;
6837 	struct md_rdev *rdev;
6838 
6839 	seq_printf(seq, "unused devices: ");
6840 
6841 	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6842 		char b[BDEVNAME_SIZE];
6843 		i++;
6844 		seq_printf(seq, "%s ",
6845 			      bdevname(rdev->bdev,b));
6846 	}
6847 	if (!i)
6848 		seq_printf(seq, "<none>");
6849 
6850 	seq_printf(seq, "\n");
6851 }
6852 
6853 
6854 static void status_resync(struct seq_file *seq, struct mddev * mddev)
6855 {
6856 	sector_t max_sectors, resync, res;
6857 	unsigned long dt, db;
6858 	sector_t rt;
6859 	int scale;
6860 	unsigned int per_milli;
6861 
6862 	if (mddev->curr_resync <= 3)
6863 		resync = 0;
6864 	else
6865 		resync = mddev->curr_resync
6866 			- atomic_read(&mddev->recovery_active);
6867 
6868 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6869 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6870 		max_sectors = mddev->resync_max_sectors;
6871 	else
6872 		max_sectors = mddev->dev_sectors;
6873 
6874 	/*
6875 	 * Should not happen.
6876 	 */
6877 	if (!max_sectors) {
6878 		MD_BUG();
6879 		return;
6880 	}
6881 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
6882 	 * in a sector_t, and (max_sectors>>scale) will fit in a
6883 	 * u32, as those are the requirements for sector_div.
6884 	 * Thus 'scale' must be at least 10
6885 	 */
6886 	scale = 10;
6887 	if (sizeof(sector_t) > sizeof(unsigned long)) {
6888 		while ( max_sectors/2 > (1ULL<<(scale+32)))
6889 			scale++;
6890 	}
6891 	res = (resync>>scale)*1000;
6892 	sector_div(res, (u32)((max_sectors>>scale)+1));
6893 
6894 	per_milli = res;
6895 	{
6896 		int i, x = per_milli/50, y = 20-x;
6897 		seq_printf(seq, "[");
6898 		for (i = 0; i < x; i++)
6899 			seq_printf(seq, "=");
6900 		seq_printf(seq, ">");
6901 		for (i = 0; i < y; i++)
6902 			seq_printf(seq, ".");
6903 		seq_printf(seq, "] ");
6904 	}
6905 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6906 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6907 		    "reshape" :
6908 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6909 		     "check" :
6910 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6911 		      "resync" : "recovery"))),
6912 		   per_milli/10, per_milli % 10,
6913 		   (unsigned long long) resync/2,
6914 		   (unsigned long long) max_sectors/2);
6915 
6916 	/*
6917 	 * dt: time from mark until now
6918 	 * db: blocks written from mark until now
6919 	 * rt: remaining time
6920 	 *
6921 	 * rt is a sector_t, so could be 32bit or 64bit.
6922 	 * So we divide before multiply in case it is 32bit and close
6923 	 * to the limit.
6924 	 * We scale the divisor (db) by 32 to avoid losing precision
6925 	 * near the end of resync when the number of remaining sectors
6926 	 * is close to 'db'.
6927 	 * We then divide rt by 32 after multiplying by db to compensate.
6928 	 * The '+1' avoids division by zero if db is very small.
6929 	 */
6930 	dt = ((jiffies - mddev->resync_mark) / HZ);
6931 	if (!dt) dt++;
6932 	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6933 		- mddev->resync_mark_cnt;
6934 
6935 	rt = max_sectors - resync;    /* number of remaining sectors */
6936 	sector_div(rt, db/32+1);
6937 	rt *= dt;
6938 	rt >>= 5;
6939 
6940 	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6941 		   ((unsigned long)rt % 60)/6);
6942 
6943 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6944 }
6945 
6946 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6947 {
6948 	struct list_head *tmp;
6949 	loff_t l = *pos;
6950 	struct mddev *mddev;
6951 
6952 	if (l >= 0x10000)
6953 		return NULL;
6954 	if (!l--)
6955 		/* header */
6956 		return (void*)1;
6957 
6958 	spin_lock(&all_mddevs_lock);
6959 	list_for_each(tmp,&all_mddevs)
6960 		if (!l--) {
6961 			mddev = list_entry(tmp, struct mddev, all_mddevs);
6962 			mddev_get(mddev);
6963 			spin_unlock(&all_mddevs_lock);
6964 			return mddev;
6965 		}
6966 	spin_unlock(&all_mddevs_lock);
6967 	if (!l--)
6968 		return (void*)2;/* tail */
6969 	return NULL;
6970 }
6971 
6972 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6973 {
6974 	struct list_head *tmp;
6975 	struct mddev *next_mddev, *mddev = v;
6976 
6977 	++*pos;
6978 	if (v == (void*)2)
6979 		return NULL;
6980 
6981 	spin_lock(&all_mddevs_lock);
6982 	if (v == (void*)1)
6983 		tmp = all_mddevs.next;
6984 	else
6985 		tmp = mddev->all_mddevs.next;
6986 	if (tmp != &all_mddevs)
6987 		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
6988 	else {
6989 		next_mddev = (void*)2;
6990 		*pos = 0x10000;
6991 	}
6992 	spin_unlock(&all_mddevs_lock);
6993 
6994 	if (v != (void*)1)
6995 		mddev_put(mddev);
6996 	return next_mddev;
6997 
6998 }
6999 
7000 static void md_seq_stop(struct seq_file *seq, void *v)
7001 {
7002 	struct mddev *mddev = v;
7003 
7004 	if (mddev && v != (void*)1 && v != (void*)2)
7005 		mddev_put(mddev);
7006 }
7007 
7008 static int md_seq_show(struct seq_file *seq, void *v)
7009 {
7010 	struct mddev *mddev = v;
7011 	sector_t sectors;
7012 	struct md_rdev *rdev;
7013 
7014 	if (v == (void*)1) {
7015 		struct md_personality *pers;
7016 		seq_printf(seq, "Personalities : ");
7017 		spin_lock(&pers_lock);
7018 		list_for_each_entry(pers, &pers_list, list)
7019 			seq_printf(seq, "[%s] ", pers->name);
7020 
7021 		spin_unlock(&pers_lock);
7022 		seq_printf(seq, "\n");
7023 		seq->poll_event = atomic_read(&md_event_count);
7024 		return 0;
7025 	}
7026 	if (v == (void*)2) {
7027 		status_unused(seq);
7028 		return 0;
7029 	}
7030 
7031 	if (mddev_lock(mddev) < 0)
7032 		return -EINTR;
7033 
7034 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7035 		seq_printf(seq, "%s : %sactive", mdname(mddev),
7036 						mddev->pers ? "" : "in");
7037 		if (mddev->pers) {
7038 			if (mddev->ro==1)
7039 				seq_printf(seq, " (read-only)");
7040 			if (mddev->ro==2)
7041 				seq_printf(seq, " (auto-read-only)");
7042 			seq_printf(seq, " %s", mddev->pers->name);
7043 		}
7044 
7045 		sectors = 0;
7046 		rdev_for_each(rdev, mddev) {
7047 			char b[BDEVNAME_SIZE];
7048 			seq_printf(seq, " %s[%d]",
7049 				bdevname(rdev->bdev,b), rdev->desc_nr);
7050 			if (test_bit(WriteMostly, &rdev->flags))
7051 				seq_printf(seq, "(W)");
7052 			if (test_bit(Faulty, &rdev->flags)) {
7053 				seq_printf(seq, "(F)");
7054 				continue;
7055 			}
7056 			if (rdev->raid_disk < 0)
7057 				seq_printf(seq, "(S)"); /* spare */
7058 			if (test_bit(Replacement, &rdev->flags))
7059 				seq_printf(seq, "(R)");
7060 			sectors += rdev->sectors;
7061 		}
7062 
7063 		if (!list_empty(&mddev->disks)) {
7064 			if (mddev->pers)
7065 				seq_printf(seq, "\n      %llu blocks",
7066 					   (unsigned long long)
7067 					   mddev->array_sectors / 2);
7068 			else
7069 				seq_printf(seq, "\n      %llu blocks",
7070 					   (unsigned long long)sectors / 2);
7071 		}
7072 		if (mddev->persistent) {
7073 			if (mddev->major_version != 0 ||
7074 			    mddev->minor_version != 90) {
7075 				seq_printf(seq," super %d.%d",
7076 					   mddev->major_version,
7077 					   mddev->minor_version);
7078 			}
7079 		} else if (mddev->external)
7080 			seq_printf(seq, " super external:%s",
7081 				   mddev->metadata_type);
7082 		else
7083 			seq_printf(seq, " super non-persistent");
7084 
7085 		if (mddev->pers) {
7086 			mddev->pers->status(seq, mddev);
7087 	 		seq_printf(seq, "\n      ");
7088 			if (mddev->pers->sync_request) {
7089 				if (mddev->curr_resync > 2) {
7090 					status_resync(seq, mddev);
7091 					seq_printf(seq, "\n      ");
7092 				} else if (mddev->curr_resync >= 1)
7093 					seq_printf(seq, "\tresync=DELAYED\n      ");
7094 				else if (mddev->recovery_cp < MaxSector)
7095 					seq_printf(seq, "\tresync=PENDING\n      ");
7096 			}
7097 		} else
7098 			seq_printf(seq, "\n       ");
7099 
7100 		bitmap_status(seq, mddev->bitmap);
7101 
7102 		seq_printf(seq, "\n");
7103 	}
7104 	mddev_unlock(mddev);
7105 
7106 	return 0;
7107 }
7108 
7109 static const struct seq_operations md_seq_ops = {
7110 	.start  = md_seq_start,
7111 	.next   = md_seq_next,
7112 	.stop   = md_seq_stop,
7113 	.show   = md_seq_show,
7114 };
7115 
7116 static int md_seq_open(struct inode *inode, struct file *file)
7117 {
7118 	struct seq_file *seq;
7119 	int error;
7120 
7121 	error = seq_open(file, &md_seq_ops);
7122 	if (error)
7123 		return error;
7124 
7125 	seq = file->private_data;
7126 	seq->poll_event = atomic_read(&md_event_count);
7127 	return error;
7128 }
7129 
7130 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7131 {
7132 	struct seq_file *seq = filp->private_data;
7133 	int mask;
7134 
7135 	poll_wait(filp, &md_event_waiters, wait);
7136 
7137 	/* always allow read */
7138 	mask = POLLIN | POLLRDNORM;
7139 
7140 	if (seq->poll_event != atomic_read(&md_event_count))
7141 		mask |= POLLERR | POLLPRI;
7142 	return mask;
7143 }
7144 
7145 static const struct file_operations md_seq_fops = {
7146 	.owner		= THIS_MODULE,
7147 	.open           = md_seq_open,
7148 	.read           = seq_read,
7149 	.llseek         = seq_lseek,
7150 	.release	= seq_release_private,
7151 	.poll		= mdstat_poll,
7152 };
7153 
7154 int register_md_personality(struct md_personality *p)
7155 {
7156 	spin_lock(&pers_lock);
7157 	list_add_tail(&p->list, &pers_list);
7158 	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
7159 	spin_unlock(&pers_lock);
7160 	return 0;
7161 }
7162 
7163 int unregister_md_personality(struct md_personality *p)
7164 {
7165 	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7166 	spin_lock(&pers_lock);
7167 	list_del_init(&p->list);
7168 	spin_unlock(&pers_lock);
7169 	return 0;
7170 }
7171 
7172 static int is_mddev_idle(struct mddev *mddev, int init)
7173 {
7174 	struct md_rdev * rdev;
7175 	int idle;
7176 	int curr_events;
7177 
7178 	idle = 1;
7179 	rcu_read_lock();
7180 	rdev_for_each_rcu(rdev, mddev) {
7181 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7182 		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7183 			      (int)part_stat_read(&disk->part0, sectors[1]) -
7184 			      atomic_read(&disk->sync_io);
7185 		/* sync IO will cause sync_io to increase before the disk_stats
7186 		 * as sync_io is counted when a request starts, and
7187 		 * disk_stats is counted when it completes.
7188 		 * So resync activity will cause curr_events to be smaller than
7189 		 * when there was no such activity.
7190 		 * non-sync IO will cause disk_stat to increase without
7191 		 * increasing sync_io so curr_events will (eventually)
7192 		 * be larger than it was before.  Once it becomes
7193 		 * substantially larger, the test below will cause
7194 		 * the array to appear non-idle, and resync will slow
7195 		 * down.
7196 		 * If there is a lot of outstanding resync activity when
7197 		 * we set last_event to curr_events, then all that activity
7198 		 * completing might cause the array to appear non-idle
7199 		 * and resync will be slowed down even though there might
7200 		 * not have been non-resync activity.  This will only
7201 		 * happen once though.  'last_events' will soon reflect
7202 		 * the state where there is little or no outstanding
7203 		 * resync requests, and further resync activity will
7204 		 * always make curr_events less than last_events.
7205 		 *
7206 		 */
7207 		if (init || curr_events - rdev->last_events > 64) {
7208 			rdev->last_events = curr_events;
7209 			idle = 0;
7210 		}
7211 	}
7212 	rcu_read_unlock();
7213 	return idle;
7214 }
7215 
7216 void md_done_sync(struct mddev *mddev, int blocks, int ok)
7217 {
7218 	/* another "blocks" (512byte) blocks have been synced */
7219 	atomic_sub(blocks, &mddev->recovery_active);
7220 	wake_up(&mddev->recovery_wait);
7221 	if (!ok) {
7222 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7223 		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7224 		md_wakeup_thread(mddev->thread);
7225 		// stop recovery, signal do_sync ....
7226 	}
7227 }
7228 
7229 
7230 /* md_write_start(mddev, bi)
7231  * If we need to update some array metadata (e.g. 'active' flag
7232  * in superblock) before writing, schedule a superblock update
7233  * and wait for it to complete.
7234  */
7235 void md_write_start(struct mddev *mddev, struct bio *bi)
7236 {
7237 	int did_change = 0;
7238 	if (bio_data_dir(bi) != WRITE)
7239 		return;
7240 
7241 	BUG_ON(mddev->ro == 1);
7242 	if (mddev->ro == 2) {
7243 		/* need to switch to read/write */
7244 		mddev->ro = 0;
7245 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7246 		md_wakeup_thread(mddev->thread);
7247 		md_wakeup_thread(mddev->sync_thread);
7248 		did_change = 1;
7249 	}
7250 	atomic_inc(&mddev->writes_pending);
7251 	if (mddev->safemode == 1)
7252 		mddev->safemode = 0;
7253 	if (mddev->in_sync) {
7254 		spin_lock_irq(&mddev->write_lock);
7255 		if (mddev->in_sync) {
7256 			mddev->in_sync = 0;
7257 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7258 			set_bit(MD_CHANGE_PENDING, &mddev->flags);
7259 			md_wakeup_thread(mddev->thread);
7260 			did_change = 1;
7261 		}
7262 		spin_unlock_irq(&mddev->write_lock);
7263 	}
7264 	if (did_change)
7265 		sysfs_notify_dirent_safe(mddev->sysfs_state);
7266 	wait_event(mddev->sb_wait,
7267 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7268 }
7269 
7270 void md_write_end(struct mddev *mddev)
7271 {
7272 	if (atomic_dec_and_test(&mddev->writes_pending)) {
7273 		if (mddev->safemode == 2)
7274 			md_wakeup_thread(mddev->thread);
7275 		else if (mddev->safemode_delay)
7276 			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7277 	}
7278 }
7279 
7280 /* md_allow_write(mddev)
7281  * Calling this ensures that the array is marked 'active' so that writes
7282  * may proceed without blocking.  It is important to call this before
7283  * attempting a GFP_KERNEL allocation while holding the mddev lock.
7284  * Must be called with mddev_lock held.
7285  *
7286  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7287  * is dropped, so return -EAGAIN after notifying userspace.
7288  */
7289 int md_allow_write(struct mddev *mddev)
7290 {
7291 	if (!mddev->pers)
7292 		return 0;
7293 	if (mddev->ro)
7294 		return 0;
7295 	if (!mddev->pers->sync_request)
7296 		return 0;
7297 
7298 	spin_lock_irq(&mddev->write_lock);
7299 	if (mddev->in_sync) {
7300 		mddev->in_sync = 0;
7301 		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7302 		set_bit(MD_CHANGE_PENDING, &mddev->flags);
7303 		if (mddev->safemode_delay &&
7304 		    mddev->safemode == 0)
7305 			mddev->safemode = 1;
7306 		spin_unlock_irq(&mddev->write_lock);
7307 		md_update_sb(mddev, 0);
7308 		sysfs_notify_dirent_safe(mddev->sysfs_state);
7309 	} else
7310 		spin_unlock_irq(&mddev->write_lock);
7311 
7312 	if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7313 		return -EAGAIN;
7314 	else
7315 		return 0;
7316 }
7317 EXPORT_SYMBOL_GPL(md_allow_write);
7318 
7319 #define SYNC_MARKS	10
7320 #define	SYNC_MARK_STEP	(3*HZ)
7321 #define UPDATE_FREQUENCY (5*60*HZ)
7322 void md_do_sync(struct md_thread *thread)
7323 {
7324 	struct mddev *mddev = thread->mddev;
7325 	struct mddev *mddev2;
7326 	unsigned int currspeed = 0,
7327 		 window;
7328 	sector_t max_sectors,j, io_sectors;
7329 	unsigned long mark[SYNC_MARKS];
7330 	unsigned long update_time;
7331 	sector_t mark_cnt[SYNC_MARKS];
7332 	int last_mark,m;
7333 	struct list_head *tmp;
7334 	sector_t last_check;
7335 	int skipped = 0;
7336 	struct md_rdev *rdev;
7337 	char *desc;
7338 	struct blk_plug plug;
7339 
7340 	/* just incase thread restarts... */
7341 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7342 		return;
7343 	if (mddev->ro) /* never try to sync a read-only array */
7344 		return;
7345 
7346 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7347 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
7348 			desc = "data-check";
7349 		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7350 			desc = "requested-resync";
7351 		else
7352 			desc = "resync";
7353 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7354 		desc = "reshape";
7355 	else
7356 		desc = "recovery";
7357 
7358 	/* we overload curr_resync somewhat here.
7359 	 * 0 == not engaged in resync at all
7360 	 * 2 == checking that there is no conflict with another sync
7361 	 * 1 == like 2, but have yielded to allow conflicting resync to
7362 	 *		commense
7363 	 * other == active in resync - this many blocks
7364 	 *
7365 	 * Before starting a resync we must have set curr_resync to
7366 	 * 2, and then checked that every "conflicting" array has curr_resync
7367 	 * less than ours.  When we find one that is the same or higher
7368 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
7369 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7370 	 * This will mean we have to start checking from the beginning again.
7371 	 *
7372 	 */
7373 
7374 	do {
7375 		mddev->curr_resync = 2;
7376 
7377 	try_again:
7378 		if (kthread_should_stop())
7379 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7380 
7381 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7382 			goto skip;
7383 		for_each_mddev(mddev2, tmp) {
7384 			if (mddev2 == mddev)
7385 				continue;
7386 			if (!mddev->parallel_resync
7387 			&&  mddev2->curr_resync
7388 			&&  match_mddev_units(mddev, mddev2)) {
7389 				DEFINE_WAIT(wq);
7390 				if (mddev < mddev2 && mddev->curr_resync == 2) {
7391 					/* arbitrarily yield */
7392 					mddev->curr_resync = 1;
7393 					wake_up(&resync_wait);
7394 				}
7395 				if (mddev > mddev2 && mddev->curr_resync == 1)
7396 					/* no need to wait here, we can wait the next
7397 					 * time 'round when curr_resync == 2
7398 					 */
7399 					continue;
7400 				/* We need to wait 'interruptible' so as not to
7401 				 * contribute to the load average, and not to
7402 				 * be caught by 'softlockup'
7403 				 */
7404 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7405 				if (!kthread_should_stop() &&
7406 				    mddev2->curr_resync >= mddev->curr_resync) {
7407 					printk(KERN_INFO "md: delaying %s of %s"
7408 					       " until %s has finished (they"
7409 					       " share one or more physical units)\n",
7410 					       desc, mdname(mddev), mdname(mddev2));
7411 					mddev_put(mddev2);
7412 					if (signal_pending(current))
7413 						flush_signals(current);
7414 					schedule();
7415 					finish_wait(&resync_wait, &wq);
7416 					goto try_again;
7417 				}
7418 				finish_wait(&resync_wait, &wq);
7419 			}
7420 		}
7421 	} while (mddev->curr_resync < 2);
7422 
7423 	j = 0;
7424 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7425 		/* resync follows the size requested by the personality,
7426 		 * which defaults to physical size, but can be virtual size
7427 		 */
7428 		max_sectors = mddev->resync_max_sectors;
7429 		atomic64_set(&mddev->resync_mismatches, 0);
7430 		/* we don't use the checkpoint if there's a bitmap */
7431 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7432 			j = mddev->resync_min;
7433 		else if (!mddev->bitmap)
7434 			j = mddev->recovery_cp;
7435 
7436 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7437 		max_sectors = mddev->resync_max_sectors;
7438 	else {
7439 		/* recovery follows the physical size of devices */
7440 		max_sectors = mddev->dev_sectors;
7441 		j = MaxSector;
7442 		rcu_read_lock();
7443 		rdev_for_each_rcu(rdev, mddev)
7444 			if (rdev->raid_disk >= 0 &&
7445 			    !test_bit(Faulty, &rdev->flags) &&
7446 			    !test_bit(In_sync, &rdev->flags) &&
7447 			    rdev->recovery_offset < j)
7448 				j = rdev->recovery_offset;
7449 		rcu_read_unlock();
7450 	}
7451 
7452 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7453 	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
7454 		" %d KB/sec/disk.\n", speed_min(mddev));
7455 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7456 	       "(but not more than %d KB/sec) for %s.\n",
7457 	       speed_max(mddev), desc);
7458 
7459 	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7460 
7461 	io_sectors = 0;
7462 	for (m = 0; m < SYNC_MARKS; m++) {
7463 		mark[m] = jiffies;
7464 		mark_cnt[m] = io_sectors;
7465 	}
7466 	last_mark = 0;
7467 	mddev->resync_mark = mark[last_mark];
7468 	mddev->resync_mark_cnt = mark_cnt[last_mark];
7469 
7470 	/*
7471 	 * Tune reconstruction:
7472 	 */
7473 	window = 32*(PAGE_SIZE/512);
7474 	printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7475 		window/2, (unsigned long long)max_sectors/2);
7476 
7477 	atomic_set(&mddev->recovery_active, 0);
7478 	last_check = 0;
7479 
7480 	if (j>2) {
7481 		printk(KERN_INFO
7482 		       "md: resuming %s of %s from checkpoint.\n",
7483 		       desc, mdname(mddev));
7484 		mddev->curr_resync = j;
7485 	} else
7486 		mddev->curr_resync = 3; /* no longer delayed */
7487 	mddev->curr_resync_completed = j;
7488 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7489 	md_new_event(mddev);
7490 	update_time = jiffies;
7491 
7492 	blk_start_plug(&plug);
7493 	while (j < max_sectors) {
7494 		sector_t sectors;
7495 
7496 		skipped = 0;
7497 
7498 		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7499 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
7500 		      (mddev->curr_resync - mddev->curr_resync_completed)
7501 		      > (max_sectors >> 4)) ||
7502 		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7503 		     (j - mddev->curr_resync_completed)*2
7504 		     >= mddev->resync_max - mddev->curr_resync_completed
7505 			    )) {
7506 			/* time to update curr_resync_completed */
7507 			wait_event(mddev->recovery_wait,
7508 				   atomic_read(&mddev->recovery_active) == 0);
7509 			mddev->curr_resync_completed = j;
7510 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7511 			    j > mddev->recovery_cp)
7512 				mddev->recovery_cp = j;
7513 			update_time = jiffies;
7514 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7515 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7516 		}
7517 
7518 		while (j >= mddev->resync_max && !kthread_should_stop()) {
7519 			/* As this condition is controlled by user-space,
7520 			 * we can block indefinitely, so use '_interruptible'
7521 			 * to avoid triggering warnings.
7522 			 */
7523 			flush_signals(current); /* just in case */
7524 			wait_event_interruptible(mddev->recovery_wait,
7525 						 mddev->resync_max > j
7526 						 || kthread_should_stop());
7527 		}
7528 
7529 		if (kthread_should_stop())
7530 			goto interrupted;
7531 
7532 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
7533 						  currspeed < speed_min(mddev));
7534 		if (sectors == 0) {
7535 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7536 			goto out;
7537 		}
7538 
7539 		if (!skipped) { /* actual IO requested */
7540 			io_sectors += sectors;
7541 			atomic_add(sectors, &mddev->recovery_active);
7542 		}
7543 
7544 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7545 			break;
7546 
7547 		j += sectors;
7548 		if (j > 2)
7549 			mddev->curr_resync = j;
7550 		mddev->curr_mark_cnt = io_sectors;
7551 		if (last_check == 0)
7552 			/* this is the earliest that rebuild will be
7553 			 * visible in /proc/mdstat
7554 			 */
7555 			md_new_event(mddev);
7556 
7557 		if (last_check + window > io_sectors || j == max_sectors)
7558 			continue;
7559 
7560 		last_check = io_sectors;
7561 	repeat:
7562 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7563 			/* step marks */
7564 			int next = (last_mark+1) % SYNC_MARKS;
7565 
7566 			mddev->resync_mark = mark[next];
7567 			mddev->resync_mark_cnt = mark_cnt[next];
7568 			mark[next] = jiffies;
7569 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7570 			last_mark = next;
7571 		}
7572 
7573 
7574 		if (kthread_should_stop())
7575 			goto interrupted;
7576 
7577 
7578 		/*
7579 		 * this loop exits only if either when we are slower than
7580 		 * the 'hard' speed limit, or the system was IO-idle for
7581 		 * a jiffy.
7582 		 * the system might be non-idle CPU-wise, but we only care
7583 		 * about not overloading the IO subsystem. (things like an
7584 		 * e2fsck being done on the RAID array should execute fast)
7585 		 */
7586 		cond_resched();
7587 
7588 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
7589 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
7590 
7591 		if (currspeed > speed_min(mddev)) {
7592 			if ((currspeed > speed_max(mddev)) ||
7593 					!is_mddev_idle(mddev, 0)) {
7594 				msleep(500);
7595 				goto repeat;
7596 			}
7597 		}
7598 	}
7599 	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
7600 	/*
7601 	 * this also signals 'finished resyncing' to md_stop
7602 	 */
7603  out:
7604 	blk_finish_plug(&plug);
7605 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7606 
7607 	/* tell personality that we are finished */
7608 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7609 
7610 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7611 	    mddev->curr_resync > 2) {
7612 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7613 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7614 				if (mddev->curr_resync >= mddev->recovery_cp) {
7615 					printk(KERN_INFO
7616 					       "md: checkpointing %s of %s.\n",
7617 					       desc, mdname(mddev));
7618 					if (test_bit(MD_RECOVERY_ERROR,
7619 						&mddev->recovery))
7620 						mddev->recovery_cp =
7621 							mddev->curr_resync_completed;
7622 					else
7623 						mddev->recovery_cp =
7624 							mddev->curr_resync;
7625 				}
7626 			} else
7627 				mddev->recovery_cp = MaxSector;
7628 		} else {
7629 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7630 				mddev->curr_resync = MaxSector;
7631 			rcu_read_lock();
7632 			rdev_for_each_rcu(rdev, mddev)
7633 				if (rdev->raid_disk >= 0 &&
7634 				    mddev->delta_disks >= 0 &&
7635 				    !test_bit(Faulty, &rdev->flags) &&
7636 				    !test_bit(In_sync, &rdev->flags) &&
7637 				    rdev->recovery_offset < mddev->curr_resync)
7638 					rdev->recovery_offset = mddev->curr_resync;
7639 			rcu_read_unlock();
7640 		}
7641 	}
7642  skip:
7643 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
7644 
7645 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7646 		/* We completed so min/max setting can be forgotten if used. */
7647 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7648 			mddev->resync_min = 0;
7649 		mddev->resync_max = MaxSector;
7650 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7651 		mddev->resync_min = mddev->curr_resync_completed;
7652 	mddev->curr_resync = 0;
7653 	wake_up(&resync_wait);
7654 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7655 	md_wakeup_thread(mddev->thread);
7656 	return;
7657 
7658  interrupted:
7659 	/*
7660 	 * got a signal, exit.
7661 	 */
7662 	printk(KERN_INFO
7663 	       "md: md_do_sync() got signal ... exiting\n");
7664 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7665 	goto out;
7666 
7667 }
7668 EXPORT_SYMBOL_GPL(md_do_sync);
7669 
7670 static int remove_and_add_spares(struct mddev *mddev,
7671 				 struct md_rdev *this)
7672 {
7673 	struct md_rdev *rdev;
7674 	int spares = 0;
7675 	int removed = 0;
7676 
7677 	rdev_for_each(rdev, mddev)
7678 		if ((this == NULL || rdev == this) &&
7679 		    rdev->raid_disk >= 0 &&
7680 		    !test_bit(Blocked, &rdev->flags) &&
7681 		    (test_bit(Faulty, &rdev->flags) ||
7682 		     ! test_bit(In_sync, &rdev->flags)) &&
7683 		    atomic_read(&rdev->nr_pending)==0) {
7684 			if (mddev->pers->hot_remove_disk(
7685 				    mddev, rdev) == 0) {
7686 				sysfs_unlink_rdev(mddev, rdev);
7687 				rdev->raid_disk = -1;
7688 				removed++;
7689 			}
7690 		}
7691 	if (removed && mddev->kobj.sd)
7692 		sysfs_notify(&mddev->kobj, NULL, "degraded");
7693 
7694 	if (this)
7695 		goto no_add;
7696 
7697 	rdev_for_each(rdev, mddev) {
7698 		if (rdev->raid_disk >= 0 &&
7699 		    !test_bit(In_sync, &rdev->flags) &&
7700 		    !test_bit(Faulty, &rdev->flags))
7701 			spares++;
7702 		if (rdev->raid_disk >= 0)
7703 			continue;
7704 		if (test_bit(Faulty, &rdev->flags))
7705 			continue;
7706 		if (mddev->ro &&
7707 		    rdev->saved_raid_disk < 0)
7708 			continue;
7709 
7710 		rdev->recovery_offset = 0;
7711 		if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
7712 			spin_lock_irq(&mddev->write_lock);
7713 			if (mddev->in_sync)
7714 				/* OK, this device, which is in_sync,
7715 				 * will definitely be noticed before
7716 				 * the next write, so recovery isn't
7717 				 * needed.
7718 				 */
7719 				rdev->recovery_offset = mddev->recovery_cp;
7720 			spin_unlock_irq(&mddev->write_lock);
7721 		}
7722 		if (mddev->ro && rdev->recovery_offset != MaxSector)
7723 			/* not safe to add this disk now */
7724 			continue;
7725 		if (mddev->pers->
7726 		    hot_add_disk(mddev, rdev) == 0) {
7727 			if (sysfs_link_rdev(mddev, rdev))
7728 				/* failure here is OK */;
7729 			spares++;
7730 			md_new_event(mddev);
7731 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
7732 		}
7733 	}
7734 no_add:
7735 	if (removed)
7736 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
7737 	return spares;
7738 }
7739 
7740 /*
7741  * This routine is regularly called by all per-raid-array threads to
7742  * deal with generic issues like resync and super-block update.
7743  * Raid personalities that don't have a thread (linear/raid0) do not
7744  * need this as they never do any recovery or update the superblock.
7745  *
7746  * It does not do any resync itself, but rather "forks" off other threads
7747  * to do that as needed.
7748  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7749  * "->recovery" and create a thread at ->sync_thread.
7750  * When the thread finishes it sets MD_RECOVERY_DONE
7751  * and wakeups up this thread which will reap the thread and finish up.
7752  * This thread also removes any faulty devices (with nr_pending == 0).
7753  *
7754  * The overall approach is:
7755  *  1/ if the superblock needs updating, update it.
7756  *  2/ If a recovery thread is running, don't do anything else.
7757  *  3/ If recovery has finished, clean up, possibly marking spares active.
7758  *  4/ If there are any faulty devices, remove them.
7759  *  5/ If array is degraded, try to add spares devices
7760  *  6/ If array has spares or is not in-sync, start a resync thread.
7761  */
7762 void md_check_recovery(struct mddev *mddev)
7763 {
7764 	if (mddev->suspended)
7765 		return;
7766 
7767 	if (mddev->bitmap)
7768 		bitmap_daemon_work(mddev);
7769 
7770 	if (signal_pending(current)) {
7771 		if (mddev->pers->sync_request && !mddev->external) {
7772 			printk(KERN_INFO "md: %s in immediate safe mode\n",
7773 			       mdname(mddev));
7774 			mddev->safemode = 2;
7775 		}
7776 		flush_signals(current);
7777 	}
7778 
7779 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7780 		return;
7781 	if ( ! (
7782 		(mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
7783 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7784 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7785 		(mddev->external == 0 && mddev->safemode == 1) ||
7786 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7787 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7788 		))
7789 		return;
7790 
7791 	if (mddev_trylock(mddev)) {
7792 		int spares = 0;
7793 
7794 		if (mddev->ro) {
7795 			/* On a read-only array we can:
7796 			 * - remove failed devices
7797 			 * - add already-in_sync devices if the array itself
7798 			 *   is in-sync.
7799 			 * As we only add devices that are already in-sync,
7800 			 * we can activate the spares immediately.
7801 			 */
7802 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7803 			remove_and_add_spares(mddev, NULL);
7804 			mddev->pers->spare_active(mddev);
7805 			goto unlock;
7806 		}
7807 
7808 		if (!mddev->external) {
7809 			int did_change = 0;
7810 			spin_lock_irq(&mddev->write_lock);
7811 			if (mddev->safemode &&
7812 			    !atomic_read(&mddev->writes_pending) &&
7813 			    !mddev->in_sync &&
7814 			    mddev->recovery_cp == MaxSector) {
7815 				mddev->in_sync = 1;
7816 				did_change = 1;
7817 				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7818 			}
7819 			if (mddev->safemode == 1)
7820 				mddev->safemode = 0;
7821 			spin_unlock_irq(&mddev->write_lock);
7822 			if (did_change)
7823 				sysfs_notify_dirent_safe(mddev->sysfs_state);
7824 		}
7825 
7826 		if (mddev->flags)
7827 			md_update_sb(mddev, 0);
7828 
7829 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7830 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7831 			/* resync/recovery still happening */
7832 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7833 			goto unlock;
7834 		}
7835 		if (mddev->sync_thread) {
7836 			md_reap_sync_thread(mddev);
7837 			goto unlock;
7838 		}
7839 		/* Set RUNNING before clearing NEEDED to avoid
7840 		 * any transients in the value of "sync_action".
7841 		 */
7842 		mddev->curr_resync_completed = 0;
7843 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7844 		/* Clear some bits that don't mean anything, but
7845 		 * might be left set
7846 		 */
7847 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7848 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7849 
7850 		if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7851 		    test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7852 			goto unlock;
7853 		/* no recovery is running.
7854 		 * remove any failed drives, then
7855 		 * add spares if possible.
7856 		 * Spares are also removed and re-added, to allow
7857 		 * the personality to fail the re-add.
7858 		 */
7859 
7860 		if (mddev->reshape_position != MaxSector) {
7861 			if (mddev->pers->check_reshape == NULL ||
7862 			    mddev->pers->check_reshape(mddev) != 0)
7863 				/* Cannot proceed */
7864 				goto unlock;
7865 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7866 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7867 		} else if ((spares = remove_and_add_spares(mddev, NULL))) {
7868 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7869 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7870 			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7871 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7872 		} else if (mddev->recovery_cp < MaxSector) {
7873 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7874 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7875 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7876 			/* nothing to be done ... */
7877 			goto unlock;
7878 
7879 		if (mddev->pers->sync_request) {
7880 			if (spares) {
7881 				/* We are adding a device or devices to an array
7882 				 * which has the bitmap stored on all devices.
7883 				 * So make sure all bitmap pages get written
7884 				 */
7885 				bitmap_write_all(mddev->bitmap);
7886 			}
7887 			mddev->sync_thread = md_register_thread(md_do_sync,
7888 								mddev,
7889 								"resync");
7890 			if (!mddev->sync_thread) {
7891 				printk(KERN_ERR "%s: could not start resync"
7892 					" thread...\n",
7893 					mdname(mddev));
7894 				/* leave the spares where they are, it shouldn't hurt */
7895 				clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7896 				clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7897 				clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7898 				clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7899 				clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7900 			} else
7901 				md_wakeup_thread(mddev->sync_thread);
7902 			sysfs_notify_dirent_safe(mddev->sysfs_action);
7903 			md_new_event(mddev);
7904 		}
7905 	unlock:
7906 		if (!mddev->sync_thread) {
7907 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7908 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7909 					       &mddev->recovery))
7910 				if (mddev->sysfs_action)
7911 					sysfs_notify_dirent_safe(mddev->sysfs_action);
7912 		}
7913 		mddev_unlock(mddev);
7914 	}
7915 }
7916 
7917 void md_reap_sync_thread(struct mddev *mddev)
7918 {
7919 	struct md_rdev *rdev;
7920 
7921 	/* resync has finished, collect result */
7922 	md_unregister_thread(&mddev->sync_thread);
7923 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7924 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7925 		/* success...*/
7926 		/* activate any spares */
7927 		if (mddev->pers->spare_active(mddev)) {
7928 			sysfs_notify(&mddev->kobj, NULL,
7929 				     "degraded");
7930 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
7931 		}
7932 	}
7933 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7934 	    mddev->pers->finish_reshape)
7935 		mddev->pers->finish_reshape(mddev);
7936 
7937 	/* If array is no-longer degraded, then any saved_raid_disk
7938 	 * information must be scrapped.  Also if any device is now
7939 	 * In_sync we must scrape the saved_raid_disk for that device
7940 	 * do the superblock for an incrementally recovered device
7941 	 * written out.
7942 	 */
7943 	rdev_for_each(rdev, mddev)
7944 		if (!mddev->degraded ||
7945 		    test_bit(In_sync, &rdev->flags))
7946 			rdev->saved_raid_disk = -1;
7947 
7948 	md_update_sb(mddev, 1);
7949 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7950 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7951 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7952 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7953 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7954 	/* flag recovery needed just to double check */
7955 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7956 	sysfs_notify_dirent_safe(mddev->sysfs_action);
7957 	md_new_event(mddev);
7958 	if (mddev->event_work.func)
7959 		queue_work(md_misc_wq, &mddev->event_work);
7960 }
7961 
7962 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7963 {
7964 	sysfs_notify_dirent_safe(rdev->sysfs_state);
7965 	wait_event_timeout(rdev->blocked_wait,
7966 			   !test_bit(Blocked, &rdev->flags) &&
7967 			   !test_bit(BlockedBadBlocks, &rdev->flags),
7968 			   msecs_to_jiffies(5000));
7969 	rdev_dec_pending(rdev, mddev);
7970 }
7971 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7972 
7973 void md_finish_reshape(struct mddev *mddev)
7974 {
7975 	/* called be personality module when reshape completes. */
7976 	struct md_rdev *rdev;
7977 
7978 	rdev_for_each(rdev, mddev) {
7979 		if (rdev->data_offset > rdev->new_data_offset)
7980 			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7981 		else
7982 			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7983 		rdev->data_offset = rdev->new_data_offset;
7984 	}
7985 }
7986 EXPORT_SYMBOL(md_finish_reshape);
7987 
7988 /* Bad block management.
7989  * We can record which blocks on each device are 'bad' and so just
7990  * fail those blocks, or that stripe, rather than the whole device.
7991  * Entries in the bad-block table are 64bits wide.  This comprises:
7992  * Length of bad-range, in sectors: 0-511 for lengths 1-512
7993  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7994  *  A 'shift' can be set so that larger blocks are tracked and
7995  *  consequently larger devices can be covered.
7996  * 'Acknowledged' flag - 1 bit. - the most significant bit.
7997  *
7998  * Locking of the bad-block table uses a seqlock so md_is_badblock
7999  * might need to retry if it is very unlucky.
8000  * We will sometimes want to check for bad blocks in a bi_end_io function,
8001  * so we use the write_seqlock_irq variant.
8002  *
8003  * When looking for a bad block we specify a range and want to
8004  * know if any block in the range is bad.  So we binary-search
8005  * to the last range that starts at-or-before the given endpoint,
8006  * (or "before the sector after the target range")
8007  * then see if it ends after the given start.
8008  * We return
8009  *  0 if there are no known bad blocks in the range
8010  *  1 if there are known bad block which are all acknowledged
8011  * -1 if there are bad blocks which have not yet been acknowledged in metadata.
8012  * plus the start/length of the first bad section we overlap.
8013  */
8014 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
8015 		   sector_t *first_bad, int *bad_sectors)
8016 {
8017 	int hi;
8018 	int lo;
8019 	u64 *p = bb->page;
8020 	int rv;
8021 	sector_t target = s + sectors;
8022 	unsigned seq;
8023 
8024 	if (bb->shift > 0) {
8025 		/* round the start down, and the end up */
8026 		s >>= bb->shift;
8027 		target += (1<<bb->shift) - 1;
8028 		target >>= bb->shift;
8029 		sectors = target - s;
8030 	}
8031 	/* 'target' is now the first block after the bad range */
8032 
8033 retry:
8034 	seq = read_seqbegin(&bb->lock);
8035 	lo = 0;
8036 	rv = 0;
8037 	hi = bb->count;
8038 
8039 	/* Binary search between lo and hi for 'target'
8040 	 * i.e. for the last range that starts before 'target'
8041 	 */
8042 	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
8043 	 * are known not to be the last range before target.
8044 	 * VARIANT: hi-lo is the number of possible
8045 	 * ranges, and decreases until it reaches 1
8046 	 */
8047 	while (hi - lo > 1) {
8048 		int mid = (lo + hi) / 2;
8049 		sector_t a = BB_OFFSET(p[mid]);
8050 		if (a < target)
8051 			/* This could still be the one, earlier ranges
8052 			 * could not. */
8053 			lo = mid;
8054 		else
8055 			/* This and later ranges are definitely out. */
8056 			hi = mid;
8057 	}
8058 	/* 'lo' might be the last that started before target, but 'hi' isn't */
8059 	if (hi > lo) {
8060 		/* need to check all range that end after 's' to see if
8061 		 * any are unacknowledged.
8062 		 */
8063 		while (lo >= 0 &&
8064 		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8065 			if (BB_OFFSET(p[lo]) < target) {
8066 				/* starts before the end, and finishes after
8067 				 * the start, so they must overlap
8068 				 */
8069 				if (rv != -1 && BB_ACK(p[lo]))
8070 					rv = 1;
8071 				else
8072 					rv = -1;
8073 				*first_bad = BB_OFFSET(p[lo]);
8074 				*bad_sectors = BB_LEN(p[lo]);
8075 			}
8076 			lo--;
8077 		}
8078 	}
8079 
8080 	if (read_seqretry(&bb->lock, seq))
8081 		goto retry;
8082 
8083 	return rv;
8084 }
8085 EXPORT_SYMBOL_GPL(md_is_badblock);
8086 
8087 /*
8088  * Add a range of bad blocks to the table.
8089  * This might extend the table, or might contract it
8090  * if two adjacent ranges can be merged.
8091  * We binary-search to find the 'insertion' point, then
8092  * decide how best to handle it.
8093  */
8094 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8095 			    int acknowledged)
8096 {
8097 	u64 *p;
8098 	int lo, hi;
8099 	int rv = 1;
8100 
8101 	if (bb->shift < 0)
8102 		/* badblocks are disabled */
8103 		return 0;
8104 
8105 	if (bb->shift) {
8106 		/* round the start down, and the end up */
8107 		sector_t next = s + sectors;
8108 		s >>= bb->shift;
8109 		next += (1<<bb->shift) - 1;
8110 		next >>= bb->shift;
8111 		sectors = next - s;
8112 	}
8113 
8114 	write_seqlock_irq(&bb->lock);
8115 
8116 	p = bb->page;
8117 	lo = 0;
8118 	hi = bb->count;
8119 	/* Find the last range that starts at-or-before 's' */
8120 	while (hi - lo > 1) {
8121 		int mid = (lo + hi) / 2;
8122 		sector_t a = BB_OFFSET(p[mid]);
8123 		if (a <= s)
8124 			lo = mid;
8125 		else
8126 			hi = mid;
8127 	}
8128 	if (hi > lo && BB_OFFSET(p[lo]) > s)
8129 		hi = lo;
8130 
8131 	if (hi > lo) {
8132 		/* we found a range that might merge with the start
8133 		 * of our new range
8134 		 */
8135 		sector_t a = BB_OFFSET(p[lo]);
8136 		sector_t e = a + BB_LEN(p[lo]);
8137 		int ack = BB_ACK(p[lo]);
8138 		if (e >= s) {
8139 			/* Yes, we can merge with a previous range */
8140 			if (s == a && s + sectors >= e)
8141 				/* new range covers old */
8142 				ack = acknowledged;
8143 			else
8144 				ack = ack && acknowledged;
8145 
8146 			if (e < s + sectors)
8147 				e = s + sectors;
8148 			if (e - a <= BB_MAX_LEN) {
8149 				p[lo] = BB_MAKE(a, e-a, ack);
8150 				s = e;
8151 			} else {
8152 				/* does not all fit in one range,
8153 				 * make p[lo] maximal
8154 				 */
8155 				if (BB_LEN(p[lo]) != BB_MAX_LEN)
8156 					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8157 				s = a + BB_MAX_LEN;
8158 			}
8159 			sectors = e - s;
8160 		}
8161 	}
8162 	if (sectors && hi < bb->count) {
8163 		/* 'hi' points to the first range that starts after 's'.
8164 		 * Maybe we can merge with the start of that range */
8165 		sector_t a = BB_OFFSET(p[hi]);
8166 		sector_t e = a + BB_LEN(p[hi]);
8167 		int ack = BB_ACK(p[hi]);
8168 		if (a <= s + sectors) {
8169 			/* merging is possible */
8170 			if (e <= s + sectors) {
8171 				/* full overlap */
8172 				e = s + sectors;
8173 				ack = acknowledged;
8174 			} else
8175 				ack = ack && acknowledged;
8176 
8177 			a = s;
8178 			if (e - a <= BB_MAX_LEN) {
8179 				p[hi] = BB_MAKE(a, e-a, ack);
8180 				s = e;
8181 			} else {
8182 				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8183 				s = a + BB_MAX_LEN;
8184 			}
8185 			sectors = e - s;
8186 			lo = hi;
8187 			hi++;
8188 		}
8189 	}
8190 	if (sectors == 0 && hi < bb->count) {
8191 		/* we might be able to combine lo and hi */
8192 		/* Note: 's' is at the end of 'lo' */
8193 		sector_t a = BB_OFFSET(p[hi]);
8194 		int lolen = BB_LEN(p[lo]);
8195 		int hilen = BB_LEN(p[hi]);
8196 		int newlen = lolen + hilen - (s - a);
8197 		if (s >= a && newlen < BB_MAX_LEN) {
8198 			/* yes, we can combine them */
8199 			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8200 			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8201 			memmove(p + hi, p + hi + 1,
8202 				(bb->count - hi - 1) * 8);
8203 			bb->count--;
8204 		}
8205 	}
8206 	while (sectors) {
8207 		/* didn't merge (it all).
8208 		 * Need to add a range just before 'hi' */
8209 		if (bb->count >= MD_MAX_BADBLOCKS) {
8210 			/* No room for more */
8211 			rv = 0;
8212 			break;
8213 		} else {
8214 			int this_sectors = sectors;
8215 			memmove(p + hi + 1, p + hi,
8216 				(bb->count - hi) * 8);
8217 			bb->count++;
8218 
8219 			if (this_sectors > BB_MAX_LEN)
8220 				this_sectors = BB_MAX_LEN;
8221 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8222 			sectors -= this_sectors;
8223 			s += this_sectors;
8224 		}
8225 	}
8226 
8227 	bb->changed = 1;
8228 	if (!acknowledged)
8229 		bb->unacked_exist = 1;
8230 	write_sequnlock_irq(&bb->lock);
8231 
8232 	return rv;
8233 }
8234 
8235 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8236 		       int is_new)
8237 {
8238 	int rv;
8239 	if (is_new)
8240 		s += rdev->new_data_offset;
8241 	else
8242 		s += rdev->data_offset;
8243 	rv = md_set_badblocks(&rdev->badblocks,
8244 			      s, sectors, 0);
8245 	if (rv) {
8246 		/* Make sure they get written out promptly */
8247 		sysfs_notify_dirent_safe(rdev->sysfs_state);
8248 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8249 		md_wakeup_thread(rdev->mddev->thread);
8250 	}
8251 	return rv;
8252 }
8253 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8254 
8255 /*
8256  * Remove a range of bad blocks from the table.
8257  * This may involve extending the table if we spilt a region,
8258  * but it must not fail.  So if the table becomes full, we just
8259  * drop the remove request.
8260  */
8261 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8262 {
8263 	u64 *p;
8264 	int lo, hi;
8265 	sector_t target = s + sectors;
8266 	int rv = 0;
8267 
8268 	if (bb->shift > 0) {
8269 		/* When clearing we round the start up and the end down.
8270 		 * This should not matter as the shift should align with
8271 		 * the block size and no rounding should ever be needed.
8272 		 * However it is better the think a block is bad when it
8273 		 * isn't than to think a block is not bad when it is.
8274 		 */
8275 		s += (1<<bb->shift) - 1;
8276 		s >>= bb->shift;
8277 		target >>= bb->shift;
8278 		sectors = target - s;
8279 	}
8280 
8281 	write_seqlock_irq(&bb->lock);
8282 
8283 	p = bb->page;
8284 	lo = 0;
8285 	hi = bb->count;
8286 	/* Find the last range that starts before 'target' */
8287 	while (hi - lo > 1) {
8288 		int mid = (lo + hi) / 2;
8289 		sector_t a = BB_OFFSET(p[mid]);
8290 		if (a < target)
8291 			lo = mid;
8292 		else
8293 			hi = mid;
8294 	}
8295 	if (hi > lo) {
8296 		/* p[lo] is the last range that could overlap the
8297 		 * current range.  Earlier ranges could also overlap,
8298 		 * but only this one can overlap the end of the range.
8299 		 */
8300 		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8301 			/* Partial overlap, leave the tail of this range */
8302 			int ack = BB_ACK(p[lo]);
8303 			sector_t a = BB_OFFSET(p[lo]);
8304 			sector_t end = a + BB_LEN(p[lo]);
8305 
8306 			if (a < s) {
8307 				/* we need to split this range */
8308 				if (bb->count >= MD_MAX_BADBLOCKS) {
8309 					rv = 0;
8310 					goto out;
8311 				}
8312 				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8313 				bb->count++;
8314 				p[lo] = BB_MAKE(a, s-a, ack);
8315 				lo++;
8316 			}
8317 			p[lo] = BB_MAKE(target, end - target, ack);
8318 			/* there is no longer an overlap */
8319 			hi = lo;
8320 			lo--;
8321 		}
8322 		while (lo >= 0 &&
8323 		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8324 			/* This range does overlap */
8325 			if (BB_OFFSET(p[lo]) < s) {
8326 				/* Keep the early parts of this range. */
8327 				int ack = BB_ACK(p[lo]);
8328 				sector_t start = BB_OFFSET(p[lo]);
8329 				p[lo] = BB_MAKE(start, s - start, ack);
8330 				/* now low doesn't overlap, so.. */
8331 				break;
8332 			}
8333 			lo--;
8334 		}
8335 		/* 'lo' is strictly before, 'hi' is strictly after,
8336 		 * anything between needs to be discarded
8337 		 */
8338 		if (hi - lo > 1) {
8339 			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8340 			bb->count -= (hi - lo - 1);
8341 		}
8342 	}
8343 
8344 	bb->changed = 1;
8345 out:
8346 	write_sequnlock_irq(&bb->lock);
8347 	return rv;
8348 }
8349 
8350 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8351 			 int is_new)
8352 {
8353 	if (is_new)
8354 		s += rdev->new_data_offset;
8355 	else
8356 		s += rdev->data_offset;
8357 	return md_clear_badblocks(&rdev->badblocks,
8358 				  s, sectors);
8359 }
8360 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8361 
8362 /*
8363  * Acknowledge all bad blocks in a list.
8364  * This only succeeds if ->changed is clear.  It is used by
8365  * in-kernel metadata updates
8366  */
8367 void md_ack_all_badblocks(struct badblocks *bb)
8368 {
8369 	if (bb->page == NULL || bb->changed)
8370 		/* no point even trying */
8371 		return;
8372 	write_seqlock_irq(&bb->lock);
8373 
8374 	if (bb->changed == 0 && bb->unacked_exist) {
8375 		u64 *p = bb->page;
8376 		int i;
8377 		for (i = 0; i < bb->count ; i++) {
8378 			if (!BB_ACK(p[i])) {
8379 				sector_t start = BB_OFFSET(p[i]);
8380 				int len = BB_LEN(p[i]);
8381 				p[i] = BB_MAKE(start, len, 1);
8382 			}
8383 		}
8384 		bb->unacked_exist = 0;
8385 	}
8386 	write_sequnlock_irq(&bb->lock);
8387 }
8388 EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8389 
8390 /* sysfs access to bad-blocks list.
8391  * We present two files.
8392  * 'bad-blocks' lists sector numbers and lengths of ranges that
8393  *    are recorded as bad.  The list is truncated to fit within
8394  *    the one-page limit of sysfs.
8395  *    Writing "sector length" to this file adds an acknowledged
8396  *    bad block list.
8397  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8398  *    been acknowledged.  Writing to this file adds bad blocks
8399  *    without acknowledging them.  This is largely for testing.
8400  */
8401 
8402 static ssize_t
8403 badblocks_show(struct badblocks *bb, char *page, int unack)
8404 {
8405 	size_t len;
8406 	int i;
8407 	u64 *p = bb->page;
8408 	unsigned seq;
8409 
8410 	if (bb->shift < 0)
8411 		return 0;
8412 
8413 retry:
8414 	seq = read_seqbegin(&bb->lock);
8415 
8416 	len = 0;
8417 	i = 0;
8418 
8419 	while (len < PAGE_SIZE && i < bb->count) {
8420 		sector_t s = BB_OFFSET(p[i]);
8421 		unsigned int length = BB_LEN(p[i]);
8422 		int ack = BB_ACK(p[i]);
8423 		i++;
8424 
8425 		if (unack && ack)
8426 			continue;
8427 
8428 		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8429 				(unsigned long long)s << bb->shift,
8430 				length << bb->shift);
8431 	}
8432 	if (unack && len == 0)
8433 		bb->unacked_exist = 0;
8434 
8435 	if (read_seqretry(&bb->lock, seq))
8436 		goto retry;
8437 
8438 	return len;
8439 }
8440 
8441 #define DO_DEBUG 1
8442 
8443 static ssize_t
8444 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8445 {
8446 	unsigned long long sector;
8447 	int length;
8448 	char newline;
8449 #ifdef DO_DEBUG
8450 	/* Allow clearing via sysfs *only* for testing/debugging.
8451 	 * Normally only a successful write may clear a badblock
8452 	 */
8453 	int clear = 0;
8454 	if (page[0] == '-') {
8455 		clear = 1;
8456 		page++;
8457 	}
8458 #endif /* DO_DEBUG */
8459 
8460 	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8461 	case 3:
8462 		if (newline != '\n')
8463 			return -EINVAL;
8464 	case 2:
8465 		if (length <= 0)
8466 			return -EINVAL;
8467 		break;
8468 	default:
8469 		return -EINVAL;
8470 	}
8471 
8472 #ifdef DO_DEBUG
8473 	if (clear) {
8474 		md_clear_badblocks(bb, sector, length);
8475 		return len;
8476 	}
8477 #endif /* DO_DEBUG */
8478 	if (md_set_badblocks(bb, sector, length, !unack))
8479 		return len;
8480 	else
8481 		return -ENOSPC;
8482 }
8483 
8484 static int md_notify_reboot(struct notifier_block *this,
8485 			    unsigned long code, void *x)
8486 {
8487 	struct list_head *tmp;
8488 	struct mddev *mddev;
8489 	int need_delay = 0;
8490 
8491 	for_each_mddev(mddev, tmp) {
8492 		if (mddev_trylock(mddev)) {
8493 			if (mddev->pers)
8494 				__md_stop_writes(mddev);
8495 			mddev->safemode = 2;
8496 			mddev_unlock(mddev);
8497 		}
8498 		need_delay = 1;
8499 	}
8500 	/*
8501 	 * certain more exotic SCSI devices are known to be
8502 	 * volatile wrt too early system reboots. While the
8503 	 * right place to handle this issue is the given
8504 	 * driver, we do want to have a safe RAID driver ...
8505 	 */
8506 	if (need_delay)
8507 		mdelay(1000*1);
8508 
8509 	return NOTIFY_DONE;
8510 }
8511 
8512 static struct notifier_block md_notifier = {
8513 	.notifier_call	= md_notify_reboot,
8514 	.next		= NULL,
8515 	.priority	= INT_MAX, /* before any real devices */
8516 };
8517 
8518 static void md_geninit(void)
8519 {
8520 	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8521 
8522 	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8523 }
8524 
8525 static int __init md_init(void)
8526 {
8527 	int ret = -ENOMEM;
8528 
8529 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8530 	if (!md_wq)
8531 		goto err_wq;
8532 
8533 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8534 	if (!md_misc_wq)
8535 		goto err_misc_wq;
8536 
8537 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8538 		goto err_md;
8539 
8540 	if ((ret = register_blkdev(0, "mdp")) < 0)
8541 		goto err_mdp;
8542 	mdp_major = ret;
8543 
8544 	blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
8545 			    md_probe, NULL, NULL);
8546 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8547 			    md_probe, NULL, NULL);
8548 
8549 	register_reboot_notifier(&md_notifier);
8550 	raid_table_header = register_sysctl_table(raid_root_table);
8551 
8552 	md_geninit();
8553 	return 0;
8554 
8555 err_mdp:
8556 	unregister_blkdev(MD_MAJOR, "md");
8557 err_md:
8558 	destroy_workqueue(md_misc_wq);
8559 err_misc_wq:
8560 	destroy_workqueue(md_wq);
8561 err_wq:
8562 	return ret;
8563 }
8564 
8565 #ifndef MODULE
8566 
8567 /*
8568  * Searches all registered partitions for autorun RAID arrays
8569  * at boot time.
8570  */
8571 
8572 static LIST_HEAD(all_detected_devices);
8573 struct detected_devices_node {
8574 	struct list_head list;
8575 	dev_t dev;
8576 };
8577 
8578 void md_autodetect_dev(dev_t dev)
8579 {
8580 	struct detected_devices_node *node_detected_dev;
8581 
8582 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8583 	if (node_detected_dev) {
8584 		node_detected_dev->dev = dev;
8585 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
8586 	} else {
8587 		printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8588 			", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8589 	}
8590 }
8591 
8592 
8593 static void autostart_arrays(int part)
8594 {
8595 	struct md_rdev *rdev;
8596 	struct detected_devices_node *node_detected_dev;
8597 	dev_t dev;
8598 	int i_scanned, i_passed;
8599 
8600 	i_scanned = 0;
8601 	i_passed = 0;
8602 
8603 	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8604 
8605 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8606 		i_scanned++;
8607 		node_detected_dev = list_entry(all_detected_devices.next,
8608 					struct detected_devices_node, list);
8609 		list_del(&node_detected_dev->list);
8610 		dev = node_detected_dev->dev;
8611 		kfree(node_detected_dev);
8612 		rdev = md_import_device(dev,0, 90);
8613 		if (IS_ERR(rdev))
8614 			continue;
8615 
8616 		if (test_bit(Faulty, &rdev->flags)) {
8617 			MD_BUG();
8618 			continue;
8619 		}
8620 		set_bit(AutoDetected, &rdev->flags);
8621 		list_add(&rdev->same_set, &pending_raid_disks);
8622 		i_passed++;
8623 	}
8624 
8625 	printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8626 						i_scanned, i_passed);
8627 
8628 	autorun_devices(part);
8629 }
8630 
8631 #endif /* !MODULE */
8632 
8633 static __exit void md_exit(void)
8634 {
8635 	struct mddev *mddev;
8636 	struct list_head *tmp;
8637 
8638 	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
8639 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8640 
8641 	unregister_blkdev(MD_MAJOR,"md");
8642 	unregister_blkdev(mdp_major, "mdp");
8643 	unregister_reboot_notifier(&md_notifier);
8644 	unregister_sysctl_table(raid_table_header);
8645 	remove_proc_entry("mdstat", NULL);
8646 	for_each_mddev(mddev, tmp) {
8647 		export_array(mddev);
8648 		mddev->hold_active = 0;
8649 	}
8650 	destroy_workqueue(md_misc_wq);
8651 	destroy_workqueue(md_wq);
8652 }
8653 
8654 subsys_initcall(md_init);
8655 module_exit(md_exit)
8656 
8657 static int get_ro(char *buffer, struct kernel_param *kp)
8658 {
8659 	return sprintf(buffer, "%d", start_readonly);
8660 }
8661 static int set_ro(const char *val, struct kernel_param *kp)
8662 {
8663 	char *e;
8664 	int num = simple_strtoul(val, &e, 10);
8665 	if (*val && (*e == '\0' || *e == '\n')) {
8666 		start_readonly = num;
8667 		return 0;
8668 	}
8669 	return -EINVAL;
8670 }
8671 
8672 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8673 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
8674 
8675 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
8676 
8677 EXPORT_SYMBOL(register_md_personality);
8678 EXPORT_SYMBOL(unregister_md_personality);
8679 EXPORT_SYMBOL(md_error);
8680 EXPORT_SYMBOL(md_done_sync);
8681 EXPORT_SYMBOL(md_write_start);
8682 EXPORT_SYMBOL(md_write_end);
8683 EXPORT_SYMBOL(md_register_thread);
8684 EXPORT_SYMBOL(md_unregister_thread);
8685 EXPORT_SYMBOL(md_wakeup_thread);
8686 EXPORT_SYMBOL(md_check_recovery);
8687 EXPORT_SYMBOL(md_reap_sync_thread);
8688 MODULE_LICENSE("GPL");
8689 MODULE_DESCRIPTION("MD RAID framework");
8690 MODULE_ALIAS("md");
8691 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
8692