xref: /linux/drivers/md/md.c (revision ed3174d93c342b8b2eeba6bbd124707d55304a7b)
1 /*
2    md.c : Multiple Devices driver for Linux
3 	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 
5      completely rewritten, based on the MD driver code from Marc Zyngier
6 
7    Changes:
8 
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19 
20      Neil Brown <neilb@cse.unsw.edu.au>.
21 
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29 
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34 
35 #include <linux/module.h>
36 #include <linux/kernel.h>
37 #include <linux/kthread.h>
38 #include <linux/linkage.h>
39 #include <linux/raid/md.h>
40 #include <linux/raid/bitmap.h>
41 #include <linux/sysctl.h>
42 #include <linux/buffer_head.h> /* for invalidate_bdev */
43 #include <linux/poll.h>
44 #include <linux/mutex.h>
45 #include <linux/ctype.h>
46 #include <linux/freezer.h>
47 
48 #include <linux/init.h>
49 
50 #include <linux/file.h>
51 
52 #ifdef CONFIG_KMOD
53 #include <linux/kmod.h>
54 #endif
55 
56 #include <asm/unaligned.h>
57 
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
60 
61 /* 63 partitions with the alternate major number (mdp) */
62 #define MdpMinorShift 6
63 
64 #define DEBUG 0
65 #define dprintk(x...) ((void)(DEBUG && printk(x)))
66 
67 
68 #ifndef MODULE
69 static void autostart_arrays (int part);
70 #endif
71 
72 static LIST_HEAD(pers_list);
73 static DEFINE_SPINLOCK(pers_lock);
74 
75 static void md_print_devices(void);
76 
77 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
78 
79 /*
80  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
81  * is 1000 KB/sec, so the extra system load does not show up that much.
82  * Increase it if you want to have more _guaranteed_ speed. Note that
83  * the RAID driver will use the maximum available bandwidth if the IO
84  * subsystem is idle. There is also an 'absolute maximum' reconstruction
85  * speed limit - in case reconstruction slows down your system despite
86  * idle IO detection.
87  *
88  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
89  * or /sys/block/mdX/md/sync_speed_{min,max}
90  */
91 
92 static int sysctl_speed_limit_min = 1000;
93 static int sysctl_speed_limit_max = 200000;
94 static inline int speed_min(mddev_t *mddev)
95 {
96 	return mddev->sync_speed_min ?
97 		mddev->sync_speed_min : sysctl_speed_limit_min;
98 }
99 
100 static inline int speed_max(mddev_t *mddev)
101 {
102 	return mddev->sync_speed_max ?
103 		mddev->sync_speed_max : sysctl_speed_limit_max;
104 }
105 
106 static struct ctl_table_header *raid_table_header;
107 
108 static ctl_table raid_table[] = {
109 	{
110 		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN,
111 		.procname	= "speed_limit_min",
112 		.data		= &sysctl_speed_limit_min,
113 		.maxlen		= sizeof(int),
114 		.mode		= S_IRUGO|S_IWUSR,
115 		.proc_handler	= &proc_dointvec,
116 	},
117 	{
118 		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX,
119 		.procname	= "speed_limit_max",
120 		.data		= &sysctl_speed_limit_max,
121 		.maxlen		= sizeof(int),
122 		.mode		= S_IRUGO|S_IWUSR,
123 		.proc_handler	= &proc_dointvec,
124 	},
125 	{ .ctl_name = 0 }
126 };
127 
128 static ctl_table raid_dir_table[] = {
129 	{
130 		.ctl_name	= DEV_RAID,
131 		.procname	= "raid",
132 		.maxlen		= 0,
133 		.mode		= S_IRUGO|S_IXUGO,
134 		.child		= raid_table,
135 	},
136 	{ .ctl_name = 0 }
137 };
138 
139 static ctl_table raid_root_table[] = {
140 	{
141 		.ctl_name	= CTL_DEV,
142 		.procname	= "dev",
143 		.maxlen		= 0,
144 		.mode		= 0555,
145 		.child		= raid_dir_table,
146 	},
147 	{ .ctl_name = 0 }
148 };
149 
150 static struct block_device_operations md_fops;
151 
152 static int start_readonly;
153 
154 /*
155  * We have a system wide 'event count' that is incremented
156  * on any 'interesting' event, and readers of /proc/mdstat
157  * can use 'poll' or 'select' to find out when the event
158  * count increases.
159  *
160  * Events are:
161  *  start array, stop array, error, add device, remove device,
162  *  start build, activate spare
163  */
164 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
165 static atomic_t md_event_count;
166 void md_new_event(mddev_t *mddev)
167 {
168 	atomic_inc(&md_event_count);
169 	wake_up(&md_event_waiters);
170 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
171 }
172 EXPORT_SYMBOL_GPL(md_new_event);
173 
174 /* Alternate version that can be called from interrupts
175  * when calling sysfs_notify isn't needed.
176  */
177 static void md_new_event_inintr(mddev_t *mddev)
178 {
179 	atomic_inc(&md_event_count);
180 	wake_up(&md_event_waiters);
181 }
182 
183 /*
184  * Enables to iterate over all existing md arrays
185  * all_mddevs_lock protects this list.
186  */
187 static LIST_HEAD(all_mddevs);
188 static DEFINE_SPINLOCK(all_mddevs_lock);
189 
190 
191 /*
192  * iterates through all used mddevs in the system.
193  * We take care to grab the all_mddevs_lock whenever navigating
194  * the list, and to always hold a refcount when unlocked.
195  * Any code which breaks out of this loop while own
196  * a reference to the current mddev and must mddev_put it.
197  */
198 #define for_each_mddev(mddev,tmp)					\
199 									\
200 	for (({ spin_lock(&all_mddevs_lock); 				\
201 		tmp = all_mddevs.next;					\
202 		mddev = NULL;});					\
203 	     ({ if (tmp != &all_mddevs)					\
204 			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
205 		spin_unlock(&all_mddevs_lock);				\
206 		if (mddev) mddev_put(mddev);				\
207 		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
208 		tmp != &all_mddevs;});					\
209 	     ({ spin_lock(&all_mddevs_lock);				\
210 		tmp = tmp->next;})					\
211 		)
212 
213 
214 static int md_fail_request (struct request_queue *q, struct bio *bio)
215 {
216 	bio_io_error(bio);
217 	return 0;
218 }
219 
220 static inline mddev_t *mddev_get(mddev_t *mddev)
221 {
222 	atomic_inc(&mddev->active);
223 	return mddev;
224 }
225 
226 static void mddev_put(mddev_t *mddev)
227 {
228 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
229 		return;
230 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
231 		list_del(&mddev->all_mddevs);
232 		spin_unlock(&all_mddevs_lock);
233 		blk_cleanup_queue(mddev->queue);
234 		kobject_put(&mddev->kobj);
235 	} else
236 		spin_unlock(&all_mddevs_lock);
237 }
238 
239 static mddev_t * mddev_find(dev_t unit)
240 {
241 	mddev_t *mddev, *new = NULL;
242 
243  retry:
244 	spin_lock(&all_mddevs_lock);
245 	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
246 		if (mddev->unit == unit) {
247 			mddev_get(mddev);
248 			spin_unlock(&all_mddevs_lock);
249 			kfree(new);
250 			return mddev;
251 		}
252 
253 	if (new) {
254 		list_add(&new->all_mddevs, &all_mddevs);
255 		spin_unlock(&all_mddevs_lock);
256 		return new;
257 	}
258 	spin_unlock(&all_mddevs_lock);
259 
260 	new = kzalloc(sizeof(*new), GFP_KERNEL);
261 	if (!new)
262 		return NULL;
263 
264 	new->unit = unit;
265 	if (MAJOR(unit) == MD_MAJOR)
266 		new->md_minor = MINOR(unit);
267 	else
268 		new->md_minor = MINOR(unit) >> MdpMinorShift;
269 
270 	mutex_init(&new->reconfig_mutex);
271 	INIT_LIST_HEAD(&new->disks);
272 	INIT_LIST_HEAD(&new->all_mddevs);
273 	init_timer(&new->safemode_timer);
274 	atomic_set(&new->active, 1);
275 	spin_lock_init(&new->write_lock);
276 	init_waitqueue_head(&new->sb_wait);
277 	new->reshape_position = MaxSector;
278 	new->resync_max = MaxSector;
279 
280 	new->queue = blk_alloc_queue(GFP_KERNEL);
281 	if (!new->queue) {
282 		kfree(new);
283 		return NULL;
284 	}
285 	set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
286 
287 	blk_queue_make_request(new->queue, md_fail_request);
288 
289 	goto retry;
290 }
291 
292 static inline int mddev_lock(mddev_t * mddev)
293 {
294 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
295 }
296 
297 static inline int mddev_trylock(mddev_t * mddev)
298 {
299 	return mutex_trylock(&mddev->reconfig_mutex);
300 }
301 
302 static inline void mddev_unlock(mddev_t * mddev)
303 {
304 	mutex_unlock(&mddev->reconfig_mutex);
305 
306 	md_wakeup_thread(mddev->thread);
307 }
308 
309 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
310 {
311 	mdk_rdev_t * rdev;
312 	struct list_head *tmp;
313 
314 	rdev_for_each(rdev, tmp, mddev) {
315 		if (rdev->desc_nr == nr)
316 			return rdev;
317 	}
318 	return NULL;
319 }
320 
321 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
322 {
323 	struct list_head *tmp;
324 	mdk_rdev_t *rdev;
325 
326 	rdev_for_each(rdev, tmp, mddev) {
327 		if (rdev->bdev->bd_dev == dev)
328 			return rdev;
329 	}
330 	return NULL;
331 }
332 
333 static struct mdk_personality *find_pers(int level, char *clevel)
334 {
335 	struct mdk_personality *pers;
336 	list_for_each_entry(pers, &pers_list, list) {
337 		if (level != LEVEL_NONE && pers->level == level)
338 			return pers;
339 		if (strcmp(pers->name, clevel)==0)
340 			return pers;
341 	}
342 	return NULL;
343 }
344 
345 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
346 {
347 	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
348 	return MD_NEW_SIZE_BLOCKS(size);
349 }
350 
351 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
352 {
353 	sector_t size;
354 
355 	size = rdev->sb_offset;
356 
357 	if (chunk_size)
358 		size &= ~((sector_t)chunk_size/1024 - 1);
359 	return size;
360 }
361 
362 static int alloc_disk_sb(mdk_rdev_t * rdev)
363 {
364 	if (rdev->sb_page)
365 		MD_BUG();
366 
367 	rdev->sb_page = alloc_page(GFP_KERNEL);
368 	if (!rdev->sb_page) {
369 		printk(KERN_ALERT "md: out of memory.\n");
370 		return -EINVAL;
371 	}
372 
373 	return 0;
374 }
375 
376 static void free_disk_sb(mdk_rdev_t * rdev)
377 {
378 	if (rdev->sb_page) {
379 		put_page(rdev->sb_page);
380 		rdev->sb_loaded = 0;
381 		rdev->sb_page = NULL;
382 		rdev->sb_offset = 0;
383 		rdev->size = 0;
384 	}
385 }
386 
387 
388 static void super_written(struct bio *bio, int error)
389 {
390 	mdk_rdev_t *rdev = bio->bi_private;
391 	mddev_t *mddev = rdev->mddev;
392 
393 	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
394 		printk("md: super_written gets error=%d, uptodate=%d\n",
395 		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
396 		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
397 		md_error(mddev, rdev);
398 	}
399 
400 	if (atomic_dec_and_test(&mddev->pending_writes))
401 		wake_up(&mddev->sb_wait);
402 	bio_put(bio);
403 }
404 
405 static void super_written_barrier(struct bio *bio, int error)
406 {
407 	struct bio *bio2 = bio->bi_private;
408 	mdk_rdev_t *rdev = bio2->bi_private;
409 	mddev_t *mddev = rdev->mddev;
410 
411 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
412 	    error == -EOPNOTSUPP) {
413 		unsigned long flags;
414 		/* barriers don't appear to be supported :-( */
415 		set_bit(BarriersNotsupp, &rdev->flags);
416 		mddev->barriers_work = 0;
417 		spin_lock_irqsave(&mddev->write_lock, flags);
418 		bio2->bi_next = mddev->biolist;
419 		mddev->biolist = bio2;
420 		spin_unlock_irqrestore(&mddev->write_lock, flags);
421 		wake_up(&mddev->sb_wait);
422 		bio_put(bio);
423 	} else {
424 		bio_put(bio2);
425 		bio->bi_private = rdev;
426 		super_written(bio, error);
427 	}
428 }
429 
430 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
431 		   sector_t sector, int size, struct page *page)
432 {
433 	/* write first size bytes of page to sector of rdev
434 	 * Increment mddev->pending_writes before returning
435 	 * and decrement it on completion, waking up sb_wait
436 	 * if zero is reached.
437 	 * If an error occurred, call md_error
438 	 *
439 	 * As we might need to resubmit the request if BIO_RW_BARRIER
440 	 * causes ENOTSUPP, we allocate a spare bio...
441 	 */
442 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
443 	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
444 
445 	bio->bi_bdev = rdev->bdev;
446 	bio->bi_sector = sector;
447 	bio_add_page(bio, page, size, 0);
448 	bio->bi_private = rdev;
449 	bio->bi_end_io = super_written;
450 	bio->bi_rw = rw;
451 
452 	atomic_inc(&mddev->pending_writes);
453 	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
454 		struct bio *rbio;
455 		rw |= (1<<BIO_RW_BARRIER);
456 		rbio = bio_clone(bio, GFP_NOIO);
457 		rbio->bi_private = bio;
458 		rbio->bi_end_io = super_written_barrier;
459 		submit_bio(rw, rbio);
460 	} else
461 		submit_bio(rw, bio);
462 }
463 
464 void md_super_wait(mddev_t *mddev)
465 {
466 	/* wait for all superblock writes that were scheduled to complete.
467 	 * if any had to be retried (due to BARRIER problems), retry them
468 	 */
469 	DEFINE_WAIT(wq);
470 	for(;;) {
471 		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
472 		if (atomic_read(&mddev->pending_writes)==0)
473 			break;
474 		while (mddev->biolist) {
475 			struct bio *bio;
476 			spin_lock_irq(&mddev->write_lock);
477 			bio = mddev->biolist;
478 			mddev->biolist = bio->bi_next ;
479 			bio->bi_next = NULL;
480 			spin_unlock_irq(&mddev->write_lock);
481 			submit_bio(bio->bi_rw, bio);
482 		}
483 		schedule();
484 	}
485 	finish_wait(&mddev->sb_wait, &wq);
486 }
487 
488 static void bi_complete(struct bio *bio, int error)
489 {
490 	complete((struct completion*)bio->bi_private);
491 }
492 
493 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
494 		   struct page *page, int rw)
495 {
496 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
497 	struct completion event;
498 	int ret;
499 
500 	rw |= (1 << BIO_RW_SYNC);
501 
502 	bio->bi_bdev = bdev;
503 	bio->bi_sector = sector;
504 	bio_add_page(bio, page, size, 0);
505 	init_completion(&event);
506 	bio->bi_private = &event;
507 	bio->bi_end_io = bi_complete;
508 	submit_bio(rw, bio);
509 	wait_for_completion(&event);
510 
511 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
512 	bio_put(bio);
513 	return ret;
514 }
515 EXPORT_SYMBOL_GPL(sync_page_io);
516 
517 static int read_disk_sb(mdk_rdev_t * rdev, int size)
518 {
519 	char b[BDEVNAME_SIZE];
520 	if (!rdev->sb_page) {
521 		MD_BUG();
522 		return -EINVAL;
523 	}
524 	if (rdev->sb_loaded)
525 		return 0;
526 
527 
528 	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
529 		goto fail;
530 	rdev->sb_loaded = 1;
531 	return 0;
532 
533 fail:
534 	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
535 		bdevname(rdev->bdev,b));
536 	return -EINVAL;
537 }
538 
539 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
540 {
541 	if (	(sb1->set_uuid0 == sb2->set_uuid0) &&
542 		(sb1->set_uuid1 == sb2->set_uuid1) &&
543 		(sb1->set_uuid2 == sb2->set_uuid2) &&
544 		(sb1->set_uuid3 == sb2->set_uuid3))
545 
546 		return 1;
547 
548 	return 0;
549 }
550 
551 
552 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
553 {
554 	int ret;
555 	mdp_super_t *tmp1, *tmp2;
556 
557 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
558 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
559 
560 	if (!tmp1 || !tmp2) {
561 		ret = 0;
562 		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
563 		goto abort;
564 	}
565 
566 	*tmp1 = *sb1;
567 	*tmp2 = *sb2;
568 
569 	/*
570 	 * nr_disks is not constant
571 	 */
572 	tmp1->nr_disks = 0;
573 	tmp2->nr_disks = 0;
574 
575 	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
576 		ret = 0;
577 	else
578 		ret = 1;
579 
580 abort:
581 	kfree(tmp1);
582 	kfree(tmp2);
583 	return ret;
584 }
585 
586 
587 static u32 md_csum_fold(u32 csum)
588 {
589 	csum = (csum & 0xffff) + (csum >> 16);
590 	return (csum & 0xffff) + (csum >> 16);
591 }
592 
593 static unsigned int calc_sb_csum(mdp_super_t * sb)
594 {
595 	u64 newcsum = 0;
596 	u32 *sb32 = (u32*)sb;
597 	int i;
598 	unsigned int disk_csum, csum;
599 
600 	disk_csum = sb->sb_csum;
601 	sb->sb_csum = 0;
602 
603 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
604 		newcsum += sb32[i];
605 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
606 
607 
608 #ifdef CONFIG_ALPHA
609 	/* This used to use csum_partial, which was wrong for several
610 	 * reasons including that different results are returned on
611 	 * different architectures.  It isn't critical that we get exactly
612 	 * the same return value as before (we always csum_fold before
613 	 * testing, and that removes any differences).  However as we
614 	 * know that csum_partial always returned a 16bit value on
615 	 * alphas, do a fold to maximise conformity to previous behaviour.
616 	 */
617 	sb->sb_csum = md_csum_fold(disk_csum);
618 #else
619 	sb->sb_csum = disk_csum;
620 #endif
621 	return csum;
622 }
623 
624 
625 /*
626  * Handle superblock details.
627  * We want to be able to handle multiple superblock formats
628  * so we have a common interface to them all, and an array of
629  * different handlers.
630  * We rely on user-space to write the initial superblock, and support
631  * reading and updating of superblocks.
632  * Interface methods are:
633  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
634  *      loads and validates a superblock on dev.
635  *      if refdev != NULL, compare superblocks on both devices
636  *    Return:
637  *      0 - dev has a superblock that is compatible with refdev
638  *      1 - dev has a superblock that is compatible and newer than refdev
639  *          so dev should be used as the refdev in future
640  *     -EINVAL superblock incompatible or invalid
641  *     -othererror e.g. -EIO
642  *
643  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
644  *      Verify that dev is acceptable into mddev.
645  *       The first time, mddev->raid_disks will be 0, and data from
646  *       dev should be merged in.  Subsequent calls check that dev
647  *       is new enough.  Return 0 or -EINVAL
648  *
649  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
650  *     Update the superblock for rdev with data in mddev
651  *     This does not write to disc.
652  *
653  */
654 
655 struct super_type  {
656 	char 		*name;
657 	struct module	*owner;
658 	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
659 	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
660 	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
661 };
662 
663 /*
664  * load_super for 0.90.0
665  */
666 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
667 {
668 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
669 	mdp_super_t *sb;
670 	int ret;
671 	sector_t sb_offset;
672 
673 	/*
674 	 * Calculate the position of the superblock,
675 	 * it's at the end of the disk.
676 	 *
677 	 * It also happens to be a multiple of 4Kb.
678 	 */
679 	sb_offset = calc_dev_sboffset(rdev->bdev);
680 	rdev->sb_offset = sb_offset;
681 
682 	ret = read_disk_sb(rdev, MD_SB_BYTES);
683 	if (ret) return ret;
684 
685 	ret = -EINVAL;
686 
687 	bdevname(rdev->bdev, b);
688 	sb = (mdp_super_t*)page_address(rdev->sb_page);
689 
690 	if (sb->md_magic != MD_SB_MAGIC) {
691 		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
692 		       b);
693 		goto abort;
694 	}
695 
696 	if (sb->major_version != 0 ||
697 	    sb->minor_version < 90 ||
698 	    sb->minor_version > 91) {
699 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
700 			sb->major_version, sb->minor_version,
701 			b);
702 		goto abort;
703 	}
704 
705 	if (sb->raid_disks <= 0)
706 		goto abort;
707 
708 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
709 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
710 			b);
711 		goto abort;
712 	}
713 
714 	rdev->preferred_minor = sb->md_minor;
715 	rdev->data_offset = 0;
716 	rdev->sb_size = MD_SB_BYTES;
717 
718 	if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
719 		if (sb->level != 1 && sb->level != 4
720 		    && sb->level != 5 && sb->level != 6
721 		    && sb->level != 10) {
722 			/* FIXME use a better test */
723 			printk(KERN_WARNING
724 			       "md: bitmaps not supported for this level.\n");
725 			goto abort;
726 		}
727 	}
728 
729 	if (sb->level == LEVEL_MULTIPATH)
730 		rdev->desc_nr = -1;
731 	else
732 		rdev->desc_nr = sb->this_disk.number;
733 
734 	if (refdev == 0)
735 		ret = 1;
736 	else {
737 		__u64 ev1, ev2;
738 		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
739 		if (!uuid_equal(refsb, sb)) {
740 			printk(KERN_WARNING "md: %s has different UUID to %s\n",
741 				b, bdevname(refdev->bdev,b2));
742 			goto abort;
743 		}
744 		if (!sb_equal(refsb, sb)) {
745 			printk(KERN_WARNING "md: %s has same UUID"
746 			       " but different superblock to %s\n",
747 			       b, bdevname(refdev->bdev, b2));
748 			goto abort;
749 		}
750 		ev1 = md_event(sb);
751 		ev2 = md_event(refsb);
752 		if (ev1 > ev2)
753 			ret = 1;
754 		else
755 			ret = 0;
756 	}
757 	rdev->size = calc_dev_size(rdev, sb->chunk_size);
758 
759 	if (rdev->size < sb->size && sb->level > 1)
760 		/* "this cannot possibly happen" ... */
761 		ret = -EINVAL;
762 
763  abort:
764 	return ret;
765 }
766 
767 /*
768  * validate_super for 0.90.0
769  */
770 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
771 {
772 	mdp_disk_t *desc;
773 	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
774 	__u64 ev1 = md_event(sb);
775 
776 	rdev->raid_disk = -1;
777 	clear_bit(Faulty, &rdev->flags);
778 	clear_bit(In_sync, &rdev->flags);
779 	clear_bit(WriteMostly, &rdev->flags);
780 	clear_bit(BarriersNotsupp, &rdev->flags);
781 
782 	if (mddev->raid_disks == 0) {
783 		mddev->major_version = 0;
784 		mddev->minor_version = sb->minor_version;
785 		mddev->patch_version = sb->patch_version;
786 		mddev->external = 0;
787 		mddev->chunk_size = sb->chunk_size;
788 		mddev->ctime = sb->ctime;
789 		mddev->utime = sb->utime;
790 		mddev->level = sb->level;
791 		mddev->clevel[0] = 0;
792 		mddev->layout = sb->layout;
793 		mddev->raid_disks = sb->raid_disks;
794 		mddev->size = sb->size;
795 		mddev->events = ev1;
796 		mddev->bitmap_offset = 0;
797 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
798 
799 		if (mddev->minor_version >= 91) {
800 			mddev->reshape_position = sb->reshape_position;
801 			mddev->delta_disks = sb->delta_disks;
802 			mddev->new_level = sb->new_level;
803 			mddev->new_layout = sb->new_layout;
804 			mddev->new_chunk = sb->new_chunk;
805 		} else {
806 			mddev->reshape_position = MaxSector;
807 			mddev->delta_disks = 0;
808 			mddev->new_level = mddev->level;
809 			mddev->new_layout = mddev->layout;
810 			mddev->new_chunk = mddev->chunk_size;
811 		}
812 
813 		if (sb->state & (1<<MD_SB_CLEAN))
814 			mddev->recovery_cp = MaxSector;
815 		else {
816 			if (sb->events_hi == sb->cp_events_hi &&
817 				sb->events_lo == sb->cp_events_lo) {
818 				mddev->recovery_cp = sb->recovery_cp;
819 			} else
820 				mddev->recovery_cp = 0;
821 		}
822 
823 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
824 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
825 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
826 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
827 
828 		mddev->max_disks = MD_SB_DISKS;
829 
830 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
831 		    mddev->bitmap_file == NULL)
832 			mddev->bitmap_offset = mddev->default_bitmap_offset;
833 
834 	} else if (mddev->pers == NULL) {
835 		/* Insist on good event counter while assembling */
836 		++ev1;
837 		if (ev1 < mddev->events)
838 			return -EINVAL;
839 	} else if (mddev->bitmap) {
840 		/* if adding to array with a bitmap, then we can accept an
841 		 * older device ... but not too old.
842 		 */
843 		if (ev1 < mddev->bitmap->events_cleared)
844 			return 0;
845 	} else {
846 		if (ev1 < mddev->events)
847 			/* just a hot-add of a new device, leave raid_disk at -1 */
848 			return 0;
849 	}
850 
851 	if (mddev->level != LEVEL_MULTIPATH) {
852 		desc = sb->disks + rdev->desc_nr;
853 
854 		if (desc->state & (1<<MD_DISK_FAULTY))
855 			set_bit(Faulty, &rdev->flags);
856 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
857 			    desc->raid_disk < mddev->raid_disks */) {
858 			set_bit(In_sync, &rdev->flags);
859 			rdev->raid_disk = desc->raid_disk;
860 		}
861 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
862 			set_bit(WriteMostly, &rdev->flags);
863 	} else /* MULTIPATH are always insync */
864 		set_bit(In_sync, &rdev->flags);
865 	return 0;
866 }
867 
868 /*
869  * sync_super for 0.90.0
870  */
871 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
872 {
873 	mdp_super_t *sb;
874 	struct list_head *tmp;
875 	mdk_rdev_t *rdev2;
876 	int next_spare = mddev->raid_disks;
877 
878 
879 	/* make rdev->sb match mddev data..
880 	 *
881 	 * 1/ zero out disks
882 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
883 	 * 3/ any empty disks < next_spare become removed
884 	 *
885 	 * disks[0] gets initialised to REMOVED because
886 	 * we cannot be sure from other fields if it has
887 	 * been initialised or not.
888 	 */
889 	int i;
890 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
891 
892 	rdev->sb_size = MD_SB_BYTES;
893 
894 	sb = (mdp_super_t*)page_address(rdev->sb_page);
895 
896 	memset(sb, 0, sizeof(*sb));
897 
898 	sb->md_magic = MD_SB_MAGIC;
899 	sb->major_version = mddev->major_version;
900 	sb->patch_version = mddev->patch_version;
901 	sb->gvalid_words  = 0; /* ignored */
902 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
903 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
904 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
905 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
906 
907 	sb->ctime = mddev->ctime;
908 	sb->level = mddev->level;
909 	sb->size  = mddev->size;
910 	sb->raid_disks = mddev->raid_disks;
911 	sb->md_minor = mddev->md_minor;
912 	sb->not_persistent = 0;
913 	sb->utime = mddev->utime;
914 	sb->state = 0;
915 	sb->events_hi = (mddev->events>>32);
916 	sb->events_lo = (u32)mddev->events;
917 
918 	if (mddev->reshape_position == MaxSector)
919 		sb->minor_version = 90;
920 	else {
921 		sb->minor_version = 91;
922 		sb->reshape_position = mddev->reshape_position;
923 		sb->new_level = mddev->new_level;
924 		sb->delta_disks = mddev->delta_disks;
925 		sb->new_layout = mddev->new_layout;
926 		sb->new_chunk = mddev->new_chunk;
927 	}
928 	mddev->minor_version = sb->minor_version;
929 	if (mddev->in_sync)
930 	{
931 		sb->recovery_cp = mddev->recovery_cp;
932 		sb->cp_events_hi = (mddev->events>>32);
933 		sb->cp_events_lo = (u32)mddev->events;
934 		if (mddev->recovery_cp == MaxSector)
935 			sb->state = (1<< MD_SB_CLEAN);
936 	} else
937 		sb->recovery_cp = 0;
938 
939 	sb->layout = mddev->layout;
940 	sb->chunk_size = mddev->chunk_size;
941 
942 	if (mddev->bitmap && mddev->bitmap_file == NULL)
943 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
944 
945 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
946 	rdev_for_each(rdev2, tmp, mddev) {
947 		mdp_disk_t *d;
948 		int desc_nr;
949 		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
950 		    && !test_bit(Faulty, &rdev2->flags))
951 			desc_nr = rdev2->raid_disk;
952 		else
953 			desc_nr = next_spare++;
954 		rdev2->desc_nr = desc_nr;
955 		d = &sb->disks[rdev2->desc_nr];
956 		nr_disks++;
957 		d->number = rdev2->desc_nr;
958 		d->major = MAJOR(rdev2->bdev->bd_dev);
959 		d->minor = MINOR(rdev2->bdev->bd_dev);
960 		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
961 		    && !test_bit(Faulty, &rdev2->flags))
962 			d->raid_disk = rdev2->raid_disk;
963 		else
964 			d->raid_disk = rdev2->desc_nr; /* compatibility */
965 		if (test_bit(Faulty, &rdev2->flags))
966 			d->state = (1<<MD_DISK_FAULTY);
967 		else if (test_bit(In_sync, &rdev2->flags)) {
968 			d->state = (1<<MD_DISK_ACTIVE);
969 			d->state |= (1<<MD_DISK_SYNC);
970 			active++;
971 			working++;
972 		} else {
973 			d->state = 0;
974 			spare++;
975 			working++;
976 		}
977 		if (test_bit(WriteMostly, &rdev2->flags))
978 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
979 	}
980 	/* now set the "removed" and "faulty" bits on any missing devices */
981 	for (i=0 ; i < mddev->raid_disks ; i++) {
982 		mdp_disk_t *d = &sb->disks[i];
983 		if (d->state == 0 && d->number == 0) {
984 			d->number = i;
985 			d->raid_disk = i;
986 			d->state = (1<<MD_DISK_REMOVED);
987 			d->state |= (1<<MD_DISK_FAULTY);
988 			failed++;
989 		}
990 	}
991 	sb->nr_disks = nr_disks;
992 	sb->active_disks = active;
993 	sb->working_disks = working;
994 	sb->failed_disks = failed;
995 	sb->spare_disks = spare;
996 
997 	sb->this_disk = sb->disks[rdev->desc_nr];
998 	sb->sb_csum = calc_sb_csum(sb);
999 }
1000 
1001 /*
1002  * version 1 superblock
1003  */
1004 
1005 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1006 {
1007 	__le32 disk_csum;
1008 	u32 csum;
1009 	unsigned long long newcsum;
1010 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1011 	__le32 *isuper = (__le32*)sb;
1012 	int i;
1013 
1014 	disk_csum = sb->sb_csum;
1015 	sb->sb_csum = 0;
1016 	newcsum = 0;
1017 	for (i=0; size>=4; size -= 4 )
1018 		newcsum += le32_to_cpu(*isuper++);
1019 
1020 	if (size == 2)
1021 		newcsum += le16_to_cpu(*(__le16*) isuper);
1022 
1023 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1024 	sb->sb_csum = disk_csum;
1025 	return cpu_to_le32(csum);
1026 }
1027 
1028 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1029 {
1030 	struct mdp_superblock_1 *sb;
1031 	int ret;
1032 	sector_t sb_offset;
1033 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1034 	int bmask;
1035 
1036 	/*
1037 	 * Calculate the position of the superblock.
1038 	 * It is always aligned to a 4K boundary and
1039 	 * depeding on minor_version, it can be:
1040 	 * 0: At least 8K, but less than 12K, from end of device
1041 	 * 1: At start of device
1042 	 * 2: 4K from start of device.
1043 	 */
1044 	switch(minor_version) {
1045 	case 0:
1046 		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1047 		sb_offset -= 8*2;
1048 		sb_offset &= ~(sector_t)(4*2-1);
1049 		/* convert from sectors to K */
1050 		sb_offset /= 2;
1051 		break;
1052 	case 1:
1053 		sb_offset = 0;
1054 		break;
1055 	case 2:
1056 		sb_offset = 4;
1057 		break;
1058 	default:
1059 		return -EINVAL;
1060 	}
1061 	rdev->sb_offset = sb_offset;
1062 
1063 	/* superblock is rarely larger than 1K, but it can be larger,
1064 	 * and it is safe to read 4k, so we do that
1065 	 */
1066 	ret = read_disk_sb(rdev, 4096);
1067 	if (ret) return ret;
1068 
1069 
1070 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1071 
1072 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1073 	    sb->major_version != cpu_to_le32(1) ||
1074 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1075 	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1076 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1077 		return -EINVAL;
1078 
1079 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1080 		printk("md: invalid superblock checksum on %s\n",
1081 			bdevname(rdev->bdev,b));
1082 		return -EINVAL;
1083 	}
1084 	if (le64_to_cpu(sb->data_size) < 10) {
1085 		printk("md: data_size too small on %s\n",
1086 		       bdevname(rdev->bdev,b));
1087 		return -EINVAL;
1088 	}
1089 	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1090 		if (sb->level != cpu_to_le32(1) &&
1091 		    sb->level != cpu_to_le32(4) &&
1092 		    sb->level != cpu_to_le32(5) &&
1093 		    sb->level != cpu_to_le32(6) &&
1094 		    sb->level != cpu_to_le32(10)) {
1095 			printk(KERN_WARNING
1096 			       "md: bitmaps not supported for this level.\n");
1097 			return -EINVAL;
1098 		}
1099 	}
1100 
1101 	rdev->preferred_minor = 0xffff;
1102 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1103 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1104 
1105 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1106 	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1107 	if (rdev->sb_size & bmask)
1108 		rdev-> sb_size = (rdev->sb_size | bmask)+1;
1109 
1110 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1111 		rdev->desc_nr = -1;
1112 	else
1113 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1114 
1115 	if (refdev == 0)
1116 		ret = 1;
1117 	else {
1118 		__u64 ev1, ev2;
1119 		struct mdp_superblock_1 *refsb =
1120 			(struct mdp_superblock_1*)page_address(refdev->sb_page);
1121 
1122 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1123 		    sb->level != refsb->level ||
1124 		    sb->layout != refsb->layout ||
1125 		    sb->chunksize != refsb->chunksize) {
1126 			printk(KERN_WARNING "md: %s has strangely different"
1127 				" superblock to %s\n",
1128 				bdevname(rdev->bdev,b),
1129 				bdevname(refdev->bdev,b2));
1130 			return -EINVAL;
1131 		}
1132 		ev1 = le64_to_cpu(sb->events);
1133 		ev2 = le64_to_cpu(refsb->events);
1134 
1135 		if (ev1 > ev2)
1136 			ret = 1;
1137 		else
1138 			ret = 0;
1139 	}
1140 	if (minor_version)
1141 		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1142 	else
1143 		rdev->size = rdev->sb_offset;
1144 	if (rdev->size < le64_to_cpu(sb->data_size)/2)
1145 		return -EINVAL;
1146 	rdev->size = le64_to_cpu(sb->data_size)/2;
1147 	if (le32_to_cpu(sb->chunksize))
1148 		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1149 
1150 	if (le64_to_cpu(sb->size) > rdev->size*2)
1151 		return -EINVAL;
1152 	return ret;
1153 }
1154 
1155 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1156 {
1157 	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1158 	__u64 ev1 = le64_to_cpu(sb->events);
1159 
1160 	rdev->raid_disk = -1;
1161 	clear_bit(Faulty, &rdev->flags);
1162 	clear_bit(In_sync, &rdev->flags);
1163 	clear_bit(WriteMostly, &rdev->flags);
1164 	clear_bit(BarriersNotsupp, &rdev->flags);
1165 
1166 	if (mddev->raid_disks == 0) {
1167 		mddev->major_version = 1;
1168 		mddev->patch_version = 0;
1169 		mddev->external = 0;
1170 		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1171 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1172 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1173 		mddev->level = le32_to_cpu(sb->level);
1174 		mddev->clevel[0] = 0;
1175 		mddev->layout = le32_to_cpu(sb->layout);
1176 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1177 		mddev->size = le64_to_cpu(sb->size)/2;
1178 		mddev->events = ev1;
1179 		mddev->bitmap_offset = 0;
1180 		mddev->default_bitmap_offset = 1024 >> 9;
1181 
1182 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1183 		memcpy(mddev->uuid, sb->set_uuid, 16);
1184 
1185 		mddev->max_disks =  (4096-256)/2;
1186 
1187 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1188 		    mddev->bitmap_file == NULL )
1189 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1190 
1191 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1192 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1193 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1194 			mddev->new_level = le32_to_cpu(sb->new_level);
1195 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1196 			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1197 		} else {
1198 			mddev->reshape_position = MaxSector;
1199 			mddev->delta_disks = 0;
1200 			mddev->new_level = mddev->level;
1201 			mddev->new_layout = mddev->layout;
1202 			mddev->new_chunk = mddev->chunk_size;
1203 		}
1204 
1205 	} else if (mddev->pers == NULL) {
1206 		/* Insist of good event counter while assembling */
1207 		++ev1;
1208 		if (ev1 < mddev->events)
1209 			return -EINVAL;
1210 	} else if (mddev->bitmap) {
1211 		/* If adding to array with a bitmap, then we can accept an
1212 		 * older device, but not too old.
1213 		 */
1214 		if (ev1 < mddev->bitmap->events_cleared)
1215 			return 0;
1216 	} else {
1217 		if (ev1 < mddev->events)
1218 			/* just a hot-add of a new device, leave raid_disk at -1 */
1219 			return 0;
1220 	}
1221 	if (mddev->level != LEVEL_MULTIPATH) {
1222 		int role;
1223 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1224 		switch(role) {
1225 		case 0xffff: /* spare */
1226 			break;
1227 		case 0xfffe: /* faulty */
1228 			set_bit(Faulty, &rdev->flags);
1229 			break;
1230 		default:
1231 			if ((le32_to_cpu(sb->feature_map) &
1232 			     MD_FEATURE_RECOVERY_OFFSET))
1233 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1234 			else
1235 				set_bit(In_sync, &rdev->flags);
1236 			rdev->raid_disk = role;
1237 			break;
1238 		}
1239 		if (sb->devflags & WriteMostly1)
1240 			set_bit(WriteMostly, &rdev->flags);
1241 	} else /* MULTIPATH are always insync */
1242 		set_bit(In_sync, &rdev->flags);
1243 
1244 	return 0;
1245 }
1246 
1247 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1248 {
1249 	struct mdp_superblock_1 *sb;
1250 	struct list_head *tmp;
1251 	mdk_rdev_t *rdev2;
1252 	int max_dev, i;
1253 	/* make rdev->sb match mddev and rdev data. */
1254 
1255 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1256 
1257 	sb->feature_map = 0;
1258 	sb->pad0 = 0;
1259 	sb->recovery_offset = cpu_to_le64(0);
1260 	memset(sb->pad1, 0, sizeof(sb->pad1));
1261 	memset(sb->pad2, 0, sizeof(sb->pad2));
1262 	memset(sb->pad3, 0, sizeof(sb->pad3));
1263 
1264 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1265 	sb->events = cpu_to_le64(mddev->events);
1266 	if (mddev->in_sync)
1267 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1268 	else
1269 		sb->resync_offset = cpu_to_le64(0);
1270 
1271 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1272 
1273 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1274 	sb->size = cpu_to_le64(mddev->size<<1);
1275 
1276 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
1277 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1278 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1279 	}
1280 
1281 	if (rdev->raid_disk >= 0 &&
1282 	    !test_bit(In_sync, &rdev->flags) &&
1283 	    rdev->recovery_offset > 0) {
1284 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1285 		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1286 	}
1287 
1288 	if (mddev->reshape_position != MaxSector) {
1289 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1290 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1291 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1292 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1293 		sb->new_level = cpu_to_le32(mddev->new_level);
1294 		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1295 	}
1296 
1297 	max_dev = 0;
1298 	rdev_for_each(rdev2, tmp, mddev)
1299 		if (rdev2->desc_nr+1 > max_dev)
1300 			max_dev = rdev2->desc_nr+1;
1301 
1302 	if (max_dev > le32_to_cpu(sb->max_dev))
1303 		sb->max_dev = cpu_to_le32(max_dev);
1304 	for (i=0; i<max_dev;i++)
1305 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1306 
1307 	rdev_for_each(rdev2, tmp, mddev) {
1308 		i = rdev2->desc_nr;
1309 		if (test_bit(Faulty, &rdev2->flags))
1310 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1311 		else if (test_bit(In_sync, &rdev2->flags))
1312 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1313 		else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1314 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1315 		else
1316 			sb->dev_roles[i] = cpu_to_le16(0xffff);
1317 	}
1318 
1319 	sb->sb_csum = calc_sb_1_csum(sb);
1320 }
1321 
1322 
1323 static struct super_type super_types[] = {
1324 	[0] = {
1325 		.name	= "0.90.0",
1326 		.owner	= THIS_MODULE,
1327 		.load_super	= super_90_load,
1328 		.validate_super	= super_90_validate,
1329 		.sync_super	= super_90_sync,
1330 	},
1331 	[1] = {
1332 		.name	= "md-1",
1333 		.owner	= THIS_MODULE,
1334 		.load_super	= super_1_load,
1335 		.validate_super	= super_1_validate,
1336 		.sync_super	= super_1_sync,
1337 	},
1338 };
1339 
1340 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1341 {
1342 	struct list_head *tmp, *tmp2;
1343 	mdk_rdev_t *rdev, *rdev2;
1344 
1345 	rdev_for_each(rdev, tmp, mddev1)
1346 		rdev_for_each(rdev2, tmp2, mddev2)
1347 			if (rdev->bdev->bd_contains ==
1348 			    rdev2->bdev->bd_contains)
1349 				return 1;
1350 
1351 	return 0;
1352 }
1353 
1354 static LIST_HEAD(pending_raid_disks);
1355 
1356 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1357 {
1358 	char b[BDEVNAME_SIZE];
1359 	struct kobject *ko;
1360 	char *s;
1361 	int err;
1362 
1363 	if (rdev->mddev) {
1364 		MD_BUG();
1365 		return -EINVAL;
1366 	}
1367 	/* make sure rdev->size exceeds mddev->size */
1368 	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1369 		if (mddev->pers) {
1370 			/* Cannot change size, so fail
1371 			 * If mddev->level <= 0, then we don't care
1372 			 * about aligning sizes (e.g. linear)
1373 			 */
1374 			if (mddev->level > 0)
1375 				return -ENOSPC;
1376 		} else
1377 			mddev->size = rdev->size;
1378 	}
1379 
1380 	/* Verify rdev->desc_nr is unique.
1381 	 * If it is -1, assign a free number, else
1382 	 * check number is not in use
1383 	 */
1384 	if (rdev->desc_nr < 0) {
1385 		int choice = 0;
1386 		if (mddev->pers) choice = mddev->raid_disks;
1387 		while (find_rdev_nr(mddev, choice))
1388 			choice++;
1389 		rdev->desc_nr = choice;
1390 	} else {
1391 		if (find_rdev_nr(mddev, rdev->desc_nr))
1392 			return -EBUSY;
1393 	}
1394 	bdevname(rdev->bdev,b);
1395 	while ( (s=strchr(b, '/')) != NULL)
1396 		*s = '!';
1397 
1398 	rdev->mddev = mddev;
1399 	printk(KERN_INFO "md: bind<%s>\n", b);
1400 
1401 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1402 		goto fail;
1403 
1404 	if (rdev->bdev->bd_part)
1405 		ko = &rdev->bdev->bd_part->dev.kobj;
1406 	else
1407 		ko = &rdev->bdev->bd_disk->dev.kobj;
1408 	if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1409 		kobject_del(&rdev->kobj);
1410 		goto fail;
1411 	}
1412 	list_add(&rdev->same_set, &mddev->disks);
1413 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1414 	return 0;
1415 
1416  fail:
1417 	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1418 	       b, mdname(mddev));
1419 	return err;
1420 }
1421 
1422 static void md_delayed_delete(struct work_struct *ws)
1423 {
1424 	mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1425 	kobject_del(&rdev->kobj);
1426 	kobject_put(&rdev->kobj);
1427 }
1428 
1429 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1430 {
1431 	char b[BDEVNAME_SIZE];
1432 	if (!rdev->mddev) {
1433 		MD_BUG();
1434 		return;
1435 	}
1436 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1437 	list_del_init(&rdev->same_set);
1438 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1439 	rdev->mddev = NULL;
1440 	sysfs_remove_link(&rdev->kobj, "block");
1441 
1442 	/* We need to delay this, otherwise we can deadlock when
1443 	 * writing to 'remove' to "dev/state"
1444 	 */
1445 	INIT_WORK(&rdev->del_work, md_delayed_delete);
1446 	kobject_get(&rdev->kobj);
1447 	schedule_work(&rdev->del_work);
1448 }
1449 
1450 /*
1451  * prevent the device from being mounted, repartitioned or
1452  * otherwise reused by a RAID array (or any other kernel
1453  * subsystem), by bd_claiming the device.
1454  */
1455 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1456 {
1457 	int err = 0;
1458 	struct block_device *bdev;
1459 	char b[BDEVNAME_SIZE];
1460 
1461 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1462 	if (IS_ERR(bdev)) {
1463 		printk(KERN_ERR "md: could not open %s.\n",
1464 			__bdevname(dev, b));
1465 		return PTR_ERR(bdev);
1466 	}
1467 	err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1468 	if (err) {
1469 		printk(KERN_ERR "md: could not bd_claim %s.\n",
1470 			bdevname(bdev, b));
1471 		blkdev_put(bdev);
1472 		return err;
1473 	}
1474 	if (!shared)
1475 		set_bit(AllReserved, &rdev->flags);
1476 	rdev->bdev = bdev;
1477 	return err;
1478 }
1479 
1480 static void unlock_rdev(mdk_rdev_t *rdev)
1481 {
1482 	struct block_device *bdev = rdev->bdev;
1483 	rdev->bdev = NULL;
1484 	if (!bdev)
1485 		MD_BUG();
1486 	bd_release(bdev);
1487 	blkdev_put(bdev);
1488 }
1489 
1490 void md_autodetect_dev(dev_t dev);
1491 
1492 static void export_rdev(mdk_rdev_t * rdev)
1493 {
1494 	char b[BDEVNAME_SIZE];
1495 	printk(KERN_INFO "md: export_rdev(%s)\n",
1496 		bdevname(rdev->bdev,b));
1497 	if (rdev->mddev)
1498 		MD_BUG();
1499 	free_disk_sb(rdev);
1500 	list_del_init(&rdev->same_set);
1501 #ifndef MODULE
1502 	md_autodetect_dev(rdev->bdev->bd_dev);
1503 #endif
1504 	unlock_rdev(rdev);
1505 	kobject_put(&rdev->kobj);
1506 }
1507 
1508 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1509 {
1510 	unbind_rdev_from_array(rdev);
1511 	export_rdev(rdev);
1512 }
1513 
1514 static void export_array(mddev_t *mddev)
1515 {
1516 	struct list_head *tmp;
1517 	mdk_rdev_t *rdev;
1518 
1519 	rdev_for_each(rdev, tmp, mddev) {
1520 		if (!rdev->mddev) {
1521 			MD_BUG();
1522 			continue;
1523 		}
1524 		kick_rdev_from_array(rdev);
1525 	}
1526 	if (!list_empty(&mddev->disks))
1527 		MD_BUG();
1528 	mddev->raid_disks = 0;
1529 	mddev->major_version = 0;
1530 }
1531 
1532 static void print_desc(mdp_disk_t *desc)
1533 {
1534 	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1535 		desc->major,desc->minor,desc->raid_disk,desc->state);
1536 }
1537 
1538 static void print_sb(mdp_super_t *sb)
1539 {
1540 	int i;
1541 
1542 	printk(KERN_INFO
1543 		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1544 		sb->major_version, sb->minor_version, sb->patch_version,
1545 		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1546 		sb->ctime);
1547 	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1548 		sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1549 		sb->md_minor, sb->layout, sb->chunk_size);
1550 	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1551 		" FD:%d SD:%d CSUM:%08x E:%08lx\n",
1552 		sb->utime, sb->state, sb->active_disks, sb->working_disks,
1553 		sb->failed_disks, sb->spare_disks,
1554 		sb->sb_csum, (unsigned long)sb->events_lo);
1555 
1556 	printk(KERN_INFO);
1557 	for (i = 0; i < MD_SB_DISKS; i++) {
1558 		mdp_disk_t *desc;
1559 
1560 		desc = sb->disks + i;
1561 		if (desc->number || desc->major || desc->minor ||
1562 		    desc->raid_disk || (desc->state && (desc->state != 4))) {
1563 			printk("     D %2d: ", i);
1564 			print_desc(desc);
1565 		}
1566 	}
1567 	printk(KERN_INFO "md:     THIS: ");
1568 	print_desc(&sb->this_disk);
1569 
1570 }
1571 
1572 static void print_rdev(mdk_rdev_t *rdev)
1573 {
1574 	char b[BDEVNAME_SIZE];
1575 	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1576 		bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1577 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1578 	        rdev->desc_nr);
1579 	if (rdev->sb_loaded) {
1580 		printk(KERN_INFO "md: rdev superblock:\n");
1581 		print_sb((mdp_super_t*)page_address(rdev->sb_page));
1582 	} else
1583 		printk(KERN_INFO "md: no rdev superblock!\n");
1584 }
1585 
1586 static void md_print_devices(void)
1587 {
1588 	struct list_head *tmp, *tmp2;
1589 	mdk_rdev_t *rdev;
1590 	mddev_t *mddev;
1591 	char b[BDEVNAME_SIZE];
1592 
1593 	printk("\n");
1594 	printk("md:	**********************************\n");
1595 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
1596 	printk("md:	**********************************\n");
1597 	for_each_mddev(mddev, tmp) {
1598 
1599 		if (mddev->bitmap)
1600 			bitmap_print_sb(mddev->bitmap);
1601 		else
1602 			printk("%s: ", mdname(mddev));
1603 		rdev_for_each(rdev, tmp2, mddev)
1604 			printk("<%s>", bdevname(rdev->bdev,b));
1605 		printk("\n");
1606 
1607 		rdev_for_each(rdev, tmp2, mddev)
1608 			print_rdev(rdev);
1609 	}
1610 	printk("md:	**********************************\n");
1611 	printk("\n");
1612 }
1613 
1614 
1615 static void sync_sbs(mddev_t * mddev, int nospares)
1616 {
1617 	/* Update each superblock (in-memory image), but
1618 	 * if we are allowed to, skip spares which already
1619 	 * have the right event counter, or have one earlier
1620 	 * (which would mean they aren't being marked as dirty
1621 	 * with the rest of the array)
1622 	 */
1623 	mdk_rdev_t *rdev;
1624 	struct list_head *tmp;
1625 
1626 	rdev_for_each(rdev, tmp, mddev) {
1627 		if (rdev->sb_events == mddev->events ||
1628 		    (nospares &&
1629 		     rdev->raid_disk < 0 &&
1630 		     (rdev->sb_events&1)==0 &&
1631 		     rdev->sb_events+1 == mddev->events)) {
1632 			/* Don't update this superblock */
1633 			rdev->sb_loaded = 2;
1634 		} else {
1635 			super_types[mddev->major_version].
1636 				sync_super(mddev, rdev);
1637 			rdev->sb_loaded = 1;
1638 		}
1639 	}
1640 }
1641 
1642 static void md_update_sb(mddev_t * mddev, int force_change)
1643 {
1644 	struct list_head *tmp;
1645 	mdk_rdev_t *rdev;
1646 	int sync_req;
1647 	int nospares = 0;
1648 
1649 repeat:
1650 	spin_lock_irq(&mddev->write_lock);
1651 
1652 	set_bit(MD_CHANGE_PENDING, &mddev->flags);
1653 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1654 		force_change = 1;
1655 	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1656 		/* just a clean<-> dirty transition, possibly leave spares alone,
1657 		 * though if events isn't the right even/odd, we will have to do
1658 		 * spares after all
1659 		 */
1660 		nospares = 1;
1661 	if (force_change)
1662 		nospares = 0;
1663 	if (mddev->degraded)
1664 		/* If the array is degraded, then skipping spares is both
1665 		 * dangerous and fairly pointless.
1666 		 * Dangerous because a device that was removed from the array
1667 		 * might have a event_count that still looks up-to-date,
1668 		 * so it can be re-added without a resync.
1669 		 * Pointless because if there are any spares to skip,
1670 		 * then a recovery will happen and soon that array won't
1671 		 * be degraded any more and the spare can go back to sleep then.
1672 		 */
1673 		nospares = 0;
1674 
1675 	sync_req = mddev->in_sync;
1676 	mddev->utime = get_seconds();
1677 
1678 	/* If this is just a dirty<->clean transition, and the array is clean
1679 	 * and 'events' is odd, we can roll back to the previous clean state */
1680 	if (nospares
1681 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1682 	    && (mddev->events & 1)
1683 	    && mddev->events != 1)
1684 		mddev->events--;
1685 	else {
1686 		/* otherwise we have to go forward and ... */
1687 		mddev->events ++;
1688 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1689 			/* .. if the array isn't clean, insist on an odd 'events' */
1690 			if ((mddev->events&1)==0) {
1691 				mddev->events++;
1692 				nospares = 0;
1693 			}
1694 		} else {
1695 			/* otherwise insist on an even 'events' (for clean states) */
1696 			if ((mddev->events&1)) {
1697 				mddev->events++;
1698 				nospares = 0;
1699 			}
1700 		}
1701 	}
1702 
1703 	if (!mddev->events) {
1704 		/*
1705 		 * oops, this 64-bit counter should never wrap.
1706 		 * Either we are in around ~1 trillion A.C., assuming
1707 		 * 1 reboot per second, or we have a bug:
1708 		 */
1709 		MD_BUG();
1710 		mddev->events --;
1711 	}
1712 
1713 	/*
1714 	 * do not write anything to disk if using
1715 	 * nonpersistent superblocks
1716 	 */
1717 	if (!mddev->persistent) {
1718 		if (!mddev->external)
1719 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1720 
1721 		spin_unlock_irq(&mddev->write_lock);
1722 		wake_up(&mddev->sb_wait);
1723 		return;
1724 	}
1725 	sync_sbs(mddev, nospares);
1726 	spin_unlock_irq(&mddev->write_lock);
1727 
1728 	dprintk(KERN_INFO
1729 		"md: updating %s RAID superblock on device (in sync %d)\n",
1730 		mdname(mddev),mddev->in_sync);
1731 
1732 	bitmap_update_sb(mddev->bitmap);
1733 	rdev_for_each(rdev, tmp, mddev) {
1734 		char b[BDEVNAME_SIZE];
1735 		dprintk(KERN_INFO "md: ");
1736 		if (rdev->sb_loaded != 1)
1737 			continue; /* no noise on spare devices */
1738 		if (test_bit(Faulty, &rdev->flags))
1739 			dprintk("(skipping faulty ");
1740 
1741 		dprintk("%s ", bdevname(rdev->bdev,b));
1742 		if (!test_bit(Faulty, &rdev->flags)) {
1743 			md_super_write(mddev,rdev,
1744 				       rdev->sb_offset<<1, rdev->sb_size,
1745 				       rdev->sb_page);
1746 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1747 				bdevname(rdev->bdev,b),
1748 				(unsigned long long)rdev->sb_offset);
1749 			rdev->sb_events = mddev->events;
1750 
1751 		} else
1752 			dprintk(")\n");
1753 		if (mddev->level == LEVEL_MULTIPATH)
1754 			/* only need to write one superblock... */
1755 			break;
1756 	}
1757 	md_super_wait(mddev);
1758 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1759 
1760 	spin_lock_irq(&mddev->write_lock);
1761 	if (mddev->in_sync != sync_req ||
1762 	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1763 		/* have to write it out again */
1764 		spin_unlock_irq(&mddev->write_lock);
1765 		goto repeat;
1766 	}
1767 	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1768 	spin_unlock_irq(&mddev->write_lock);
1769 	wake_up(&mddev->sb_wait);
1770 
1771 }
1772 
1773 /* words written to sysfs files may, or my not, be \n terminated.
1774  * We want to accept with case. For this we use cmd_match.
1775  */
1776 static int cmd_match(const char *cmd, const char *str)
1777 {
1778 	/* See if cmd, written into a sysfs file, matches
1779 	 * str.  They must either be the same, or cmd can
1780 	 * have a trailing newline
1781 	 */
1782 	while (*cmd && *str && *cmd == *str) {
1783 		cmd++;
1784 		str++;
1785 	}
1786 	if (*cmd == '\n')
1787 		cmd++;
1788 	if (*str || *cmd)
1789 		return 0;
1790 	return 1;
1791 }
1792 
1793 struct rdev_sysfs_entry {
1794 	struct attribute attr;
1795 	ssize_t (*show)(mdk_rdev_t *, char *);
1796 	ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1797 };
1798 
1799 static ssize_t
1800 state_show(mdk_rdev_t *rdev, char *page)
1801 {
1802 	char *sep = "";
1803 	size_t len = 0;
1804 
1805 	if (test_bit(Faulty, &rdev->flags)) {
1806 		len+= sprintf(page+len, "%sfaulty",sep);
1807 		sep = ",";
1808 	}
1809 	if (test_bit(In_sync, &rdev->flags)) {
1810 		len += sprintf(page+len, "%sin_sync",sep);
1811 		sep = ",";
1812 	}
1813 	if (test_bit(WriteMostly, &rdev->flags)) {
1814 		len += sprintf(page+len, "%swrite_mostly",sep);
1815 		sep = ",";
1816 	}
1817 	if (!test_bit(Faulty, &rdev->flags) &&
1818 	    !test_bit(In_sync, &rdev->flags)) {
1819 		len += sprintf(page+len, "%sspare", sep);
1820 		sep = ",";
1821 	}
1822 	return len+sprintf(page+len, "\n");
1823 }
1824 
1825 static ssize_t
1826 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1827 {
1828 	/* can write
1829 	 *  faulty  - simulates and error
1830 	 *  remove  - disconnects the device
1831 	 *  writemostly - sets write_mostly
1832 	 *  -writemostly - clears write_mostly
1833 	 */
1834 	int err = -EINVAL;
1835 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1836 		md_error(rdev->mddev, rdev);
1837 		err = 0;
1838 	} else if (cmd_match(buf, "remove")) {
1839 		if (rdev->raid_disk >= 0)
1840 			err = -EBUSY;
1841 		else {
1842 			mddev_t *mddev = rdev->mddev;
1843 			kick_rdev_from_array(rdev);
1844 			if (mddev->pers)
1845 				md_update_sb(mddev, 1);
1846 			md_new_event(mddev);
1847 			err = 0;
1848 		}
1849 	} else if (cmd_match(buf, "writemostly")) {
1850 		set_bit(WriteMostly, &rdev->flags);
1851 		err = 0;
1852 	} else if (cmd_match(buf, "-writemostly")) {
1853 		clear_bit(WriteMostly, &rdev->flags);
1854 		err = 0;
1855 	}
1856 	return err ? err : len;
1857 }
1858 static struct rdev_sysfs_entry rdev_state =
1859 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1860 
1861 static ssize_t
1862 super_show(mdk_rdev_t *rdev, char *page)
1863 {
1864 	if (rdev->sb_loaded && rdev->sb_size) {
1865 		memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1866 		return rdev->sb_size;
1867 	} else
1868 		return 0;
1869 }
1870 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1871 
1872 static ssize_t
1873 errors_show(mdk_rdev_t *rdev, char *page)
1874 {
1875 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1876 }
1877 
1878 static ssize_t
1879 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1880 {
1881 	char *e;
1882 	unsigned long n = simple_strtoul(buf, &e, 10);
1883 	if (*buf && (*e == 0 || *e == '\n')) {
1884 		atomic_set(&rdev->corrected_errors, n);
1885 		return len;
1886 	}
1887 	return -EINVAL;
1888 }
1889 static struct rdev_sysfs_entry rdev_errors =
1890 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1891 
1892 static ssize_t
1893 slot_show(mdk_rdev_t *rdev, char *page)
1894 {
1895 	if (rdev->raid_disk < 0)
1896 		return sprintf(page, "none\n");
1897 	else
1898 		return sprintf(page, "%d\n", rdev->raid_disk);
1899 }
1900 
1901 static ssize_t
1902 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1903 {
1904 	char *e;
1905 	int err;
1906 	char nm[20];
1907 	int slot = simple_strtoul(buf, &e, 10);
1908 	if (strncmp(buf, "none", 4)==0)
1909 		slot = -1;
1910 	else if (e==buf || (*e && *e!= '\n'))
1911 		return -EINVAL;
1912 	if (rdev->mddev->pers) {
1913 		/* Setting 'slot' on an active array requires also
1914 		 * updating the 'rd%d' link, and communicating
1915 		 * with the personality with ->hot_*_disk.
1916 		 * For now we only support removing
1917 		 * failed/spare devices.  This normally happens automatically,
1918 		 * but not when the metadata is externally managed.
1919 		 */
1920 		if (slot != -1)
1921 			return -EBUSY;
1922 		if (rdev->raid_disk == -1)
1923 			return -EEXIST;
1924 		/* personality does all needed checks */
1925 		if (rdev->mddev->pers->hot_add_disk == NULL)
1926 			return -EINVAL;
1927 		err = rdev->mddev->pers->
1928 			hot_remove_disk(rdev->mddev, rdev->raid_disk);
1929 		if (err)
1930 			return err;
1931 		sprintf(nm, "rd%d", rdev->raid_disk);
1932 		sysfs_remove_link(&rdev->mddev->kobj, nm);
1933 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1934 		md_wakeup_thread(rdev->mddev->thread);
1935 	} else {
1936 		if (slot >= rdev->mddev->raid_disks)
1937 			return -ENOSPC;
1938 		rdev->raid_disk = slot;
1939 		/* assume it is working */
1940 		clear_bit(Faulty, &rdev->flags);
1941 		clear_bit(WriteMostly, &rdev->flags);
1942 		set_bit(In_sync, &rdev->flags);
1943 	}
1944 	return len;
1945 }
1946 
1947 
1948 static struct rdev_sysfs_entry rdev_slot =
1949 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1950 
1951 static ssize_t
1952 offset_show(mdk_rdev_t *rdev, char *page)
1953 {
1954 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1955 }
1956 
1957 static ssize_t
1958 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1959 {
1960 	char *e;
1961 	unsigned long long offset = simple_strtoull(buf, &e, 10);
1962 	if (e==buf || (*e && *e != '\n'))
1963 		return -EINVAL;
1964 	if (rdev->mddev->pers)
1965 		return -EBUSY;
1966 	if (rdev->size && rdev->mddev->external)
1967 		/* Must set offset before size, so overlap checks
1968 		 * can be sane */
1969 		return -EBUSY;
1970 	rdev->data_offset = offset;
1971 	return len;
1972 }
1973 
1974 static struct rdev_sysfs_entry rdev_offset =
1975 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1976 
1977 static ssize_t
1978 rdev_size_show(mdk_rdev_t *rdev, char *page)
1979 {
1980 	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1981 }
1982 
1983 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
1984 {
1985 	/* check if two start/length pairs overlap */
1986 	if (s1+l1 <= s2)
1987 		return 0;
1988 	if (s2+l2 <= s1)
1989 		return 0;
1990 	return 1;
1991 }
1992 
1993 static ssize_t
1994 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1995 {
1996 	char *e;
1997 	unsigned long long size = simple_strtoull(buf, &e, 10);
1998 	unsigned long long oldsize = rdev->size;
1999 	if (e==buf || (*e && *e != '\n'))
2000 		return -EINVAL;
2001 	if (rdev->mddev->pers)
2002 		return -EBUSY;
2003 	rdev->size = size;
2004 	if (size > oldsize && rdev->mddev->external) {
2005 		/* need to check that all other rdevs with the same ->bdev
2006 		 * do not overlap.  We need to unlock the mddev to avoid
2007 		 * a deadlock.  We have already changed rdev->size, and if
2008 		 * we have to change it back, we will have the lock again.
2009 		 */
2010 		mddev_t *mddev;
2011 		int overlap = 0;
2012 		struct list_head *tmp, *tmp2;
2013 
2014 		mddev_unlock(rdev->mddev);
2015 		for_each_mddev(mddev, tmp) {
2016 			mdk_rdev_t *rdev2;
2017 
2018 			mddev_lock(mddev);
2019 			rdev_for_each(rdev2, tmp2, mddev)
2020 				if (test_bit(AllReserved, &rdev2->flags) ||
2021 				    (rdev->bdev == rdev2->bdev &&
2022 				     rdev != rdev2 &&
2023 				     overlaps(rdev->data_offset, rdev->size,
2024 					    rdev2->data_offset, rdev2->size))) {
2025 					overlap = 1;
2026 					break;
2027 				}
2028 			mddev_unlock(mddev);
2029 			if (overlap) {
2030 				mddev_put(mddev);
2031 				break;
2032 			}
2033 		}
2034 		mddev_lock(rdev->mddev);
2035 		if (overlap) {
2036 			/* Someone else could have slipped in a size
2037 			 * change here, but doing so is just silly.
2038 			 * We put oldsize back because we *know* it is
2039 			 * safe, and trust userspace not to race with
2040 			 * itself
2041 			 */
2042 			rdev->size = oldsize;
2043 			return -EBUSY;
2044 		}
2045 	}
2046 	if (size < rdev->mddev->size || rdev->mddev->size == 0)
2047 		rdev->mddev->size = size;
2048 	return len;
2049 }
2050 
2051 static struct rdev_sysfs_entry rdev_size =
2052 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2053 
2054 static struct attribute *rdev_default_attrs[] = {
2055 	&rdev_state.attr,
2056 	&rdev_super.attr,
2057 	&rdev_errors.attr,
2058 	&rdev_slot.attr,
2059 	&rdev_offset.attr,
2060 	&rdev_size.attr,
2061 	NULL,
2062 };
2063 static ssize_t
2064 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2065 {
2066 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2067 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2068 
2069 	if (!entry->show)
2070 		return -EIO;
2071 	return entry->show(rdev, page);
2072 }
2073 
2074 static ssize_t
2075 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2076 	      const char *page, size_t length)
2077 {
2078 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2079 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2080 	int rv;
2081 
2082 	if (!entry->store)
2083 		return -EIO;
2084 	if (!capable(CAP_SYS_ADMIN))
2085 		return -EACCES;
2086 	rv = mddev_lock(rdev->mddev);
2087 	if (!rv) {
2088 		rv = entry->store(rdev, page, length);
2089 		mddev_unlock(rdev->mddev);
2090 	}
2091 	return rv;
2092 }
2093 
2094 static void rdev_free(struct kobject *ko)
2095 {
2096 	mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2097 	kfree(rdev);
2098 }
2099 static struct sysfs_ops rdev_sysfs_ops = {
2100 	.show		= rdev_attr_show,
2101 	.store		= rdev_attr_store,
2102 };
2103 static struct kobj_type rdev_ktype = {
2104 	.release	= rdev_free,
2105 	.sysfs_ops	= &rdev_sysfs_ops,
2106 	.default_attrs	= rdev_default_attrs,
2107 };
2108 
2109 /*
2110  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2111  *
2112  * mark the device faulty if:
2113  *
2114  *   - the device is nonexistent (zero size)
2115  *   - the device has no valid superblock
2116  *
2117  * a faulty rdev _never_ has rdev->sb set.
2118  */
2119 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2120 {
2121 	char b[BDEVNAME_SIZE];
2122 	int err;
2123 	mdk_rdev_t *rdev;
2124 	sector_t size;
2125 
2126 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2127 	if (!rdev) {
2128 		printk(KERN_ERR "md: could not alloc mem for new device!\n");
2129 		return ERR_PTR(-ENOMEM);
2130 	}
2131 
2132 	if ((err = alloc_disk_sb(rdev)))
2133 		goto abort_free;
2134 
2135 	err = lock_rdev(rdev, newdev, super_format == -2);
2136 	if (err)
2137 		goto abort_free;
2138 
2139 	kobject_init(&rdev->kobj, &rdev_ktype);
2140 
2141 	rdev->desc_nr = -1;
2142 	rdev->saved_raid_disk = -1;
2143 	rdev->raid_disk = -1;
2144 	rdev->flags = 0;
2145 	rdev->data_offset = 0;
2146 	rdev->sb_events = 0;
2147 	atomic_set(&rdev->nr_pending, 0);
2148 	atomic_set(&rdev->read_errors, 0);
2149 	atomic_set(&rdev->corrected_errors, 0);
2150 
2151 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2152 	if (!size) {
2153 		printk(KERN_WARNING
2154 			"md: %s has zero or unknown size, marking faulty!\n",
2155 			bdevname(rdev->bdev,b));
2156 		err = -EINVAL;
2157 		goto abort_free;
2158 	}
2159 
2160 	if (super_format >= 0) {
2161 		err = super_types[super_format].
2162 			load_super(rdev, NULL, super_minor);
2163 		if (err == -EINVAL) {
2164 			printk(KERN_WARNING
2165 				"md: %s does not have a valid v%d.%d "
2166 			       "superblock, not importing!\n",
2167 				bdevname(rdev->bdev,b),
2168 			       super_format, super_minor);
2169 			goto abort_free;
2170 		}
2171 		if (err < 0) {
2172 			printk(KERN_WARNING
2173 				"md: could not read %s's sb, not importing!\n",
2174 				bdevname(rdev->bdev,b));
2175 			goto abort_free;
2176 		}
2177 	}
2178 	INIT_LIST_HEAD(&rdev->same_set);
2179 
2180 	return rdev;
2181 
2182 abort_free:
2183 	if (rdev->sb_page) {
2184 		if (rdev->bdev)
2185 			unlock_rdev(rdev);
2186 		free_disk_sb(rdev);
2187 	}
2188 	kfree(rdev);
2189 	return ERR_PTR(err);
2190 }
2191 
2192 /*
2193  * Check a full RAID array for plausibility
2194  */
2195 
2196 
2197 static void analyze_sbs(mddev_t * mddev)
2198 {
2199 	int i;
2200 	struct list_head *tmp;
2201 	mdk_rdev_t *rdev, *freshest;
2202 	char b[BDEVNAME_SIZE];
2203 
2204 	freshest = NULL;
2205 	rdev_for_each(rdev, tmp, mddev)
2206 		switch (super_types[mddev->major_version].
2207 			load_super(rdev, freshest, mddev->minor_version)) {
2208 		case 1:
2209 			freshest = rdev;
2210 			break;
2211 		case 0:
2212 			break;
2213 		default:
2214 			printk( KERN_ERR \
2215 				"md: fatal superblock inconsistency in %s"
2216 				" -- removing from array\n",
2217 				bdevname(rdev->bdev,b));
2218 			kick_rdev_from_array(rdev);
2219 		}
2220 
2221 
2222 	super_types[mddev->major_version].
2223 		validate_super(mddev, freshest);
2224 
2225 	i = 0;
2226 	rdev_for_each(rdev, tmp, mddev) {
2227 		if (rdev != freshest)
2228 			if (super_types[mddev->major_version].
2229 			    validate_super(mddev, rdev)) {
2230 				printk(KERN_WARNING "md: kicking non-fresh %s"
2231 					" from array!\n",
2232 					bdevname(rdev->bdev,b));
2233 				kick_rdev_from_array(rdev);
2234 				continue;
2235 			}
2236 		if (mddev->level == LEVEL_MULTIPATH) {
2237 			rdev->desc_nr = i++;
2238 			rdev->raid_disk = rdev->desc_nr;
2239 			set_bit(In_sync, &rdev->flags);
2240 		} else if (rdev->raid_disk >= mddev->raid_disks) {
2241 			rdev->raid_disk = -1;
2242 			clear_bit(In_sync, &rdev->flags);
2243 		}
2244 	}
2245 
2246 
2247 
2248 	if (mddev->recovery_cp != MaxSector &&
2249 	    mddev->level >= 1)
2250 		printk(KERN_ERR "md: %s: raid array is not clean"
2251 		       " -- starting background reconstruction\n",
2252 		       mdname(mddev));
2253 
2254 }
2255 
2256 static ssize_t
2257 safe_delay_show(mddev_t *mddev, char *page)
2258 {
2259 	int msec = (mddev->safemode_delay*1000)/HZ;
2260 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2261 }
2262 static ssize_t
2263 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2264 {
2265 	int scale=1;
2266 	int dot=0;
2267 	int i;
2268 	unsigned long msec;
2269 	char buf[30];
2270 	char *e;
2271 	/* remove a period, and count digits after it */
2272 	if (len >= sizeof(buf))
2273 		return -EINVAL;
2274 	strlcpy(buf, cbuf, len);
2275 	buf[len] = 0;
2276 	for (i=0; i<len; i++) {
2277 		if (dot) {
2278 			if (isdigit(buf[i])) {
2279 				buf[i-1] = buf[i];
2280 				scale *= 10;
2281 			}
2282 			buf[i] = 0;
2283 		} else if (buf[i] == '.') {
2284 			dot=1;
2285 			buf[i] = 0;
2286 		}
2287 	}
2288 	msec = simple_strtoul(buf, &e, 10);
2289 	if (e == buf || (*e && *e != '\n'))
2290 		return -EINVAL;
2291 	msec = (msec * 1000) / scale;
2292 	if (msec == 0)
2293 		mddev->safemode_delay = 0;
2294 	else {
2295 		mddev->safemode_delay = (msec*HZ)/1000;
2296 		if (mddev->safemode_delay == 0)
2297 			mddev->safemode_delay = 1;
2298 	}
2299 	return len;
2300 }
2301 static struct md_sysfs_entry md_safe_delay =
2302 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2303 
2304 static ssize_t
2305 level_show(mddev_t *mddev, char *page)
2306 {
2307 	struct mdk_personality *p = mddev->pers;
2308 	if (p)
2309 		return sprintf(page, "%s\n", p->name);
2310 	else if (mddev->clevel[0])
2311 		return sprintf(page, "%s\n", mddev->clevel);
2312 	else if (mddev->level != LEVEL_NONE)
2313 		return sprintf(page, "%d\n", mddev->level);
2314 	else
2315 		return 0;
2316 }
2317 
2318 static ssize_t
2319 level_store(mddev_t *mddev, const char *buf, size_t len)
2320 {
2321 	ssize_t rv = len;
2322 	if (mddev->pers)
2323 		return -EBUSY;
2324 	if (len == 0)
2325 		return 0;
2326 	if (len >= sizeof(mddev->clevel))
2327 		return -ENOSPC;
2328 	strncpy(mddev->clevel, buf, len);
2329 	if (mddev->clevel[len-1] == '\n')
2330 		len--;
2331 	mddev->clevel[len] = 0;
2332 	mddev->level = LEVEL_NONE;
2333 	return rv;
2334 }
2335 
2336 static struct md_sysfs_entry md_level =
2337 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2338 
2339 
2340 static ssize_t
2341 layout_show(mddev_t *mddev, char *page)
2342 {
2343 	/* just a number, not meaningful for all levels */
2344 	if (mddev->reshape_position != MaxSector &&
2345 	    mddev->layout != mddev->new_layout)
2346 		return sprintf(page, "%d (%d)\n",
2347 			       mddev->new_layout, mddev->layout);
2348 	return sprintf(page, "%d\n", mddev->layout);
2349 }
2350 
2351 static ssize_t
2352 layout_store(mddev_t *mddev, const char *buf, size_t len)
2353 {
2354 	char *e;
2355 	unsigned long n = simple_strtoul(buf, &e, 10);
2356 
2357 	if (!*buf || (*e && *e != '\n'))
2358 		return -EINVAL;
2359 
2360 	if (mddev->pers)
2361 		return -EBUSY;
2362 	if (mddev->reshape_position != MaxSector)
2363 		mddev->new_layout = n;
2364 	else
2365 		mddev->layout = n;
2366 	return len;
2367 }
2368 static struct md_sysfs_entry md_layout =
2369 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2370 
2371 
2372 static ssize_t
2373 raid_disks_show(mddev_t *mddev, char *page)
2374 {
2375 	if (mddev->raid_disks == 0)
2376 		return 0;
2377 	if (mddev->reshape_position != MaxSector &&
2378 	    mddev->delta_disks != 0)
2379 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2380 			       mddev->raid_disks - mddev->delta_disks);
2381 	return sprintf(page, "%d\n", mddev->raid_disks);
2382 }
2383 
2384 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2385 
2386 static ssize_t
2387 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2388 {
2389 	char *e;
2390 	int rv = 0;
2391 	unsigned long n = simple_strtoul(buf, &e, 10);
2392 
2393 	if (!*buf || (*e && *e != '\n'))
2394 		return -EINVAL;
2395 
2396 	if (mddev->pers)
2397 		rv = update_raid_disks(mddev, n);
2398 	else if (mddev->reshape_position != MaxSector) {
2399 		int olddisks = mddev->raid_disks - mddev->delta_disks;
2400 		mddev->delta_disks = n - olddisks;
2401 		mddev->raid_disks = n;
2402 	} else
2403 		mddev->raid_disks = n;
2404 	return rv ? rv : len;
2405 }
2406 static struct md_sysfs_entry md_raid_disks =
2407 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2408 
2409 static ssize_t
2410 chunk_size_show(mddev_t *mddev, char *page)
2411 {
2412 	if (mddev->reshape_position != MaxSector &&
2413 	    mddev->chunk_size != mddev->new_chunk)
2414 		return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2415 			       mddev->chunk_size);
2416 	return sprintf(page, "%d\n", mddev->chunk_size);
2417 }
2418 
2419 static ssize_t
2420 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2421 {
2422 	/* can only set chunk_size if array is not yet active */
2423 	char *e;
2424 	unsigned long n = simple_strtoul(buf, &e, 10);
2425 
2426 	if (!*buf || (*e && *e != '\n'))
2427 		return -EINVAL;
2428 
2429 	if (mddev->pers)
2430 		return -EBUSY;
2431 	else if (mddev->reshape_position != MaxSector)
2432 		mddev->new_chunk = n;
2433 	else
2434 		mddev->chunk_size = n;
2435 	return len;
2436 }
2437 static struct md_sysfs_entry md_chunk_size =
2438 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2439 
2440 static ssize_t
2441 resync_start_show(mddev_t *mddev, char *page)
2442 {
2443 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2444 }
2445 
2446 static ssize_t
2447 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2448 {
2449 	/* can only set chunk_size if array is not yet active */
2450 	char *e;
2451 	unsigned long long n = simple_strtoull(buf, &e, 10);
2452 
2453 	if (mddev->pers)
2454 		return -EBUSY;
2455 	if (!*buf || (*e && *e != '\n'))
2456 		return -EINVAL;
2457 
2458 	mddev->recovery_cp = n;
2459 	return len;
2460 }
2461 static struct md_sysfs_entry md_resync_start =
2462 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2463 
2464 /*
2465  * The array state can be:
2466  *
2467  * clear
2468  *     No devices, no size, no level
2469  *     Equivalent to STOP_ARRAY ioctl
2470  * inactive
2471  *     May have some settings, but array is not active
2472  *        all IO results in error
2473  *     When written, doesn't tear down array, but just stops it
2474  * suspended (not supported yet)
2475  *     All IO requests will block. The array can be reconfigured.
2476  *     Writing this, if accepted, will block until array is quiessent
2477  * readonly
2478  *     no resync can happen.  no superblocks get written.
2479  *     write requests fail
2480  * read-auto
2481  *     like readonly, but behaves like 'clean' on a write request.
2482  *
2483  * clean - no pending writes, but otherwise active.
2484  *     When written to inactive array, starts without resync
2485  *     If a write request arrives then
2486  *       if metadata is known, mark 'dirty' and switch to 'active'.
2487  *       if not known, block and switch to write-pending
2488  *     If written to an active array that has pending writes, then fails.
2489  * active
2490  *     fully active: IO and resync can be happening.
2491  *     When written to inactive array, starts with resync
2492  *
2493  * write-pending
2494  *     clean, but writes are blocked waiting for 'active' to be written.
2495  *
2496  * active-idle
2497  *     like active, but no writes have been seen for a while (100msec).
2498  *
2499  */
2500 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2501 		   write_pending, active_idle, bad_word};
2502 static char *array_states[] = {
2503 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2504 	"write-pending", "active-idle", NULL };
2505 
2506 static int match_word(const char *word, char **list)
2507 {
2508 	int n;
2509 	for (n=0; list[n]; n++)
2510 		if (cmd_match(word, list[n]))
2511 			break;
2512 	return n;
2513 }
2514 
2515 static ssize_t
2516 array_state_show(mddev_t *mddev, char *page)
2517 {
2518 	enum array_state st = inactive;
2519 
2520 	if (mddev->pers)
2521 		switch(mddev->ro) {
2522 		case 1:
2523 			st = readonly;
2524 			break;
2525 		case 2:
2526 			st = read_auto;
2527 			break;
2528 		case 0:
2529 			if (mddev->in_sync)
2530 				st = clean;
2531 			else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2532 				st = write_pending;
2533 			else if (mddev->safemode)
2534 				st = active_idle;
2535 			else
2536 				st = active;
2537 		}
2538 	else {
2539 		if (list_empty(&mddev->disks) &&
2540 		    mddev->raid_disks == 0 &&
2541 		    mddev->size == 0)
2542 			st = clear;
2543 		else
2544 			st = inactive;
2545 	}
2546 	return sprintf(page, "%s\n", array_states[st]);
2547 }
2548 
2549 static int do_md_stop(mddev_t * mddev, int ro);
2550 static int do_md_run(mddev_t * mddev);
2551 static int restart_array(mddev_t *mddev);
2552 
2553 static ssize_t
2554 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2555 {
2556 	int err = -EINVAL;
2557 	enum array_state st = match_word(buf, array_states);
2558 	switch(st) {
2559 	case bad_word:
2560 		break;
2561 	case clear:
2562 		/* stopping an active array */
2563 		if (atomic_read(&mddev->active) > 1)
2564 			return -EBUSY;
2565 		err = do_md_stop(mddev, 0);
2566 		break;
2567 	case inactive:
2568 		/* stopping an active array */
2569 		if (mddev->pers) {
2570 			if (atomic_read(&mddev->active) > 1)
2571 				return -EBUSY;
2572 			err = do_md_stop(mddev, 2);
2573 		} else
2574 			err = 0; /* already inactive */
2575 		break;
2576 	case suspended:
2577 		break; /* not supported yet */
2578 	case readonly:
2579 		if (mddev->pers)
2580 			err = do_md_stop(mddev, 1);
2581 		else {
2582 			mddev->ro = 1;
2583 			err = do_md_run(mddev);
2584 		}
2585 		break;
2586 	case read_auto:
2587 		/* stopping an active array */
2588 		if (mddev->pers) {
2589 			err = do_md_stop(mddev, 1);
2590 			if (err == 0)
2591 				mddev->ro = 2; /* FIXME mark devices writable */
2592 		} else {
2593 			mddev->ro = 2;
2594 			err = do_md_run(mddev);
2595 		}
2596 		break;
2597 	case clean:
2598 		if (mddev->pers) {
2599 			restart_array(mddev);
2600 			spin_lock_irq(&mddev->write_lock);
2601 			if (atomic_read(&mddev->writes_pending) == 0) {
2602 				if (mddev->in_sync == 0) {
2603 					mddev->in_sync = 1;
2604 					if (mddev->persistent)
2605 						set_bit(MD_CHANGE_CLEAN,
2606 							&mddev->flags);
2607 				}
2608 				err = 0;
2609 			} else
2610 				err = -EBUSY;
2611 			spin_unlock_irq(&mddev->write_lock);
2612 		} else {
2613 			mddev->ro = 0;
2614 			mddev->recovery_cp = MaxSector;
2615 			err = do_md_run(mddev);
2616 		}
2617 		break;
2618 	case active:
2619 		if (mddev->pers) {
2620 			restart_array(mddev);
2621 			if (mddev->external)
2622 				clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2623 			wake_up(&mddev->sb_wait);
2624 			err = 0;
2625 		} else {
2626 			mddev->ro = 0;
2627 			err = do_md_run(mddev);
2628 		}
2629 		break;
2630 	case write_pending:
2631 	case active_idle:
2632 		/* these cannot be set */
2633 		break;
2634 	}
2635 	if (err)
2636 		return err;
2637 	else
2638 		return len;
2639 }
2640 static struct md_sysfs_entry md_array_state =
2641 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2642 
2643 static ssize_t
2644 null_show(mddev_t *mddev, char *page)
2645 {
2646 	return -EINVAL;
2647 }
2648 
2649 static ssize_t
2650 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2651 {
2652 	/* buf must be %d:%d\n? giving major and minor numbers */
2653 	/* The new device is added to the array.
2654 	 * If the array has a persistent superblock, we read the
2655 	 * superblock to initialise info and check validity.
2656 	 * Otherwise, only checking done is that in bind_rdev_to_array,
2657 	 * which mainly checks size.
2658 	 */
2659 	char *e;
2660 	int major = simple_strtoul(buf, &e, 10);
2661 	int minor;
2662 	dev_t dev;
2663 	mdk_rdev_t *rdev;
2664 	int err;
2665 
2666 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2667 		return -EINVAL;
2668 	minor = simple_strtoul(e+1, &e, 10);
2669 	if (*e && *e != '\n')
2670 		return -EINVAL;
2671 	dev = MKDEV(major, minor);
2672 	if (major != MAJOR(dev) ||
2673 	    minor != MINOR(dev))
2674 		return -EOVERFLOW;
2675 
2676 
2677 	if (mddev->persistent) {
2678 		rdev = md_import_device(dev, mddev->major_version,
2679 					mddev->minor_version);
2680 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2681 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2682 						       mdk_rdev_t, same_set);
2683 			err = super_types[mddev->major_version]
2684 				.load_super(rdev, rdev0, mddev->minor_version);
2685 			if (err < 0)
2686 				goto out;
2687 		}
2688 	} else if (mddev->external)
2689 		rdev = md_import_device(dev, -2, -1);
2690 	else
2691 		rdev = md_import_device(dev, -1, -1);
2692 
2693 	if (IS_ERR(rdev))
2694 		return PTR_ERR(rdev);
2695 	err = bind_rdev_to_array(rdev, mddev);
2696  out:
2697 	if (err)
2698 		export_rdev(rdev);
2699 	return err ? err : len;
2700 }
2701 
2702 static struct md_sysfs_entry md_new_device =
2703 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2704 
2705 static ssize_t
2706 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2707 {
2708 	char *end;
2709 	unsigned long chunk, end_chunk;
2710 
2711 	if (!mddev->bitmap)
2712 		goto out;
2713 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2714 	while (*buf) {
2715 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
2716 		if (buf == end) break;
2717 		if (*end == '-') { /* range */
2718 			buf = end + 1;
2719 			end_chunk = simple_strtoul(buf, &end, 0);
2720 			if (buf == end) break;
2721 		}
2722 		if (*end && !isspace(*end)) break;
2723 		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2724 		buf = end;
2725 		while (isspace(*buf)) buf++;
2726 	}
2727 	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2728 out:
2729 	return len;
2730 }
2731 
2732 static struct md_sysfs_entry md_bitmap =
2733 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2734 
2735 static ssize_t
2736 size_show(mddev_t *mddev, char *page)
2737 {
2738 	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2739 }
2740 
2741 static int update_size(mddev_t *mddev, unsigned long size);
2742 
2743 static ssize_t
2744 size_store(mddev_t *mddev, const char *buf, size_t len)
2745 {
2746 	/* If array is inactive, we can reduce the component size, but
2747 	 * not increase it (except from 0).
2748 	 * If array is active, we can try an on-line resize
2749 	 */
2750 	char *e;
2751 	int err = 0;
2752 	unsigned long long size = simple_strtoull(buf, &e, 10);
2753 	if (!*buf || *buf == '\n' ||
2754 	    (*e && *e != '\n'))
2755 		return -EINVAL;
2756 
2757 	if (mddev->pers) {
2758 		err = update_size(mddev, size);
2759 		md_update_sb(mddev, 1);
2760 	} else {
2761 		if (mddev->size == 0 ||
2762 		    mddev->size > size)
2763 			mddev->size = size;
2764 		else
2765 			err = -ENOSPC;
2766 	}
2767 	return err ? err : len;
2768 }
2769 
2770 static struct md_sysfs_entry md_size =
2771 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2772 
2773 
2774 /* Metdata version.
2775  * This is one of
2776  *   'none' for arrays with no metadata (good luck...)
2777  *   'external' for arrays with externally managed metadata,
2778  * or N.M for internally known formats
2779  */
2780 static ssize_t
2781 metadata_show(mddev_t *mddev, char *page)
2782 {
2783 	if (mddev->persistent)
2784 		return sprintf(page, "%d.%d\n",
2785 			       mddev->major_version, mddev->minor_version);
2786 	else if (mddev->external)
2787 		return sprintf(page, "external:%s\n", mddev->metadata_type);
2788 	else
2789 		return sprintf(page, "none\n");
2790 }
2791 
2792 static ssize_t
2793 metadata_store(mddev_t *mddev, const char *buf, size_t len)
2794 {
2795 	int major, minor;
2796 	char *e;
2797 	if (!list_empty(&mddev->disks))
2798 		return -EBUSY;
2799 
2800 	if (cmd_match(buf, "none")) {
2801 		mddev->persistent = 0;
2802 		mddev->external = 0;
2803 		mddev->major_version = 0;
2804 		mddev->minor_version = 90;
2805 		return len;
2806 	}
2807 	if (strncmp(buf, "external:", 9) == 0) {
2808 		size_t namelen = len-9;
2809 		if (namelen >= sizeof(mddev->metadata_type))
2810 			namelen = sizeof(mddev->metadata_type)-1;
2811 		strncpy(mddev->metadata_type, buf+9, namelen);
2812 		mddev->metadata_type[namelen] = 0;
2813 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
2814 			mddev->metadata_type[--namelen] = 0;
2815 		mddev->persistent = 0;
2816 		mddev->external = 1;
2817 		mddev->major_version = 0;
2818 		mddev->minor_version = 90;
2819 		return len;
2820 	}
2821 	major = simple_strtoul(buf, &e, 10);
2822 	if (e==buf || *e != '.')
2823 		return -EINVAL;
2824 	buf = e+1;
2825 	minor = simple_strtoul(buf, &e, 10);
2826 	if (e==buf || (*e && *e != '\n') )
2827 		return -EINVAL;
2828 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2829 		return -ENOENT;
2830 	mddev->major_version = major;
2831 	mddev->minor_version = minor;
2832 	mddev->persistent = 1;
2833 	mddev->external = 0;
2834 	return len;
2835 }
2836 
2837 static struct md_sysfs_entry md_metadata =
2838 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2839 
2840 static ssize_t
2841 action_show(mddev_t *mddev, char *page)
2842 {
2843 	char *type = "idle";
2844 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2845 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
2846 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2847 			type = "reshape";
2848 		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2849 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2850 				type = "resync";
2851 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2852 				type = "check";
2853 			else
2854 				type = "repair";
2855 		} else
2856 			type = "recover";
2857 	}
2858 	return sprintf(page, "%s\n", type);
2859 }
2860 
2861 static ssize_t
2862 action_store(mddev_t *mddev, const char *page, size_t len)
2863 {
2864 	if (!mddev->pers || !mddev->pers->sync_request)
2865 		return -EINVAL;
2866 
2867 	if (cmd_match(page, "idle")) {
2868 		if (mddev->sync_thread) {
2869 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2870 			md_unregister_thread(mddev->sync_thread);
2871 			mddev->sync_thread = NULL;
2872 			mddev->recovery = 0;
2873 		}
2874 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2875 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2876 		return -EBUSY;
2877 	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2878 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2879 	else if (cmd_match(page, "reshape")) {
2880 		int err;
2881 		if (mddev->pers->start_reshape == NULL)
2882 			return -EINVAL;
2883 		err = mddev->pers->start_reshape(mddev);
2884 		if (err)
2885 			return err;
2886 	} else {
2887 		if (cmd_match(page, "check"))
2888 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2889 		else if (!cmd_match(page, "repair"))
2890 			return -EINVAL;
2891 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2892 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2893 	}
2894 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2895 	md_wakeup_thread(mddev->thread);
2896 	return len;
2897 }
2898 
2899 static ssize_t
2900 mismatch_cnt_show(mddev_t *mddev, char *page)
2901 {
2902 	return sprintf(page, "%llu\n",
2903 		       (unsigned long long) mddev->resync_mismatches);
2904 }
2905 
2906 static struct md_sysfs_entry md_scan_mode =
2907 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2908 
2909 
2910 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2911 
2912 static ssize_t
2913 sync_min_show(mddev_t *mddev, char *page)
2914 {
2915 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
2916 		       mddev->sync_speed_min ? "local": "system");
2917 }
2918 
2919 static ssize_t
2920 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2921 {
2922 	int min;
2923 	char *e;
2924 	if (strncmp(buf, "system", 6)==0) {
2925 		mddev->sync_speed_min = 0;
2926 		return len;
2927 	}
2928 	min = simple_strtoul(buf, &e, 10);
2929 	if (buf == e || (*e && *e != '\n') || min <= 0)
2930 		return -EINVAL;
2931 	mddev->sync_speed_min = min;
2932 	return len;
2933 }
2934 
2935 static struct md_sysfs_entry md_sync_min =
2936 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2937 
2938 static ssize_t
2939 sync_max_show(mddev_t *mddev, char *page)
2940 {
2941 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
2942 		       mddev->sync_speed_max ? "local": "system");
2943 }
2944 
2945 static ssize_t
2946 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2947 {
2948 	int max;
2949 	char *e;
2950 	if (strncmp(buf, "system", 6)==0) {
2951 		mddev->sync_speed_max = 0;
2952 		return len;
2953 	}
2954 	max = simple_strtoul(buf, &e, 10);
2955 	if (buf == e || (*e && *e != '\n') || max <= 0)
2956 		return -EINVAL;
2957 	mddev->sync_speed_max = max;
2958 	return len;
2959 }
2960 
2961 static struct md_sysfs_entry md_sync_max =
2962 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2963 
2964 static ssize_t
2965 degraded_show(mddev_t *mddev, char *page)
2966 {
2967 	return sprintf(page, "%d\n", mddev->degraded);
2968 }
2969 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
2970 
2971 static ssize_t
2972 sync_speed_show(mddev_t *mddev, char *page)
2973 {
2974 	unsigned long resync, dt, db;
2975 	resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2976 	dt = ((jiffies - mddev->resync_mark) / HZ);
2977 	if (!dt) dt++;
2978 	db = resync - (mddev->resync_mark_cnt);
2979 	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2980 }
2981 
2982 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2983 
2984 static ssize_t
2985 sync_completed_show(mddev_t *mddev, char *page)
2986 {
2987 	unsigned long max_blocks, resync;
2988 
2989 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2990 		max_blocks = mddev->resync_max_sectors;
2991 	else
2992 		max_blocks = mddev->size << 1;
2993 
2994 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2995 	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2996 }
2997 
2998 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2999 
3000 static ssize_t
3001 max_sync_show(mddev_t *mddev, char *page)
3002 {
3003 	if (mddev->resync_max == MaxSector)
3004 		return sprintf(page, "max\n");
3005 	else
3006 		return sprintf(page, "%llu\n",
3007 			       (unsigned long long)mddev->resync_max);
3008 }
3009 static ssize_t
3010 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3011 {
3012 	if (strncmp(buf, "max", 3) == 0)
3013 		mddev->resync_max = MaxSector;
3014 	else {
3015 		char *ep;
3016 		unsigned long long max = simple_strtoull(buf, &ep, 10);
3017 		if (ep == buf || (*ep != 0 && *ep != '\n'))
3018 			return -EINVAL;
3019 		if (max < mddev->resync_max &&
3020 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3021 			return -EBUSY;
3022 
3023 		/* Must be a multiple of chunk_size */
3024 		if (mddev->chunk_size) {
3025 			if (max & (sector_t)((mddev->chunk_size>>9)-1))
3026 				return -EINVAL;
3027 		}
3028 		mddev->resync_max = max;
3029 	}
3030 	wake_up(&mddev->recovery_wait);
3031 	return len;
3032 }
3033 
3034 static struct md_sysfs_entry md_max_sync =
3035 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3036 
3037 static ssize_t
3038 suspend_lo_show(mddev_t *mddev, char *page)
3039 {
3040 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3041 }
3042 
3043 static ssize_t
3044 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3045 {
3046 	char *e;
3047 	unsigned long long new = simple_strtoull(buf, &e, 10);
3048 
3049 	if (mddev->pers->quiesce == NULL)
3050 		return -EINVAL;
3051 	if (buf == e || (*e && *e != '\n'))
3052 		return -EINVAL;
3053 	if (new >= mddev->suspend_hi ||
3054 	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3055 		mddev->suspend_lo = new;
3056 		mddev->pers->quiesce(mddev, 2);
3057 		return len;
3058 	} else
3059 		return -EINVAL;
3060 }
3061 static struct md_sysfs_entry md_suspend_lo =
3062 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3063 
3064 
3065 static ssize_t
3066 suspend_hi_show(mddev_t *mddev, char *page)
3067 {
3068 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3069 }
3070 
3071 static ssize_t
3072 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3073 {
3074 	char *e;
3075 	unsigned long long new = simple_strtoull(buf, &e, 10);
3076 
3077 	if (mddev->pers->quiesce == NULL)
3078 		return -EINVAL;
3079 	if (buf == e || (*e && *e != '\n'))
3080 		return -EINVAL;
3081 	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3082 	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3083 		mddev->suspend_hi = new;
3084 		mddev->pers->quiesce(mddev, 1);
3085 		mddev->pers->quiesce(mddev, 0);
3086 		return len;
3087 	} else
3088 		return -EINVAL;
3089 }
3090 static struct md_sysfs_entry md_suspend_hi =
3091 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3092 
3093 static ssize_t
3094 reshape_position_show(mddev_t *mddev, char *page)
3095 {
3096 	if (mddev->reshape_position != MaxSector)
3097 		return sprintf(page, "%llu\n",
3098 			       (unsigned long long)mddev->reshape_position);
3099 	strcpy(page, "none\n");
3100 	return 5;
3101 }
3102 
3103 static ssize_t
3104 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3105 {
3106 	char *e;
3107 	unsigned long long new = simple_strtoull(buf, &e, 10);
3108 	if (mddev->pers)
3109 		return -EBUSY;
3110 	if (buf == e || (*e && *e != '\n'))
3111 		return -EINVAL;
3112 	mddev->reshape_position = new;
3113 	mddev->delta_disks = 0;
3114 	mddev->new_level = mddev->level;
3115 	mddev->new_layout = mddev->layout;
3116 	mddev->new_chunk = mddev->chunk_size;
3117 	return len;
3118 }
3119 
3120 static struct md_sysfs_entry md_reshape_position =
3121 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3122        reshape_position_store);
3123 
3124 
3125 static struct attribute *md_default_attrs[] = {
3126 	&md_level.attr,
3127 	&md_layout.attr,
3128 	&md_raid_disks.attr,
3129 	&md_chunk_size.attr,
3130 	&md_size.attr,
3131 	&md_resync_start.attr,
3132 	&md_metadata.attr,
3133 	&md_new_device.attr,
3134 	&md_safe_delay.attr,
3135 	&md_array_state.attr,
3136 	&md_reshape_position.attr,
3137 	NULL,
3138 };
3139 
3140 static struct attribute *md_redundancy_attrs[] = {
3141 	&md_scan_mode.attr,
3142 	&md_mismatches.attr,
3143 	&md_sync_min.attr,
3144 	&md_sync_max.attr,
3145 	&md_sync_speed.attr,
3146 	&md_sync_completed.attr,
3147 	&md_max_sync.attr,
3148 	&md_suspend_lo.attr,
3149 	&md_suspend_hi.attr,
3150 	&md_bitmap.attr,
3151 	&md_degraded.attr,
3152 	NULL,
3153 };
3154 static struct attribute_group md_redundancy_group = {
3155 	.name = NULL,
3156 	.attrs = md_redundancy_attrs,
3157 };
3158 
3159 
3160 static ssize_t
3161 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3162 {
3163 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3164 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3165 	ssize_t rv;
3166 
3167 	if (!entry->show)
3168 		return -EIO;
3169 	rv = mddev_lock(mddev);
3170 	if (!rv) {
3171 		rv = entry->show(mddev, page);
3172 		mddev_unlock(mddev);
3173 	}
3174 	return rv;
3175 }
3176 
3177 static ssize_t
3178 md_attr_store(struct kobject *kobj, struct attribute *attr,
3179 	      const char *page, size_t length)
3180 {
3181 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3182 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3183 	ssize_t rv;
3184 
3185 	if (!entry->store)
3186 		return -EIO;
3187 	if (!capable(CAP_SYS_ADMIN))
3188 		return -EACCES;
3189 	rv = mddev_lock(mddev);
3190 	if (!rv) {
3191 		rv = entry->store(mddev, page, length);
3192 		mddev_unlock(mddev);
3193 	}
3194 	return rv;
3195 }
3196 
3197 static void md_free(struct kobject *ko)
3198 {
3199 	mddev_t *mddev = container_of(ko, mddev_t, kobj);
3200 	kfree(mddev);
3201 }
3202 
3203 static struct sysfs_ops md_sysfs_ops = {
3204 	.show	= md_attr_show,
3205 	.store	= md_attr_store,
3206 };
3207 static struct kobj_type md_ktype = {
3208 	.release	= md_free,
3209 	.sysfs_ops	= &md_sysfs_ops,
3210 	.default_attrs	= md_default_attrs,
3211 };
3212 
3213 int mdp_major = 0;
3214 
3215 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3216 {
3217 	static DEFINE_MUTEX(disks_mutex);
3218 	mddev_t *mddev = mddev_find(dev);
3219 	struct gendisk *disk;
3220 	int partitioned = (MAJOR(dev) != MD_MAJOR);
3221 	int shift = partitioned ? MdpMinorShift : 0;
3222 	int unit = MINOR(dev) >> shift;
3223 	int error;
3224 
3225 	if (!mddev)
3226 		return NULL;
3227 
3228 	mutex_lock(&disks_mutex);
3229 	if (mddev->gendisk) {
3230 		mutex_unlock(&disks_mutex);
3231 		mddev_put(mddev);
3232 		return NULL;
3233 	}
3234 	disk = alloc_disk(1 << shift);
3235 	if (!disk) {
3236 		mutex_unlock(&disks_mutex);
3237 		mddev_put(mddev);
3238 		return NULL;
3239 	}
3240 	disk->major = MAJOR(dev);
3241 	disk->first_minor = unit << shift;
3242 	if (partitioned)
3243 		sprintf(disk->disk_name, "md_d%d", unit);
3244 	else
3245 		sprintf(disk->disk_name, "md%d", unit);
3246 	disk->fops = &md_fops;
3247 	disk->private_data = mddev;
3248 	disk->queue = mddev->queue;
3249 	add_disk(disk);
3250 	mddev->gendisk = disk;
3251 	mutex_unlock(&disks_mutex);
3252 	error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3253 				     "%s", "md");
3254 	if (error)
3255 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3256 		       disk->disk_name);
3257 	else
3258 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
3259 	return NULL;
3260 }
3261 
3262 static void md_safemode_timeout(unsigned long data)
3263 {
3264 	mddev_t *mddev = (mddev_t *) data;
3265 
3266 	mddev->safemode = 1;
3267 	md_wakeup_thread(mddev->thread);
3268 }
3269 
3270 static int start_dirty_degraded;
3271 
3272 static int do_md_run(mddev_t * mddev)
3273 {
3274 	int err;
3275 	int chunk_size;
3276 	struct list_head *tmp;
3277 	mdk_rdev_t *rdev;
3278 	struct gendisk *disk;
3279 	struct mdk_personality *pers;
3280 	char b[BDEVNAME_SIZE];
3281 
3282 	if (list_empty(&mddev->disks))
3283 		/* cannot run an array with no devices.. */
3284 		return -EINVAL;
3285 
3286 	if (mddev->pers)
3287 		return -EBUSY;
3288 
3289 	/*
3290 	 * Analyze all RAID superblock(s)
3291 	 */
3292 	if (!mddev->raid_disks) {
3293 		if (!mddev->persistent)
3294 			return -EINVAL;
3295 		analyze_sbs(mddev);
3296 	}
3297 
3298 	chunk_size = mddev->chunk_size;
3299 
3300 	if (chunk_size) {
3301 		if (chunk_size > MAX_CHUNK_SIZE) {
3302 			printk(KERN_ERR "too big chunk_size: %d > %d\n",
3303 				chunk_size, MAX_CHUNK_SIZE);
3304 			return -EINVAL;
3305 		}
3306 		/*
3307 		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3308 		 */
3309 		if ( (1 << ffz(~chunk_size)) != chunk_size) {
3310 			printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3311 			return -EINVAL;
3312 		}
3313 		if (chunk_size < PAGE_SIZE) {
3314 			printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3315 				chunk_size, PAGE_SIZE);
3316 			return -EINVAL;
3317 		}
3318 
3319 		/* devices must have minimum size of one chunk */
3320 		rdev_for_each(rdev, tmp, mddev) {
3321 			if (test_bit(Faulty, &rdev->flags))
3322 				continue;
3323 			if (rdev->size < chunk_size / 1024) {
3324 				printk(KERN_WARNING
3325 					"md: Dev %s smaller than chunk_size:"
3326 					" %lluk < %dk\n",
3327 					bdevname(rdev->bdev,b),
3328 					(unsigned long long)rdev->size,
3329 					chunk_size / 1024);
3330 				return -EINVAL;
3331 			}
3332 		}
3333 	}
3334 
3335 #ifdef CONFIG_KMOD
3336 	if (mddev->level != LEVEL_NONE)
3337 		request_module("md-level-%d", mddev->level);
3338 	else if (mddev->clevel[0])
3339 		request_module("md-%s", mddev->clevel);
3340 #endif
3341 
3342 	/*
3343 	 * Drop all container device buffers, from now on
3344 	 * the only valid external interface is through the md
3345 	 * device.
3346 	 */
3347 	rdev_for_each(rdev, tmp, mddev) {
3348 		if (test_bit(Faulty, &rdev->flags))
3349 			continue;
3350 		sync_blockdev(rdev->bdev);
3351 		invalidate_bdev(rdev->bdev);
3352 
3353 		/* perform some consistency tests on the device.
3354 		 * We don't want the data to overlap the metadata,
3355 		 * Internal Bitmap issues has handled elsewhere.
3356 		 */
3357 		if (rdev->data_offset < rdev->sb_offset) {
3358 			if (mddev->size &&
3359 			    rdev->data_offset + mddev->size*2
3360 			    > rdev->sb_offset*2) {
3361 				printk("md: %s: data overlaps metadata\n",
3362 				       mdname(mddev));
3363 				return -EINVAL;
3364 			}
3365 		} else {
3366 			if (rdev->sb_offset*2 + rdev->sb_size/512
3367 			    > rdev->data_offset) {
3368 				printk("md: %s: metadata overlaps data\n",
3369 				       mdname(mddev));
3370 				return -EINVAL;
3371 			}
3372 		}
3373 	}
3374 
3375 	md_probe(mddev->unit, NULL, NULL);
3376 	disk = mddev->gendisk;
3377 	if (!disk)
3378 		return -ENOMEM;
3379 
3380 	spin_lock(&pers_lock);
3381 	pers = find_pers(mddev->level, mddev->clevel);
3382 	if (!pers || !try_module_get(pers->owner)) {
3383 		spin_unlock(&pers_lock);
3384 		if (mddev->level != LEVEL_NONE)
3385 			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3386 			       mddev->level);
3387 		else
3388 			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3389 			       mddev->clevel);
3390 		return -EINVAL;
3391 	}
3392 	mddev->pers = pers;
3393 	spin_unlock(&pers_lock);
3394 	mddev->level = pers->level;
3395 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3396 
3397 	if (mddev->reshape_position != MaxSector &&
3398 	    pers->start_reshape == NULL) {
3399 		/* This personality cannot handle reshaping... */
3400 		mddev->pers = NULL;
3401 		module_put(pers->owner);
3402 		return -EINVAL;
3403 	}
3404 
3405 	if (pers->sync_request) {
3406 		/* Warn if this is a potentially silly
3407 		 * configuration.
3408 		 */
3409 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3410 		mdk_rdev_t *rdev2;
3411 		struct list_head *tmp2;
3412 		int warned = 0;
3413 		rdev_for_each(rdev, tmp, mddev) {
3414 			rdev_for_each(rdev2, tmp2, mddev) {
3415 				if (rdev < rdev2 &&
3416 				    rdev->bdev->bd_contains ==
3417 				    rdev2->bdev->bd_contains) {
3418 					printk(KERN_WARNING
3419 					       "%s: WARNING: %s appears to be"
3420 					       " on the same physical disk as"
3421 					       " %s.\n",
3422 					       mdname(mddev),
3423 					       bdevname(rdev->bdev,b),
3424 					       bdevname(rdev2->bdev,b2));
3425 					warned = 1;
3426 				}
3427 			}
3428 		}
3429 		if (warned)
3430 			printk(KERN_WARNING
3431 			       "True protection against single-disk"
3432 			       " failure might be compromised.\n");
3433 	}
3434 
3435 	mddev->recovery = 0;
3436 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3437 	mddev->barriers_work = 1;
3438 	mddev->ok_start_degraded = start_dirty_degraded;
3439 
3440 	if (start_readonly)
3441 		mddev->ro = 2; /* read-only, but switch on first write */
3442 
3443 	err = mddev->pers->run(mddev);
3444 	if (!err && mddev->pers->sync_request) {
3445 		err = bitmap_create(mddev);
3446 		if (err) {
3447 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3448 			       mdname(mddev), err);
3449 			mddev->pers->stop(mddev);
3450 		}
3451 	}
3452 	if (err) {
3453 		printk(KERN_ERR "md: pers->run() failed ...\n");
3454 		module_put(mddev->pers->owner);
3455 		mddev->pers = NULL;
3456 		bitmap_destroy(mddev);
3457 		return err;
3458 	}
3459 	if (mddev->pers->sync_request) {
3460 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3461 			printk(KERN_WARNING
3462 			       "md: cannot register extra attributes for %s\n",
3463 			       mdname(mddev));
3464 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
3465 		mddev->ro = 0;
3466 
3467  	atomic_set(&mddev->writes_pending,0);
3468 	mddev->safemode = 0;
3469 	mddev->safemode_timer.function = md_safemode_timeout;
3470 	mddev->safemode_timer.data = (unsigned long) mddev;
3471 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3472 	mddev->in_sync = 1;
3473 
3474 	rdev_for_each(rdev, tmp, mddev)
3475 		if (rdev->raid_disk >= 0) {
3476 			char nm[20];
3477 			sprintf(nm, "rd%d", rdev->raid_disk);
3478 			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3479 				printk("md: cannot register %s for %s\n",
3480 				       nm, mdname(mddev));
3481 		}
3482 
3483 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3484 
3485 	if (mddev->flags)
3486 		md_update_sb(mddev, 0);
3487 
3488 	set_capacity(disk, mddev->array_size<<1);
3489 
3490 	/* If we call blk_queue_make_request here, it will
3491 	 * re-initialise max_sectors etc which may have been
3492 	 * refined inside -> run.  So just set the bits we need to set.
3493 	 * Most initialisation happended when we called
3494 	 * blk_queue_make_request(..., md_fail_request)
3495 	 * earlier.
3496 	 */
3497 	mddev->queue->queuedata = mddev;
3498 	mddev->queue->make_request_fn = mddev->pers->make_request;
3499 
3500 	/* If there is a partially-recovered drive we need to
3501 	 * start recovery here.  If we leave it to md_check_recovery,
3502 	 * it will remove the drives and not do the right thing
3503 	 */
3504 	if (mddev->degraded && !mddev->sync_thread) {
3505 		struct list_head *rtmp;
3506 		int spares = 0;
3507 		rdev_for_each(rdev, rtmp, mddev)
3508 			if (rdev->raid_disk >= 0 &&
3509 			    !test_bit(In_sync, &rdev->flags) &&
3510 			    !test_bit(Faulty, &rdev->flags))
3511 				/* complete an interrupted recovery */
3512 				spares++;
3513 		if (spares && mddev->pers->sync_request) {
3514 			mddev->recovery = 0;
3515 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3516 			mddev->sync_thread = md_register_thread(md_do_sync,
3517 								mddev,
3518 								"%s_resync");
3519 			if (!mddev->sync_thread) {
3520 				printk(KERN_ERR "%s: could not start resync"
3521 				       " thread...\n",
3522 				       mdname(mddev));
3523 				/* leave the spares where they are, it shouldn't hurt */
3524 				mddev->recovery = 0;
3525 			}
3526 		}
3527 	}
3528 	md_wakeup_thread(mddev->thread);
3529 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3530 
3531 	mddev->changed = 1;
3532 	md_new_event(mddev);
3533 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3534 	return 0;
3535 }
3536 
3537 static int restart_array(mddev_t *mddev)
3538 {
3539 	struct gendisk *disk = mddev->gendisk;
3540 	int err;
3541 
3542 	/*
3543 	 * Complain if it has no devices
3544 	 */
3545 	err = -ENXIO;
3546 	if (list_empty(&mddev->disks))
3547 		goto out;
3548 
3549 	if (mddev->pers) {
3550 		err = -EBUSY;
3551 		if (!mddev->ro)
3552 			goto out;
3553 
3554 		mddev->safemode = 0;
3555 		mddev->ro = 0;
3556 		set_disk_ro(disk, 0);
3557 
3558 		printk(KERN_INFO "md: %s switched to read-write mode.\n",
3559 			mdname(mddev));
3560 		/*
3561 		 * Kick recovery or resync if necessary
3562 		 */
3563 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3564 		md_wakeup_thread(mddev->thread);
3565 		md_wakeup_thread(mddev->sync_thread);
3566 		err = 0;
3567 	} else
3568 		err = -EINVAL;
3569 
3570 out:
3571 	return err;
3572 }
3573 
3574 /* similar to deny_write_access, but accounts for our holding a reference
3575  * to the file ourselves */
3576 static int deny_bitmap_write_access(struct file * file)
3577 {
3578 	struct inode *inode = file->f_mapping->host;
3579 
3580 	spin_lock(&inode->i_lock);
3581 	if (atomic_read(&inode->i_writecount) > 1) {
3582 		spin_unlock(&inode->i_lock);
3583 		return -ETXTBSY;
3584 	}
3585 	atomic_set(&inode->i_writecount, -1);
3586 	spin_unlock(&inode->i_lock);
3587 
3588 	return 0;
3589 }
3590 
3591 static void restore_bitmap_write_access(struct file *file)
3592 {
3593 	struct inode *inode = file->f_mapping->host;
3594 
3595 	spin_lock(&inode->i_lock);
3596 	atomic_set(&inode->i_writecount, 1);
3597 	spin_unlock(&inode->i_lock);
3598 }
3599 
3600 /* mode:
3601  *   0 - completely stop and dis-assemble array
3602  *   1 - switch to readonly
3603  *   2 - stop but do not disassemble array
3604  */
3605 static int do_md_stop(mddev_t * mddev, int mode)
3606 {
3607 	int err = 0;
3608 	struct gendisk *disk = mddev->gendisk;
3609 
3610 	if (mddev->pers) {
3611 		if (atomic_read(&mddev->active)>2) {
3612 			printk("md: %s still in use.\n",mdname(mddev));
3613 			return -EBUSY;
3614 		}
3615 
3616 		if (mddev->sync_thread) {
3617 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3618 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3619 			md_unregister_thread(mddev->sync_thread);
3620 			mddev->sync_thread = NULL;
3621 		}
3622 
3623 		del_timer_sync(&mddev->safemode_timer);
3624 
3625 		invalidate_partition(disk, 0);
3626 
3627 		switch(mode) {
3628 		case 1: /* readonly */
3629 			err  = -ENXIO;
3630 			if (mddev->ro==1)
3631 				goto out;
3632 			mddev->ro = 1;
3633 			break;
3634 		case 0: /* disassemble */
3635 		case 2: /* stop */
3636 			bitmap_flush(mddev);
3637 			md_super_wait(mddev);
3638 			if (mddev->ro)
3639 				set_disk_ro(disk, 0);
3640 			blk_queue_make_request(mddev->queue, md_fail_request);
3641 			mddev->pers->stop(mddev);
3642 			mddev->queue->merge_bvec_fn = NULL;
3643 			mddev->queue->unplug_fn = NULL;
3644 			mddev->queue->backing_dev_info.congested_fn = NULL;
3645 			if (mddev->pers->sync_request)
3646 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3647 
3648 			module_put(mddev->pers->owner);
3649 			mddev->pers = NULL;
3650 
3651 			set_capacity(disk, 0);
3652 			mddev->changed = 1;
3653 
3654 			if (mddev->ro)
3655 				mddev->ro = 0;
3656 		}
3657 		if (!mddev->in_sync || mddev->flags) {
3658 			/* mark array as shutdown cleanly */
3659 			mddev->in_sync = 1;
3660 			md_update_sb(mddev, 1);
3661 		}
3662 		if (mode == 1)
3663 			set_disk_ro(disk, 1);
3664 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3665 	}
3666 
3667 	/*
3668 	 * Free resources if final stop
3669 	 */
3670 	if (mode == 0) {
3671 		mdk_rdev_t *rdev;
3672 		struct list_head *tmp;
3673 
3674 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3675 
3676 		bitmap_destroy(mddev);
3677 		if (mddev->bitmap_file) {
3678 			restore_bitmap_write_access(mddev->bitmap_file);
3679 			fput(mddev->bitmap_file);
3680 			mddev->bitmap_file = NULL;
3681 		}
3682 		mddev->bitmap_offset = 0;
3683 
3684 		rdev_for_each(rdev, tmp, mddev)
3685 			if (rdev->raid_disk >= 0) {
3686 				char nm[20];
3687 				sprintf(nm, "rd%d", rdev->raid_disk);
3688 				sysfs_remove_link(&mddev->kobj, nm);
3689 			}
3690 
3691 		/* make sure all md_delayed_delete calls have finished */
3692 		flush_scheduled_work();
3693 
3694 		export_array(mddev);
3695 
3696 		mddev->array_size = 0;
3697 		mddev->size = 0;
3698 		mddev->raid_disks = 0;
3699 		mddev->recovery_cp = 0;
3700 		mddev->resync_max = MaxSector;
3701 		mddev->reshape_position = MaxSector;
3702 		mddev->external = 0;
3703 		mddev->persistent = 0;
3704 
3705 	} else if (mddev->pers)
3706 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
3707 			mdname(mddev));
3708 	err = 0;
3709 	md_new_event(mddev);
3710 out:
3711 	return err;
3712 }
3713 
3714 #ifndef MODULE
3715 static void autorun_array(mddev_t *mddev)
3716 {
3717 	mdk_rdev_t *rdev;
3718 	struct list_head *tmp;
3719 	int err;
3720 
3721 	if (list_empty(&mddev->disks))
3722 		return;
3723 
3724 	printk(KERN_INFO "md: running: ");
3725 
3726 	rdev_for_each(rdev, tmp, mddev) {
3727 		char b[BDEVNAME_SIZE];
3728 		printk("<%s>", bdevname(rdev->bdev,b));
3729 	}
3730 	printk("\n");
3731 
3732 	err = do_md_run (mddev);
3733 	if (err) {
3734 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3735 		do_md_stop (mddev, 0);
3736 	}
3737 }
3738 
3739 /*
3740  * lets try to run arrays based on all disks that have arrived
3741  * until now. (those are in pending_raid_disks)
3742  *
3743  * the method: pick the first pending disk, collect all disks with
3744  * the same UUID, remove all from the pending list and put them into
3745  * the 'same_array' list. Then order this list based on superblock
3746  * update time (freshest comes first), kick out 'old' disks and
3747  * compare superblocks. If everything's fine then run it.
3748  *
3749  * If "unit" is allocated, then bump its reference count
3750  */
3751 static void autorun_devices(int part)
3752 {
3753 	struct list_head *tmp;
3754 	mdk_rdev_t *rdev0, *rdev;
3755 	mddev_t *mddev;
3756 	char b[BDEVNAME_SIZE];
3757 
3758 	printk(KERN_INFO "md: autorun ...\n");
3759 	while (!list_empty(&pending_raid_disks)) {
3760 		int unit;
3761 		dev_t dev;
3762 		LIST_HEAD(candidates);
3763 		rdev0 = list_entry(pending_raid_disks.next,
3764 					 mdk_rdev_t, same_set);
3765 
3766 		printk(KERN_INFO "md: considering %s ...\n",
3767 			bdevname(rdev0->bdev,b));
3768 		INIT_LIST_HEAD(&candidates);
3769 		rdev_for_each_list(rdev, tmp, pending_raid_disks)
3770 			if (super_90_load(rdev, rdev0, 0) >= 0) {
3771 				printk(KERN_INFO "md:  adding %s ...\n",
3772 					bdevname(rdev->bdev,b));
3773 				list_move(&rdev->same_set, &candidates);
3774 			}
3775 		/*
3776 		 * now we have a set of devices, with all of them having
3777 		 * mostly sane superblocks. It's time to allocate the
3778 		 * mddev.
3779 		 */
3780 		if (part) {
3781 			dev = MKDEV(mdp_major,
3782 				    rdev0->preferred_minor << MdpMinorShift);
3783 			unit = MINOR(dev) >> MdpMinorShift;
3784 		} else {
3785 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3786 			unit = MINOR(dev);
3787 		}
3788 		if (rdev0->preferred_minor != unit) {
3789 			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3790 			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3791 			break;
3792 		}
3793 
3794 		md_probe(dev, NULL, NULL);
3795 		mddev = mddev_find(dev);
3796 		if (!mddev) {
3797 			printk(KERN_ERR
3798 				"md: cannot allocate memory for md drive.\n");
3799 			break;
3800 		}
3801 		if (mddev_lock(mddev))
3802 			printk(KERN_WARNING "md: %s locked, cannot run\n",
3803 			       mdname(mddev));
3804 		else if (mddev->raid_disks || mddev->major_version
3805 			 || !list_empty(&mddev->disks)) {
3806 			printk(KERN_WARNING
3807 				"md: %s already running, cannot run %s\n",
3808 				mdname(mddev), bdevname(rdev0->bdev,b));
3809 			mddev_unlock(mddev);
3810 		} else {
3811 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
3812 			mddev->persistent = 1;
3813 			rdev_for_each_list(rdev, tmp, candidates) {
3814 				list_del_init(&rdev->same_set);
3815 				if (bind_rdev_to_array(rdev, mddev))
3816 					export_rdev(rdev);
3817 			}
3818 			autorun_array(mddev);
3819 			mddev_unlock(mddev);
3820 		}
3821 		/* on success, candidates will be empty, on error
3822 		 * it won't...
3823 		 */
3824 		rdev_for_each_list(rdev, tmp, candidates)
3825 			export_rdev(rdev);
3826 		mddev_put(mddev);
3827 	}
3828 	printk(KERN_INFO "md: ... autorun DONE.\n");
3829 }
3830 #endif /* !MODULE */
3831 
3832 static int get_version(void __user * arg)
3833 {
3834 	mdu_version_t ver;
3835 
3836 	ver.major = MD_MAJOR_VERSION;
3837 	ver.minor = MD_MINOR_VERSION;
3838 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
3839 
3840 	if (copy_to_user(arg, &ver, sizeof(ver)))
3841 		return -EFAULT;
3842 
3843 	return 0;
3844 }
3845 
3846 static int get_array_info(mddev_t * mddev, void __user * arg)
3847 {
3848 	mdu_array_info_t info;
3849 	int nr,working,active,failed,spare;
3850 	mdk_rdev_t *rdev;
3851 	struct list_head *tmp;
3852 
3853 	nr=working=active=failed=spare=0;
3854 	rdev_for_each(rdev, tmp, mddev) {
3855 		nr++;
3856 		if (test_bit(Faulty, &rdev->flags))
3857 			failed++;
3858 		else {
3859 			working++;
3860 			if (test_bit(In_sync, &rdev->flags))
3861 				active++;
3862 			else
3863 				spare++;
3864 		}
3865 	}
3866 
3867 	info.major_version = mddev->major_version;
3868 	info.minor_version = mddev->minor_version;
3869 	info.patch_version = MD_PATCHLEVEL_VERSION;
3870 	info.ctime         = mddev->ctime;
3871 	info.level         = mddev->level;
3872 	info.size          = mddev->size;
3873 	if (info.size != mddev->size) /* overflow */
3874 		info.size = -1;
3875 	info.nr_disks      = nr;
3876 	info.raid_disks    = mddev->raid_disks;
3877 	info.md_minor      = mddev->md_minor;
3878 	info.not_persistent= !mddev->persistent;
3879 
3880 	info.utime         = mddev->utime;
3881 	info.state         = 0;
3882 	if (mddev->in_sync)
3883 		info.state = (1<<MD_SB_CLEAN);
3884 	if (mddev->bitmap && mddev->bitmap_offset)
3885 		info.state = (1<<MD_SB_BITMAP_PRESENT);
3886 	info.active_disks  = active;
3887 	info.working_disks = working;
3888 	info.failed_disks  = failed;
3889 	info.spare_disks   = spare;
3890 
3891 	info.layout        = mddev->layout;
3892 	info.chunk_size    = mddev->chunk_size;
3893 
3894 	if (copy_to_user(arg, &info, sizeof(info)))
3895 		return -EFAULT;
3896 
3897 	return 0;
3898 }
3899 
3900 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3901 {
3902 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3903 	char *ptr, *buf = NULL;
3904 	int err = -ENOMEM;
3905 
3906 	md_allow_write(mddev);
3907 
3908 	file = kmalloc(sizeof(*file), GFP_KERNEL);
3909 	if (!file)
3910 		goto out;
3911 
3912 	/* bitmap disabled, zero the first byte and copy out */
3913 	if (!mddev->bitmap || !mddev->bitmap->file) {
3914 		file->pathname[0] = '\0';
3915 		goto copy_out;
3916 	}
3917 
3918 	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3919 	if (!buf)
3920 		goto out;
3921 
3922 	ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3923 	if (!ptr)
3924 		goto out;
3925 
3926 	strcpy(file->pathname, ptr);
3927 
3928 copy_out:
3929 	err = 0;
3930 	if (copy_to_user(arg, file, sizeof(*file)))
3931 		err = -EFAULT;
3932 out:
3933 	kfree(buf);
3934 	kfree(file);
3935 	return err;
3936 }
3937 
3938 static int get_disk_info(mddev_t * mddev, void __user * arg)
3939 {
3940 	mdu_disk_info_t info;
3941 	unsigned int nr;
3942 	mdk_rdev_t *rdev;
3943 
3944 	if (copy_from_user(&info, arg, sizeof(info)))
3945 		return -EFAULT;
3946 
3947 	nr = info.number;
3948 
3949 	rdev = find_rdev_nr(mddev, nr);
3950 	if (rdev) {
3951 		info.major = MAJOR(rdev->bdev->bd_dev);
3952 		info.minor = MINOR(rdev->bdev->bd_dev);
3953 		info.raid_disk = rdev->raid_disk;
3954 		info.state = 0;
3955 		if (test_bit(Faulty, &rdev->flags))
3956 			info.state |= (1<<MD_DISK_FAULTY);
3957 		else if (test_bit(In_sync, &rdev->flags)) {
3958 			info.state |= (1<<MD_DISK_ACTIVE);
3959 			info.state |= (1<<MD_DISK_SYNC);
3960 		}
3961 		if (test_bit(WriteMostly, &rdev->flags))
3962 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
3963 	} else {
3964 		info.major = info.minor = 0;
3965 		info.raid_disk = -1;
3966 		info.state = (1<<MD_DISK_REMOVED);
3967 	}
3968 
3969 	if (copy_to_user(arg, &info, sizeof(info)))
3970 		return -EFAULT;
3971 
3972 	return 0;
3973 }
3974 
3975 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3976 {
3977 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3978 	mdk_rdev_t *rdev;
3979 	dev_t dev = MKDEV(info->major,info->minor);
3980 
3981 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3982 		return -EOVERFLOW;
3983 
3984 	if (!mddev->raid_disks) {
3985 		int err;
3986 		/* expecting a device which has a superblock */
3987 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3988 		if (IS_ERR(rdev)) {
3989 			printk(KERN_WARNING
3990 				"md: md_import_device returned %ld\n",
3991 				PTR_ERR(rdev));
3992 			return PTR_ERR(rdev);
3993 		}
3994 		if (!list_empty(&mddev->disks)) {
3995 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3996 							mdk_rdev_t, same_set);
3997 			int err = super_types[mddev->major_version]
3998 				.load_super(rdev, rdev0, mddev->minor_version);
3999 			if (err < 0) {
4000 				printk(KERN_WARNING
4001 					"md: %s has different UUID to %s\n",
4002 					bdevname(rdev->bdev,b),
4003 					bdevname(rdev0->bdev,b2));
4004 				export_rdev(rdev);
4005 				return -EINVAL;
4006 			}
4007 		}
4008 		err = bind_rdev_to_array(rdev, mddev);
4009 		if (err)
4010 			export_rdev(rdev);
4011 		return err;
4012 	}
4013 
4014 	/*
4015 	 * add_new_disk can be used once the array is assembled
4016 	 * to add "hot spares".  They must already have a superblock
4017 	 * written
4018 	 */
4019 	if (mddev->pers) {
4020 		int err;
4021 		if (!mddev->pers->hot_add_disk) {
4022 			printk(KERN_WARNING
4023 				"%s: personality does not support diskops!\n",
4024 			       mdname(mddev));
4025 			return -EINVAL;
4026 		}
4027 		if (mddev->persistent)
4028 			rdev = md_import_device(dev, mddev->major_version,
4029 						mddev->minor_version);
4030 		else
4031 			rdev = md_import_device(dev, -1, -1);
4032 		if (IS_ERR(rdev)) {
4033 			printk(KERN_WARNING
4034 				"md: md_import_device returned %ld\n",
4035 				PTR_ERR(rdev));
4036 			return PTR_ERR(rdev);
4037 		}
4038 		/* set save_raid_disk if appropriate */
4039 		if (!mddev->persistent) {
4040 			if (info->state & (1<<MD_DISK_SYNC)  &&
4041 			    info->raid_disk < mddev->raid_disks)
4042 				rdev->raid_disk = info->raid_disk;
4043 			else
4044 				rdev->raid_disk = -1;
4045 		} else
4046 			super_types[mddev->major_version].
4047 				validate_super(mddev, rdev);
4048 		rdev->saved_raid_disk = rdev->raid_disk;
4049 
4050 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
4051 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4052 			set_bit(WriteMostly, &rdev->flags);
4053 
4054 		rdev->raid_disk = -1;
4055 		err = bind_rdev_to_array(rdev, mddev);
4056 		if (!err && !mddev->pers->hot_remove_disk) {
4057 			/* If there is hot_add_disk but no hot_remove_disk
4058 			 * then added disks for geometry changes,
4059 			 * and should be added immediately.
4060 			 */
4061 			super_types[mddev->major_version].
4062 				validate_super(mddev, rdev);
4063 			err = mddev->pers->hot_add_disk(mddev, rdev);
4064 			if (err)
4065 				unbind_rdev_from_array(rdev);
4066 		}
4067 		if (err)
4068 			export_rdev(rdev);
4069 
4070 		md_update_sb(mddev, 1);
4071 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4072 		md_wakeup_thread(mddev->thread);
4073 		return err;
4074 	}
4075 
4076 	/* otherwise, add_new_disk is only allowed
4077 	 * for major_version==0 superblocks
4078 	 */
4079 	if (mddev->major_version != 0) {
4080 		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4081 		       mdname(mddev));
4082 		return -EINVAL;
4083 	}
4084 
4085 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
4086 		int err;
4087 		rdev = md_import_device (dev, -1, 0);
4088 		if (IS_ERR(rdev)) {
4089 			printk(KERN_WARNING
4090 				"md: error, md_import_device() returned %ld\n",
4091 				PTR_ERR(rdev));
4092 			return PTR_ERR(rdev);
4093 		}
4094 		rdev->desc_nr = info->number;
4095 		if (info->raid_disk < mddev->raid_disks)
4096 			rdev->raid_disk = info->raid_disk;
4097 		else
4098 			rdev->raid_disk = -1;
4099 
4100 		if (rdev->raid_disk < mddev->raid_disks)
4101 			if (info->state & (1<<MD_DISK_SYNC))
4102 				set_bit(In_sync, &rdev->flags);
4103 
4104 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4105 			set_bit(WriteMostly, &rdev->flags);
4106 
4107 		if (!mddev->persistent) {
4108 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
4109 			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4110 		} else
4111 			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4112 		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
4113 
4114 		err = bind_rdev_to_array(rdev, mddev);
4115 		if (err) {
4116 			export_rdev(rdev);
4117 			return err;
4118 		}
4119 	}
4120 
4121 	return 0;
4122 }
4123 
4124 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4125 {
4126 	char b[BDEVNAME_SIZE];
4127 	mdk_rdev_t *rdev;
4128 
4129 	if (!mddev->pers)
4130 		return -ENODEV;
4131 
4132 	rdev = find_rdev(mddev, dev);
4133 	if (!rdev)
4134 		return -ENXIO;
4135 
4136 	if (rdev->raid_disk >= 0)
4137 		goto busy;
4138 
4139 	kick_rdev_from_array(rdev);
4140 	md_update_sb(mddev, 1);
4141 	md_new_event(mddev);
4142 
4143 	return 0;
4144 busy:
4145 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
4146 		bdevname(rdev->bdev,b), mdname(mddev));
4147 	return -EBUSY;
4148 }
4149 
4150 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4151 {
4152 	char b[BDEVNAME_SIZE];
4153 	int err;
4154 	unsigned int size;
4155 	mdk_rdev_t *rdev;
4156 
4157 	if (!mddev->pers)
4158 		return -ENODEV;
4159 
4160 	if (mddev->major_version != 0) {
4161 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4162 			" version-0 superblocks.\n",
4163 			mdname(mddev));
4164 		return -EINVAL;
4165 	}
4166 	if (!mddev->pers->hot_add_disk) {
4167 		printk(KERN_WARNING
4168 			"%s: personality does not support diskops!\n",
4169 			mdname(mddev));
4170 		return -EINVAL;
4171 	}
4172 
4173 	rdev = md_import_device (dev, -1, 0);
4174 	if (IS_ERR(rdev)) {
4175 		printk(KERN_WARNING
4176 			"md: error, md_import_device() returned %ld\n",
4177 			PTR_ERR(rdev));
4178 		return -EINVAL;
4179 	}
4180 
4181 	if (mddev->persistent)
4182 		rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4183 	else
4184 		rdev->sb_offset =
4185 			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4186 
4187 	size = calc_dev_size(rdev, mddev->chunk_size);
4188 	rdev->size = size;
4189 
4190 	if (test_bit(Faulty, &rdev->flags)) {
4191 		printk(KERN_WARNING
4192 			"md: can not hot-add faulty %s disk to %s!\n",
4193 			bdevname(rdev->bdev,b), mdname(mddev));
4194 		err = -EINVAL;
4195 		goto abort_export;
4196 	}
4197 	clear_bit(In_sync, &rdev->flags);
4198 	rdev->desc_nr = -1;
4199 	rdev->saved_raid_disk = -1;
4200 	err = bind_rdev_to_array(rdev, mddev);
4201 	if (err)
4202 		goto abort_export;
4203 
4204 	/*
4205 	 * The rest should better be atomic, we can have disk failures
4206 	 * noticed in interrupt contexts ...
4207 	 */
4208 
4209 	if (rdev->desc_nr == mddev->max_disks) {
4210 		printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4211 			mdname(mddev));
4212 		err = -EBUSY;
4213 		goto abort_unbind_export;
4214 	}
4215 
4216 	rdev->raid_disk = -1;
4217 
4218 	md_update_sb(mddev, 1);
4219 
4220 	/*
4221 	 * Kick recovery, maybe this spare has to be added to the
4222 	 * array immediately.
4223 	 */
4224 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4225 	md_wakeup_thread(mddev->thread);
4226 	md_new_event(mddev);
4227 	return 0;
4228 
4229 abort_unbind_export:
4230 	unbind_rdev_from_array(rdev);
4231 
4232 abort_export:
4233 	export_rdev(rdev);
4234 	return err;
4235 }
4236 
4237 static int set_bitmap_file(mddev_t *mddev, int fd)
4238 {
4239 	int err;
4240 
4241 	if (mddev->pers) {
4242 		if (!mddev->pers->quiesce)
4243 			return -EBUSY;
4244 		if (mddev->recovery || mddev->sync_thread)
4245 			return -EBUSY;
4246 		/* we should be able to change the bitmap.. */
4247 	}
4248 
4249 
4250 	if (fd >= 0) {
4251 		if (mddev->bitmap)
4252 			return -EEXIST; /* cannot add when bitmap is present */
4253 		mddev->bitmap_file = fget(fd);
4254 
4255 		if (mddev->bitmap_file == NULL) {
4256 			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4257 			       mdname(mddev));
4258 			return -EBADF;
4259 		}
4260 
4261 		err = deny_bitmap_write_access(mddev->bitmap_file);
4262 		if (err) {
4263 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4264 			       mdname(mddev));
4265 			fput(mddev->bitmap_file);
4266 			mddev->bitmap_file = NULL;
4267 			return err;
4268 		}
4269 		mddev->bitmap_offset = 0; /* file overrides offset */
4270 	} else if (mddev->bitmap == NULL)
4271 		return -ENOENT; /* cannot remove what isn't there */
4272 	err = 0;
4273 	if (mddev->pers) {
4274 		mddev->pers->quiesce(mddev, 1);
4275 		if (fd >= 0)
4276 			err = bitmap_create(mddev);
4277 		if (fd < 0 || err) {
4278 			bitmap_destroy(mddev);
4279 			fd = -1; /* make sure to put the file */
4280 		}
4281 		mddev->pers->quiesce(mddev, 0);
4282 	}
4283 	if (fd < 0) {
4284 		if (mddev->bitmap_file) {
4285 			restore_bitmap_write_access(mddev->bitmap_file);
4286 			fput(mddev->bitmap_file);
4287 		}
4288 		mddev->bitmap_file = NULL;
4289 	}
4290 
4291 	return err;
4292 }
4293 
4294 /*
4295  * set_array_info is used two different ways
4296  * The original usage is when creating a new array.
4297  * In this usage, raid_disks is > 0 and it together with
4298  *  level, size, not_persistent,layout,chunksize determine the
4299  *  shape of the array.
4300  *  This will always create an array with a type-0.90.0 superblock.
4301  * The newer usage is when assembling an array.
4302  *  In this case raid_disks will be 0, and the major_version field is
4303  *  use to determine which style super-blocks are to be found on the devices.
4304  *  The minor and patch _version numbers are also kept incase the
4305  *  super_block handler wishes to interpret them.
4306  */
4307 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4308 {
4309 
4310 	if (info->raid_disks == 0) {
4311 		/* just setting version number for superblock loading */
4312 		if (info->major_version < 0 ||
4313 		    info->major_version >= ARRAY_SIZE(super_types) ||
4314 		    super_types[info->major_version].name == NULL) {
4315 			/* maybe try to auto-load a module? */
4316 			printk(KERN_INFO
4317 				"md: superblock version %d not known\n",
4318 				info->major_version);
4319 			return -EINVAL;
4320 		}
4321 		mddev->major_version = info->major_version;
4322 		mddev->minor_version = info->minor_version;
4323 		mddev->patch_version = info->patch_version;
4324 		mddev->persistent = !info->not_persistent;
4325 		return 0;
4326 	}
4327 	mddev->major_version = MD_MAJOR_VERSION;
4328 	mddev->minor_version = MD_MINOR_VERSION;
4329 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
4330 	mddev->ctime         = get_seconds();
4331 
4332 	mddev->level         = info->level;
4333 	mddev->clevel[0]     = 0;
4334 	mddev->size          = info->size;
4335 	mddev->raid_disks    = info->raid_disks;
4336 	/* don't set md_minor, it is determined by which /dev/md* was
4337 	 * openned
4338 	 */
4339 	if (info->state & (1<<MD_SB_CLEAN))
4340 		mddev->recovery_cp = MaxSector;
4341 	else
4342 		mddev->recovery_cp = 0;
4343 	mddev->persistent    = ! info->not_persistent;
4344 	mddev->external	     = 0;
4345 
4346 	mddev->layout        = info->layout;
4347 	mddev->chunk_size    = info->chunk_size;
4348 
4349 	mddev->max_disks     = MD_SB_DISKS;
4350 
4351 	if (mddev->persistent)
4352 		mddev->flags         = 0;
4353 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
4354 
4355 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4356 	mddev->bitmap_offset = 0;
4357 
4358 	mddev->reshape_position = MaxSector;
4359 
4360 	/*
4361 	 * Generate a 128 bit UUID
4362 	 */
4363 	get_random_bytes(mddev->uuid, 16);
4364 
4365 	mddev->new_level = mddev->level;
4366 	mddev->new_chunk = mddev->chunk_size;
4367 	mddev->new_layout = mddev->layout;
4368 	mddev->delta_disks = 0;
4369 
4370 	return 0;
4371 }
4372 
4373 static int update_size(mddev_t *mddev, unsigned long size)
4374 {
4375 	mdk_rdev_t * rdev;
4376 	int rv;
4377 	struct list_head *tmp;
4378 	int fit = (size == 0);
4379 
4380 	if (mddev->pers->resize == NULL)
4381 		return -EINVAL;
4382 	/* The "size" is the amount of each device that is used.
4383 	 * This can only make sense for arrays with redundancy.
4384 	 * linear and raid0 always use whatever space is available
4385 	 * We can only consider changing the size if no resync
4386 	 * or reconstruction is happening, and if the new size
4387 	 * is acceptable. It must fit before the sb_offset or,
4388 	 * if that is <data_offset, it must fit before the
4389 	 * size of each device.
4390 	 * If size is zero, we find the largest size that fits.
4391 	 */
4392 	if (mddev->sync_thread)
4393 		return -EBUSY;
4394 	rdev_for_each(rdev, tmp, mddev) {
4395 		sector_t avail;
4396 		avail = rdev->size * 2;
4397 
4398 		if (fit && (size == 0 || size > avail/2))
4399 			size = avail/2;
4400 		if (avail < ((sector_t)size << 1))
4401 			return -ENOSPC;
4402 	}
4403 	rv = mddev->pers->resize(mddev, (sector_t)size *2);
4404 	if (!rv) {
4405 		struct block_device *bdev;
4406 
4407 		bdev = bdget_disk(mddev->gendisk, 0);
4408 		if (bdev) {
4409 			mutex_lock(&bdev->bd_inode->i_mutex);
4410 			i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4411 			mutex_unlock(&bdev->bd_inode->i_mutex);
4412 			bdput(bdev);
4413 		}
4414 	}
4415 	return rv;
4416 }
4417 
4418 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4419 {
4420 	int rv;
4421 	/* change the number of raid disks */
4422 	if (mddev->pers->check_reshape == NULL)
4423 		return -EINVAL;
4424 	if (raid_disks <= 0 ||
4425 	    raid_disks >= mddev->max_disks)
4426 		return -EINVAL;
4427 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4428 		return -EBUSY;
4429 	mddev->delta_disks = raid_disks - mddev->raid_disks;
4430 
4431 	rv = mddev->pers->check_reshape(mddev);
4432 	return rv;
4433 }
4434 
4435 
4436 /*
4437  * update_array_info is used to change the configuration of an
4438  * on-line array.
4439  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4440  * fields in the info are checked against the array.
4441  * Any differences that cannot be handled will cause an error.
4442  * Normally, only one change can be managed at a time.
4443  */
4444 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4445 {
4446 	int rv = 0;
4447 	int cnt = 0;
4448 	int state = 0;
4449 
4450 	/* calculate expected state,ignoring low bits */
4451 	if (mddev->bitmap && mddev->bitmap_offset)
4452 		state |= (1 << MD_SB_BITMAP_PRESENT);
4453 
4454 	if (mddev->major_version != info->major_version ||
4455 	    mddev->minor_version != info->minor_version ||
4456 /*	    mddev->patch_version != info->patch_version || */
4457 	    mddev->ctime         != info->ctime         ||
4458 	    mddev->level         != info->level         ||
4459 /*	    mddev->layout        != info->layout        || */
4460 	    !mddev->persistent	 != info->not_persistent||
4461 	    mddev->chunk_size    != info->chunk_size    ||
4462 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4463 	    ((state^info->state) & 0xfffffe00)
4464 		)
4465 		return -EINVAL;
4466 	/* Check there is only one change */
4467 	if (info->size >= 0 && mddev->size != info->size) cnt++;
4468 	if (mddev->raid_disks != info->raid_disks) cnt++;
4469 	if (mddev->layout != info->layout) cnt++;
4470 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4471 	if (cnt == 0) return 0;
4472 	if (cnt > 1) return -EINVAL;
4473 
4474 	if (mddev->layout != info->layout) {
4475 		/* Change layout
4476 		 * we don't need to do anything at the md level, the
4477 		 * personality will take care of it all.
4478 		 */
4479 		if (mddev->pers->reconfig == NULL)
4480 			return -EINVAL;
4481 		else
4482 			return mddev->pers->reconfig(mddev, info->layout, -1);
4483 	}
4484 	if (info->size >= 0 && mddev->size != info->size)
4485 		rv = update_size(mddev, info->size);
4486 
4487 	if (mddev->raid_disks    != info->raid_disks)
4488 		rv = update_raid_disks(mddev, info->raid_disks);
4489 
4490 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4491 		if (mddev->pers->quiesce == NULL)
4492 			return -EINVAL;
4493 		if (mddev->recovery || mddev->sync_thread)
4494 			return -EBUSY;
4495 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4496 			/* add the bitmap */
4497 			if (mddev->bitmap)
4498 				return -EEXIST;
4499 			if (mddev->default_bitmap_offset == 0)
4500 				return -EINVAL;
4501 			mddev->bitmap_offset = mddev->default_bitmap_offset;
4502 			mddev->pers->quiesce(mddev, 1);
4503 			rv = bitmap_create(mddev);
4504 			if (rv)
4505 				bitmap_destroy(mddev);
4506 			mddev->pers->quiesce(mddev, 0);
4507 		} else {
4508 			/* remove the bitmap */
4509 			if (!mddev->bitmap)
4510 				return -ENOENT;
4511 			if (mddev->bitmap->file)
4512 				return -EINVAL;
4513 			mddev->pers->quiesce(mddev, 1);
4514 			bitmap_destroy(mddev);
4515 			mddev->pers->quiesce(mddev, 0);
4516 			mddev->bitmap_offset = 0;
4517 		}
4518 	}
4519 	md_update_sb(mddev, 1);
4520 	return rv;
4521 }
4522 
4523 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4524 {
4525 	mdk_rdev_t *rdev;
4526 
4527 	if (mddev->pers == NULL)
4528 		return -ENODEV;
4529 
4530 	rdev = find_rdev(mddev, dev);
4531 	if (!rdev)
4532 		return -ENODEV;
4533 
4534 	md_error(mddev, rdev);
4535 	return 0;
4536 }
4537 
4538 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4539 {
4540 	mddev_t *mddev = bdev->bd_disk->private_data;
4541 
4542 	geo->heads = 2;
4543 	geo->sectors = 4;
4544 	geo->cylinders = get_capacity(mddev->gendisk) / 8;
4545 	return 0;
4546 }
4547 
4548 static int md_ioctl(struct inode *inode, struct file *file,
4549 			unsigned int cmd, unsigned long arg)
4550 {
4551 	int err = 0;
4552 	void __user *argp = (void __user *)arg;
4553 	mddev_t *mddev = NULL;
4554 
4555 	if (!capable(CAP_SYS_ADMIN))
4556 		return -EACCES;
4557 
4558 	/*
4559 	 * Commands dealing with the RAID driver but not any
4560 	 * particular array:
4561 	 */
4562 	switch (cmd)
4563 	{
4564 		case RAID_VERSION:
4565 			err = get_version(argp);
4566 			goto done;
4567 
4568 		case PRINT_RAID_DEBUG:
4569 			err = 0;
4570 			md_print_devices();
4571 			goto done;
4572 
4573 #ifndef MODULE
4574 		case RAID_AUTORUN:
4575 			err = 0;
4576 			autostart_arrays(arg);
4577 			goto done;
4578 #endif
4579 		default:;
4580 	}
4581 
4582 	/*
4583 	 * Commands creating/starting a new array:
4584 	 */
4585 
4586 	mddev = inode->i_bdev->bd_disk->private_data;
4587 
4588 	if (!mddev) {
4589 		BUG();
4590 		goto abort;
4591 	}
4592 
4593 	err = mddev_lock(mddev);
4594 	if (err) {
4595 		printk(KERN_INFO
4596 			"md: ioctl lock interrupted, reason %d, cmd %d\n",
4597 			err, cmd);
4598 		goto abort;
4599 	}
4600 
4601 	switch (cmd)
4602 	{
4603 		case SET_ARRAY_INFO:
4604 			{
4605 				mdu_array_info_t info;
4606 				if (!arg)
4607 					memset(&info, 0, sizeof(info));
4608 				else if (copy_from_user(&info, argp, sizeof(info))) {
4609 					err = -EFAULT;
4610 					goto abort_unlock;
4611 				}
4612 				if (mddev->pers) {
4613 					err = update_array_info(mddev, &info);
4614 					if (err) {
4615 						printk(KERN_WARNING "md: couldn't update"
4616 						       " array info. %d\n", err);
4617 						goto abort_unlock;
4618 					}
4619 					goto done_unlock;
4620 				}
4621 				if (!list_empty(&mddev->disks)) {
4622 					printk(KERN_WARNING
4623 					       "md: array %s already has disks!\n",
4624 					       mdname(mddev));
4625 					err = -EBUSY;
4626 					goto abort_unlock;
4627 				}
4628 				if (mddev->raid_disks) {
4629 					printk(KERN_WARNING
4630 					       "md: array %s already initialised!\n",
4631 					       mdname(mddev));
4632 					err = -EBUSY;
4633 					goto abort_unlock;
4634 				}
4635 				err = set_array_info(mddev, &info);
4636 				if (err) {
4637 					printk(KERN_WARNING "md: couldn't set"
4638 					       " array info. %d\n", err);
4639 					goto abort_unlock;
4640 				}
4641 			}
4642 			goto done_unlock;
4643 
4644 		default:;
4645 	}
4646 
4647 	/*
4648 	 * Commands querying/configuring an existing array:
4649 	 */
4650 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4651 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4652 	if ((!mddev->raid_disks && !mddev->external)
4653 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4654 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4655 	    && cmd != GET_BITMAP_FILE) {
4656 		err = -ENODEV;
4657 		goto abort_unlock;
4658 	}
4659 
4660 	/*
4661 	 * Commands even a read-only array can execute:
4662 	 */
4663 	switch (cmd)
4664 	{
4665 		case GET_ARRAY_INFO:
4666 			err = get_array_info(mddev, argp);
4667 			goto done_unlock;
4668 
4669 		case GET_BITMAP_FILE:
4670 			err = get_bitmap_file(mddev, argp);
4671 			goto done_unlock;
4672 
4673 		case GET_DISK_INFO:
4674 			err = get_disk_info(mddev, argp);
4675 			goto done_unlock;
4676 
4677 		case RESTART_ARRAY_RW:
4678 			err = restart_array(mddev);
4679 			goto done_unlock;
4680 
4681 		case STOP_ARRAY:
4682 			err = do_md_stop (mddev, 0);
4683 			goto done_unlock;
4684 
4685 		case STOP_ARRAY_RO:
4686 			err = do_md_stop (mddev, 1);
4687 			goto done_unlock;
4688 
4689 	/*
4690 	 * We have a problem here : there is no easy way to give a CHS
4691 	 * virtual geometry. We currently pretend that we have a 2 heads
4692 	 * 4 sectors (with a BIG number of cylinders...). This drives
4693 	 * dosfs just mad... ;-)
4694 	 */
4695 	}
4696 
4697 	/*
4698 	 * The remaining ioctls are changing the state of the
4699 	 * superblock, so we do not allow them on read-only arrays.
4700 	 * However non-MD ioctls (e.g. get-size) will still come through
4701 	 * here and hit the 'default' below, so only disallow
4702 	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4703 	 */
4704 	if (_IOC_TYPE(cmd) == MD_MAJOR &&
4705 	    mddev->ro && mddev->pers) {
4706 		if (mddev->ro == 2) {
4707 			mddev->ro = 0;
4708 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4709 		md_wakeup_thread(mddev->thread);
4710 
4711 		} else {
4712 			err = -EROFS;
4713 			goto abort_unlock;
4714 		}
4715 	}
4716 
4717 	switch (cmd)
4718 	{
4719 		case ADD_NEW_DISK:
4720 		{
4721 			mdu_disk_info_t info;
4722 			if (copy_from_user(&info, argp, sizeof(info)))
4723 				err = -EFAULT;
4724 			else
4725 				err = add_new_disk(mddev, &info);
4726 			goto done_unlock;
4727 		}
4728 
4729 		case HOT_REMOVE_DISK:
4730 			err = hot_remove_disk(mddev, new_decode_dev(arg));
4731 			goto done_unlock;
4732 
4733 		case HOT_ADD_DISK:
4734 			err = hot_add_disk(mddev, new_decode_dev(arg));
4735 			goto done_unlock;
4736 
4737 		case SET_DISK_FAULTY:
4738 			err = set_disk_faulty(mddev, new_decode_dev(arg));
4739 			goto done_unlock;
4740 
4741 		case RUN_ARRAY:
4742 			err = do_md_run (mddev);
4743 			goto done_unlock;
4744 
4745 		case SET_BITMAP_FILE:
4746 			err = set_bitmap_file(mddev, (int)arg);
4747 			goto done_unlock;
4748 
4749 		default:
4750 			err = -EINVAL;
4751 			goto abort_unlock;
4752 	}
4753 
4754 done_unlock:
4755 abort_unlock:
4756 	mddev_unlock(mddev);
4757 
4758 	return err;
4759 done:
4760 	if (err)
4761 		MD_BUG();
4762 abort:
4763 	return err;
4764 }
4765 
4766 static int md_open(struct inode *inode, struct file *file)
4767 {
4768 	/*
4769 	 * Succeed if we can lock the mddev, which confirms that
4770 	 * it isn't being stopped right now.
4771 	 */
4772 	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4773 	int err;
4774 
4775 	if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
4776 		goto out;
4777 
4778 	err = 0;
4779 	mddev_get(mddev);
4780 	mddev_unlock(mddev);
4781 
4782 	check_disk_change(inode->i_bdev);
4783  out:
4784 	return err;
4785 }
4786 
4787 static int md_release(struct inode *inode, struct file * file)
4788 {
4789  	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4790 
4791 	BUG_ON(!mddev);
4792 	mddev_put(mddev);
4793 
4794 	return 0;
4795 }
4796 
4797 static int md_media_changed(struct gendisk *disk)
4798 {
4799 	mddev_t *mddev = disk->private_data;
4800 
4801 	return mddev->changed;
4802 }
4803 
4804 static int md_revalidate(struct gendisk *disk)
4805 {
4806 	mddev_t *mddev = disk->private_data;
4807 
4808 	mddev->changed = 0;
4809 	return 0;
4810 }
4811 static struct block_device_operations md_fops =
4812 {
4813 	.owner		= THIS_MODULE,
4814 	.open		= md_open,
4815 	.release	= md_release,
4816 	.ioctl		= md_ioctl,
4817 	.getgeo		= md_getgeo,
4818 	.media_changed	= md_media_changed,
4819 	.revalidate_disk= md_revalidate,
4820 };
4821 
4822 static int md_thread(void * arg)
4823 {
4824 	mdk_thread_t *thread = arg;
4825 
4826 	/*
4827 	 * md_thread is a 'system-thread', it's priority should be very
4828 	 * high. We avoid resource deadlocks individually in each
4829 	 * raid personality. (RAID5 does preallocation) We also use RR and
4830 	 * the very same RT priority as kswapd, thus we will never get
4831 	 * into a priority inversion deadlock.
4832 	 *
4833 	 * we definitely have to have equal or higher priority than
4834 	 * bdflush, otherwise bdflush will deadlock if there are too
4835 	 * many dirty RAID5 blocks.
4836 	 */
4837 
4838 	allow_signal(SIGKILL);
4839 	while (!kthread_should_stop()) {
4840 
4841 		/* We need to wait INTERRUPTIBLE so that
4842 		 * we don't add to the load-average.
4843 		 * That means we need to be sure no signals are
4844 		 * pending
4845 		 */
4846 		if (signal_pending(current))
4847 			flush_signals(current);
4848 
4849 		wait_event_interruptible_timeout
4850 			(thread->wqueue,
4851 			 test_bit(THREAD_WAKEUP, &thread->flags)
4852 			 || kthread_should_stop(),
4853 			 thread->timeout);
4854 
4855 		clear_bit(THREAD_WAKEUP, &thread->flags);
4856 
4857 		thread->run(thread->mddev);
4858 	}
4859 
4860 	return 0;
4861 }
4862 
4863 void md_wakeup_thread(mdk_thread_t *thread)
4864 {
4865 	if (thread) {
4866 		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4867 		set_bit(THREAD_WAKEUP, &thread->flags);
4868 		wake_up(&thread->wqueue);
4869 	}
4870 }
4871 
4872 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4873 				 const char *name)
4874 {
4875 	mdk_thread_t *thread;
4876 
4877 	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4878 	if (!thread)
4879 		return NULL;
4880 
4881 	init_waitqueue_head(&thread->wqueue);
4882 
4883 	thread->run = run;
4884 	thread->mddev = mddev;
4885 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
4886 	thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4887 	if (IS_ERR(thread->tsk)) {
4888 		kfree(thread);
4889 		return NULL;
4890 	}
4891 	return thread;
4892 }
4893 
4894 void md_unregister_thread(mdk_thread_t *thread)
4895 {
4896 	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
4897 
4898 	kthread_stop(thread->tsk);
4899 	kfree(thread);
4900 }
4901 
4902 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4903 {
4904 	if (!mddev) {
4905 		MD_BUG();
4906 		return;
4907 	}
4908 
4909 	if (!rdev || test_bit(Faulty, &rdev->flags))
4910 		return;
4911 /*
4912 	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4913 		mdname(mddev),
4914 		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4915 		__builtin_return_address(0),__builtin_return_address(1),
4916 		__builtin_return_address(2),__builtin_return_address(3));
4917 */
4918 	if (!mddev->pers)
4919 		return;
4920 	if (!mddev->pers->error_handler)
4921 		return;
4922 	mddev->pers->error_handler(mddev,rdev);
4923 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4924 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4925 	md_wakeup_thread(mddev->thread);
4926 	md_new_event_inintr(mddev);
4927 }
4928 
4929 /* seq_file implementation /proc/mdstat */
4930 
4931 static void status_unused(struct seq_file *seq)
4932 {
4933 	int i = 0;
4934 	mdk_rdev_t *rdev;
4935 	struct list_head *tmp;
4936 
4937 	seq_printf(seq, "unused devices: ");
4938 
4939 	rdev_for_each_list(rdev, tmp, pending_raid_disks) {
4940 		char b[BDEVNAME_SIZE];
4941 		i++;
4942 		seq_printf(seq, "%s ",
4943 			      bdevname(rdev->bdev,b));
4944 	}
4945 	if (!i)
4946 		seq_printf(seq, "<none>");
4947 
4948 	seq_printf(seq, "\n");
4949 }
4950 
4951 
4952 static void status_resync(struct seq_file *seq, mddev_t * mddev)
4953 {
4954 	sector_t max_blocks, resync, res;
4955 	unsigned long dt, db, rt;
4956 	int scale;
4957 	unsigned int per_milli;
4958 
4959 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4960 
4961 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4962 		max_blocks = mddev->resync_max_sectors >> 1;
4963 	else
4964 		max_blocks = mddev->size;
4965 
4966 	/*
4967 	 * Should not happen.
4968 	 */
4969 	if (!max_blocks) {
4970 		MD_BUG();
4971 		return;
4972 	}
4973 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
4974 	 * in a sector_t, and (max_blocks>>scale) will fit in a
4975 	 * u32, as those are the requirements for sector_div.
4976 	 * Thus 'scale' must be at least 10
4977 	 */
4978 	scale = 10;
4979 	if (sizeof(sector_t) > sizeof(unsigned long)) {
4980 		while ( max_blocks/2 > (1ULL<<(scale+32)))
4981 			scale++;
4982 	}
4983 	res = (resync>>scale)*1000;
4984 	sector_div(res, (u32)((max_blocks>>scale)+1));
4985 
4986 	per_milli = res;
4987 	{
4988 		int i, x = per_milli/50, y = 20-x;
4989 		seq_printf(seq, "[");
4990 		for (i = 0; i < x; i++)
4991 			seq_printf(seq, "=");
4992 		seq_printf(seq, ">");
4993 		for (i = 0; i < y; i++)
4994 			seq_printf(seq, ".");
4995 		seq_printf(seq, "] ");
4996 	}
4997 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4998 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4999 		    "reshape" :
5000 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5001 		     "check" :
5002 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5003 		      "resync" : "recovery"))),
5004 		   per_milli/10, per_milli % 10,
5005 		   (unsigned long long) resync,
5006 		   (unsigned long long) max_blocks);
5007 
5008 	/*
5009 	 * We do not want to overflow, so the order of operands and
5010 	 * the * 100 / 100 trick are important. We do a +1 to be
5011 	 * safe against division by zero. We only estimate anyway.
5012 	 *
5013 	 * dt: time from mark until now
5014 	 * db: blocks written from mark until now
5015 	 * rt: remaining time
5016 	 */
5017 	dt = ((jiffies - mddev->resync_mark) / HZ);
5018 	if (!dt) dt++;
5019 	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5020 		- mddev->resync_mark_cnt;
5021 	rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
5022 
5023 	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
5024 
5025 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5026 }
5027 
5028 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5029 {
5030 	struct list_head *tmp;
5031 	loff_t l = *pos;
5032 	mddev_t *mddev;
5033 
5034 	if (l >= 0x10000)
5035 		return NULL;
5036 	if (!l--)
5037 		/* header */
5038 		return (void*)1;
5039 
5040 	spin_lock(&all_mddevs_lock);
5041 	list_for_each(tmp,&all_mddevs)
5042 		if (!l--) {
5043 			mddev = list_entry(tmp, mddev_t, all_mddevs);
5044 			mddev_get(mddev);
5045 			spin_unlock(&all_mddevs_lock);
5046 			return mddev;
5047 		}
5048 	spin_unlock(&all_mddevs_lock);
5049 	if (!l--)
5050 		return (void*)2;/* tail */
5051 	return NULL;
5052 }
5053 
5054 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5055 {
5056 	struct list_head *tmp;
5057 	mddev_t *next_mddev, *mddev = v;
5058 
5059 	++*pos;
5060 	if (v == (void*)2)
5061 		return NULL;
5062 
5063 	spin_lock(&all_mddevs_lock);
5064 	if (v == (void*)1)
5065 		tmp = all_mddevs.next;
5066 	else
5067 		tmp = mddev->all_mddevs.next;
5068 	if (tmp != &all_mddevs)
5069 		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5070 	else {
5071 		next_mddev = (void*)2;
5072 		*pos = 0x10000;
5073 	}
5074 	spin_unlock(&all_mddevs_lock);
5075 
5076 	if (v != (void*)1)
5077 		mddev_put(mddev);
5078 	return next_mddev;
5079 
5080 }
5081 
5082 static void md_seq_stop(struct seq_file *seq, void *v)
5083 {
5084 	mddev_t *mddev = v;
5085 
5086 	if (mddev && v != (void*)1 && v != (void*)2)
5087 		mddev_put(mddev);
5088 }
5089 
5090 struct mdstat_info {
5091 	int event;
5092 };
5093 
5094 static int md_seq_show(struct seq_file *seq, void *v)
5095 {
5096 	mddev_t *mddev = v;
5097 	sector_t size;
5098 	struct list_head *tmp2;
5099 	mdk_rdev_t *rdev;
5100 	struct mdstat_info *mi = seq->private;
5101 	struct bitmap *bitmap;
5102 
5103 	if (v == (void*)1) {
5104 		struct mdk_personality *pers;
5105 		seq_printf(seq, "Personalities : ");
5106 		spin_lock(&pers_lock);
5107 		list_for_each_entry(pers, &pers_list, list)
5108 			seq_printf(seq, "[%s] ", pers->name);
5109 
5110 		spin_unlock(&pers_lock);
5111 		seq_printf(seq, "\n");
5112 		mi->event = atomic_read(&md_event_count);
5113 		return 0;
5114 	}
5115 	if (v == (void*)2) {
5116 		status_unused(seq);
5117 		return 0;
5118 	}
5119 
5120 	if (mddev_lock(mddev) < 0)
5121 		return -EINTR;
5122 
5123 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5124 		seq_printf(seq, "%s : %sactive", mdname(mddev),
5125 						mddev->pers ? "" : "in");
5126 		if (mddev->pers) {
5127 			if (mddev->ro==1)
5128 				seq_printf(seq, " (read-only)");
5129 			if (mddev->ro==2)
5130 				seq_printf(seq, "(auto-read-only)");
5131 			seq_printf(seq, " %s", mddev->pers->name);
5132 		}
5133 
5134 		size = 0;
5135 		rdev_for_each(rdev, tmp2, mddev) {
5136 			char b[BDEVNAME_SIZE];
5137 			seq_printf(seq, " %s[%d]",
5138 				bdevname(rdev->bdev,b), rdev->desc_nr);
5139 			if (test_bit(WriteMostly, &rdev->flags))
5140 				seq_printf(seq, "(W)");
5141 			if (test_bit(Faulty, &rdev->flags)) {
5142 				seq_printf(seq, "(F)");
5143 				continue;
5144 			} else if (rdev->raid_disk < 0)
5145 				seq_printf(seq, "(S)"); /* spare */
5146 			size += rdev->size;
5147 		}
5148 
5149 		if (!list_empty(&mddev->disks)) {
5150 			if (mddev->pers)
5151 				seq_printf(seq, "\n      %llu blocks",
5152 					(unsigned long long)mddev->array_size);
5153 			else
5154 				seq_printf(seq, "\n      %llu blocks",
5155 					(unsigned long long)size);
5156 		}
5157 		if (mddev->persistent) {
5158 			if (mddev->major_version != 0 ||
5159 			    mddev->minor_version != 90) {
5160 				seq_printf(seq," super %d.%d",
5161 					   mddev->major_version,
5162 					   mddev->minor_version);
5163 			}
5164 		} else if (mddev->external)
5165 			seq_printf(seq, " super external:%s",
5166 				   mddev->metadata_type);
5167 		else
5168 			seq_printf(seq, " super non-persistent");
5169 
5170 		if (mddev->pers) {
5171 			mddev->pers->status (seq, mddev);
5172 	 		seq_printf(seq, "\n      ");
5173 			if (mddev->pers->sync_request) {
5174 				if (mddev->curr_resync > 2) {
5175 					status_resync (seq, mddev);
5176 					seq_printf(seq, "\n      ");
5177 				} else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5178 					seq_printf(seq, "\tresync=DELAYED\n      ");
5179 				else if (mddev->recovery_cp < MaxSector)
5180 					seq_printf(seq, "\tresync=PENDING\n      ");
5181 			}
5182 		} else
5183 			seq_printf(seq, "\n       ");
5184 
5185 		if ((bitmap = mddev->bitmap)) {
5186 			unsigned long chunk_kb;
5187 			unsigned long flags;
5188 			spin_lock_irqsave(&bitmap->lock, flags);
5189 			chunk_kb = bitmap->chunksize >> 10;
5190 			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5191 				"%lu%s chunk",
5192 				bitmap->pages - bitmap->missing_pages,
5193 				bitmap->pages,
5194 				(bitmap->pages - bitmap->missing_pages)
5195 					<< (PAGE_SHIFT - 10),
5196 				chunk_kb ? chunk_kb : bitmap->chunksize,
5197 				chunk_kb ? "KB" : "B");
5198 			if (bitmap->file) {
5199 				seq_printf(seq, ", file: ");
5200 				seq_path(seq, bitmap->file->f_path.mnt,
5201 					 bitmap->file->f_path.dentry," \t\n");
5202 			}
5203 
5204 			seq_printf(seq, "\n");
5205 			spin_unlock_irqrestore(&bitmap->lock, flags);
5206 		}
5207 
5208 		seq_printf(seq, "\n");
5209 	}
5210 	mddev_unlock(mddev);
5211 
5212 	return 0;
5213 }
5214 
5215 static struct seq_operations md_seq_ops = {
5216 	.start  = md_seq_start,
5217 	.next   = md_seq_next,
5218 	.stop   = md_seq_stop,
5219 	.show   = md_seq_show,
5220 };
5221 
5222 static int md_seq_open(struct inode *inode, struct file *file)
5223 {
5224 	int error;
5225 	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5226 	if (mi == NULL)
5227 		return -ENOMEM;
5228 
5229 	error = seq_open(file, &md_seq_ops);
5230 	if (error)
5231 		kfree(mi);
5232 	else {
5233 		struct seq_file *p = file->private_data;
5234 		p->private = mi;
5235 		mi->event = atomic_read(&md_event_count);
5236 	}
5237 	return error;
5238 }
5239 
5240 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5241 {
5242 	struct seq_file *m = filp->private_data;
5243 	struct mdstat_info *mi = m->private;
5244 	int mask;
5245 
5246 	poll_wait(filp, &md_event_waiters, wait);
5247 
5248 	/* always allow read */
5249 	mask = POLLIN | POLLRDNORM;
5250 
5251 	if (mi->event != atomic_read(&md_event_count))
5252 		mask |= POLLERR | POLLPRI;
5253 	return mask;
5254 }
5255 
5256 static const struct file_operations md_seq_fops = {
5257 	.owner		= THIS_MODULE,
5258 	.open           = md_seq_open,
5259 	.read           = seq_read,
5260 	.llseek         = seq_lseek,
5261 	.release	= seq_release_private,
5262 	.poll		= mdstat_poll,
5263 };
5264 
5265 int register_md_personality(struct mdk_personality *p)
5266 {
5267 	spin_lock(&pers_lock);
5268 	list_add_tail(&p->list, &pers_list);
5269 	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5270 	spin_unlock(&pers_lock);
5271 	return 0;
5272 }
5273 
5274 int unregister_md_personality(struct mdk_personality *p)
5275 {
5276 	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5277 	spin_lock(&pers_lock);
5278 	list_del_init(&p->list);
5279 	spin_unlock(&pers_lock);
5280 	return 0;
5281 }
5282 
5283 static int is_mddev_idle(mddev_t *mddev)
5284 {
5285 	mdk_rdev_t * rdev;
5286 	struct list_head *tmp;
5287 	int idle;
5288 	long curr_events;
5289 
5290 	idle = 1;
5291 	rdev_for_each(rdev, tmp, mddev) {
5292 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5293 		curr_events = disk_stat_read(disk, sectors[0]) +
5294 				disk_stat_read(disk, sectors[1]) -
5295 				atomic_read(&disk->sync_io);
5296 		/* sync IO will cause sync_io to increase before the disk_stats
5297 		 * as sync_io is counted when a request starts, and
5298 		 * disk_stats is counted when it completes.
5299 		 * So resync activity will cause curr_events to be smaller than
5300 		 * when there was no such activity.
5301 		 * non-sync IO will cause disk_stat to increase without
5302 		 * increasing sync_io so curr_events will (eventually)
5303 		 * be larger than it was before.  Once it becomes
5304 		 * substantially larger, the test below will cause
5305 		 * the array to appear non-idle, and resync will slow
5306 		 * down.
5307 		 * If there is a lot of outstanding resync activity when
5308 		 * we set last_event to curr_events, then all that activity
5309 		 * completing might cause the array to appear non-idle
5310 		 * and resync will be slowed down even though there might
5311 		 * not have been non-resync activity.  This will only
5312 		 * happen once though.  'last_events' will soon reflect
5313 		 * the state where there is little or no outstanding
5314 		 * resync requests, and further resync activity will
5315 		 * always make curr_events less than last_events.
5316 		 *
5317 		 */
5318 		if (curr_events - rdev->last_events > 4096) {
5319 			rdev->last_events = curr_events;
5320 			idle = 0;
5321 		}
5322 	}
5323 	return idle;
5324 }
5325 
5326 void md_done_sync(mddev_t *mddev, int blocks, int ok)
5327 {
5328 	/* another "blocks" (512byte) blocks have been synced */
5329 	atomic_sub(blocks, &mddev->recovery_active);
5330 	wake_up(&mddev->recovery_wait);
5331 	if (!ok) {
5332 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5333 		md_wakeup_thread(mddev->thread);
5334 		// stop recovery, signal do_sync ....
5335 	}
5336 }
5337 
5338 
5339 /* md_write_start(mddev, bi)
5340  * If we need to update some array metadata (e.g. 'active' flag
5341  * in superblock) before writing, schedule a superblock update
5342  * and wait for it to complete.
5343  */
5344 void md_write_start(mddev_t *mddev, struct bio *bi)
5345 {
5346 	if (bio_data_dir(bi) != WRITE)
5347 		return;
5348 
5349 	BUG_ON(mddev->ro == 1);
5350 	if (mddev->ro == 2) {
5351 		/* need to switch to read/write */
5352 		mddev->ro = 0;
5353 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5354 		md_wakeup_thread(mddev->thread);
5355 	}
5356 	atomic_inc(&mddev->writes_pending);
5357 	if (mddev->in_sync) {
5358 		spin_lock_irq(&mddev->write_lock);
5359 		if (mddev->in_sync) {
5360 			mddev->in_sync = 0;
5361 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5362 			md_wakeup_thread(mddev->thread);
5363 		}
5364 		spin_unlock_irq(&mddev->write_lock);
5365 	}
5366 	wait_event(mddev->sb_wait, mddev->flags==0);
5367 }
5368 
5369 void md_write_end(mddev_t *mddev)
5370 {
5371 	if (atomic_dec_and_test(&mddev->writes_pending)) {
5372 		if (mddev->safemode == 2)
5373 			md_wakeup_thread(mddev->thread);
5374 		else if (mddev->safemode_delay)
5375 			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5376 	}
5377 }
5378 
5379 /* md_allow_write(mddev)
5380  * Calling this ensures that the array is marked 'active' so that writes
5381  * may proceed without blocking.  It is important to call this before
5382  * attempting a GFP_KERNEL allocation while holding the mddev lock.
5383  * Must be called with mddev_lock held.
5384  */
5385 void md_allow_write(mddev_t *mddev)
5386 {
5387 	if (!mddev->pers)
5388 		return;
5389 	if (mddev->ro)
5390 		return;
5391 
5392 	spin_lock_irq(&mddev->write_lock);
5393 	if (mddev->in_sync) {
5394 		mddev->in_sync = 0;
5395 		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5396 		if (mddev->safemode_delay &&
5397 		    mddev->safemode == 0)
5398 			mddev->safemode = 1;
5399 		spin_unlock_irq(&mddev->write_lock);
5400 		md_update_sb(mddev, 0);
5401 	} else
5402 		spin_unlock_irq(&mddev->write_lock);
5403 }
5404 EXPORT_SYMBOL_GPL(md_allow_write);
5405 
5406 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5407 
5408 #define SYNC_MARKS	10
5409 #define	SYNC_MARK_STEP	(3*HZ)
5410 void md_do_sync(mddev_t *mddev)
5411 {
5412 	mddev_t *mddev2;
5413 	unsigned int currspeed = 0,
5414 		 window;
5415 	sector_t max_sectors,j, io_sectors;
5416 	unsigned long mark[SYNC_MARKS];
5417 	sector_t mark_cnt[SYNC_MARKS];
5418 	int last_mark,m;
5419 	struct list_head *tmp;
5420 	sector_t last_check;
5421 	int skipped = 0;
5422 	struct list_head *rtmp;
5423 	mdk_rdev_t *rdev;
5424 	char *desc;
5425 
5426 	/* just incase thread restarts... */
5427 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5428 		return;
5429 	if (mddev->ro) /* never try to sync a read-only array */
5430 		return;
5431 
5432 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5433 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5434 			desc = "data-check";
5435 		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5436 			desc = "requested-resync";
5437 		else
5438 			desc = "resync";
5439 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5440 		desc = "reshape";
5441 	else
5442 		desc = "recovery";
5443 
5444 	/* we overload curr_resync somewhat here.
5445 	 * 0 == not engaged in resync at all
5446 	 * 2 == checking that there is no conflict with another sync
5447 	 * 1 == like 2, but have yielded to allow conflicting resync to
5448 	 *		commense
5449 	 * other == active in resync - this many blocks
5450 	 *
5451 	 * Before starting a resync we must have set curr_resync to
5452 	 * 2, and then checked that every "conflicting" array has curr_resync
5453 	 * less than ours.  When we find one that is the same or higher
5454 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5455 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5456 	 * This will mean we have to start checking from the beginning again.
5457 	 *
5458 	 */
5459 
5460 	do {
5461 		mddev->curr_resync = 2;
5462 
5463 	try_again:
5464 		if (kthread_should_stop()) {
5465 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5466 			goto skip;
5467 		}
5468 		for_each_mddev(mddev2, tmp) {
5469 			if (mddev2 == mddev)
5470 				continue;
5471 			if (mddev2->curr_resync &&
5472 			    match_mddev_units(mddev,mddev2)) {
5473 				DEFINE_WAIT(wq);
5474 				if (mddev < mddev2 && mddev->curr_resync == 2) {
5475 					/* arbitrarily yield */
5476 					mddev->curr_resync = 1;
5477 					wake_up(&resync_wait);
5478 				}
5479 				if (mddev > mddev2 && mddev->curr_resync == 1)
5480 					/* no need to wait here, we can wait the next
5481 					 * time 'round when curr_resync == 2
5482 					 */
5483 					continue;
5484 				prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5485 				if (!kthread_should_stop() &&
5486 				    mddev2->curr_resync >= mddev->curr_resync) {
5487 					printk(KERN_INFO "md: delaying %s of %s"
5488 					       " until %s has finished (they"
5489 					       " share one or more physical units)\n",
5490 					       desc, mdname(mddev), mdname(mddev2));
5491 					mddev_put(mddev2);
5492 					schedule();
5493 					finish_wait(&resync_wait, &wq);
5494 					goto try_again;
5495 				}
5496 				finish_wait(&resync_wait, &wq);
5497 			}
5498 		}
5499 	} while (mddev->curr_resync < 2);
5500 
5501 	j = 0;
5502 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5503 		/* resync follows the size requested by the personality,
5504 		 * which defaults to physical size, but can be virtual size
5505 		 */
5506 		max_sectors = mddev->resync_max_sectors;
5507 		mddev->resync_mismatches = 0;
5508 		/* we don't use the checkpoint if there's a bitmap */
5509 		if (!mddev->bitmap &&
5510 		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5511 			j = mddev->recovery_cp;
5512 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5513 		max_sectors = mddev->size << 1;
5514 	else {
5515 		/* recovery follows the physical size of devices */
5516 		max_sectors = mddev->size << 1;
5517 		j = MaxSector;
5518 		rdev_for_each(rdev, rtmp, mddev)
5519 			if (rdev->raid_disk >= 0 &&
5520 			    !test_bit(Faulty, &rdev->flags) &&
5521 			    !test_bit(In_sync, &rdev->flags) &&
5522 			    rdev->recovery_offset < j)
5523 				j = rdev->recovery_offset;
5524 	}
5525 
5526 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5527 	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
5528 		" %d KB/sec/disk.\n", speed_min(mddev));
5529 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5530 	       "(but not more than %d KB/sec) for %s.\n",
5531 	       speed_max(mddev), desc);
5532 
5533 	is_mddev_idle(mddev); /* this also initializes IO event counters */
5534 
5535 	io_sectors = 0;
5536 	for (m = 0; m < SYNC_MARKS; m++) {
5537 		mark[m] = jiffies;
5538 		mark_cnt[m] = io_sectors;
5539 	}
5540 	last_mark = 0;
5541 	mddev->resync_mark = mark[last_mark];
5542 	mddev->resync_mark_cnt = mark_cnt[last_mark];
5543 
5544 	/*
5545 	 * Tune reconstruction:
5546 	 */
5547 	window = 32*(PAGE_SIZE/512);
5548 	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5549 		window/2,(unsigned long long) max_sectors/2);
5550 
5551 	atomic_set(&mddev->recovery_active, 0);
5552 	init_waitqueue_head(&mddev->recovery_wait);
5553 	last_check = 0;
5554 
5555 	if (j>2) {
5556 		printk(KERN_INFO
5557 		       "md: resuming %s of %s from checkpoint.\n",
5558 		       desc, mdname(mddev));
5559 		mddev->curr_resync = j;
5560 	}
5561 
5562 	while (j < max_sectors) {
5563 		sector_t sectors;
5564 
5565 		skipped = 0;
5566 		if (j >= mddev->resync_max) {
5567 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5568 			wait_event(mddev->recovery_wait,
5569 				   mddev->resync_max > j
5570 				   || kthread_should_stop());
5571 		}
5572 		if (kthread_should_stop())
5573 			goto interrupted;
5574 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
5575 						  currspeed < speed_min(mddev));
5576 		if (sectors == 0) {
5577 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5578 			goto out;
5579 		}
5580 
5581 		if (!skipped) { /* actual IO requested */
5582 			io_sectors += sectors;
5583 			atomic_add(sectors, &mddev->recovery_active);
5584 		}
5585 
5586 		j += sectors;
5587 		if (j>1) mddev->curr_resync = j;
5588 		mddev->curr_mark_cnt = io_sectors;
5589 		if (last_check == 0)
5590 			/* this is the earliers that rebuilt will be
5591 			 * visible in /proc/mdstat
5592 			 */
5593 			md_new_event(mddev);
5594 
5595 		if (last_check + window > io_sectors || j == max_sectors)
5596 			continue;
5597 
5598 		last_check = io_sectors;
5599 
5600 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5601 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5602 			break;
5603 
5604 	repeat:
5605 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5606 			/* step marks */
5607 			int next = (last_mark+1) % SYNC_MARKS;
5608 
5609 			mddev->resync_mark = mark[next];
5610 			mddev->resync_mark_cnt = mark_cnt[next];
5611 			mark[next] = jiffies;
5612 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5613 			last_mark = next;
5614 		}
5615 
5616 
5617 		if (kthread_should_stop())
5618 			goto interrupted;
5619 
5620 
5621 		/*
5622 		 * this loop exits only if either when we are slower than
5623 		 * the 'hard' speed limit, or the system was IO-idle for
5624 		 * a jiffy.
5625 		 * the system might be non-idle CPU-wise, but we only care
5626 		 * about not overloading the IO subsystem. (things like an
5627 		 * e2fsck being done on the RAID array should execute fast)
5628 		 */
5629 		blk_unplug(mddev->queue);
5630 		cond_resched();
5631 
5632 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5633 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
5634 
5635 		if (currspeed > speed_min(mddev)) {
5636 			if ((currspeed > speed_max(mddev)) ||
5637 					!is_mddev_idle(mddev)) {
5638 				msleep(500);
5639 				goto repeat;
5640 			}
5641 		}
5642 	}
5643 	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5644 	/*
5645 	 * this also signals 'finished resyncing' to md_stop
5646 	 */
5647  out:
5648 	blk_unplug(mddev->queue);
5649 
5650 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5651 
5652 	/* tell personality that we are finished */
5653 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5654 
5655 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5656 	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5657 	    mddev->curr_resync > 2) {
5658 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5659 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5660 				if (mddev->curr_resync >= mddev->recovery_cp) {
5661 					printk(KERN_INFO
5662 					       "md: checkpointing %s of %s.\n",
5663 					       desc, mdname(mddev));
5664 					mddev->recovery_cp = mddev->curr_resync;
5665 				}
5666 			} else
5667 				mddev->recovery_cp = MaxSector;
5668 		} else {
5669 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5670 				mddev->curr_resync = MaxSector;
5671 			rdev_for_each(rdev, rtmp, mddev)
5672 				if (rdev->raid_disk >= 0 &&
5673 				    !test_bit(Faulty, &rdev->flags) &&
5674 				    !test_bit(In_sync, &rdev->flags) &&
5675 				    rdev->recovery_offset < mddev->curr_resync)
5676 					rdev->recovery_offset = mddev->curr_resync;
5677 		}
5678 	}
5679 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5680 
5681  skip:
5682 	mddev->curr_resync = 0;
5683 	mddev->resync_max = MaxSector;
5684 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5685 	wake_up(&resync_wait);
5686 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5687 	md_wakeup_thread(mddev->thread);
5688 	return;
5689 
5690  interrupted:
5691 	/*
5692 	 * got a signal, exit.
5693 	 */
5694 	printk(KERN_INFO
5695 	       "md: md_do_sync() got signal ... exiting\n");
5696 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5697 	goto out;
5698 
5699 }
5700 EXPORT_SYMBOL_GPL(md_do_sync);
5701 
5702 
5703 static int remove_and_add_spares(mddev_t *mddev)
5704 {
5705 	mdk_rdev_t *rdev;
5706 	struct list_head *rtmp;
5707 	int spares = 0;
5708 
5709 	rdev_for_each(rdev, rtmp, mddev)
5710 		if (rdev->raid_disk >= 0 &&
5711 		    !mddev->external &&
5712 		    (test_bit(Faulty, &rdev->flags) ||
5713 		     ! test_bit(In_sync, &rdev->flags)) &&
5714 		    atomic_read(&rdev->nr_pending)==0) {
5715 			if (mddev->pers->hot_remove_disk(
5716 				    mddev, rdev->raid_disk)==0) {
5717 				char nm[20];
5718 				sprintf(nm,"rd%d", rdev->raid_disk);
5719 				sysfs_remove_link(&mddev->kobj, nm);
5720 				rdev->raid_disk = -1;
5721 			}
5722 		}
5723 
5724 	if (mddev->degraded) {
5725 		rdev_for_each(rdev, rtmp, mddev)
5726 			if (rdev->raid_disk < 0
5727 			    && !test_bit(Faulty, &rdev->flags)) {
5728 				rdev->recovery_offset = 0;
5729 				if (mddev->pers->hot_add_disk(mddev,rdev)) {
5730 					char nm[20];
5731 					sprintf(nm, "rd%d", rdev->raid_disk);
5732 					if (sysfs_create_link(&mddev->kobj,
5733 							      &rdev->kobj, nm))
5734 						printk(KERN_WARNING
5735 						       "md: cannot register "
5736 						       "%s for %s\n",
5737 						       nm, mdname(mddev));
5738 					spares++;
5739 					md_new_event(mddev);
5740 				} else
5741 					break;
5742 			}
5743 	}
5744 	return spares;
5745 }
5746 /*
5747  * This routine is regularly called by all per-raid-array threads to
5748  * deal with generic issues like resync and super-block update.
5749  * Raid personalities that don't have a thread (linear/raid0) do not
5750  * need this as they never do any recovery or update the superblock.
5751  *
5752  * It does not do any resync itself, but rather "forks" off other threads
5753  * to do that as needed.
5754  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5755  * "->recovery" and create a thread at ->sync_thread.
5756  * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5757  * and wakeups up this thread which will reap the thread and finish up.
5758  * This thread also removes any faulty devices (with nr_pending == 0).
5759  *
5760  * The overall approach is:
5761  *  1/ if the superblock needs updating, update it.
5762  *  2/ If a recovery thread is running, don't do anything else.
5763  *  3/ If recovery has finished, clean up, possibly marking spares active.
5764  *  4/ If there are any faulty devices, remove them.
5765  *  5/ If array is degraded, try to add spares devices
5766  *  6/ If array has spares or is not in-sync, start a resync thread.
5767  */
5768 void md_check_recovery(mddev_t *mddev)
5769 {
5770 	mdk_rdev_t *rdev;
5771 	struct list_head *rtmp;
5772 
5773 
5774 	if (mddev->bitmap)
5775 		bitmap_daemon_work(mddev->bitmap);
5776 
5777 	if (mddev->ro)
5778 		return;
5779 
5780 	if (signal_pending(current)) {
5781 		if (mddev->pers->sync_request) {
5782 			printk(KERN_INFO "md: %s in immediate safe mode\n",
5783 			       mdname(mddev));
5784 			mddev->safemode = 2;
5785 		}
5786 		flush_signals(current);
5787 	}
5788 
5789 	if ( ! (
5790 		(mddev->flags && !mddev->external) ||
5791 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5792 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5793 		(mddev->safemode == 1) ||
5794 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5795 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5796 		))
5797 		return;
5798 
5799 	if (mddev_trylock(mddev)) {
5800 		int spares = 0;
5801 
5802 		spin_lock_irq(&mddev->write_lock);
5803 		if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5804 		    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5805 			mddev->in_sync = 1;
5806 			if (mddev->persistent)
5807 				set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5808 		}
5809 		if (mddev->safemode == 1)
5810 			mddev->safemode = 0;
5811 		spin_unlock_irq(&mddev->write_lock);
5812 
5813 		if (mddev->flags)
5814 			md_update_sb(mddev, 0);
5815 
5816 
5817 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5818 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5819 			/* resync/recovery still happening */
5820 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5821 			goto unlock;
5822 		}
5823 		if (mddev->sync_thread) {
5824 			/* resync has finished, collect result */
5825 			md_unregister_thread(mddev->sync_thread);
5826 			mddev->sync_thread = NULL;
5827 			if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5828 			    !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5829 				/* success...*/
5830 				/* activate any spares */
5831 				mddev->pers->spare_active(mddev);
5832 			}
5833 			md_update_sb(mddev, 1);
5834 
5835 			/* if array is no-longer degraded, then any saved_raid_disk
5836 			 * information must be scrapped
5837 			 */
5838 			if (!mddev->degraded)
5839 				rdev_for_each(rdev, rtmp, mddev)
5840 					rdev->saved_raid_disk = -1;
5841 
5842 			mddev->recovery = 0;
5843 			/* flag recovery needed just to double check */
5844 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5845 			md_new_event(mddev);
5846 			goto unlock;
5847 		}
5848 		/* Clear some bits that don't mean anything, but
5849 		 * might be left set
5850 		 */
5851 		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5852 		clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5853 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5854 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5855 
5856 		if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5857 			goto unlock;
5858 		/* no recovery is running.
5859 		 * remove any failed drives, then
5860 		 * add spares if possible.
5861 		 * Spare are also removed and re-added, to allow
5862 		 * the personality to fail the re-add.
5863 		 */
5864 
5865 		if (mddev->reshape_position != MaxSector) {
5866 			if (mddev->pers->check_reshape(mddev) != 0)
5867 				/* Cannot proceed */
5868 				goto unlock;
5869 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5870 		} else if ((spares = remove_and_add_spares(mddev))) {
5871 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5872 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5873 		} else if (mddev->recovery_cp < MaxSector) {
5874 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5875 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5876 			/* nothing to be done ... */
5877 			goto unlock;
5878 
5879 		if (mddev->pers->sync_request) {
5880 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5881 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5882 				/* We are adding a device or devices to an array
5883 				 * which has the bitmap stored on all devices.
5884 				 * So make sure all bitmap pages get written
5885 				 */
5886 				bitmap_write_all(mddev->bitmap);
5887 			}
5888 			mddev->sync_thread = md_register_thread(md_do_sync,
5889 								mddev,
5890 								"%s_resync");
5891 			if (!mddev->sync_thread) {
5892 				printk(KERN_ERR "%s: could not start resync"
5893 					" thread...\n",
5894 					mdname(mddev));
5895 				/* leave the spares where they are, it shouldn't hurt */
5896 				mddev->recovery = 0;
5897 			} else
5898 				md_wakeup_thread(mddev->sync_thread);
5899 			md_new_event(mddev);
5900 		}
5901 	unlock:
5902 		mddev_unlock(mddev);
5903 	}
5904 }
5905 
5906 static int md_notify_reboot(struct notifier_block *this,
5907 			    unsigned long code, void *x)
5908 {
5909 	struct list_head *tmp;
5910 	mddev_t *mddev;
5911 
5912 	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5913 
5914 		printk(KERN_INFO "md: stopping all md devices.\n");
5915 
5916 		for_each_mddev(mddev, tmp)
5917 			if (mddev_trylock(mddev)) {
5918 				do_md_stop (mddev, 1);
5919 				mddev_unlock(mddev);
5920 			}
5921 		/*
5922 		 * certain more exotic SCSI devices are known to be
5923 		 * volatile wrt too early system reboots. While the
5924 		 * right place to handle this issue is the given
5925 		 * driver, we do want to have a safe RAID driver ...
5926 		 */
5927 		mdelay(1000*1);
5928 	}
5929 	return NOTIFY_DONE;
5930 }
5931 
5932 static struct notifier_block md_notifier = {
5933 	.notifier_call	= md_notify_reboot,
5934 	.next		= NULL,
5935 	.priority	= INT_MAX, /* before any real devices */
5936 };
5937 
5938 static void md_geninit(void)
5939 {
5940 	struct proc_dir_entry *p;
5941 
5942 	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5943 
5944 	p = create_proc_entry("mdstat", S_IRUGO, NULL);
5945 	if (p)
5946 		p->proc_fops = &md_seq_fops;
5947 }
5948 
5949 static int __init md_init(void)
5950 {
5951 	if (register_blkdev(MAJOR_NR, "md"))
5952 		return -1;
5953 	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5954 		unregister_blkdev(MAJOR_NR, "md");
5955 		return -1;
5956 	}
5957 	blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
5958 			    md_probe, NULL, NULL);
5959 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
5960 			    md_probe, NULL, NULL);
5961 
5962 	register_reboot_notifier(&md_notifier);
5963 	raid_table_header = register_sysctl_table(raid_root_table);
5964 
5965 	md_geninit();
5966 	return (0);
5967 }
5968 
5969 
5970 #ifndef MODULE
5971 
5972 /*
5973  * Searches all registered partitions for autorun RAID arrays
5974  * at boot time.
5975  */
5976 
5977 static LIST_HEAD(all_detected_devices);
5978 struct detected_devices_node {
5979 	struct list_head list;
5980 	dev_t dev;
5981 };
5982 
5983 void md_autodetect_dev(dev_t dev)
5984 {
5985 	struct detected_devices_node *node_detected_dev;
5986 
5987 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
5988 	if (node_detected_dev) {
5989 		node_detected_dev->dev = dev;
5990 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
5991 	} else {
5992 		printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
5993 			", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
5994 	}
5995 }
5996 
5997 
5998 static void autostart_arrays(int part)
5999 {
6000 	mdk_rdev_t *rdev;
6001 	struct detected_devices_node *node_detected_dev;
6002 	dev_t dev;
6003 	int i_scanned, i_passed;
6004 
6005 	i_scanned = 0;
6006 	i_passed = 0;
6007 
6008 	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6009 
6010 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6011 		i_scanned++;
6012 		node_detected_dev = list_entry(all_detected_devices.next,
6013 					struct detected_devices_node, list);
6014 		list_del(&node_detected_dev->list);
6015 		dev = node_detected_dev->dev;
6016 		kfree(node_detected_dev);
6017 		rdev = md_import_device(dev,0, 90);
6018 		if (IS_ERR(rdev))
6019 			continue;
6020 
6021 		if (test_bit(Faulty, &rdev->flags)) {
6022 			MD_BUG();
6023 			continue;
6024 		}
6025 		list_add(&rdev->same_set, &pending_raid_disks);
6026 		i_passed++;
6027 	}
6028 
6029 	printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6030 						i_scanned, i_passed);
6031 
6032 	autorun_devices(part);
6033 }
6034 
6035 #endif /* !MODULE */
6036 
6037 static __exit void md_exit(void)
6038 {
6039 	mddev_t *mddev;
6040 	struct list_head *tmp;
6041 
6042 	blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
6043 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6044 
6045 	unregister_blkdev(MAJOR_NR,"md");
6046 	unregister_blkdev(mdp_major, "mdp");
6047 	unregister_reboot_notifier(&md_notifier);
6048 	unregister_sysctl_table(raid_table_header);
6049 	remove_proc_entry("mdstat", NULL);
6050 	for_each_mddev(mddev, tmp) {
6051 		struct gendisk *disk = mddev->gendisk;
6052 		if (!disk)
6053 			continue;
6054 		export_array(mddev);
6055 		del_gendisk(disk);
6056 		put_disk(disk);
6057 		mddev->gendisk = NULL;
6058 		mddev_put(mddev);
6059 	}
6060 }
6061 
6062 subsys_initcall(md_init);
6063 module_exit(md_exit)
6064 
6065 static int get_ro(char *buffer, struct kernel_param *kp)
6066 {
6067 	return sprintf(buffer, "%d", start_readonly);
6068 }
6069 static int set_ro(const char *val, struct kernel_param *kp)
6070 {
6071 	char *e;
6072 	int num = simple_strtoul(val, &e, 10);
6073 	if (*val && (*e == '\0' || *e == '\n')) {
6074 		start_readonly = num;
6075 		return 0;
6076 	}
6077 	return -EINVAL;
6078 }
6079 
6080 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6081 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6082 
6083 
6084 EXPORT_SYMBOL(register_md_personality);
6085 EXPORT_SYMBOL(unregister_md_personality);
6086 EXPORT_SYMBOL(md_error);
6087 EXPORT_SYMBOL(md_done_sync);
6088 EXPORT_SYMBOL(md_write_start);
6089 EXPORT_SYMBOL(md_write_end);
6090 EXPORT_SYMBOL(md_register_thread);
6091 EXPORT_SYMBOL(md_unregister_thread);
6092 EXPORT_SYMBOL(md_wakeup_thread);
6093 EXPORT_SYMBOL(md_check_recovery);
6094 MODULE_LICENSE("GPL");
6095 MODULE_ALIAS("md");
6096 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
6097