xref: /linux/drivers/md/md.c (revision 5e8d780d745c1619aba81fe7166c5a4b5cad2b84)
1 /*
2    md.c : Multiple Devices driver for Linux
3 	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 
5      completely rewritten, based on the MD driver code from Marc Zyngier
6 
7    Changes:
8 
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19 
20      Neil Brown <neilb@cse.unsw.edu.au>.
21 
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29 
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34 
35 #include <linux/module.h>
36 #include <linux/config.h>
37 #include <linux/kthread.h>
38 #include <linux/linkage.h>
39 #include <linux/raid/md.h>
40 #include <linux/raid/bitmap.h>
41 #include <linux/sysctl.h>
42 #include <linux/buffer_head.h> /* for invalidate_bdev */
43 #include <linux/suspend.h>
44 #include <linux/poll.h>
45 #include <linux/mutex.h>
46 #include <linux/ctype.h>
47 
48 #include <linux/init.h>
49 
50 #include <linux/file.h>
51 
52 #ifdef CONFIG_KMOD
53 #include <linux/kmod.h>
54 #endif
55 
56 #include <asm/unaligned.h>
57 
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
60 
61 /* 63 partitions with the alternate major number (mdp) */
62 #define MdpMinorShift 6
63 
64 #define DEBUG 0
65 #define dprintk(x...) ((void)(DEBUG && printk(x)))
66 
67 
68 #ifndef MODULE
69 static void autostart_arrays (int part);
70 #endif
71 
72 static LIST_HEAD(pers_list);
73 static DEFINE_SPINLOCK(pers_lock);
74 
75 static void md_print_devices(void);
76 
77 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
78 
79 /*
80  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
81  * is 1000 KB/sec, so the extra system load does not show up that much.
82  * Increase it if you want to have more _guaranteed_ speed. Note that
83  * the RAID driver will use the maximum available bandwidth if the IO
84  * subsystem is idle. There is also an 'absolute maximum' reconstruction
85  * speed limit - in case reconstruction slows down your system despite
86  * idle IO detection.
87  *
88  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
89  * or /sys/block/mdX/md/sync_speed_{min,max}
90  */
91 
92 static int sysctl_speed_limit_min = 1000;
93 static int sysctl_speed_limit_max = 200000;
94 static inline int speed_min(mddev_t *mddev)
95 {
96 	return mddev->sync_speed_min ?
97 		mddev->sync_speed_min : sysctl_speed_limit_min;
98 }
99 
100 static inline int speed_max(mddev_t *mddev)
101 {
102 	return mddev->sync_speed_max ?
103 		mddev->sync_speed_max : sysctl_speed_limit_max;
104 }
105 
106 static struct ctl_table_header *raid_table_header;
107 
108 static ctl_table raid_table[] = {
109 	{
110 		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN,
111 		.procname	= "speed_limit_min",
112 		.data		= &sysctl_speed_limit_min,
113 		.maxlen		= sizeof(int),
114 		.mode		= 0644,
115 		.proc_handler	= &proc_dointvec,
116 	},
117 	{
118 		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX,
119 		.procname	= "speed_limit_max",
120 		.data		= &sysctl_speed_limit_max,
121 		.maxlen		= sizeof(int),
122 		.mode		= 0644,
123 		.proc_handler	= &proc_dointvec,
124 	},
125 	{ .ctl_name = 0 }
126 };
127 
128 static ctl_table raid_dir_table[] = {
129 	{
130 		.ctl_name	= DEV_RAID,
131 		.procname	= "raid",
132 		.maxlen		= 0,
133 		.mode		= 0555,
134 		.child		= raid_table,
135 	},
136 	{ .ctl_name = 0 }
137 };
138 
139 static ctl_table raid_root_table[] = {
140 	{
141 		.ctl_name	= CTL_DEV,
142 		.procname	= "dev",
143 		.maxlen		= 0,
144 		.mode		= 0555,
145 		.child		= raid_dir_table,
146 	},
147 	{ .ctl_name = 0 }
148 };
149 
150 static struct block_device_operations md_fops;
151 
152 static int start_readonly;
153 
154 /*
155  * We have a system wide 'event count' that is incremented
156  * on any 'interesting' event, and readers of /proc/mdstat
157  * can use 'poll' or 'select' to find out when the event
158  * count increases.
159  *
160  * Events are:
161  *  start array, stop array, error, add device, remove device,
162  *  start build, activate spare
163  */
164 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
165 static atomic_t md_event_count;
166 void md_new_event(mddev_t *mddev)
167 {
168 	atomic_inc(&md_event_count);
169 	wake_up(&md_event_waiters);
170 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
171 }
172 EXPORT_SYMBOL_GPL(md_new_event);
173 
174 /* Alternate version that can be called from interrupts
175  * when calling sysfs_notify isn't needed.
176  */
177 static void md_new_event_inintr(mddev_t *mddev)
178 {
179 	atomic_inc(&md_event_count);
180 	wake_up(&md_event_waiters);
181 }
182 
183 /*
184  * Enables to iterate over all existing md arrays
185  * all_mddevs_lock protects this list.
186  */
187 static LIST_HEAD(all_mddevs);
188 static DEFINE_SPINLOCK(all_mddevs_lock);
189 
190 
191 /*
192  * iterates through all used mddevs in the system.
193  * We take care to grab the all_mddevs_lock whenever navigating
194  * the list, and to always hold a refcount when unlocked.
195  * Any code which breaks out of this loop while own
196  * a reference to the current mddev and must mddev_put it.
197  */
198 #define ITERATE_MDDEV(mddev,tmp)					\
199 									\
200 	for (({ spin_lock(&all_mddevs_lock); 				\
201 		tmp = all_mddevs.next;					\
202 		mddev = NULL;});					\
203 	     ({ if (tmp != &all_mddevs)					\
204 			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
205 		spin_unlock(&all_mddevs_lock);				\
206 		if (mddev) mddev_put(mddev);				\
207 		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
208 		tmp != &all_mddevs;});					\
209 	     ({ spin_lock(&all_mddevs_lock);				\
210 		tmp = tmp->next;})					\
211 		)
212 
213 
214 static int md_fail_request (request_queue_t *q, struct bio *bio)
215 {
216 	bio_io_error(bio, bio->bi_size);
217 	return 0;
218 }
219 
220 static inline mddev_t *mddev_get(mddev_t *mddev)
221 {
222 	atomic_inc(&mddev->active);
223 	return mddev;
224 }
225 
226 static void mddev_put(mddev_t *mddev)
227 {
228 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
229 		return;
230 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
231 		list_del(&mddev->all_mddevs);
232 		spin_unlock(&all_mddevs_lock);
233 		blk_cleanup_queue(mddev->queue);
234 		kobject_unregister(&mddev->kobj);
235 	} else
236 		spin_unlock(&all_mddevs_lock);
237 }
238 
239 static mddev_t * mddev_find(dev_t unit)
240 {
241 	mddev_t *mddev, *new = NULL;
242 
243  retry:
244 	spin_lock(&all_mddevs_lock);
245 	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
246 		if (mddev->unit == unit) {
247 			mddev_get(mddev);
248 			spin_unlock(&all_mddevs_lock);
249 			kfree(new);
250 			return mddev;
251 		}
252 
253 	if (new) {
254 		list_add(&new->all_mddevs, &all_mddevs);
255 		spin_unlock(&all_mddevs_lock);
256 		return new;
257 	}
258 	spin_unlock(&all_mddevs_lock);
259 
260 	new = kzalloc(sizeof(*new), GFP_KERNEL);
261 	if (!new)
262 		return NULL;
263 
264 	new->unit = unit;
265 	if (MAJOR(unit) == MD_MAJOR)
266 		new->md_minor = MINOR(unit);
267 	else
268 		new->md_minor = MINOR(unit) >> MdpMinorShift;
269 
270 	mutex_init(&new->reconfig_mutex);
271 	INIT_LIST_HEAD(&new->disks);
272 	INIT_LIST_HEAD(&new->all_mddevs);
273 	init_timer(&new->safemode_timer);
274 	atomic_set(&new->active, 1);
275 	spin_lock_init(&new->write_lock);
276 	init_waitqueue_head(&new->sb_wait);
277 
278 	new->queue = blk_alloc_queue(GFP_KERNEL);
279 	if (!new->queue) {
280 		kfree(new);
281 		return NULL;
282 	}
283 	set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
284 
285 	blk_queue_make_request(new->queue, md_fail_request);
286 
287 	goto retry;
288 }
289 
290 static inline int mddev_lock(mddev_t * mddev)
291 {
292 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
293 }
294 
295 static inline int mddev_trylock(mddev_t * mddev)
296 {
297 	return mutex_trylock(&mddev->reconfig_mutex);
298 }
299 
300 static inline void mddev_unlock(mddev_t * mddev)
301 {
302 	mutex_unlock(&mddev->reconfig_mutex);
303 
304 	md_wakeup_thread(mddev->thread);
305 }
306 
307 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
308 {
309 	mdk_rdev_t * rdev;
310 	struct list_head *tmp;
311 
312 	ITERATE_RDEV(mddev,rdev,tmp) {
313 		if (rdev->desc_nr == nr)
314 			return rdev;
315 	}
316 	return NULL;
317 }
318 
319 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
320 {
321 	struct list_head *tmp;
322 	mdk_rdev_t *rdev;
323 
324 	ITERATE_RDEV(mddev,rdev,tmp) {
325 		if (rdev->bdev->bd_dev == dev)
326 			return rdev;
327 	}
328 	return NULL;
329 }
330 
331 static struct mdk_personality *find_pers(int level, char *clevel)
332 {
333 	struct mdk_personality *pers;
334 	list_for_each_entry(pers, &pers_list, list) {
335 		if (level != LEVEL_NONE && pers->level == level)
336 			return pers;
337 		if (strcmp(pers->name, clevel)==0)
338 			return pers;
339 	}
340 	return NULL;
341 }
342 
343 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
344 {
345 	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
346 	return MD_NEW_SIZE_BLOCKS(size);
347 }
348 
349 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
350 {
351 	sector_t size;
352 
353 	size = rdev->sb_offset;
354 
355 	if (chunk_size)
356 		size &= ~((sector_t)chunk_size/1024 - 1);
357 	return size;
358 }
359 
360 static int alloc_disk_sb(mdk_rdev_t * rdev)
361 {
362 	if (rdev->sb_page)
363 		MD_BUG();
364 
365 	rdev->sb_page = alloc_page(GFP_KERNEL);
366 	if (!rdev->sb_page) {
367 		printk(KERN_ALERT "md: out of memory.\n");
368 		return -EINVAL;
369 	}
370 
371 	return 0;
372 }
373 
374 static void free_disk_sb(mdk_rdev_t * rdev)
375 {
376 	if (rdev->sb_page) {
377 		put_page(rdev->sb_page);
378 		rdev->sb_loaded = 0;
379 		rdev->sb_page = NULL;
380 		rdev->sb_offset = 0;
381 		rdev->size = 0;
382 	}
383 }
384 
385 
386 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
387 {
388 	mdk_rdev_t *rdev = bio->bi_private;
389 	mddev_t *mddev = rdev->mddev;
390 	if (bio->bi_size)
391 		return 1;
392 
393 	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
394 		md_error(mddev, rdev);
395 
396 	if (atomic_dec_and_test(&mddev->pending_writes))
397 		wake_up(&mddev->sb_wait);
398 	bio_put(bio);
399 	return 0;
400 }
401 
402 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
403 {
404 	struct bio *bio2 = bio->bi_private;
405 	mdk_rdev_t *rdev = bio2->bi_private;
406 	mddev_t *mddev = rdev->mddev;
407 	if (bio->bi_size)
408 		return 1;
409 
410 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
411 	    error == -EOPNOTSUPP) {
412 		unsigned long flags;
413 		/* barriers don't appear to be supported :-( */
414 		set_bit(BarriersNotsupp, &rdev->flags);
415 		mddev->barriers_work = 0;
416 		spin_lock_irqsave(&mddev->write_lock, flags);
417 		bio2->bi_next = mddev->biolist;
418 		mddev->biolist = bio2;
419 		spin_unlock_irqrestore(&mddev->write_lock, flags);
420 		wake_up(&mddev->sb_wait);
421 		bio_put(bio);
422 		return 0;
423 	}
424 	bio_put(bio2);
425 	bio->bi_private = rdev;
426 	return super_written(bio, bytes_done, error);
427 }
428 
429 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
430 		   sector_t sector, int size, struct page *page)
431 {
432 	/* write first size bytes of page to sector of rdev
433 	 * Increment mddev->pending_writes before returning
434 	 * and decrement it on completion, waking up sb_wait
435 	 * if zero is reached.
436 	 * If an error occurred, call md_error
437 	 *
438 	 * As we might need to resubmit the request if BIO_RW_BARRIER
439 	 * causes ENOTSUPP, we allocate a spare bio...
440 	 */
441 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
442 	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
443 
444 	bio->bi_bdev = rdev->bdev;
445 	bio->bi_sector = sector;
446 	bio_add_page(bio, page, size, 0);
447 	bio->bi_private = rdev;
448 	bio->bi_end_io = super_written;
449 	bio->bi_rw = rw;
450 
451 	atomic_inc(&mddev->pending_writes);
452 	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
453 		struct bio *rbio;
454 		rw |= (1<<BIO_RW_BARRIER);
455 		rbio = bio_clone(bio, GFP_NOIO);
456 		rbio->bi_private = bio;
457 		rbio->bi_end_io = super_written_barrier;
458 		submit_bio(rw, rbio);
459 	} else
460 		submit_bio(rw, bio);
461 }
462 
463 void md_super_wait(mddev_t *mddev)
464 {
465 	/* wait for all superblock writes that were scheduled to complete.
466 	 * if any had to be retried (due to BARRIER problems), retry them
467 	 */
468 	DEFINE_WAIT(wq);
469 	for(;;) {
470 		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
471 		if (atomic_read(&mddev->pending_writes)==0)
472 			break;
473 		while (mddev->biolist) {
474 			struct bio *bio;
475 			spin_lock_irq(&mddev->write_lock);
476 			bio = mddev->biolist;
477 			mddev->biolist = bio->bi_next ;
478 			bio->bi_next = NULL;
479 			spin_unlock_irq(&mddev->write_lock);
480 			submit_bio(bio->bi_rw, bio);
481 		}
482 		schedule();
483 	}
484 	finish_wait(&mddev->sb_wait, &wq);
485 }
486 
487 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
488 {
489 	if (bio->bi_size)
490 		return 1;
491 
492 	complete((struct completion*)bio->bi_private);
493 	return 0;
494 }
495 
496 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
497 		   struct page *page, int rw)
498 {
499 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
500 	struct completion event;
501 	int ret;
502 
503 	rw |= (1 << BIO_RW_SYNC);
504 
505 	bio->bi_bdev = bdev;
506 	bio->bi_sector = sector;
507 	bio_add_page(bio, page, size, 0);
508 	init_completion(&event);
509 	bio->bi_private = &event;
510 	bio->bi_end_io = bi_complete;
511 	submit_bio(rw, bio);
512 	wait_for_completion(&event);
513 
514 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
515 	bio_put(bio);
516 	return ret;
517 }
518 EXPORT_SYMBOL_GPL(sync_page_io);
519 
520 static int read_disk_sb(mdk_rdev_t * rdev, int size)
521 {
522 	char b[BDEVNAME_SIZE];
523 	if (!rdev->sb_page) {
524 		MD_BUG();
525 		return -EINVAL;
526 	}
527 	if (rdev->sb_loaded)
528 		return 0;
529 
530 
531 	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
532 		goto fail;
533 	rdev->sb_loaded = 1;
534 	return 0;
535 
536 fail:
537 	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
538 		bdevname(rdev->bdev,b));
539 	return -EINVAL;
540 }
541 
542 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
543 {
544 	if (	(sb1->set_uuid0 == sb2->set_uuid0) &&
545 		(sb1->set_uuid1 == sb2->set_uuid1) &&
546 		(sb1->set_uuid2 == sb2->set_uuid2) &&
547 		(sb1->set_uuid3 == sb2->set_uuid3))
548 
549 		return 1;
550 
551 	return 0;
552 }
553 
554 
555 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
556 {
557 	int ret;
558 	mdp_super_t *tmp1, *tmp2;
559 
560 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
561 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
562 
563 	if (!tmp1 || !tmp2) {
564 		ret = 0;
565 		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
566 		goto abort;
567 	}
568 
569 	*tmp1 = *sb1;
570 	*tmp2 = *sb2;
571 
572 	/*
573 	 * nr_disks is not constant
574 	 */
575 	tmp1->nr_disks = 0;
576 	tmp2->nr_disks = 0;
577 
578 	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
579 		ret = 0;
580 	else
581 		ret = 1;
582 
583 abort:
584 	kfree(tmp1);
585 	kfree(tmp2);
586 	return ret;
587 }
588 
589 static unsigned int calc_sb_csum(mdp_super_t * sb)
590 {
591 	unsigned int disk_csum, csum;
592 
593 	disk_csum = sb->sb_csum;
594 	sb->sb_csum = 0;
595 	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
596 	sb->sb_csum = disk_csum;
597 	return csum;
598 }
599 
600 
601 /*
602  * Handle superblock details.
603  * We want to be able to handle multiple superblock formats
604  * so we have a common interface to them all, and an array of
605  * different handlers.
606  * We rely on user-space to write the initial superblock, and support
607  * reading and updating of superblocks.
608  * Interface methods are:
609  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
610  *      loads and validates a superblock on dev.
611  *      if refdev != NULL, compare superblocks on both devices
612  *    Return:
613  *      0 - dev has a superblock that is compatible with refdev
614  *      1 - dev has a superblock that is compatible and newer than refdev
615  *          so dev should be used as the refdev in future
616  *     -EINVAL superblock incompatible or invalid
617  *     -othererror e.g. -EIO
618  *
619  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
620  *      Verify that dev is acceptable into mddev.
621  *       The first time, mddev->raid_disks will be 0, and data from
622  *       dev should be merged in.  Subsequent calls check that dev
623  *       is new enough.  Return 0 or -EINVAL
624  *
625  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
626  *     Update the superblock for rdev with data in mddev
627  *     This does not write to disc.
628  *
629  */
630 
631 struct super_type  {
632 	char 		*name;
633 	struct module	*owner;
634 	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
635 	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
636 	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
637 };
638 
639 /*
640  * load_super for 0.90.0
641  */
642 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
643 {
644 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
645 	mdp_super_t *sb;
646 	int ret;
647 	sector_t sb_offset;
648 
649 	/*
650 	 * Calculate the position of the superblock,
651 	 * it's at the end of the disk.
652 	 *
653 	 * It also happens to be a multiple of 4Kb.
654 	 */
655 	sb_offset = calc_dev_sboffset(rdev->bdev);
656 	rdev->sb_offset = sb_offset;
657 
658 	ret = read_disk_sb(rdev, MD_SB_BYTES);
659 	if (ret) return ret;
660 
661 	ret = -EINVAL;
662 
663 	bdevname(rdev->bdev, b);
664 	sb = (mdp_super_t*)page_address(rdev->sb_page);
665 
666 	if (sb->md_magic != MD_SB_MAGIC) {
667 		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
668 		       b);
669 		goto abort;
670 	}
671 
672 	if (sb->major_version != 0 ||
673 	    sb->minor_version < 90 ||
674 	    sb->minor_version > 91) {
675 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
676 			sb->major_version, sb->minor_version,
677 			b);
678 		goto abort;
679 	}
680 
681 	if (sb->raid_disks <= 0)
682 		goto abort;
683 
684 	if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
685 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
686 			b);
687 		goto abort;
688 	}
689 
690 	rdev->preferred_minor = sb->md_minor;
691 	rdev->data_offset = 0;
692 	rdev->sb_size = MD_SB_BYTES;
693 
694 	if (sb->level == LEVEL_MULTIPATH)
695 		rdev->desc_nr = -1;
696 	else
697 		rdev->desc_nr = sb->this_disk.number;
698 
699 	if (refdev == 0)
700 		ret = 1;
701 	else {
702 		__u64 ev1, ev2;
703 		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
704 		if (!uuid_equal(refsb, sb)) {
705 			printk(KERN_WARNING "md: %s has different UUID to %s\n",
706 				b, bdevname(refdev->bdev,b2));
707 			goto abort;
708 		}
709 		if (!sb_equal(refsb, sb)) {
710 			printk(KERN_WARNING "md: %s has same UUID"
711 			       " but different superblock to %s\n",
712 			       b, bdevname(refdev->bdev, b2));
713 			goto abort;
714 		}
715 		ev1 = md_event(sb);
716 		ev2 = md_event(refsb);
717 		if (ev1 > ev2)
718 			ret = 1;
719 		else
720 			ret = 0;
721 	}
722 	rdev->size = calc_dev_size(rdev, sb->chunk_size);
723 
724 	if (rdev->size < sb->size && sb->level > 1)
725 		/* "this cannot possibly happen" ... */
726 		ret = -EINVAL;
727 
728  abort:
729 	return ret;
730 }
731 
732 /*
733  * validate_super for 0.90.0
734  */
735 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
736 {
737 	mdp_disk_t *desc;
738 	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
739 	__u64 ev1 = md_event(sb);
740 
741 	rdev->raid_disk = -1;
742 	rdev->flags = 0;
743 	if (mddev->raid_disks == 0) {
744 		mddev->major_version = 0;
745 		mddev->minor_version = sb->minor_version;
746 		mddev->patch_version = sb->patch_version;
747 		mddev->persistent = ! sb->not_persistent;
748 		mddev->chunk_size = sb->chunk_size;
749 		mddev->ctime = sb->ctime;
750 		mddev->utime = sb->utime;
751 		mddev->level = sb->level;
752 		mddev->clevel[0] = 0;
753 		mddev->layout = sb->layout;
754 		mddev->raid_disks = sb->raid_disks;
755 		mddev->size = sb->size;
756 		mddev->events = ev1;
757 		mddev->bitmap_offset = 0;
758 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
759 
760 		if (mddev->minor_version >= 91) {
761 			mddev->reshape_position = sb->reshape_position;
762 			mddev->delta_disks = sb->delta_disks;
763 			mddev->new_level = sb->new_level;
764 			mddev->new_layout = sb->new_layout;
765 			mddev->new_chunk = sb->new_chunk;
766 		} else {
767 			mddev->reshape_position = MaxSector;
768 			mddev->delta_disks = 0;
769 			mddev->new_level = mddev->level;
770 			mddev->new_layout = mddev->layout;
771 			mddev->new_chunk = mddev->chunk_size;
772 		}
773 
774 		if (sb->state & (1<<MD_SB_CLEAN))
775 			mddev->recovery_cp = MaxSector;
776 		else {
777 			if (sb->events_hi == sb->cp_events_hi &&
778 				sb->events_lo == sb->cp_events_lo) {
779 				mddev->recovery_cp = sb->recovery_cp;
780 			} else
781 				mddev->recovery_cp = 0;
782 		}
783 
784 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
785 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
786 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
787 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
788 
789 		mddev->max_disks = MD_SB_DISKS;
790 
791 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
792 		    mddev->bitmap_file == NULL) {
793 			if (mddev->level != 1 && mddev->level != 4
794 			    && mddev->level != 5 && mddev->level != 6
795 			    && mddev->level != 10) {
796 				/* FIXME use a better test */
797 				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
798 				return -EINVAL;
799 			}
800 			mddev->bitmap_offset = mddev->default_bitmap_offset;
801 		}
802 
803 	} else if (mddev->pers == NULL) {
804 		/* Insist on good event counter while assembling */
805 		++ev1;
806 		if (ev1 < mddev->events)
807 			return -EINVAL;
808 	} else if (mddev->bitmap) {
809 		/* if adding to array with a bitmap, then we can accept an
810 		 * older device ... but not too old.
811 		 */
812 		if (ev1 < mddev->bitmap->events_cleared)
813 			return 0;
814 	} else {
815 		if (ev1 < mddev->events)
816 			/* just a hot-add of a new device, leave raid_disk at -1 */
817 			return 0;
818 	}
819 
820 	if (mddev->level != LEVEL_MULTIPATH) {
821 		desc = sb->disks + rdev->desc_nr;
822 
823 		if (desc->state & (1<<MD_DISK_FAULTY))
824 			set_bit(Faulty, &rdev->flags);
825 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
826 			    desc->raid_disk < mddev->raid_disks */) {
827 			set_bit(In_sync, &rdev->flags);
828 			rdev->raid_disk = desc->raid_disk;
829 		}
830 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
831 			set_bit(WriteMostly, &rdev->flags);
832 	} else /* MULTIPATH are always insync */
833 		set_bit(In_sync, &rdev->flags);
834 	return 0;
835 }
836 
837 /*
838  * sync_super for 0.90.0
839  */
840 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
841 {
842 	mdp_super_t *sb;
843 	struct list_head *tmp;
844 	mdk_rdev_t *rdev2;
845 	int next_spare = mddev->raid_disks;
846 
847 
848 	/* make rdev->sb match mddev data..
849 	 *
850 	 * 1/ zero out disks
851 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
852 	 * 3/ any empty disks < next_spare become removed
853 	 *
854 	 * disks[0] gets initialised to REMOVED because
855 	 * we cannot be sure from other fields if it has
856 	 * been initialised or not.
857 	 */
858 	int i;
859 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
860 
861 	rdev->sb_size = MD_SB_BYTES;
862 
863 	sb = (mdp_super_t*)page_address(rdev->sb_page);
864 
865 	memset(sb, 0, sizeof(*sb));
866 
867 	sb->md_magic = MD_SB_MAGIC;
868 	sb->major_version = mddev->major_version;
869 	sb->patch_version = mddev->patch_version;
870 	sb->gvalid_words  = 0; /* ignored */
871 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
872 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
873 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
874 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
875 
876 	sb->ctime = mddev->ctime;
877 	sb->level = mddev->level;
878 	sb->size  = mddev->size;
879 	sb->raid_disks = mddev->raid_disks;
880 	sb->md_minor = mddev->md_minor;
881 	sb->not_persistent = !mddev->persistent;
882 	sb->utime = mddev->utime;
883 	sb->state = 0;
884 	sb->events_hi = (mddev->events>>32);
885 	sb->events_lo = (u32)mddev->events;
886 
887 	if (mddev->reshape_position == MaxSector)
888 		sb->minor_version = 90;
889 	else {
890 		sb->minor_version = 91;
891 		sb->reshape_position = mddev->reshape_position;
892 		sb->new_level = mddev->new_level;
893 		sb->delta_disks = mddev->delta_disks;
894 		sb->new_layout = mddev->new_layout;
895 		sb->new_chunk = mddev->new_chunk;
896 	}
897 	mddev->minor_version = sb->minor_version;
898 	if (mddev->in_sync)
899 	{
900 		sb->recovery_cp = mddev->recovery_cp;
901 		sb->cp_events_hi = (mddev->events>>32);
902 		sb->cp_events_lo = (u32)mddev->events;
903 		if (mddev->recovery_cp == MaxSector)
904 			sb->state = (1<< MD_SB_CLEAN);
905 	} else
906 		sb->recovery_cp = 0;
907 
908 	sb->layout = mddev->layout;
909 	sb->chunk_size = mddev->chunk_size;
910 
911 	if (mddev->bitmap && mddev->bitmap_file == NULL)
912 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
913 
914 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
915 	ITERATE_RDEV(mddev,rdev2,tmp) {
916 		mdp_disk_t *d;
917 		int desc_nr;
918 		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
919 		    && !test_bit(Faulty, &rdev2->flags))
920 			desc_nr = rdev2->raid_disk;
921 		else
922 			desc_nr = next_spare++;
923 		rdev2->desc_nr = desc_nr;
924 		d = &sb->disks[rdev2->desc_nr];
925 		nr_disks++;
926 		d->number = rdev2->desc_nr;
927 		d->major = MAJOR(rdev2->bdev->bd_dev);
928 		d->minor = MINOR(rdev2->bdev->bd_dev);
929 		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
930 		    && !test_bit(Faulty, &rdev2->flags))
931 			d->raid_disk = rdev2->raid_disk;
932 		else
933 			d->raid_disk = rdev2->desc_nr; /* compatibility */
934 		if (test_bit(Faulty, &rdev2->flags))
935 			d->state = (1<<MD_DISK_FAULTY);
936 		else if (test_bit(In_sync, &rdev2->flags)) {
937 			d->state = (1<<MD_DISK_ACTIVE);
938 			d->state |= (1<<MD_DISK_SYNC);
939 			active++;
940 			working++;
941 		} else {
942 			d->state = 0;
943 			spare++;
944 			working++;
945 		}
946 		if (test_bit(WriteMostly, &rdev2->flags))
947 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
948 	}
949 	/* now set the "removed" and "faulty" bits on any missing devices */
950 	for (i=0 ; i < mddev->raid_disks ; i++) {
951 		mdp_disk_t *d = &sb->disks[i];
952 		if (d->state == 0 && d->number == 0) {
953 			d->number = i;
954 			d->raid_disk = i;
955 			d->state = (1<<MD_DISK_REMOVED);
956 			d->state |= (1<<MD_DISK_FAULTY);
957 			failed++;
958 		}
959 	}
960 	sb->nr_disks = nr_disks;
961 	sb->active_disks = active;
962 	sb->working_disks = working;
963 	sb->failed_disks = failed;
964 	sb->spare_disks = spare;
965 
966 	sb->this_disk = sb->disks[rdev->desc_nr];
967 	sb->sb_csum = calc_sb_csum(sb);
968 }
969 
970 /*
971  * version 1 superblock
972  */
973 
974 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
975 {
976 	unsigned int disk_csum, csum;
977 	unsigned long long newcsum;
978 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
979 	unsigned int *isuper = (unsigned int*)sb;
980 	int i;
981 
982 	disk_csum = sb->sb_csum;
983 	sb->sb_csum = 0;
984 	newcsum = 0;
985 	for (i=0; size>=4; size -= 4 )
986 		newcsum += le32_to_cpu(*isuper++);
987 
988 	if (size == 2)
989 		newcsum += le16_to_cpu(*(unsigned short*) isuper);
990 
991 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
992 	sb->sb_csum = disk_csum;
993 	return cpu_to_le32(csum);
994 }
995 
996 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
997 {
998 	struct mdp_superblock_1 *sb;
999 	int ret;
1000 	sector_t sb_offset;
1001 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1002 	int bmask;
1003 
1004 	/*
1005 	 * Calculate the position of the superblock.
1006 	 * It is always aligned to a 4K boundary and
1007 	 * depeding on minor_version, it can be:
1008 	 * 0: At least 8K, but less than 12K, from end of device
1009 	 * 1: At start of device
1010 	 * 2: 4K from start of device.
1011 	 */
1012 	switch(minor_version) {
1013 	case 0:
1014 		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1015 		sb_offset -= 8*2;
1016 		sb_offset &= ~(sector_t)(4*2-1);
1017 		/* convert from sectors to K */
1018 		sb_offset /= 2;
1019 		break;
1020 	case 1:
1021 		sb_offset = 0;
1022 		break;
1023 	case 2:
1024 		sb_offset = 4;
1025 		break;
1026 	default:
1027 		return -EINVAL;
1028 	}
1029 	rdev->sb_offset = sb_offset;
1030 
1031 	/* superblock is rarely larger than 1K, but it can be larger,
1032 	 * and it is safe to read 4k, so we do that
1033 	 */
1034 	ret = read_disk_sb(rdev, 4096);
1035 	if (ret) return ret;
1036 
1037 
1038 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1039 
1040 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1041 	    sb->major_version != cpu_to_le32(1) ||
1042 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1043 	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1044 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1045 		return -EINVAL;
1046 
1047 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1048 		printk("md: invalid superblock checksum on %s\n",
1049 			bdevname(rdev->bdev,b));
1050 		return -EINVAL;
1051 	}
1052 	if (le64_to_cpu(sb->data_size) < 10) {
1053 		printk("md: data_size too small on %s\n",
1054 		       bdevname(rdev->bdev,b));
1055 		return -EINVAL;
1056 	}
1057 	rdev->preferred_minor = 0xffff;
1058 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1059 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1060 
1061 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1062 	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1063 	if (rdev->sb_size & bmask)
1064 		rdev-> sb_size = (rdev->sb_size | bmask)+1;
1065 
1066 	if (refdev == 0)
1067 		ret = 1;
1068 	else {
1069 		__u64 ev1, ev2;
1070 		struct mdp_superblock_1 *refsb =
1071 			(struct mdp_superblock_1*)page_address(refdev->sb_page);
1072 
1073 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1074 		    sb->level != refsb->level ||
1075 		    sb->layout != refsb->layout ||
1076 		    sb->chunksize != refsb->chunksize) {
1077 			printk(KERN_WARNING "md: %s has strangely different"
1078 				" superblock to %s\n",
1079 				bdevname(rdev->bdev,b),
1080 				bdevname(refdev->bdev,b2));
1081 			return -EINVAL;
1082 		}
1083 		ev1 = le64_to_cpu(sb->events);
1084 		ev2 = le64_to_cpu(refsb->events);
1085 
1086 		if (ev1 > ev2)
1087 			ret = 1;
1088 		else
1089 			ret = 0;
1090 	}
1091 	if (minor_version)
1092 		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1093 	else
1094 		rdev->size = rdev->sb_offset;
1095 	if (rdev->size < le64_to_cpu(sb->data_size)/2)
1096 		return -EINVAL;
1097 	rdev->size = le64_to_cpu(sb->data_size)/2;
1098 	if (le32_to_cpu(sb->chunksize))
1099 		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1100 
1101 	if (le32_to_cpu(sb->size) > rdev->size*2)
1102 		return -EINVAL;
1103 	return ret;
1104 }
1105 
1106 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1107 {
1108 	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1109 	__u64 ev1 = le64_to_cpu(sb->events);
1110 
1111 	rdev->raid_disk = -1;
1112 	rdev->flags = 0;
1113 	if (mddev->raid_disks == 0) {
1114 		mddev->major_version = 1;
1115 		mddev->patch_version = 0;
1116 		mddev->persistent = 1;
1117 		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1118 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1119 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1120 		mddev->level = le32_to_cpu(sb->level);
1121 		mddev->clevel[0] = 0;
1122 		mddev->layout = le32_to_cpu(sb->layout);
1123 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1124 		mddev->size = le64_to_cpu(sb->size)/2;
1125 		mddev->events = ev1;
1126 		mddev->bitmap_offset = 0;
1127 		mddev->default_bitmap_offset = 1024 >> 9;
1128 
1129 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1130 		memcpy(mddev->uuid, sb->set_uuid, 16);
1131 
1132 		mddev->max_disks =  (4096-256)/2;
1133 
1134 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1135 		    mddev->bitmap_file == NULL ) {
1136 			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1137 			    && mddev->level != 10) {
1138 				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1139 				return -EINVAL;
1140 			}
1141 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1142 		}
1143 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1144 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1145 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1146 			mddev->new_level = le32_to_cpu(sb->new_level);
1147 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1148 			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1149 		} else {
1150 			mddev->reshape_position = MaxSector;
1151 			mddev->delta_disks = 0;
1152 			mddev->new_level = mddev->level;
1153 			mddev->new_layout = mddev->layout;
1154 			mddev->new_chunk = mddev->chunk_size;
1155 		}
1156 
1157 	} else if (mddev->pers == NULL) {
1158 		/* Insist of good event counter while assembling */
1159 		++ev1;
1160 		if (ev1 < mddev->events)
1161 			return -EINVAL;
1162 	} else if (mddev->bitmap) {
1163 		/* If adding to array with a bitmap, then we can accept an
1164 		 * older device, but not too old.
1165 		 */
1166 		if (ev1 < mddev->bitmap->events_cleared)
1167 			return 0;
1168 	} else {
1169 		if (ev1 < mddev->events)
1170 			/* just a hot-add of a new device, leave raid_disk at -1 */
1171 			return 0;
1172 	}
1173 	if (mddev->level != LEVEL_MULTIPATH) {
1174 		int role;
1175 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1176 		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1177 		switch(role) {
1178 		case 0xffff: /* spare */
1179 			break;
1180 		case 0xfffe: /* faulty */
1181 			set_bit(Faulty, &rdev->flags);
1182 			break;
1183 		default:
1184 			if ((le32_to_cpu(sb->feature_map) &
1185 			     MD_FEATURE_RECOVERY_OFFSET))
1186 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1187 			else
1188 				set_bit(In_sync, &rdev->flags);
1189 			rdev->raid_disk = role;
1190 			break;
1191 		}
1192 		if (sb->devflags & WriteMostly1)
1193 			set_bit(WriteMostly, &rdev->flags);
1194 	} else /* MULTIPATH are always insync */
1195 		set_bit(In_sync, &rdev->flags);
1196 
1197 	return 0;
1198 }
1199 
1200 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1201 {
1202 	struct mdp_superblock_1 *sb;
1203 	struct list_head *tmp;
1204 	mdk_rdev_t *rdev2;
1205 	int max_dev, i;
1206 	/* make rdev->sb match mddev and rdev data. */
1207 
1208 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1209 
1210 	sb->feature_map = 0;
1211 	sb->pad0 = 0;
1212 	sb->recovery_offset = cpu_to_le64(0);
1213 	memset(sb->pad1, 0, sizeof(sb->pad1));
1214 	memset(sb->pad2, 0, sizeof(sb->pad2));
1215 	memset(sb->pad3, 0, sizeof(sb->pad3));
1216 
1217 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1218 	sb->events = cpu_to_le64(mddev->events);
1219 	if (mddev->in_sync)
1220 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1221 	else
1222 		sb->resync_offset = cpu_to_le64(0);
1223 
1224 	sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1225 
1226 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1227 	sb->size = cpu_to_le64(mddev->size<<1);
1228 
1229 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
1230 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1231 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1232 	}
1233 
1234 	if (rdev->raid_disk >= 0 &&
1235 	    !test_bit(In_sync, &rdev->flags) &&
1236 	    rdev->recovery_offset > 0) {
1237 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1238 		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1239 	}
1240 
1241 	if (mddev->reshape_position != MaxSector) {
1242 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1243 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1244 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1245 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1246 		sb->new_level = cpu_to_le32(mddev->new_level);
1247 		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1248 	}
1249 
1250 	max_dev = 0;
1251 	ITERATE_RDEV(mddev,rdev2,tmp)
1252 		if (rdev2->desc_nr+1 > max_dev)
1253 			max_dev = rdev2->desc_nr+1;
1254 
1255 	sb->max_dev = cpu_to_le32(max_dev);
1256 	for (i=0; i<max_dev;i++)
1257 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1258 
1259 	ITERATE_RDEV(mddev,rdev2,tmp) {
1260 		i = rdev2->desc_nr;
1261 		if (test_bit(Faulty, &rdev2->flags))
1262 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1263 		else if (test_bit(In_sync, &rdev2->flags))
1264 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1265 		else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1266 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1267 		else
1268 			sb->dev_roles[i] = cpu_to_le16(0xffff);
1269 	}
1270 
1271 	sb->sb_csum = calc_sb_1_csum(sb);
1272 }
1273 
1274 
1275 static struct super_type super_types[] = {
1276 	[0] = {
1277 		.name	= "0.90.0",
1278 		.owner	= THIS_MODULE,
1279 		.load_super	= super_90_load,
1280 		.validate_super	= super_90_validate,
1281 		.sync_super	= super_90_sync,
1282 	},
1283 	[1] = {
1284 		.name	= "md-1",
1285 		.owner	= THIS_MODULE,
1286 		.load_super	= super_1_load,
1287 		.validate_super	= super_1_validate,
1288 		.sync_super	= super_1_sync,
1289 	},
1290 };
1291 
1292 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1293 {
1294 	struct list_head *tmp;
1295 	mdk_rdev_t *rdev;
1296 
1297 	ITERATE_RDEV(mddev,rdev,tmp)
1298 		if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1299 			return rdev;
1300 
1301 	return NULL;
1302 }
1303 
1304 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1305 {
1306 	struct list_head *tmp;
1307 	mdk_rdev_t *rdev;
1308 
1309 	ITERATE_RDEV(mddev1,rdev,tmp)
1310 		if (match_dev_unit(mddev2, rdev))
1311 			return 1;
1312 
1313 	return 0;
1314 }
1315 
1316 static LIST_HEAD(pending_raid_disks);
1317 
1318 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1319 {
1320 	mdk_rdev_t *same_pdev;
1321 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1322 	struct kobject *ko;
1323 	char *s;
1324 
1325 	if (rdev->mddev) {
1326 		MD_BUG();
1327 		return -EINVAL;
1328 	}
1329 	/* make sure rdev->size exceeds mddev->size */
1330 	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1331 		if (mddev->pers)
1332 			/* Cannot change size, so fail */
1333 			return -ENOSPC;
1334 		else
1335 			mddev->size = rdev->size;
1336 	}
1337 	same_pdev = match_dev_unit(mddev, rdev);
1338 	if (same_pdev)
1339 		printk(KERN_WARNING
1340 			"%s: WARNING: %s appears to be on the same physical"
1341 	 		" disk as %s. True\n     protection against single-disk"
1342 			" failure might be compromised.\n",
1343 			mdname(mddev), bdevname(rdev->bdev,b),
1344 			bdevname(same_pdev->bdev,b2));
1345 
1346 	/* Verify rdev->desc_nr is unique.
1347 	 * If it is -1, assign a free number, else
1348 	 * check number is not in use
1349 	 */
1350 	if (rdev->desc_nr < 0) {
1351 		int choice = 0;
1352 		if (mddev->pers) choice = mddev->raid_disks;
1353 		while (find_rdev_nr(mddev, choice))
1354 			choice++;
1355 		rdev->desc_nr = choice;
1356 	} else {
1357 		if (find_rdev_nr(mddev, rdev->desc_nr))
1358 			return -EBUSY;
1359 	}
1360 	bdevname(rdev->bdev,b);
1361 	if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1362 		return -ENOMEM;
1363 	while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1364 		*s = '!';
1365 
1366 	list_add(&rdev->same_set, &mddev->disks);
1367 	rdev->mddev = mddev;
1368 	printk(KERN_INFO "md: bind<%s>\n", b);
1369 
1370 	rdev->kobj.parent = &mddev->kobj;
1371 	kobject_add(&rdev->kobj);
1372 
1373 	if (rdev->bdev->bd_part)
1374 		ko = &rdev->bdev->bd_part->kobj;
1375 	else
1376 		ko = &rdev->bdev->bd_disk->kobj;
1377 	sysfs_create_link(&rdev->kobj, ko, "block");
1378 	bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1379 	return 0;
1380 }
1381 
1382 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1383 {
1384 	char b[BDEVNAME_SIZE];
1385 	if (!rdev->mddev) {
1386 		MD_BUG();
1387 		return;
1388 	}
1389 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1390 	list_del_init(&rdev->same_set);
1391 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1392 	rdev->mddev = NULL;
1393 	sysfs_remove_link(&rdev->kobj, "block");
1394 	kobject_del(&rdev->kobj);
1395 }
1396 
1397 /*
1398  * prevent the device from being mounted, repartitioned or
1399  * otherwise reused by a RAID array (or any other kernel
1400  * subsystem), by bd_claiming the device.
1401  */
1402 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1403 {
1404 	int err = 0;
1405 	struct block_device *bdev;
1406 	char b[BDEVNAME_SIZE];
1407 
1408 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1409 	if (IS_ERR(bdev)) {
1410 		printk(KERN_ERR "md: could not open %s.\n",
1411 			__bdevname(dev, b));
1412 		return PTR_ERR(bdev);
1413 	}
1414 	err = bd_claim(bdev, rdev);
1415 	if (err) {
1416 		printk(KERN_ERR "md: could not bd_claim %s.\n",
1417 			bdevname(bdev, b));
1418 		blkdev_put(bdev);
1419 		return err;
1420 	}
1421 	rdev->bdev = bdev;
1422 	return err;
1423 }
1424 
1425 static void unlock_rdev(mdk_rdev_t *rdev)
1426 {
1427 	struct block_device *bdev = rdev->bdev;
1428 	rdev->bdev = NULL;
1429 	if (!bdev)
1430 		MD_BUG();
1431 	bd_release(bdev);
1432 	blkdev_put(bdev);
1433 }
1434 
1435 void md_autodetect_dev(dev_t dev);
1436 
1437 static void export_rdev(mdk_rdev_t * rdev)
1438 {
1439 	char b[BDEVNAME_SIZE];
1440 	printk(KERN_INFO "md: export_rdev(%s)\n",
1441 		bdevname(rdev->bdev,b));
1442 	if (rdev->mddev)
1443 		MD_BUG();
1444 	free_disk_sb(rdev);
1445 	list_del_init(&rdev->same_set);
1446 #ifndef MODULE
1447 	md_autodetect_dev(rdev->bdev->bd_dev);
1448 #endif
1449 	unlock_rdev(rdev);
1450 	kobject_put(&rdev->kobj);
1451 }
1452 
1453 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1454 {
1455 	unbind_rdev_from_array(rdev);
1456 	export_rdev(rdev);
1457 }
1458 
1459 static void export_array(mddev_t *mddev)
1460 {
1461 	struct list_head *tmp;
1462 	mdk_rdev_t *rdev;
1463 
1464 	ITERATE_RDEV(mddev,rdev,tmp) {
1465 		if (!rdev->mddev) {
1466 			MD_BUG();
1467 			continue;
1468 		}
1469 		kick_rdev_from_array(rdev);
1470 	}
1471 	if (!list_empty(&mddev->disks))
1472 		MD_BUG();
1473 	mddev->raid_disks = 0;
1474 	mddev->major_version = 0;
1475 }
1476 
1477 static void print_desc(mdp_disk_t *desc)
1478 {
1479 	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1480 		desc->major,desc->minor,desc->raid_disk,desc->state);
1481 }
1482 
1483 static void print_sb(mdp_super_t *sb)
1484 {
1485 	int i;
1486 
1487 	printk(KERN_INFO
1488 		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1489 		sb->major_version, sb->minor_version, sb->patch_version,
1490 		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1491 		sb->ctime);
1492 	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1493 		sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1494 		sb->md_minor, sb->layout, sb->chunk_size);
1495 	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1496 		" FD:%d SD:%d CSUM:%08x E:%08lx\n",
1497 		sb->utime, sb->state, sb->active_disks, sb->working_disks,
1498 		sb->failed_disks, sb->spare_disks,
1499 		sb->sb_csum, (unsigned long)sb->events_lo);
1500 
1501 	printk(KERN_INFO);
1502 	for (i = 0; i < MD_SB_DISKS; i++) {
1503 		mdp_disk_t *desc;
1504 
1505 		desc = sb->disks + i;
1506 		if (desc->number || desc->major || desc->minor ||
1507 		    desc->raid_disk || (desc->state && (desc->state != 4))) {
1508 			printk("     D %2d: ", i);
1509 			print_desc(desc);
1510 		}
1511 	}
1512 	printk(KERN_INFO "md:     THIS: ");
1513 	print_desc(&sb->this_disk);
1514 
1515 }
1516 
1517 static void print_rdev(mdk_rdev_t *rdev)
1518 {
1519 	char b[BDEVNAME_SIZE];
1520 	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1521 		bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1522 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1523 	        rdev->desc_nr);
1524 	if (rdev->sb_loaded) {
1525 		printk(KERN_INFO "md: rdev superblock:\n");
1526 		print_sb((mdp_super_t*)page_address(rdev->sb_page));
1527 	} else
1528 		printk(KERN_INFO "md: no rdev superblock!\n");
1529 }
1530 
1531 static void md_print_devices(void)
1532 {
1533 	struct list_head *tmp, *tmp2;
1534 	mdk_rdev_t *rdev;
1535 	mddev_t *mddev;
1536 	char b[BDEVNAME_SIZE];
1537 
1538 	printk("\n");
1539 	printk("md:	**********************************\n");
1540 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
1541 	printk("md:	**********************************\n");
1542 	ITERATE_MDDEV(mddev,tmp) {
1543 
1544 		if (mddev->bitmap)
1545 			bitmap_print_sb(mddev->bitmap);
1546 		else
1547 			printk("%s: ", mdname(mddev));
1548 		ITERATE_RDEV(mddev,rdev,tmp2)
1549 			printk("<%s>", bdevname(rdev->bdev,b));
1550 		printk("\n");
1551 
1552 		ITERATE_RDEV(mddev,rdev,tmp2)
1553 			print_rdev(rdev);
1554 	}
1555 	printk("md:	**********************************\n");
1556 	printk("\n");
1557 }
1558 
1559 
1560 static void sync_sbs(mddev_t * mddev, int nospares)
1561 {
1562 	/* Update each superblock (in-memory image), but
1563 	 * if we are allowed to, skip spares which already
1564 	 * have the right event counter, or have one earlier
1565 	 * (which would mean they aren't being marked as dirty
1566 	 * with the rest of the array)
1567 	 */
1568 	mdk_rdev_t *rdev;
1569 	struct list_head *tmp;
1570 
1571 	ITERATE_RDEV(mddev,rdev,tmp) {
1572 		if (rdev->sb_events == mddev->events ||
1573 		    (nospares &&
1574 		     rdev->raid_disk < 0 &&
1575 		     (rdev->sb_events&1)==0 &&
1576 		     rdev->sb_events+1 == mddev->events)) {
1577 			/* Don't update this superblock */
1578 			rdev->sb_loaded = 2;
1579 		} else {
1580 			super_types[mddev->major_version].
1581 				sync_super(mddev, rdev);
1582 			rdev->sb_loaded = 1;
1583 		}
1584 	}
1585 }
1586 
1587 void md_update_sb(mddev_t * mddev)
1588 {
1589 	int err;
1590 	struct list_head *tmp;
1591 	mdk_rdev_t *rdev;
1592 	int sync_req;
1593 	int nospares = 0;
1594 
1595 repeat:
1596 	spin_lock_irq(&mddev->write_lock);
1597 	sync_req = mddev->in_sync;
1598 	mddev->utime = get_seconds();
1599 	if (mddev->sb_dirty == 3)
1600 		/* just a clean<-> dirty transition, possibly leave spares alone,
1601 		 * though if events isn't the right even/odd, we will have to do
1602 		 * spares after all
1603 		 */
1604 		nospares = 1;
1605 
1606 	/* If this is just a dirty<->clean transition, and the array is clean
1607 	 * and 'events' is odd, we can roll back to the previous clean state */
1608 	if (mddev->sb_dirty == 3
1609 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1610 	    && (mddev->events & 1))
1611 		mddev->events--;
1612 	else {
1613 		/* otherwise we have to go forward and ... */
1614 		mddev->events ++;
1615 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1616 			/* .. if the array isn't clean, insist on an odd 'events' */
1617 			if ((mddev->events&1)==0) {
1618 				mddev->events++;
1619 				nospares = 0;
1620 			}
1621 		} else {
1622 			/* otherwise insist on an even 'events' (for clean states) */
1623 			if ((mddev->events&1)) {
1624 				mddev->events++;
1625 				nospares = 0;
1626 			}
1627 		}
1628 	}
1629 
1630 	if (!mddev->events) {
1631 		/*
1632 		 * oops, this 64-bit counter should never wrap.
1633 		 * Either we are in around ~1 trillion A.C., assuming
1634 		 * 1 reboot per second, or we have a bug:
1635 		 */
1636 		MD_BUG();
1637 		mddev->events --;
1638 	}
1639 	mddev->sb_dirty = 2;
1640 	sync_sbs(mddev, nospares);
1641 
1642 	/*
1643 	 * do not write anything to disk if using
1644 	 * nonpersistent superblocks
1645 	 */
1646 	if (!mddev->persistent) {
1647 		mddev->sb_dirty = 0;
1648 		spin_unlock_irq(&mddev->write_lock);
1649 		wake_up(&mddev->sb_wait);
1650 		return;
1651 	}
1652 	spin_unlock_irq(&mddev->write_lock);
1653 
1654 	dprintk(KERN_INFO
1655 		"md: updating %s RAID superblock on device (in sync %d)\n",
1656 		mdname(mddev),mddev->in_sync);
1657 
1658 	err = bitmap_update_sb(mddev->bitmap);
1659 	ITERATE_RDEV(mddev,rdev,tmp) {
1660 		char b[BDEVNAME_SIZE];
1661 		dprintk(KERN_INFO "md: ");
1662 		if (rdev->sb_loaded != 1)
1663 			continue; /* no noise on spare devices */
1664 		if (test_bit(Faulty, &rdev->flags))
1665 			dprintk("(skipping faulty ");
1666 
1667 		dprintk("%s ", bdevname(rdev->bdev,b));
1668 		if (!test_bit(Faulty, &rdev->flags)) {
1669 			md_super_write(mddev,rdev,
1670 				       rdev->sb_offset<<1, rdev->sb_size,
1671 				       rdev->sb_page);
1672 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1673 				bdevname(rdev->bdev,b),
1674 				(unsigned long long)rdev->sb_offset);
1675 			rdev->sb_events = mddev->events;
1676 
1677 		} else
1678 			dprintk(")\n");
1679 		if (mddev->level == LEVEL_MULTIPATH)
1680 			/* only need to write one superblock... */
1681 			break;
1682 	}
1683 	md_super_wait(mddev);
1684 	/* if there was a failure, sb_dirty was set to 1, and we re-write super */
1685 
1686 	spin_lock_irq(&mddev->write_lock);
1687 	if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1688 		/* have to write it out again */
1689 		spin_unlock_irq(&mddev->write_lock);
1690 		goto repeat;
1691 	}
1692 	mddev->sb_dirty = 0;
1693 	spin_unlock_irq(&mddev->write_lock);
1694 	wake_up(&mddev->sb_wait);
1695 
1696 }
1697 EXPORT_SYMBOL_GPL(md_update_sb);
1698 
1699 /* words written to sysfs files may, or my not, be \n terminated.
1700  * We want to accept with case. For this we use cmd_match.
1701  */
1702 static int cmd_match(const char *cmd, const char *str)
1703 {
1704 	/* See if cmd, written into a sysfs file, matches
1705 	 * str.  They must either be the same, or cmd can
1706 	 * have a trailing newline
1707 	 */
1708 	while (*cmd && *str && *cmd == *str) {
1709 		cmd++;
1710 		str++;
1711 	}
1712 	if (*cmd == '\n')
1713 		cmd++;
1714 	if (*str || *cmd)
1715 		return 0;
1716 	return 1;
1717 }
1718 
1719 struct rdev_sysfs_entry {
1720 	struct attribute attr;
1721 	ssize_t (*show)(mdk_rdev_t *, char *);
1722 	ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1723 };
1724 
1725 static ssize_t
1726 state_show(mdk_rdev_t *rdev, char *page)
1727 {
1728 	char *sep = "";
1729 	int len=0;
1730 
1731 	if (test_bit(Faulty, &rdev->flags)) {
1732 		len+= sprintf(page+len, "%sfaulty",sep);
1733 		sep = ",";
1734 	}
1735 	if (test_bit(In_sync, &rdev->flags)) {
1736 		len += sprintf(page+len, "%sin_sync",sep);
1737 		sep = ",";
1738 	}
1739 	if (test_bit(WriteMostly, &rdev->flags)) {
1740 		len += sprintf(page+len, "%swrite_mostly",sep);
1741 		sep = ",";
1742 	}
1743 	if (!test_bit(Faulty, &rdev->flags) &&
1744 	    !test_bit(In_sync, &rdev->flags)) {
1745 		len += sprintf(page+len, "%sspare", sep);
1746 		sep = ",";
1747 	}
1748 	return len+sprintf(page+len, "\n");
1749 }
1750 
1751 static ssize_t
1752 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1753 {
1754 	/* can write
1755 	 *  faulty  - simulates and error
1756 	 *  remove  - disconnects the device
1757 	 *  writemostly - sets write_mostly
1758 	 *  -writemostly - clears write_mostly
1759 	 */
1760 	int err = -EINVAL;
1761 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1762 		md_error(rdev->mddev, rdev);
1763 		err = 0;
1764 	} else if (cmd_match(buf, "remove")) {
1765 		if (rdev->raid_disk >= 0)
1766 			err = -EBUSY;
1767 		else {
1768 			mddev_t *mddev = rdev->mddev;
1769 			kick_rdev_from_array(rdev);
1770 			md_update_sb(mddev);
1771 			md_new_event(mddev);
1772 			err = 0;
1773 		}
1774 	} else if (cmd_match(buf, "writemostly")) {
1775 		set_bit(WriteMostly, &rdev->flags);
1776 		err = 0;
1777 	} else if (cmd_match(buf, "-writemostly")) {
1778 		clear_bit(WriteMostly, &rdev->flags);
1779 		err = 0;
1780 	}
1781 	return err ? err : len;
1782 }
1783 static struct rdev_sysfs_entry
1784 rdev_state = __ATTR(state, 0644, state_show, state_store);
1785 
1786 static ssize_t
1787 super_show(mdk_rdev_t *rdev, char *page)
1788 {
1789 	if (rdev->sb_loaded && rdev->sb_size) {
1790 		memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1791 		return rdev->sb_size;
1792 	} else
1793 		return 0;
1794 }
1795 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1796 
1797 static ssize_t
1798 errors_show(mdk_rdev_t *rdev, char *page)
1799 {
1800 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1801 }
1802 
1803 static ssize_t
1804 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1805 {
1806 	char *e;
1807 	unsigned long n = simple_strtoul(buf, &e, 10);
1808 	if (*buf && (*e == 0 || *e == '\n')) {
1809 		atomic_set(&rdev->corrected_errors, n);
1810 		return len;
1811 	}
1812 	return -EINVAL;
1813 }
1814 static struct rdev_sysfs_entry rdev_errors =
1815 __ATTR(errors, 0644, errors_show, errors_store);
1816 
1817 static ssize_t
1818 slot_show(mdk_rdev_t *rdev, char *page)
1819 {
1820 	if (rdev->raid_disk < 0)
1821 		return sprintf(page, "none\n");
1822 	else
1823 		return sprintf(page, "%d\n", rdev->raid_disk);
1824 }
1825 
1826 static ssize_t
1827 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1828 {
1829 	char *e;
1830 	int slot = simple_strtoul(buf, &e, 10);
1831 	if (strncmp(buf, "none", 4)==0)
1832 		slot = -1;
1833 	else if (e==buf || (*e && *e!= '\n'))
1834 		return -EINVAL;
1835 	if (rdev->mddev->pers)
1836 		/* Cannot set slot in active array (yet) */
1837 		return -EBUSY;
1838 	if (slot >= rdev->mddev->raid_disks)
1839 		return -ENOSPC;
1840 	rdev->raid_disk = slot;
1841 	/* assume it is working */
1842 	rdev->flags = 0;
1843 	set_bit(In_sync, &rdev->flags);
1844 	return len;
1845 }
1846 
1847 
1848 static struct rdev_sysfs_entry rdev_slot =
1849 __ATTR(slot, 0644, slot_show, slot_store);
1850 
1851 static ssize_t
1852 offset_show(mdk_rdev_t *rdev, char *page)
1853 {
1854 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1855 }
1856 
1857 static ssize_t
1858 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1859 {
1860 	char *e;
1861 	unsigned long long offset = simple_strtoull(buf, &e, 10);
1862 	if (e==buf || (*e && *e != '\n'))
1863 		return -EINVAL;
1864 	if (rdev->mddev->pers)
1865 		return -EBUSY;
1866 	rdev->data_offset = offset;
1867 	return len;
1868 }
1869 
1870 static struct rdev_sysfs_entry rdev_offset =
1871 __ATTR(offset, 0644, offset_show, offset_store);
1872 
1873 static ssize_t
1874 rdev_size_show(mdk_rdev_t *rdev, char *page)
1875 {
1876 	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1877 }
1878 
1879 static ssize_t
1880 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1881 {
1882 	char *e;
1883 	unsigned long long size = simple_strtoull(buf, &e, 10);
1884 	if (e==buf || (*e && *e != '\n'))
1885 		return -EINVAL;
1886 	if (rdev->mddev->pers)
1887 		return -EBUSY;
1888 	rdev->size = size;
1889 	if (size < rdev->mddev->size || rdev->mddev->size == 0)
1890 		rdev->mddev->size = size;
1891 	return len;
1892 }
1893 
1894 static struct rdev_sysfs_entry rdev_size =
1895 __ATTR(size, 0644, rdev_size_show, rdev_size_store);
1896 
1897 static struct attribute *rdev_default_attrs[] = {
1898 	&rdev_state.attr,
1899 	&rdev_super.attr,
1900 	&rdev_errors.attr,
1901 	&rdev_slot.attr,
1902 	&rdev_offset.attr,
1903 	&rdev_size.attr,
1904 	NULL,
1905 };
1906 static ssize_t
1907 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1908 {
1909 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1910 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1911 
1912 	if (!entry->show)
1913 		return -EIO;
1914 	return entry->show(rdev, page);
1915 }
1916 
1917 static ssize_t
1918 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1919 	      const char *page, size_t length)
1920 {
1921 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1922 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1923 
1924 	if (!entry->store)
1925 		return -EIO;
1926 	return entry->store(rdev, page, length);
1927 }
1928 
1929 static void rdev_free(struct kobject *ko)
1930 {
1931 	mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1932 	kfree(rdev);
1933 }
1934 static struct sysfs_ops rdev_sysfs_ops = {
1935 	.show		= rdev_attr_show,
1936 	.store		= rdev_attr_store,
1937 };
1938 static struct kobj_type rdev_ktype = {
1939 	.release	= rdev_free,
1940 	.sysfs_ops	= &rdev_sysfs_ops,
1941 	.default_attrs	= rdev_default_attrs,
1942 };
1943 
1944 /*
1945  * Import a device. If 'super_format' >= 0, then sanity check the superblock
1946  *
1947  * mark the device faulty if:
1948  *
1949  *   - the device is nonexistent (zero size)
1950  *   - the device has no valid superblock
1951  *
1952  * a faulty rdev _never_ has rdev->sb set.
1953  */
1954 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1955 {
1956 	char b[BDEVNAME_SIZE];
1957 	int err;
1958 	mdk_rdev_t *rdev;
1959 	sector_t size;
1960 
1961 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1962 	if (!rdev) {
1963 		printk(KERN_ERR "md: could not alloc mem for new device!\n");
1964 		return ERR_PTR(-ENOMEM);
1965 	}
1966 
1967 	if ((err = alloc_disk_sb(rdev)))
1968 		goto abort_free;
1969 
1970 	err = lock_rdev(rdev, newdev);
1971 	if (err)
1972 		goto abort_free;
1973 
1974 	rdev->kobj.parent = NULL;
1975 	rdev->kobj.ktype = &rdev_ktype;
1976 	kobject_init(&rdev->kobj);
1977 
1978 	rdev->desc_nr = -1;
1979 	rdev->flags = 0;
1980 	rdev->data_offset = 0;
1981 	rdev->sb_events = 0;
1982 	atomic_set(&rdev->nr_pending, 0);
1983 	atomic_set(&rdev->read_errors, 0);
1984 	atomic_set(&rdev->corrected_errors, 0);
1985 
1986 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1987 	if (!size) {
1988 		printk(KERN_WARNING
1989 			"md: %s has zero or unknown size, marking faulty!\n",
1990 			bdevname(rdev->bdev,b));
1991 		err = -EINVAL;
1992 		goto abort_free;
1993 	}
1994 
1995 	if (super_format >= 0) {
1996 		err = super_types[super_format].
1997 			load_super(rdev, NULL, super_minor);
1998 		if (err == -EINVAL) {
1999 			printk(KERN_WARNING
2000 				"md: %s has invalid sb, not importing!\n",
2001 				bdevname(rdev->bdev,b));
2002 			goto abort_free;
2003 		}
2004 		if (err < 0) {
2005 			printk(KERN_WARNING
2006 				"md: could not read %s's sb, not importing!\n",
2007 				bdevname(rdev->bdev,b));
2008 			goto abort_free;
2009 		}
2010 	}
2011 	INIT_LIST_HEAD(&rdev->same_set);
2012 
2013 	return rdev;
2014 
2015 abort_free:
2016 	if (rdev->sb_page) {
2017 		if (rdev->bdev)
2018 			unlock_rdev(rdev);
2019 		free_disk_sb(rdev);
2020 	}
2021 	kfree(rdev);
2022 	return ERR_PTR(err);
2023 }
2024 
2025 /*
2026  * Check a full RAID array for plausibility
2027  */
2028 
2029 
2030 static void analyze_sbs(mddev_t * mddev)
2031 {
2032 	int i;
2033 	struct list_head *tmp;
2034 	mdk_rdev_t *rdev, *freshest;
2035 	char b[BDEVNAME_SIZE];
2036 
2037 	freshest = NULL;
2038 	ITERATE_RDEV(mddev,rdev,tmp)
2039 		switch (super_types[mddev->major_version].
2040 			load_super(rdev, freshest, mddev->minor_version)) {
2041 		case 1:
2042 			freshest = rdev;
2043 			break;
2044 		case 0:
2045 			break;
2046 		default:
2047 			printk( KERN_ERR \
2048 				"md: fatal superblock inconsistency in %s"
2049 				" -- removing from array\n",
2050 				bdevname(rdev->bdev,b));
2051 			kick_rdev_from_array(rdev);
2052 		}
2053 
2054 
2055 	super_types[mddev->major_version].
2056 		validate_super(mddev, freshest);
2057 
2058 	i = 0;
2059 	ITERATE_RDEV(mddev,rdev,tmp) {
2060 		if (rdev != freshest)
2061 			if (super_types[mddev->major_version].
2062 			    validate_super(mddev, rdev)) {
2063 				printk(KERN_WARNING "md: kicking non-fresh %s"
2064 					" from array!\n",
2065 					bdevname(rdev->bdev,b));
2066 				kick_rdev_from_array(rdev);
2067 				continue;
2068 			}
2069 		if (mddev->level == LEVEL_MULTIPATH) {
2070 			rdev->desc_nr = i++;
2071 			rdev->raid_disk = rdev->desc_nr;
2072 			set_bit(In_sync, &rdev->flags);
2073 		}
2074 	}
2075 
2076 
2077 
2078 	if (mddev->recovery_cp != MaxSector &&
2079 	    mddev->level >= 1)
2080 		printk(KERN_ERR "md: %s: raid array is not clean"
2081 		       " -- starting background reconstruction\n",
2082 		       mdname(mddev));
2083 
2084 }
2085 
2086 static ssize_t
2087 safe_delay_show(mddev_t *mddev, char *page)
2088 {
2089 	int msec = (mddev->safemode_delay*1000)/HZ;
2090 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2091 }
2092 static ssize_t
2093 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2094 {
2095 	int scale=1;
2096 	int dot=0;
2097 	int i;
2098 	unsigned long msec;
2099 	char buf[30];
2100 	char *e;
2101 	/* remove a period, and count digits after it */
2102 	if (len >= sizeof(buf))
2103 		return -EINVAL;
2104 	strlcpy(buf, cbuf, len);
2105 	buf[len] = 0;
2106 	for (i=0; i<len; i++) {
2107 		if (dot) {
2108 			if (isdigit(buf[i])) {
2109 				buf[i-1] = buf[i];
2110 				scale *= 10;
2111 			}
2112 			buf[i] = 0;
2113 		} else if (buf[i] == '.') {
2114 			dot=1;
2115 			buf[i] = 0;
2116 		}
2117 	}
2118 	msec = simple_strtoul(buf, &e, 10);
2119 	if (e == buf || (*e && *e != '\n'))
2120 		return -EINVAL;
2121 	msec = (msec * 1000) / scale;
2122 	if (msec == 0)
2123 		mddev->safemode_delay = 0;
2124 	else {
2125 		mddev->safemode_delay = (msec*HZ)/1000;
2126 		if (mddev->safemode_delay == 0)
2127 			mddev->safemode_delay = 1;
2128 	}
2129 	return len;
2130 }
2131 static struct md_sysfs_entry md_safe_delay =
2132 __ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
2133 
2134 static ssize_t
2135 level_show(mddev_t *mddev, char *page)
2136 {
2137 	struct mdk_personality *p = mddev->pers;
2138 	if (p)
2139 		return sprintf(page, "%s\n", p->name);
2140 	else if (mddev->clevel[0])
2141 		return sprintf(page, "%s\n", mddev->clevel);
2142 	else if (mddev->level != LEVEL_NONE)
2143 		return sprintf(page, "%d\n", mddev->level);
2144 	else
2145 		return 0;
2146 }
2147 
2148 static ssize_t
2149 level_store(mddev_t *mddev, const char *buf, size_t len)
2150 {
2151 	int rv = len;
2152 	if (mddev->pers)
2153 		return -EBUSY;
2154 	if (len == 0)
2155 		return 0;
2156 	if (len >= sizeof(mddev->clevel))
2157 		return -ENOSPC;
2158 	strncpy(mddev->clevel, buf, len);
2159 	if (mddev->clevel[len-1] == '\n')
2160 		len--;
2161 	mddev->clevel[len] = 0;
2162 	mddev->level = LEVEL_NONE;
2163 	return rv;
2164 }
2165 
2166 static struct md_sysfs_entry md_level =
2167 __ATTR(level, 0644, level_show, level_store);
2168 
2169 
2170 static ssize_t
2171 layout_show(mddev_t *mddev, char *page)
2172 {
2173 	/* just a number, not meaningful for all levels */
2174 	return sprintf(page, "%d\n", mddev->layout);
2175 }
2176 
2177 static ssize_t
2178 layout_store(mddev_t *mddev, const char *buf, size_t len)
2179 {
2180 	char *e;
2181 	unsigned long n = simple_strtoul(buf, &e, 10);
2182 	if (mddev->pers)
2183 		return -EBUSY;
2184 
2185 	if (!*buf || (*e && *e != '\n'))
2186 		return -EINVAL;
2187 
2188 	mddev->layout = n;
2189 	return len;
2190 }
2191 static struct md_sysfs_entry md_layout =
2192 __ATTR(layout, 0655, layout_show, layout_store);
2193 
2194 
2195 static ssize_t
2196 raid_disks_show(mddev_t *mddev, char *page)
2197 {
2198 	if (mddev->raid_disks == 0)
2199 		return 0;
2200 	return sprintf(page, "%d\n", mddev->raid_disks);
2201 }
2202 
2203 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2204 
2205 static ssize_t
2206 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2207 {
2208 	/* can only set raid_disks if array is not yet active */
2209 	char *e;
2210 	int rv = 0;
2211 	unsigned long n = simple_strtoul(buf, &e, 10);
2212 
2213 	if (!*buf || (*e && *e != '\n'))
2214 		return -EINVAL;
2215 
2216 	if (mddev->pers)
2217 		rv = update_raid_disks(mddev, n);
2218 	else
2219 		mddev->raid_disks = n;
2220 	return rv ? rv : len;
2221 }
2222 static struct md_sysfs_entry md_raid_disks =
2223 __ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
2224 
2225 static ssize_t
2226 chunk_size_show(mddev_t *mddev, char *page)
2227 {
2228 	return sprintf(page, "%d\n", mddev->chunk_size);
2229 }
2230 
2231 static ssize_t
2232 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2233 {
2234 	/* can only set chunk_size if array is not yet active */
2235 	char *e;
2236 	unsigned long n = simple_strtoul(buf, &e, 10);
2237 
2238 	if (mddev->pers)
2239 		return -EBUSY;
2240 	if (!*buf || (*e && *e != '\n'))
2241 		return -EINVAL;
2242 
2243 	mddev->chunk_size = n;
2244 	return len;
2245 }
2246 static struct md_sysfs_entry md_chunk_size =
2247 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2248 
2249 static ssize_t
2250 resync_start_show(mddev_t *mddev, char *page)
2251 {
2252 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2253 }
2254 
2255 static ssize_t
2256 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2257 {
2258 	/* can only set chunk_size if array is not yet active */
2259 	char *e;
2260 	unsigned long long n = simple_strtoull(buf, &e, 10);
2261 
2262 	if (mddev->pers)
2263 		return -EBUSY;
2264 	if (!*buf || (*e && *e != '\n'))
2265 		return -EINVAL;
2266 
2267 	mddev->recovery_cp = n;
2268 	return len;
2269 }
2270 static struct md_sysfs_entry md_resync_start =
2271 __ATTR(resync_start, 0644, resync_start_show, resync_start_store);
2272 
2273 /*
2274  * The array state can be:
2275  *
2276  * clear
2277  *     No devices, no size, no level
2278  *     Equivalent to STOP_ARRAY ioctl
2279  * inactive
2280  *     May have some settings, but array is not active
2281  *        all IO results in error
2282  *     When written, doesn't tear down array, but just stops it
2283  * suspended (not supported yet)
2284  *     All IO requests will block. The array can be reconfigured.
2285  *     Writing this, if accepted, will block until array is quiessent
2286  * readonly
2287  *     no resync can happen.  no superblocks get written.
2288  *     write requests fail
2289  * read-auto
2290  *     like readonly, but behaves like 'clean' on a write request.
2291  *
2292  * clean - no pending writes, but otherwise active.
2293  *     When written to inactive array, starts without resync
2294  *     If a write request arrives then
2295  *       if metadata is known, mark 'dirty' and switch to 'active'.
2296  *       if not known, block and switch to write-pending
2297  *     If written to an active array that has pending writes, then fails.
2298  * active
2299  *     fully active: IO and resync can be happening.
2300  *     When written to inactive array, starts with resync
2301  *
2302  * write-pending
2303  *     clean, but writes are blocked waiting for 'active' to be written.
2304  *
2305  * active-idle
2306  *     like active, but no writes have been seen for a while (100msec).
2307  *
2308  */
2309 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2310 		   write_pending, active_idle, bad_word};
2311 static char *array_states[] = {
2312 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2313 	"write-pending", "active-idle", NULL };
2314 
2315 static int match_word(const char *word, char **list)
2316 {
2317 	int n;
2318 	for (n=0; list[n]; n++)
2319 		if (cmd_match(word, list[n]))
2320 			break;
2321 	return n;
2322 }
2323 
2324 static ssize_t
2325 array_state_show(mddev_t *mddev, char *page)
2326 {
2327 	enum array_state st = inactive;
2328 
2329 	if (mddev->pers)
2330 		switch(mddev->ro) {
2331 		case 1:
2332 			st = readonly;
2333 			break;
2334 		case 2:
2335 			st = read_auto;
2336 			break;
2337 		case 0:
2338 			if (mddev->in_sync)
2339 				st = clean;
2340 			else if (mddev->safemode)
2341 				st = active_idle;
2342 			else
2343 				st = active;
2344 		}
2345 	else {
2346 		if (list_empty(&mddev->disks) &&
2347 		    mddev->raid_disks == 0 &&
2348 		    mddev->size == 0)
2349 			st = clear;
2350 		else
2351 			st = inactive;
2352 	}
2353 	return sprintf(page, "%s\n", array_states[st]);
2354 }
2355 
2356 static int do_md_stop(mddev_t * mddev, int ro);
2357 static int do_md_run(mddev_t * mddev);
2358 static int restart_array(mddev_t *mddev);
2359 
2360 static ssize_t
2361 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2362 {
2363 	int err = -EINVAL;
2364 	enum array_state st = match_word(buf, array_states);
2365 	switch(st) {
2366 	case bad_word:
2367 		break;
2368 	case clear:
2369 		/* stopping an active array */
2370 		if (mddev->pers) {
2371 			if (atomic_read(&mddev->active) > 1)
2372 				return -EBUSY;
2373 			err = do_md_stop(mddev, 0);
2374 		}
2375 		break;
2376 	case inactive:
2377 		/* stopping an active array */
2378 		if (mddev->pers) {
2379 			if (atomic_read(&mddev->active) > 1)
2380 				return -EBUSY;
2381 			err = do_md_stop(mddev, 2);
2382 		}
2383 		break;
2384 	case suspended:
2385 		break; /* not supported yet */
2386 	case readonly:
2387 		if (mddev->pers)
2388 			err = do_md_stop(mddev, 1);
2389 		else {
2390 			mddev->ro = 1;
2391 			err = do_md_run(mddev);
2392 		}
2393 		break;
2394 	case read_auto:
2395 		/* stopping an active array */
2396 		if (mddev->pers) {
2397 			err = do_md_stop(mddev, 1);
2398 			if (err == 0)
2399 				mddev->ro = 2; /* FIXME mark devices writable */
2400 		} else {
2401 			mddev->ro = 2;
2402 			err = do_md_run(mddev);
2403 		}
2404 		break;
2405 	case clean:
2406 		if (mddev->pers) {
2407 			restart_array(mddev);
2408 			spin_lock_irq(&mddev->write_lock);
2409 			if (atomic_read(&mddev->writes_pending) == 0) {
2410 				mddev->in_sync = 1;
2411 				mddev->sb_dirty = 1;
2412 			}
2413 			spin_unlock_irq(&mddev->write_lock);
2414 		} else {
2415 			mddev->ro = 0;
2416 			mddev->recovery_cp = MaxSector;
2417 			err = do_md_run(mddev);
2418 		}
2419 		break;
2420 	case active:
2421 		if (mddev->pers) {
2422 			restart_array(mddev);
2423 			mddev->sb_dirty = 0;
2424 			wake_up(&mddev->sb_wait);
2425 			err = 0;
2426 		} else {
2427 			mddev->ro = 0;
2428 			err = do_md_run(mddev);
2429 		}
2430 		break;
2431 	case write_pending:
2432 	case active_idle:
2433 		/* these cannot be set */
2434 		break;
2435 	}
2436 	if (err)
2437 		return err;
2438 	else
2439 		return len;
2440 }
2441 static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
2442 
2443 static ssize_t
2444 null_show(mddev_t *mddev, char *page)
2445 {
2446 	return -EINVAL;
2447 }
2448 
2449 static ssize_t
2450 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2451 {
2452 	/* buf must be %d:%d\n? giving major and minor numbers */
2453 	/* The new device is added to the array.
2454 	 * If the array has a persistent superblock, we read the
2455 	 * superblock to initialise info and check validity.
2456 	 * Otherwise, only checking done is that in bind_rdev_to_array,
2457 	 * which mainly checks size.
2458 	 */
2459 	char *e;
2460 	int major = simple_strtoul(buf, &e, 10);
2461 	int minor;
2462 	dev_t dev;
2463 	mdk_rdev_t *rdev;
2464 	int err;
2465 
2466 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2467 		return -EINVAL;
2468 	minor = simple_strtoul(e+1, &e, 10);
2469 	if (*e && *e != '\n')
2470 		return -EINVAL;
2471 	dev = MKDEV(major, minor);
2472 	if (major != MAJOR(dev) ||
2473 	    minor != MINOR(dev))
2474 		return -EOVERFLOW;
2475 
2476 
2477 	if (mddev->persistent) {
2478 		rdev = md_import_device(dev, mddev->major_version,
2479 					mddev->minor_version);
2480 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2481 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2482 						       mdk_rdev_t, same_set);
2483 			err = super_types[mddev->major_version]
2484 				.load_super(rdev, rdev0, mddev->minor_version);
2485 			if (err < 0)
2486 				goto out;
2487 		}
2488 	} else
2489 		rdev = md_import_device(dev, -1, -1);
2490 
2491 	if (IS_ERR(rdev))
2492 		return PTR_ERR(rdev);
2493 	err = bind_rdev_to_array(rdev, mddev);
2494  out:
2495 	if (err)
2496 		export_rdev(rdev);
2497 	return err ? err : len;
2498 }
2499 
2500 static struct md_sysfs_entry md_new_device =
2501 __ATTR(new_dev, 0200, null_show, new_dev_store);
2502 
2503 static ssize_t
2504 size_show(mddev_t *mddev, char *page)
2505 {
2506 	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2507 }
2508 
2509 static int update_size(mddev_t *mddev, unsigned long size);
2510 
2511 static ssize_t
2512 size_store(mddev_t *mddev, const char *buf, size_t len)
2513 {
2514 	/* If array is inactive, we can reduce the component size, but
2515 	 * not increase it (except from 0).
2516 	 * If array is active, we can try an on-line resize
2517 	 */
2518 	char *e;
2519 	int err = 0;
2520 	unsigned long long size = simple_strtoull(buf, &e, 10);
2521 	if (!*buf || *buf == '\n' ||
2522 	    (*e && *e != '\n'))
2523 		return -EINVAL;
2524 
2525 	if (mddev->pers) {
2526 		err = update_size(mddev, size);
2527 		md_update_sb(mddev);
2528 	} else {
2529 		if (mddev->size == 0 ||
2530 		    mddev->size > size)
2531 			mddev->size = size;
2532 		else
2533 			err = -ENOSPC;
2534 	}
2535 	return err ? err : len;
2536 }
2537 
2538 static struct md_sysfs_entry md_size =
2539 __ATTR(component_size, 0644, size_show, size_store);
2540 
2541 
2542 /* Metdata version.
2543  * This is either 'none' for arrays with externally managed metadata,
2544  * or N.M for internally known formats
2545  */
2546 static ssize_t
2547 metadata_show(mddev_t *mddev, char *page)
2548 {
2549 	if (mddev->persistent)
2550 		return sprintf(page, "%d.%d\n",
2551 			       mddev->major_version, mddev->minor_version);
2552 	else
2553 		return sprintf(page, "none\n");
2554 }
2555 
2556 static ssize_t
2557 metadata_store(mddev_t *mddev, const char *buf, size_t len)
2558 {
2559 	int major, minor;
2560 	char *e;
2561 	if (!list_empty(&mddev->disks))
2562 		return -EBUSY;
2563 
2564 	if (cmd_match(buf, "none")) {
2565 		mddev->persistent = 0;
2566 		mddev->major_version = 0;
2567 		mddev->minor_version = 90;
2568 		return len;
2569 	}
2570 	major = simple_strtoul(buf, &e, 10);
2571 	if (e==buf || *e != '.')
2572 		return -EINVAL;
2573 	buf = e+1;
2574 	minor = simple_strtoul(buf, &e, 10);
2575 	if (e==buf || *e != '\n')
2576 		return -EINVAL;
2577 	if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2578 	    super_types[major].name == NULL)
2579 		return -ENOENT;
2580 	mddev->major_version = major;
2581 	mddev->minor_version = minor;
2582 	mddev->persistent = 1;
2583 	return len;
2584 }
2585 
2586 static struct md_sysfs_entry md_metadata =
2587 __ATTR(metadata_version, 0644, metadata_show, metadata_store);
2588 
2589 static ssize_t
2590 action_show(mddev_t *mddev, char *page)
2591 {
2592 	char *type = "idle";
2593 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2594 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2595 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2596 			type = "reshape";
2597 		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2598 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2599 				type = "resync";
2600 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2601 				type = "check";
2602 			else
2603 				type = "repair";
2604 		} else
2605 			type = "recover";
2606 	}
2607 	return sprintf(page, "%s\n", type);
2608 }
2609 
2610 static ssize_t
2611 action_store(mddev_t *mddev, const char *page, size_t len)
2612 {
2613 	if (!mddev->pers || !mddev->pers->sync_request)
2614 		return -EINVAL;
2615 
2616 	if (cmd_match(page, "idle")) {
2617 		if (mddev->sync_thread) {
2618 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2619 			md_unregister_thread(mddev->sync_thread);
2620 			mddev->sync_thread = NULL;
2621 			mddev->recovery = 0;
2622 		}
2623 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2624 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2625 		return -EBUSY;
2626 	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2627 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2628 	else if (cmd_match(page, "reshape")) {
2629 		int err;
2630 		if (mddev->pers->start_reshape == NULL)
2631 			return -EINVAL;
2632 		err = mddev->pers->start_reshape(mddev);
2633 		if (err)
2634 			return err;
2635 	} else {
2636 		if (cmd_match(page, "check"))
2637 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2638 		else if (!cmd_match(page, "repair"))
2639 			return -EINVAL;
2640 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2641 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2642 	}
2643 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2644 	md_wakeup_thread(mddev->thread);
2645 	return len;
2646 }
2647 
2648 static ssize_t
2649 mismatch_cnt_show(mddev_t *mddev, char *page)
2650 {
2651 	return sprintf(page, "%llu\n",
2652 		       (unsigned long long) mddev->resync_mismatches);
2653 }
2654 
2655 static struct md_sysfs_entry
2656 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2657 
2658 
2659 static struct md_sysfs_entry
2660 md_mismatches = __ATTR_RO(mismatch_cnt);
2661 
2662 static ssize_t
2663 sync_min_show(mddev_t *mddev, char *page)
2664 {
2665 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
2666 		       mddev->sync_speed_min ? "local": "system");
2667 }
2668 
2669 static ssize_t
2670 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2671 {
2672 	int min;
2673 	char *e;
2674 	if (strncmp(buf, "system", 6)==0) {
2675 		mddev->sync_speed_min = 0;
2676 		return len;
2677 	}
2678 	min = simple_strtoul(buf, &e, 10);
2679 	if (buf == e || (*e && *e != '\n') || min <= 0)
2680 		return -EINVAL;
2681 	mddev->sync_speed_min = min;
2682 	return len;
2683 }
2684 
2685 static struct md_sysfs_entry md_sync_min =
2686 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2687 
2688 static ssize_t
2689 sync_max_show(mddev_t *mddev, char *page)
2690 {
2691 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
2692 		       mddev->sync_speed_max ? "local": "system");
2693 }
2694 
2695 static ssize_t
2696 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2697 {
2698 	int max;
2699 	char *e;
2700 	if (strncmp(buf, "system", 6)==0) {
2701 		mddev->sync_speed_max = 0;
2702 		return len;
2703 	}
2704 	max = simple_strtoul(buf, &e, 10);
2705 	if (buf == e || (*e && *e != '\n') || max <= 0)
2706 		return -EINVAL;
2707 	mddev->sync_speed_max = max;
2708 	return len;
2709 }
2710 
2711 static struct md_sysfs_entry md_sync_max =
2712 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2713 
2714 
2715 static ssize_t
2716 sync_speed_show(mddev_t *mddev, char *page)
2717 {
2718 	unsigned long resync, dt, db;
2719 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2720 	dt = ((jiffies - mddev->resync_mark) / HZ);
2721 	if (!dt) dt++;
2722 	db = resync - (mddev->resync_mark_cnt);
2723 	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2724 }
2725 
2726 static struct md_sysfs_entry
2727 md_sync_speed = __ATTR_RO(sync_speed);
2728 
2729 static ssize_t
2730 sync_completed_show(mddev_t *mddev, char *page)
2731 {
2732 	unsigned long max_blocks, resync;
2733 
2734 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2735 		max_blocks = mddev->resync_max_sectors;
2736 	else
2737 		max_blocks = mddev->size << 1;
2738 
2739 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2740 	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2741 }
2742 
2743 static struct md_sysfs_entry
2744 md_sync_completed = __ATTR_RO(sync_completed);
2745 
2746 static ssize_t
2747 suspend_lo_show(mddev_t *mddev, char *page)
2748 {
2749 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2750 }
2751 
2752 static ssize_t
2753 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2754 {
2755 	char *e;
2756 	unsigned long long new = simple_strtoull(buf, &e, 10);
2757 
2758 	if (mddev->pers->quiesce == NULL)
2759 		return -EINVAL;
2760 	if (buf == e || (*e && *e != '\n'))
2761 		return -EINVAL;
2762 	if (new >= mddev->suspend_hi ||
2763 	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2764 		mddev->suspend_lo = new;
2765 		mddev->pers->quiesce(mddev, 2);
2766 		return len;
2767 	} else
2768 		return -EINVAL;
2769 }
2770 static struct md_sysfs_entry md_suspend_lo =
2771 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2772 
2773 
2774 static ssize_t
2775 suspend_hi_show(mddev_t *mddev, char *page)
2776 {
2777 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2778 }
2779 
2780 static ssize_t
2781 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2782 {
2783 	char *e;
2784 	unsigned long long new = simple_strtoull(buf, &e, 10);
2785 
2786 	if (mddev->pers->quiesce == NULL)
2787 		return -EINVAL;
2788 	if (buf == e || (*e && *e != '\n'))
2789 		return -EINVAL;
2790 	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2791 	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2792 		mddev->suspend_hi = new;
2793 		mddev->pers->quiesce(mddev, 1);
2794 		mddev->pers->quiesce(mddev, 0);
2795 		return len;
2796 	} else
2797 		return -EINVAL;
2798 }
2799 static struct md_sysfs_entry md_suspend_hi =
2800 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2801 
2802 
2803 static struct attribute *md_default_attrs[] = {
2804 	&md_level.attr,
2805 	&md_layout.attr,
2806 	&md_raid_disks.attr,
2807 	&md_chunk_size.attr,
2808 	&md_size.attr,
2809 	&md_resync_start.attr,
2810 	&md_metadata.attr,
2811 	&md_new_device.attr,
2812 	&md_safe_delay.attr,
2813 	&md_array_state.attr,
2814 	NULL,
2815 };
2816 
2817 static struct attribute *md_redundancy_attrs[] = {
2818 	&md_scan_mode.attr,
2819 	&md_mismatches.attr,
2820 	&md_sync_min.attr,
2821 	&md_sync_max.attr,
2822 	&md_sync_speed.attr,
2823 	&md_sync_completed.attr,
2824 	&md_suspend_lo.attr,
2825 	&md_suspend_hi.attr,
2826 	NULL,
2827 };
2828 static struct attribute_group md_redundancy_group = {
2829 	.name = NULL,
2830 	.attrs = md_redundancy_attrs,
2831 };
2832 
2833 
2834 static ssize_t
2835 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2836 {
2837 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2838 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2839 	ssize_t rv;
2840 
2841 	if (!entry->show)
2842 		return -EIO;
2843 	rv = mddev_lock(mddev);
2844 	if (!rv) {
2845 		rv = entry->show(mddev, page);
2846 		mddev_unlock(mddev);
2847 	}
2848 	return rv;
2849 }
2850 
2851 static ssize_t
2852 md_attr_store(struct kobject *kobj, struct attribute *attr,
2853 	      const char *page, size_t length)
2854 {
2855 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2856 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2857 	ssize_t rv;
2858 
2859 	if (!entry->store)
2860 		return -EIO;
2861 	rv = mddev_lock(mddev);
2862 	if (!rv) {
2863 		rv = entry->store(mddev, page, length);
2864 		mddev_unlock(mddev);
2865 	}
2866 	return rv;
2867 }
2868 
2869 static void md_free(struct kobject *ko)
2870 {
2871 	mddev_t *mddev = container_of(ko, mddev_t, kobj);
2872 	kfree(mddev);
2873 }
2874 
2875 static struct sysfs_ops md_sysfs_ops = {
2876 	.show	= md_attr_show,
2877 	.store	= md_attr_store,
2878 };
2879 static struct kobj_type md_ktype = {
2880 	.release	= md_free,
2881 	.sysfs_ops	= &md_sysfs_ops,
2882 	.default_attrs	= md_default_attrs,
2883 };
2884 
2885 int mdp_major = 0;
2886 
2887 static struct kobject *md_probe(dev_t dev, int *part, void *data)
2888 {
2889 	static DEFINE_MUTEX(disks_mutex);
2890 	mddev_t *mddev = mddev_find(dev);
2891 	struct gendisk *disk;
2892 	int partitioned = (MAJOR(dev) != MD_MAJOR);
2893 	int shift = partitioned ? MdpMinorShift : 0;
2894 	int unit = MINOR(dev) >> shift;
2895 
2896 	if (!mddev)
2897 		return NULL;
2898 
2899 	mutex_lock(&disks_mutex);
2900 	if (mddev->gendisk) {
2901 		mutex_unlock(&disks_mutex);
2902 		mddev_put(mddev);
2903 		return NULL;
2904 	}
2905 	disk = alloc_disk(1 << shift);
2906 	if (!disk) {
2907 		mutex_unlock(&disks_mutex);
2908 		mddev_put(mddev);
2909 		return NULL;
2910 	}
2911 	disk->major = MAJOR(dev);
2912 	disk->first_minor = unit << shift;
2913 	if (partitioned)
2914 		sprintf(disk->disk_name, "md_d%d", unit);
2915 	else
2916 		sprintf(disk->disk_name, "md%d", unit);
2917 	disk->fops = &md_fops;
2918 	disk->private_data = mddev;
2919 	disk->queue = mddev->queue;
2920 	add_disk(disk);
2921 	mddev->gendisk = disk;
2922 	mutex_unlock(&disks_mutex);
2923 	mddev->kobj.parent = &disk->kobj;
2924 	mddev->kobj.k_name = NULL;
2925 	snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
2926 	mddev->kobj.ktype = &md_ktype;
2927 	kobject_register(&mddev->kobj);
2928 	return NULL;
2929 }
2930 
2931 static void md_safemode_timeout(unsigned long data)
2932 {
2933 	mddev_t *mddev = (mddev_t *) data;
2934 
2935 	mddev->safemode = 1;
2936 	md_wakeup_thread(mddev->thread);
2937 }
2938 
2939 static int start_dirty_degraded;
2940 
2941 static int do_md_run(mddev_t * mddev)
2942 {
2943 	int err;
2944 	int chunk_size;
2945 	struct list_head *tmp;
2946 	mdk_rdev_t *rdev;
2947 	struct gendisk *disk;
2948 	struct mdk_personality *pers;
2949 	char b[BDEVNAME_SIZE];
2950 
2951 	if (list_empty(&mddev->disks))
2952 		/* cannot run an array with no devices.. */
2953 		return -EINVAL;
2954 
2955 	if (mddev->pers)
2956 		return -EBUSY;
2957 
2958 	/*
2959 	 * Analyze all RAID superblock(s)
2960 	 */
2961 	if (!mddev->raid_disks)
2962 		analyze_sbs(mddev);
2963 
2964 	chunk_size = mddev->chunk_size;
2965 
2966 	if (chunk_size) {
2967 		if (chunk_size > MAX_CHUNK_SIZE) {
2968 			printk(KERN_ERR "too big chunk_size: %d > %d\n",
2969 				chunk_size, MAX_CHUNK_SIZE);
2970 			return -EINVAL;
2971 		}
2972 		/*
2973 		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
2974 		 */
2975 		if ( (1 << ffz(~chunk_size)) != chunk_size) {
2976 			printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
2977 			return -EINVAL;
2978 		}
2979 		if (chunk_size < PAGE_SIZE) {
2980 			printk(KERN_ERR "too small chunk_size: %d < %ld\n",
2981 				chunk_size, PAGE_SIZE);
2982 			return -EINVAL;
2983 		}
2984 
2985 		/* devices must have minimum size of one chunk */
2986 		ITERATE_RDEV(mddev,rdev,tmp) {
2987 			if (test_bit(Faulty, &rdev->flags))
2988 				continue;
2989 			if (rdev->size < chunk_size / 1024) {
2990 				printk(KERN_WARNING
2991 					"md: Dev %s smaller than chunk_size:"
2992 					" %lluk < %dk\n",
2993 					bdevname(rdev->bdev,b),
2994 					(unsigned long long)rdev->size,
2995 					chunk_size / 1024);
2996 				return -EINVAL;
2997 			}
2998 		}
2999 	}
3000 
3001 #ifdef CONFIG_KMOD
3002 	if (mddev->level != LEVEL_NONE)
3003 		request_module("md-level-%d", mddev->level);
3004 	else if (mddev->clevel[0])
3005 		request_module("md-%s", mddev->clevel);
3006 #endif
3007 
3008 	/*
3009 	 * Drop all container device buffers, from now on
3010 	 * the only valid external interface is through the md
3011 	 * device.
3012 	 * Also find largest hardsector size
3013 	 */
3014 	ITERATE_RDEV(mddev,rdev,tmp) {
3015 		if (test_bit(Faulty, &rdev->flags))
3016 			continue;
3017 		sync_blockdev(rdev->bdev);
3018 		invalidate_bdev(rdev->bdev, 0);
3019 	}
3020 
3021 	md_probe(mddev->unit, NULL, NULL);
3022 	disk = mddev->gendisk;
3023 	if (!disk)
3024 		return -ENOMEM;
3025 
3026 	spin_lock(&pers_lock);
3027 	pers = find_pers(mddev->level, mddev->clevel);
3028 	if (!pers || !try_module_get(pers->owner)) {
3029 		spin_unlock(&pers_lock);
3030 		if (mddev->level != LEVEL_NONE)
3031 			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3032 			       mddev->level);
3033 		else
3034 			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3035 			       mddev->clevel);
3036 		return -EINVAL;
3037 	}
3038 	mddev->pers = pers;
3039 	spin_unlock(&pers_lock);
3040 	mddev->level = pers->level;
3041 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3042 
3043 	if (mddev->reshape_position != MaxSector &&
3044 	    pers->start_reshape == NULL) {
3045 		/* This personality cannot handle reshaping... */
3046 		mddev->pers = NULL;
3047 		module_put(pers->owner);
3048 		return -EINVAL;
3049 	}
3050 
3051 	mddev->recovery = 0;
3052 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3053 	mddev->barriers_work = 1;
3054 	mddev->ok_start_degraded = start_dirty_degraded;
3055 
3056 	if (start_readonly)
3057 		mddev->ro = 2; /* read-only, but switch on first write */
3058 
3059 	err = mddev->pers->run(mddev);
3060 	if (!err && mddev->pers->sync_request) {
3061 		err = bitmap_create(mddev);
3062 		if (err) {
3063 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3064 			       mdname(mddev), err);
3065 			mddev->pers->stop(mddev);
3066 		}
3067 	}
3068 	if (err) {
3069 		printk(KERN_ERR "md: pers->run() failed ...\n");
3070 		module_put(mddev->pers->owner);
3071 		mddev->pers = NULL;
3072 		bitmap_destroy(mddev);
3073 		return err;
3074 	}
3075 	if (mddev->pers->sync_request)
3076 		sysfs_create_group(&mddev->kobj, &md_redundancy_group);
3077 	else if (mddev->ro == 2) /* auto-readonly not meaningful */
3078 		mddev->ro = 0;
3079 
3080  	atomic_set(&mddev->writes_pending,0);
3081 	mddev->safemode = 0;
3082 	mddev->safemode_timer.function = md_safemode_timeout;
3083 	mddev->safemode_timer.data = (unsigned long) mddev;
3084 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3085 	mddev->in_sync = 1;
3086 
3087 	ITERATE_RDEV(mddev,rdev,tmp)
3088 		if (rdev->raid_disk >= 0) {
3089 			char nm[20];
3090 			sprintf(nm, "rd%d", rdev->raid_disk);
3091 			sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3092 		}
3093 
3094 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3095 	md_wakeup_thread(mddev->thread);
3096 
3097 	if (mddev->sb_dirty)
3098 		md_update_sb(mddev);
3099 
3100 	set_capacity(disk, mddev->array_size<<1);
3101 
3102 	/* If we call blk_queue_make_request here, it will
3103 	 * re-initialise max_sectors etc which may have been
3104 	 * refined inside -> run.  So just set the bits we need to set.
3105 	 * Most initialisation happended when we called
3106 	 * blk_queue_make_request(..., md_fail_request)
3107 	 * earlier.
3108 	 */
3109 	mddev->queue->queuedata = mddev;
3110 	mddev->queue->make_request_fn = mddev->pers->make_request;
3111 
3112 	/* If there is a partially-recovered drive we need to
3113 	 * start recovery here.  If we leave it to md_check_recovery,
3114 	 * it will remove the drives and not do the right thing
3115 	 */
3116 	if (mddev->degraded) {
3117 		struct list_head *rtmp;
3118 		int spares = 0;
3119 		ITERATE_RDEV(mddev,rdev,rtmp)
3120 			if (rdev->raid_disk >= 0 &&
3121 			    !test_bit(In_sync, &rdev->flags) &&
3122 			    !test_bit(Faulty, &rdev->flags))
3123 				/* complete an interrupted recovery */
3124 				spares++;
3125 		if (spares && mddev->pers->sync_request) {
3126 			mddev->recovery = 0;
3127 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3128 			mddev->sync_thread = md_register_thread(md_do_sync,
3129 								mddev,
3130 								"%s_resync");
3131 			if (!mddev->sync_thread) {
3132 				printk(KERN_ERR "%s: could not start resync"
3133 				       " thread...\n",
3134 				       mdname(mddev));
3135 				/* leave the spares where they are, it shouldn't hurt */
3136 				mddev->recovery = 0;
3137 			} else
3138 				md_wakeup_thread(mddev->sync_thread);
3139 		}
3140 	}
3141 
3142 	mddev->changed = 1;
3143 	md_new_event(mddev);
3144 	return 0;
3145 }
3146 
3147 static int restart_array(mddev_t *mddev)
3148 {
3149 	struct gendisk *disk = mddev->gendisk;
3150 	int err;
3151 
3152 	/*
3153 	 * Complain if it has no devices
3154 	 */
3155 	err = -ENXIO;
3156 	if (list_empty(&mddev->disks))
3157 		goto out;
3158 
3159 	if (mddev->pers) {
3160 		err = -EBUSY;
3161 		if (!mddev->ro)
3162 			goto out;
3163 
3164 		mddev->safemode = 0;
3165 		mddev->ro = 0;
3166 		set_disk_ro(disk, 0);
3167 
3168 		printk(KERN_INFO "md: %s switched to read-write mode.\n",
3169 			mdname(mddev));
3170 		/*
3171 		 * Kick recovery or resync if necessary
3172 		 */
3173 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3174 		md_wakeup_thread(mddev->thread);
3175 		md_wakeup_thread(mddev->sync_thread);
3176 		err = 0;
3177 	} else
3178 		err = -EINVAL;
3179 
3180 out:
3181 	return err;
3182 }
3183 
3184 /* similar to deny_write_access, but accounts for our holding a reference
3185  * to the file ourselves */
3186 static int deny_bitmap_write_access(struct file * file)
3187 {
3188 	struct inode *inode = file->f_mapping->host;
3189 
3190 	spin_lock(&inode->i_lock);
3191 	if (atomic_read(&inode->i_writecount) > 1) {
3192 		spin_unlock(&inode->i_lock);
3193 		return -ETXTBSY;
3194 	}
3195 	atomic_set(&inode->i_writecount, -1);
3196 	spin_unlock(&inode->i_lock);
3197 
3198 	return 0;
3199 }
3200 
3201 static void restore_bitmap_write_access(struct file *file)
3202 {
3203 	struct inode *inode = file->f_mapping->host;
3204 
3205 	spin_lock(&inode->i_lock);
3206 	atomic_set(&inode->i_writecount, 1);
3207 	spin_unlock(&inode->i_lock);
3208 }
3209 
3210 /* mode:
3211  *   0 - completely stop and dis-assemble array
3212  *   1 - switch to readonly
3213  *   2 - stop but do not disassemble array
3214  */
3215 static int do_md_stop(mddev_t * mddev, int mode)
3216 {
3217 	int err = 0;
3218 	struct gendisk *disk = mddev->gendisk;
3219 
3220 	if (mddev->pers) {
3221 		if (atomic_read(&mddev->active)>2) {
3222 			printk("md: %s still in use.\n",mdname(mddev));
3223 			return -EBUSY;
3224 		}
3225 
3226 		if (mddev->sync_thread) {
3227 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3228 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3229 			md_unregister_thread(mddev->sync_thread);
3230 			mddev->sync_thread = NULL;
3231 		}
3232 
3233 		del_timer_sync(&mddev->safemode_timer);
3234 
3235 		invalidate_partition(disk, 0);
3236 
3237 		switch(mode) {
3238 		case 1: /* readonly */
3239 			err  = -ENXIO;
3240 			if (mddev->ro==1)
3241 				goto out;
3242 			mddev->ro = 1;
3243 			break;
3244 		case 0: /* disassemble */
3245 		case 2: /* stop */
3246 			bitmap_flush(mddev);
3247 			md_super_wait(mddev);
3248 			if (mddev->ro)
3249 				set_disk_ro(disk, 0);
3250 			blk_queue_make_request(mddev->queue, md_fail_request);
3251 			mddev->pers->stop(mddev);
3252 			if (mddev->pers->sync_request)
3253 				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3254 
3255 			module_put(mddev->pers->owner);
3256 			mddev->pers = NULL;
3257 			if (mddev->ro)
3258 				mddev->ro = 0;
3259 		}
3260 		if (!mddev->in_sync || mddev->sb_dirty) {
3261 			/* mark array as shutdown cleanly */
3262 			mddev->in_sync = 1;
3263 			md_update_sb(mddev);
3264 		}
3265 		if (mode == 1)
3266 			set_disk_ro(disk, 1);
3267 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3268 	}
3269 
3270 	/*
3271 	 * Free resources if final stop
3272 	 */
3273 	if (mode == 0) {
3274 		mdk_rdev_t *rdev;
3275 		struct list_head *tmp;
3276 		struct gendisk *disk;
3277 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3278 
3279 		bitmap_destroy(mddev);
3280 		if (mddev->bitmap_file) {
3281 			restore_bitmap_write_access(mddev->bitmap_file);
3282 			fput(mddev->bitmap_file);
3283 			mddev->bitmap_file = NULL;
3284 		}
3285 		mddev->bitmap_offset = 0;
3286 
3287 		ITERATE_RDEV(mddev,rdev,tmp)
3288 			if (rdev->raid_disk >= 0) {
3289 				char nm[20];
3290 				sprintf(nm, "rd%d", rdev->raid_disk);
3291 				sysfs_remove_link(&mddev->kobj, nm);
3292 			}
3293 
3294 		export_array(mddev);
3295 
3296 		mddev->array_size = 0;
3297 		mddev->size = 0;
3298 		mddev->raid_disks = 0;
3299 		mddev->recovery_cp = 0;
3300 
3301 		disk = mddev->gendisk;
3302 		if (disk)
3303 			set_capacity(disk, 0);
3304 		mddev->changed = 1;
3305 	} else if (mddev->pers)
3306 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
3307 			mdname(mddev));
3308 	err = 0;
3309 	md_new_event(mddev);
3310 out:
3311 	return err;
3312 }
3313 
3314 static void autorun_array(mddev_t *mddev)
3315 {
3316 	mdk_rdev_t *rdev;
3317 	struct list_head *tmp;
3318 	int err;
3319 
3320 	if (list_empty(&mddev->disks))
3321 		return;
3322 
3323 	printk(KERN_INFO "md: running: ");
3324 
3325 	ITERATE_RDEV(mddev,rdev,tmp) {
3326 		char b[BDEVNAME_SIZE];
3327 		printk("<%s>", bdevname(rdev->bdev,b));
3328 	}
3329 	printk("\n");
3330 
3331 	err = do_md_run (mddev);
3332 	if (err) {
3333 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3334 		do_md_stop (mddev, 0);
3335 	}
3336 }
3337 
3338 /*
3339  * lets try to run arrays based on all disks that have arrived
3340  * until now. (those are in pending_raid_disks)
3341  *
3342  * the method: pick the first pending disk, collect all disks with
3343  * the same UUID, remove all from the pending list and put them into
3344  * the 'same_array' list. Then order this list based on superblock
3345  * update time (freshest comes first), kick out 'old' disks and
3346  * compare superblocks. If everything's fine then run it.
3347  *
3348  * If "unit" is allocated, then bump its reference count
3349  */
3350 static void autorun_devices(int part)
3351 {
3352 	struct list_head *tmp;
3353 	mdk_rdev_t *rdev0, *rdev;
3354 	mddev_t *mddev;
3355 	char b[BDEVNAME_SIZE];
3356 
3357 	printk(KERN_INFO "md: autorun ...\n");
3358 	while (!list_empty(&pending_raid_disks)) {
3359 		dev_t dev;
3360 		LIST_HEAD(candidates);
3361 		rdev0 = list_entry(pending_raid_disks.next,
3362 					 mdk_rdev_t, same_set);
3363 
3364 		printk(KERN_INFO "md: considering %s ...\n",
3365 			bdevname(rdev0->bdev,b));
3366 		INIT_LIST_HEAD(&candidates);
3367 		ITERATE_RDEV_PENDING(rdev,tmp)
3368 			if (super_90_load(rdev, rdev0, 0) >= 0) {
3369 				printk(KERN_INFO "md:  adding %s ...\n",
3370 					bdevname(rdev->bdev,b));
3371 				list_move(&rdev->same_set, &candidates);
3372 			}
3373 		/*
3374 		 * now we have a set of devices, with all of them having
3375 		 * mostly sane superblocks. It's time to allocate the
3376 		 * mddev.
3377 		 */
3378 		if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
3379 			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3380 			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3381 			break;
3382 		}
3383 		if (part)
3384 			dev = MKDEV(mdp_major,
3385 				    rdev0->preferred_minor << MdpMinorShift);
3386 		else
3387 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3388 
3389 		md_probe(dev, NULL, NULL);
3390 		mddev = mddev_find(dev);
3391 		if (!mddev) {
3392 			printk(KERN_ERR
3393 				"md: cannot allocate memory for md drive.\n");
3394 			break;
3395 		}
3396 		if (mddev_lock(mddev))
3397 			printk(KERN_WARNING "md: %s locked, cannot run\n",
3398 			       mdname(mddev));
3399 		else if (mddev->raid_disks || mddev->major_version
3400 			 || !list_empty(&mddev->disks)) {
3401 			printk(KERN_WARNING
3402 				"md: %s already running, cannot run %s\n",
3403 				mdname(mddev), bdevname(rdev0->bdev,b));
3404 			mddev_unlock(mddev);
3405 		} else {
3406 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
3407 			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3408 				list_del_init(&rdev->same_set);
3409 				if (bind_rdev_to_array(rdev, mddev))
3410 					export_rdev(rdev);
3411 			}
3412 			autorun_array(mddev);
3413 			mddev_unlock(mddev);
3414 		}
3415 		/* on success, candidates will be empty, on error
3416 		 * it won't...
3417 		 */
3418 		ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3419 			export_rdev(rdev);
3420 		mddev_put(mddev);
3421 	}
3422 	printk(KERN_INFO "md: ... autorun DONE.\n");
3423 }
3424 
3425 /*
3426  * import RAID devices based on one partition
3427  * if possible, the array gets run as well.
3428  */
3429 
3430 static int autostart_array(dev_t startdev)
3431 {
3432 	char b[BDEVNAME_SIZE];
3433 	int err = -EINVAL, i;
3434 	mdp_super_t *sb = NULL;
3435 	mdk_rdev_t *start_rdev = NULL, *rdev;
3436 
3437 	start_rdev = md_import_device(startdev, 0, 0);
3438 	if (IS_ERR(start_rdev))
3439 		return err;
3440 
3441 
3442 	/* NOTE: this can only work for 0.90.0 superblocks */
3443 	sb = (mdp_super_t*)page_address(start_rdev->sb_page);
3444 	if (sb->major_version != 0 ||
3445 	    sb->minor_version != 90 ) {
3446 		printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
3447 		export_rdev(start_rdev);
3448 		return err;
3449 	}
3450 
3451 	if (test_bit(Faulty, &start_rdev->flags)) {
3452 		printk(KERN_WARNING
3453 			"md: can not autostart based on faulty %s!\n",
3454 			bdevname(start_rdev->bdev,b));
3455 		export_rdev(start_rdev);
3456 		return err;
3457 	}
3458 	list_add(&start_rdev->same_set, &pending_raid_disks);
3459 
3460 	for (i = 0; i < MD_SB_DISKS; i++) {
3461 		mdp_disk_t *desc = sb->disks + i;
3462 		dev_t dev = MKDEV(desc->major, desc->minor);
3463 
3464 		if (!dev)
3465 			continue;
3466 		if (dev == startdev)
3467 			continue;
3468 		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
3469 			continue;
3470 		rdev = md_import_device(dev, 0, 0);
3471 		if (IS_ERR(rdev))
3472 			continue;
3473 
3474 		list_add(&rdev->same_set, &pending_raid_disks);
3475 	}
3476 
3477 	/*
3478 	 * possibly return codes
3479 	 */
3480 	autorun_devices(0);
3481 	return 0;
3482 
3483 }
3484 
3485 
3486 static int get_version(void __user * arg)
3487 {
3488 	mdu_version_t ver;
3489 
3490 	ver.major = MD_MAJOR_VERSION;
3491 	ver.minor = MD_MINOR_VERSION;
3492 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
3493 
3494 	if (copy_to_user(arg, &ver, sizeof(ver)))
3495 		return -EFAULT;
3496 
3497 	return 0;
3498 }
3499 
3500 static int get_array_info(mddev_t * mddev, void __user * arg)
3501 {
3502 	mdu_array_info_t info;
3503 	int nr,working,active,failed,spare;
3504 	mdk_rdev_t *rdev;
3505 	struct list_head *tmp;
3506 
3507 	nr=working=active=failed=spare=0;
3508 	ITERATE_RDEV(mddev,rdev,tmp) {
3509 		nr++;
3510 		if (test_bit(Faulty, &rdev->flags))
3511 			failed++;
3512 		else {
3513 			working++;
3514 			if (test_bit(In_sync, &rdev->flags))
3515 				active++;
3516 			else
3517 				spare++;
3518 		}
3519 	}
3520 
3521 	info.major_version = mddev->major_version;
3522 	info.minor_version = mddev->minor_version;
3523 	info.patch_version = MD_PATCHLEVEL_VERSION;
3524 	info.ctime         = mddev->ctime;
3525 	info.level         = mddev->level;
3526 	info.size          = mddev->size;
3527 	if (info.size != mddev->size) /* overflow */
3528 		info.size = -1;
3529 	info.nr_disks      = nr;
3530 	info.raid_disks    = mddev->raid_disks;
3531 	info.md_minor      = mddev->md_minor;
3532 	info.not_persistent= !mddev->persistent;
3533 
3534 	info.utime         = mddev->utime;
3535 	info.state         = 0;
3536 	if (mddev->in_sync)
3537 		info.state = (1<<MD_SB_CLEAN);
3538 	if (mddev->bitmap && mddev->bitmap_offset)
3539 		info.state = (1<<MD_SB_BITMAP_PRESENT);
3540 	info.active_disks  = active;
3541 	info.working_disks = working;
3542 	info.failed_disks  = failed;
3543 	info.spare_disks   = spare;
3544 
3545 	info.layout        = mddev->layout;
3546 	info.chunk_size    = mddev->chunk_size;
3547 
3548 	if (copy_to_user(arg, &info, sizeof(info)))
3549 		return -EFAULT;
3550 
3551 	return 0;
3552 }
3553 
3554 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3555 {
3556 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3557 	char *ptr, *buf = NULL;
3558 	int err = -ENOMEM;
3559 
3560 	file = kmalloc(sizeof(*file), GFP_KERNEL);
3561 	if (!file)
3562 		goto out;
3563 
3564 	/* bitmap disabled, zero the first byte and copy out */
3565 	if (!mddev->bitmap || !mddev->bitmap->file) {
3566 		file->pathname[0] = '\0';
3567 		goto copy_out;
3568 	}
3569 
3570 	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3571 	if (!buf)
3572 		goto out;
3573 
3574 	ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3575 	if (!ptr)
3576 		goto out;
3577 
3578 	strcpy(file->pathname, ptr);
3579 
3580 copy_out:
3581 	err = 0;
3582 	if (copy_to_user(arg, file, sizeof(*file)))
3583 		err = -EFAULT;
3584 out:
3585 	kfree(buf);
3586 	kfree(file);
3587 	return err;
3588 }
3589 
3590 static int get_disk_info(mddev_t * mddev, void __user * arg)
3591 {
3592 	mdu_disk_info_t info;
3593 	unsigned int nr;
3594 	mdk_rdev_t *rdev;
3595 
3596 	if (copy_from_user(&info, arg, sizeof(info)))
3597 		return -EFAULT;
3598 
3599 	nr = info.number;
3600 
3601 	rdev = find_rdev_nr(mddev, nr);
3602 	if (rdev) {
3603 		info.major = MAJOR(rdev->bdev->bd_dev);
3604 		info.minor = MINOR(rdev->bdev->bd_dev);
3605 		info.raid_disk = rdev->raid_disk;
3606 		info.state = 0;
3607 		if (test_bit(Faulty, &rdev->flags))
3608 			info.state |= (1<<MD_DISK_FAULTY);
3609 		else if (test_bit(In_sync, &rdev->flags)) {
3610 			info.state |= (1<<MD_DISK_ACTIVE);
3611 			info.state |= (1<<MD_DISK_SYNC);
3612 		}
3613 		if (test_bit(WriteMostly, &rdev->flags))
3614 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
3615 	} else {
3616 		info.major = info.minor = 0;
3617 		info.raid_disk = -1;
3618 		info.state = (1<<MD_DISK_REMOVED);
3619 	}
3620 
3621 	if (copy_to_user(arg, &info, sizeof(info)))
3622 		return -EFAULT;
3623 
3624 	return 0;
3625 }
3626 
3627 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3628 {
3629 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3630 	mdk_rdev_t *rdev;
3631 	dev_t dev = MKDEV(info->major,info->minor);
3632 
3633 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3634 		return -EOVERFLOW;
3635 
3636 	if (!mddev->raid_disks) {
3637 		int err;
3638 		/* expecting a device which has a superblock */
3639 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3640 		if (IS_ERR(rdev)) {
3641 			printk(KERN_WARNING
3642 				"md: md_import_device returned %ld\n",
3643 				PTR_ERR(rdev));
3644 			return PTR_ERR(rdev);
3645 		}
3646 		if (!list_empty(&mddev->disks)) {
3647 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3648 							mdk_rdev_t, same_set);
3649 			int err = super_types[mddev->major_version]
3650 				.load_super(rdev, rdev0, mddev->minor_version);
3651 			if (err < 0) {
3652 				printk(KERN_WARNING
3653 					"md: %s has different UUID to %s\n",
3654 					bdevname(rdev->bdev,b),
3655 					bdevname(rdev0->bdev,b2));
3656 				export_rdev(rdev);
3657 				return -EINVAL;
3658 			}
3659 		}
3660 		err = bind_rdev_to_array(rdev, mddev);
3661 		if (err)
3662 			export_rdev(rdev);
3663 		return err;
3664 	}
3665 
3666 	/*
3667 	 * add_new_disk can be used once the array is assembled
3668 	 * to add "hot spares".  They must already have a superblock
3669 	 * written
3670 	 */
3671 	if (mddev->pers) {
3672 		int err;
3673 		if (!mddev->pers->hot_add_disk) {
3674 			printk(KERN_WARNING
3675 				"%s: personality does not support diskops!\n",
3676 			       mdname(mddev));
3677 			return -EINVAL;
3678 		}
3679 		if (mddev->persistent)
3680 			rdev = md_import_device(dev, mddev->major_version,
3681 						mddev->minor_version);
3682 		else
3683 			rdev = md_import_device(dev, -1, -1);
3684 		if (IS_ERR(rdev)) {
3685 			printk(KERN_WARNING
3686 				"md: md_import_device returned %ld\n",
3687 				PTR_ERR(rdev));
3688 			return PTR_ERR(rdev);
3689 		}
3690 		/* set save_raid_disk if appropriate */
3691 		if (!mddev->persistent) {
3692 			if (info->state & (1<<MD_DISK_SYNC)  &&
3693 			    info->raid_disk < mddev->raid_disks)
3694 				rdev->raid_disk = info->raid_disk;
3695 			else
3696 				rdev->raid_disk = -1;
3697 		} else
3698 			super_types[mddev->major_version].
3699 				validate_super(mddev, rdev);
3700 		rdev->saved_raid_disk = rdev->raid_disk;
3701 
3702 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
3703 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3704 			set_bit(WriteMostly, &rdev->flags);
3705 
3706 		rdev->raid_disk = -1;
3707 		err = bind_rdev_to_array(rdev, mddev);
3708 		if (!err && !mddev->pers->hot_remove_disk) {
3709 			/* If there is hot_add_disk but no hot_remove_disk
3710 			 * then added disks for geometry changes,
3711 			 * and should be added immediately.
3712 			 */
3713 			super_types[mddev->major_version].
3714 				validate_super(mddev, rdev);
3715 			err = mddev->pers->hot_add_disk(mddev, rdev);
3716 			if (err)
3717 				unbind_rdev_from_array(rdev);
3718 		}
3719 		if (err)
3720 			export_rdev(rdev);
3721 
3722 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3723 		md_wakeup_thread(mddev->thread);
3724 		return err;
3725 	}
3726 
3727 	/* otherwise, add_new_disk is only allowed
3728 	 * for major_version==0 superblocks
3729 	 */
3730 	if (mddev->major_version != 0) {
3731 		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3732 		       mdname(mddev));
3733 		return -EINVAL;
3734 	}
3735 
3736 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
3737 		int err;
3738 		rdev = md_import_device (dev, -1, 0);
3739 		if (IS_ERR(rdev)) {
3740 			printk(KERN_WARNING
3741 				"md: error, md_import_device() returned %ld\n",
3742 				PTR_ERR(rdev));
3743 			return PTR_ERR(rdev);
3744 		}
3745 		rdev->desc_nr = info->number;
3746 		if (info->raid_disk < mddev->raid_disks)
3747 			rdev->raid_disk = info->raid_disk;
3748 		else
3749 			rdev->raid_disk = -1;
3750 
3751 		rdev->flags = 0;
3752 
3753 		if (rdev->raid_disk < mddev->raid_disks)
3754 			if (info->state & (1<<MD_DISK_SYNC))
3755 				set_bit(In_sync, &rdev->flags);
3756 
3757 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3758 			set_bit(WriteMostly, &rdev->flags);
3759 
3760 		if (!mddev->persistent) {
3761 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
3762 			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3763 		} else
3764 			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3765 		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3766 
3767 		err = bind_rdev_to_array(rdev, mddev);
3768 		if (err) {
3769 			export_rdev(rdev);
3770 			return err;
3771 		}
3772 	}
3773 
3774 	return 0;
3775 }
3776 
3777 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3778 {
3779 	char b[BDEVNAME_SIZE];
3780 	mdk_rdev_t *rdev;
3781 
3782 	if (!mddev->pers)
3783 		return -ENODEV;
3784 
3785 	rdev = find_rdev(mddev, dev);
3786 	if (!rdev)
3787 		return -ENXIO;
3788 
3789 	if (rdev->raid_disk >= 0)
3790 		goto busy;
3791 
3792 	kick_rdev_from_array(rdev);
3793 	md_update_sb(mddev);
3794 	md_new_event(mddev);
3795 
3796 	return 0;
3797 busy:
3798 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3799 		bdevname(rdev->bdev,b), mdname(mddev));
3800 	return -EBUSY;
3801 }
3802 
3803 static int hot_add_disk(mddev_t * mddev, dev_t dev)
3804 {
3805 	char b[BDEVNAME_SIZE];
3806 	int err;
3807 	unsigned int size;
3808 	mdk_rdev_t *rdev;
3809 
3810 	if (!mddev->pers)
3811 		return -ENODEV;
3812 
3813 	if (mddev->major_version != 0) {
3814 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3815 			" version-0 superblocks.\n",
3816 			mdname(mddev));
3817 		return -EINVAL;
3818 	}
3819 	if (!mddev->pers->hot_add_disk) {
3820 		printk(KERN_WARNING
3821 			"%s: personality does not support diskops!\n",
3822 			mdname(mddev));
3823 		return -EINVAL;
3824 	}
3825 
3826 	rdev = md_import_device (dev, -1, 0);
3827 	if (IS_ERR(rdev)) {
3828 		printk(KERN_WARNING
3829 			"md: error, md_import_device() returned %ld\n",
3830 			PTR_ERR(rdev));
3831 		return -EINVAL;
3832 	}
3833 
3834 	if (mddev->persistent)
3835 		rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3836 	else
3837 		rdev->sb_offset =
3838 			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3839 
3840 	size = calc_dev_size(rdev, mddev->chunk_size);
3841 	rdev->size = size;
3842 
3843 	if (test_bit(Faulty, &rdev->flags)) {
3844 		printk(KERN_WARNING
3845 			"md: can not hot-add faulty %s disk to %s!\n",
3846 			bdevname(rdev->bdev,b), mdname(mddev));
3847 		err = -EINVAL;
3848 		goto abort_export;
3849 	}
3850 	clear_bit(In_sync, &rdev->flags);
3851 	rdev->desc_nr = -1;
3852 	err = bind_rdev_to_array(rdev, mddev);
3853 	if (err)
3854 		goto abort_export;
3855 
3856 	/*
3857 	 * The rest should better be atomic, we can have disk failures
3858 	 * noticed in interrupt contexts ...
3859 	 */
3860 
3861 	if (rdev->desc_nr == mddev->max_disks) {
3862 		printk(KERN_WARNING "%s: can not hot-add to full array!\n",
3863 			mdname(mddev));
3864 		err = -EBUSY;
3865 		goto abort_unbind_export;
3866 	}
3867 
3868 	rdev->raid_disk = -1;
3869 
3870 	md_update_sb(mddev);
3871 
3872 	/*
3873 	 * Kick recovery, maybe this spare has to be added to the
3874 	 * array immediately.
3875 	 */
3876 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3877 	md_wakeup_thread(mddev->thread);
3878 	md_new_event(mddev);
3879 	return 0;
3880 
3881 abort_unbind_export:
3882 	unbind_rdev_from_array(rdev);
3883 
3884 abort_export:
3885 	export_rdev(rdev);
3886 	return err;
3887 }
3888 
3889 static int set_bitmap_file(mddev_t *mddev, int fd)
3890 {
3891 	int err;
3892 
3893 	if (mddev->pers) {
3894 		if (!mddev->pers->quiesce)
3895 			return -EBUSY;
3896 		if (mddev->recovery || mddev->sync_thread)
3897 			return -EBUSY;
3898 		/* we should be able to change the bitmap.. */
3899 	}
3900 
3901 
3902 	if (fd >= 0) {
3903 		if (mddev->bitmap)
3904 			return -EEXIST; /* cannot add when bitmap is present */
3905 		mddev->bitmap_file = fget(fd);
3906 
3907 		if (mddev->bitmap_file == NULL) {
3908 			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
3909 			       mdname(mddev));
3910 			return -EBADF;
3911 		}
3912 
3913 		err = deny_bitmap_write_access(mddev->bitmap_file);
3914 		if (err) {
3915 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
3916 			       mdname(mddev));
3917 			fput(mddev->bitmap_file);
3918 			mddev->bitmap_file = NULL;
3919 			return err;
3920 		}
3921 		mddev->bitmap_offset = 0; /* file overrides offset */
3922 	} else if (mddev->bitmap == NULL)
3923 		return -ENOENT; /* cannot remove what isn't there */
3924 	err = 0;
3925 	if (mddev->pers) {
3926 		mddev->pers->quiesce(mddev, 1);
3927 		if (fd >= 0)
3928 			err = bitmap_create(mddev);
3929 		if (fd < 0 || err) {
3930 			bitmap_destroy(mddev);
3931 			fd = -1; /* make sure to put the file */
3932 		}
3933 		mddev->pers->quiesce(mddev, 0);
3934 	}
3935 	if (fd < 0) {
3936 		if (mddev->bitmap_file) {
3937 			restore_bitmap_write_access(mddev->bitmap_file);
3938 			fput(mddev->bitmap_file);
3939 		}
3940 		mddev->bitmap_file = NULL;
3941 	}
3942 
3943 	return err;
3944 }
3945 
3946 /*
3947  * set_array_info is used two different ways
3948  * The original usage is when creating a new array.
3949  * In this usage, raid_disks is > 0 and it together with
3950  *  level, size, not_persistent,layout,chunksize determine the
3951  *  shape of the array.
3952  *  This will always create an array with a type-0.90.0 superblock.
3953  * The newer usage is when assembling an array.
3954  *  In this case raid_disks will be 0, and the major_version field is
3955  *  use to determine which style super-blocks are to be found on the devices.
3956  *  The minor and patch _version numbers are also kept incase the
3957  *  super_block handler wishes to interpret them.
3958  */
3959 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3960 {
3961 
3962 	if (info->raid_disks == 0) {
3963 		/* just setting version number for superblock loading */
3964 		if (info->major_version < 0 ||
3965 		    info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
3966 		    super_types[info->major_version].name == NULL) {
3967 			/* maybe try to auto-load a module? */
3968 			printk(KERN_INFO
3969 				"md: superblock version %d not known\n",
3970 				info->major_version);
3971 			return -EINVAL;
3972 		}
3973 		mddev->major_version = info->major_version;
3974 		mddev->minor_version = info->minor_version;
3975 		mddev->patch_version = info->patch_version;
3976 		return 0;
3977 	}
3978 	mddev->major_version = MD_MAJOR_VERSION;
3979 	mddev->minor_version = MD_MINOR_VERSION;
3980 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
3981 	mddev->ctime         = get_seconds();
3982 
3983 	mddev->level         = info->level;
3984 	mddev->clevel[0]     = 0;
3985 	mddev->size          = info->size;
3986 	mddev->raid_disks    = info->raid_disks;
3987 	/* don't set md_minor, it is determined by which /dev/md* was
3988 	 * openned
3989 	 */
3990 	if (info->state & (1<<MD_SB_CLEAN))
3991 		mddev->recovery_cp = MaxSector;
3992 	else
3993 		mddev->recovery_cp = 0;
3994 	mddev->persistent    = ! info->not_persistent;
3995 
3996 	mddev->layout        = info->layout;
3997 	mddev->chunk_size    = info->chunk_size;
3998 
3999 	mddev->max_disks     = MD_SB_DISKS;
4000 
4001 	mddev->sb_dirty      = 1;
4002 
4003 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4004 	mddev->bitmap_offset = 0;
4005 
4006 	mddev->reshape_position = MaxSector;
4007 
4008 	/*
4009 	 * Generate a 128 bit UUID
4010 	 */
4011 	get_random_bytes(mddev->uuid, 16);
4012 
4013 	mddev->new_level = mddev->level;
4014 	mddev->new_chunk = mddev->chunk_size;
4015 	mddev->new_layout = mddev->layout;
4016 	mddev->delta_disks = 0;
4017 
4018 	return 0;
4019 }
4020 
4021 static int update_size(mddev_t *mddev, unsigned long size)
4022 {
4023 	mdk_rdev_t * rdev;
4024 	int rv;
4025 	struct list_head *tmp;
4026 	int fit = (size == 0);
4027 
4028 	if (mddev->pers->resize == NULL)
4029 		return -EINVAL;
4030 	/* The "size" is the amount of each device that is used.
4031 	 * This can only make sense for arrays with redundancy.
4032 	 * linear and raid0 always use whatever space is available
4033 	 * We can only consider changing the size if no resync
4034 	 * or reconstruction is happening, and if the new size
4035 	 * is acceptable. It must fit before the sb_offset or,
4036 	 * if that is <data_offset, it must fit before the
4037 	 * size of each device.
4038 	 * If size is zero, we find the largest size that fits.
4039 	 */
4040 	if (mddev->sync_thread)
4041 		return -EBUSY;
4042 	ITERATE_RDEV(mddev,rdev,tmp) {
4043 		sector_t avail;
4044 		if (rdev->sb_offset > rdev->data_offset)
4045 			avail = (rdev->sb_offset*2) - rdev->data_offset;
4046 		else
4047 			avail = get_capacity(rdev->bdev->bd_disk)
4048 				- rdev->data_offset;
4049 		if (fit && (size == 0 || size > avail/2))
4050 			size = avail/2;
4051 		if (avail < ((sector_t)size << 1))
4052 			return -ENOSPC;
4053 	}
4054 	rv = mddev->pers->resize(mddev, (sector_t)size *2);
4055 	if (!rv) {
4056 		struct block_device *bdev;
4057 
4058 		bdev = bdget_disk(mddev->gendisk, 0);
4059 		if (bdev) {
4060 			mutex_lock(&bdev->bd_inode->i_mutex);
4061 			i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4062 			mutex_unlock(&bdev->bd_inode->i_mutex);
4063 			bdput(bdev);
4064 		}
4065 	}
4066 	return rv;
4067 }
4068 
4069 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4070 {
4071 	int rv;
4072 	/* change the number of raid disks */
4073 	if (mddev->pers->check_reshape == NULL)
4074 		return -EINVAL;
4075 	if (raid_disks <= 0 ||
4076 	    raid_disks >= mddev->max_disks)
4077 		return -EINVAL;
4078 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4079 		return -EBUSY;
4080 	mddev->delta_disks = raid_disks - mddev->raid_disks;
4081 
4082 	rv = mddev->pers->check_reshape(mddev);
4083 	return rv;
4084 }
4085 
4086 
4087 /*
4088  * update_array_info is used to change the configuration of an
4089  * on-line array.
4090  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4091  * fields in the info are checked against the array.
4092  * Any differences that cannot be handled will cause an error.
4093  * Normally, only one change can be managed at a time.
4094  */
4095 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4096 {
4097 	int rv = 0;
4098 	int cnt = 0;
4099 	int state = 0;
4100 
4101 	/* calculate expected state,ignoring low bits */
4102 	if (mddev->bitmap && mddev->bitmap_offset)
4103 		state |= (1 << MD_SB_BITMAP_PRESENT);
4104 
4105 	if (mddev->major_version != info->major_version ||
4106 	    mddev->minor_version != info->minor_version ||
4107 /*	    mddev->patch_version != info->patch_version || */
4108 	    mddev->ctime         != info->ctime         ||
4109 	    mddev->level         != info->level         ||
4110 /*	    mddev->layout        != info->layout        || */
4111 	    !mddev->persistent	 != info->not_persistent||
4112 	    mddev->chunk_size    != info->chunk_size    ||
4113 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4114 	    ((state^info->state) & 0xfffffe00)
4115 		)
4116 		return -EINVAL;
4117 	/* Check there is only one change */
4118 	if (info->size >= 0 && mddev->size != info->size) cnt++;
4119 	if (mddev->raid_disks != info->raid_disks) cnt++;
4120 	if (mddev->layout != info->layout) cnt++;
4121 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4122 	if (cnt == 0) return 0;
4123 	if (cnt > 1) return -EINVAL;
4124 
4125 	if (mddev->layout != info->layout) {
4126 		/* Change layout
4127 		 * we don't need to do anything at the md level, the
4128 		 * personality will take care of it all.
4129 		 */
4130 		if (mddev->pers->reconfig == NULL)
4131 			return -EINVAL;
4132 		else
4133 			return mddev->pers->reconfig(mddev, info->layout, -1);
4134 	}
4135 	if (info->size >= 0 && mddev->size != info->size)
4136 		rv = update_size(mddev, info->size);
4137 
4138 	if (mddev->raid_disks    != info->raid_disks)
4139 		rv = update_raid_disks(mddev, info->raid_disks);
4140 
4141 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4142 		if (mddev->pers->quiesce == NULL)
4143 			return -EINVAL;
4144 		if (mddev->recovery || mddev->sync_thread)
4145 			return -EBUSY;
4146 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4147 			/* add the bitmap */
4148 			if (mddev->bitmap)
4149 				return -EEXIST;
4150 			if (mddev->default_bitmap_offset == 0)
4151 				return -EINVAL;
4152 			mddev->bitmap_offset = mddev->default_bitmap_offset;
4153 			mddev->pers->quiesce(mddev, 1);
4154 			rv = bitmap_create(mddev);
4155 			if (rv)
4156 				bitmap_destroy(mddev);
4157 			mddev->pers->quiesce(mddev, 0);
4158 		} else {
4159 			/* remove the bitmap */
4160 			if (!mddev->bitmap)
4161 				return -ENOENT;
4162 			if (mddev->bitmap->file)
4163 				return -EINVAL;
4164 			mddev->pers->quiesce(mddev, 1);
4165 			bitmap_destroy(mddev);
4166 			mddev->pers->quiesce(mddev, 0);
4167 			mddev->bitmap_offset = 0;
4168 		}
4169 	}
4170 	md_update_sb(mddev);
4171 	return rv;
4172 }
4173 
4174 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4175 {
4176 	mdk_rdev_t *rdev;
4177 
4178 	if (mddev->pers == NULL)
4179 		return -ENODEV;
4180 
4181 	rdev = find_rdev(mddev, dev);
4182 	if (!rdev)
4183 		return -ENODEV;
4184 
4185 	md_error(mddev, rdev);
4186 	return 0;
4187 }
4188 
4189 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4190 {
4191 	mddev_t *mddev = bdev->bd_disk->private_data;
4192 
4193 	geo->heads = 2;
4194 	geo->sectors = 4;
4195 	geo->cylinders = get_capacity(mddev->gendisk) / 8;
4196 	return 0;
4197 }
4198 
4199 static int md_ioctl(struct inode *inode, struct file *file,
4200 			unsigned int cmd, unsigned long arg)
4201 {
4202 	int err = 0;
4203 	void __user *argp = (void __user *)arg;
4204 	mddev_t *mddev = NULL;
4205 
4206 	if (!capable(CAP_SYS_ADMIN))
4207 		return -EACCES;
4208 
4209 	/*
4210 	 * Commands dealing with the RAID driver but not any
4211 	 * particular array:
4212 	 */
4213 	switch (cmd)
4214 	{
4215 		case RAID_VERSION:
4216 			err = get_version(argp);
4217 			goto done;
4218 
4219 		case PRINT_RAID_DEBUG:
4220 			err = 0;
4221 			md_print_devices();
4222 			goto done;
4223 
4224 #ifndef MODULE
4225 		case RAID_AUTORUN:
4226 			err = 0;
4227 			autostart_arrays(arg);
4228 			goto done;
4229 #endif
4230 		default:;
4231 	}
4232 
4233 	/*
4234 	 * Commands creating/starting a new array:
4235 	 */
4236 
4237 	mddev = inode->i_bdev->bd_disk->private_data;
4238 
4239 	if (!mddev) {
4240 		BUG();
4241 		goto abort;
4242 	}
4243 
4244 
4245 	if (cmd == START_ARRAY) {
4246 		/* START_ARRAY doesn't need to lock the array as autostart_array
4247 		 * does the locking, and it could even be a different array
4248 		 */
4249 		static int cnt = 3;
4250 		if (cnt > 0 ) {
4251 			printk(KERN_WARNING
4252 			       "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
4253 			       "This will not be supported beyond July 2006\n",
4254 			       current->comm, current->pid);
4255 			cnt--;
4256 		}
4257 		err = autostart_array(new_decode_dev(arg));
4258 		if (err) {
4259 			printk(KERN_WARNING "md: autostart failed!\n");
4260 			goto abort;
4261 		}
4262 		goto done;
4263 	}
4264 
4265 	err = mddev_lock(mddev);
4266 	if (err) {
4267 		printk(KERN_INFO
4268 			"md: ioctl lock interrupted, reason %d, cmd %d\n",
4269 			err, cmd);
4270 		goto abort;
4271 	}
4272 
4273 	switch (cmd)
4274 	{
4275 		case SET_ARRAY_INFO:
4276 			{
4277 				mdu_array_info_t info;
4278 				if (!arg)
4279 					memset(&info, 0, sizeof(info));
4280 				else if (copy_from_user(&info, argp, sizeof(info))) {
4281 					err = -EFAULT;
4282 					goto abort_unlock;
4283 				}
4284 				if (mddev->pers) {
4285 					err = update_array_info(mddev, &info);
4286 					if (err) {
4287 						printk(KERN_WARNING "md: couldn't update"
4288 						       " array info. %d\n", err);
4289 						goto abort_unlock;
4290 					}
4291 					goto done_unlock;
4292 				}
4293 				if (!list_empty(&mddev->disks)) {
4294 					printk(KERN_WARNING
4295 					       "md: array %s already has disks!\n",
4296 					       mdname(mddev));
4297 					err = -EBUSY;
4298 					goto abort_unlock;
4299 				}
4300 				if (mddev->raid_disks) {
4301 					printk(KERN_WARNING
4302 					       "md: array %s already initialised!\n",
4303 					       mdname(mddev));
4304 					err = -EBUSY;
4305 					goto abort_unlock;
4306 				}
4307 				err = set_array_info(mddev, &info);
4308 				if (err) {
4309 					printk(KERN_WARNING "md: couldn't set"
4310 					       " array info. %d\n", err);
4311 					goto abort_unlock;
4312 				}
4313 			}
4314 			goto done_unlock;
4315 
4316 		default:;
4317 	}
4318 
4319 	/*
4320 	 * Commands querying/configuring an existing array:
4321 	 */
4322 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4323 	 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
4324 	if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4325 			&& cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
4326 		err = -ENODEV;
4327 		goto abort_unlock;
4328 	}
4329 
4330 	/*
4331 	 * Commands even a read-only array can execute:
4332 	 */
4333 	switch (cmd)
4334 	{
4335 		case GET_ARRAY_INFO:
4336 			err = get_array_info(mddev, argp);
4337 			goto done_unlock;
4338 
4339 		case GET_BITMAP_FILE:
4340 			err = get_bitmap_file(mddev, argp);
4341 			goto done_unlock;
4342 
4343 		case GET_DISK_INFO:
4344 			err = get_disk_info(mddev, argp);
4345 			goto done_unlock;
4346 
4347 		case RESTART_ARRAY_RW:
4348 			err = restart_array(mddev);
4349 			goto done_unlock;
4350 
4351 		case STOP_ARRAY:
4352 			err = do_md_stop (mddev, 0);
4353 			goto done_unlock;
4354 
4355 		case STOP_ARRAY_RO:
4356 			err = do_md_stop (mddev, 1);
4357 			goto done_unlock;
4358 
4359 	/*
4360 	 * We have a problem here : there is no easy way to give a CHS
4361 	 * virtual geometry. We currently pretend that we have a 2 heads
4362 	 * 4 sectors (with a BIG number of cylinders...). This drives
4363 	 * dosfs just mad... ;-)
4364 	 */
4365 	}
4366 
4367 	/*
4368 	 * The remaining ioctls are changing the state of the
4369 	 * superblock, so we do not allow them on read-only arrays.
4370 	 * However non-MD ioctls (e.g. get-size) will still come through
4371 	 * here and hit the 'default' below, so only disallow
4372 	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4373 	 */
4374 	if (_IOC_TYPE(cmd) == MD_MAJOR &&
4375 	    mddev->ro && mddev->pers) {
4376 		if (mddev->ro == 2) {
4377 			mddev->ro = 0;
4378 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4379 		md_wakeup_thread(mddev->thread);
4380 
4381 		} else {
4382 			err = -EROFS;
4383 			goto abort_unlock;
4384 		}
4385 	}
4386 
4387 	switch (cmd)
4388 	{
4389 		case ADD_NEW_DISK:
4390 		{
4391 			mdu_disk_info_t info;
4392 			if (copy_from_user(&info, argp, sizeof(info)))
4393 				err = -EFAULT;
4394 			else
4395 				err = add_new_disk(mddev, &info);
4396 			goto done_unlock;
4397 		}
4398 
4399 		case HOT_REMOVE_DISK:
4400 			err = hot_remove_disk(mddev, new_decode_dev(arg));
4401 			goto done_unlock;
4402 
4403 		case HOT_ADD_DISK:
4404 			err = hot_add_disk(mddev, new_decode_dev(arg));
4405 			goto done_unlock;
4406 
4407 		case SET_DISK_FAULTY:
4408 			err = set_disk_faulty(mddev, new_decode_dev(arg));
4409 			goto done_unlock;
4410 
4411 		case RUN_ARRAY:
4412 			err = do_md_run (mddev);
4413 			goto done_unlock;
4414 
4415 		case SET_BITMAP_FILE:
4416 			err = set_bitmap_file(mddev, (int)arg);
4417 			goto done_unlock;
4418 
4419 		default:
4420 			err = -EINVAL;
4421 			goto abort_unlock;
4422 	}
4423 
4424 done_unlock:
4425 abort_unlock:
4426 	mddev_unlock(mddev);
4427 
4428 	return err;
4429 done:
4430 	if (err)
4431 		MD_BUG();
4432 abort:
4433 	return err;
4434 }
4435 
4436 static int md_open(struct inode *inode, struct file *file)
4437 {
4438 	/*
4439 	 * Succeed if we can lock the mddev, which confirms that
4440 	 * it isn't being stopped right now.
4441 	 */
4442 	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4443 	int err;
4444 
4445 	if ((err = mddev_lock(mddev)))
4446 		goto out;
4447 
4448 	err = 0;
4449 	mddev_get(mddev);
4450 	mddev_unlock(mddev);
4451 
4452 	check_disk_change(inode->i_bdev);
4453  out:
4454 	return err;
4455 }
4456 
4457 static int md_release(struct inode *inode, struct file * file)
4458 {
4459  	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4460 
4461 	if (!mddev)
4462 		BUG();
4463 	mddev_put(mddev);
4464 
4465 	return 0;
4466 }
4467 
4468 static int md_media_changed(struct gendisk *disk)
4469 {
4470 	mddev_t *mddev = disk->private_data;
4471 
4472 	return mddev->changed;
4473 }
4474 
4475 static int md_revalidate(struct gendisk *disk)
4476 {
4477 	mddev_t *mddev = disk->private_data;
4478 
4479 	mddev->changed = 0;
4480 	return 0;
4481 }
4482 static struct block_device_operations md_fops =
4483 {
4484 	.owner		= THIS_MODULE,
4485 	.open		= md_open,
4486 	.release	= md_release,
4487 	.ioctl		= md_ioctl,
4488 	.getgeo		= md_getgeo,
4489 	.media_changed	= md_media_changed,
4490 	.revalidate_disk= md_revalidate,
4491 };
4492 
4493 static int md_thread(void * arg)
4494 {
4495 	mdk_thread_t *thread = arg;
4496 
4497 	/*
4498 	 * md_thread is a 'system-thread', it's priority should be very
4499 	 * high. We avoid resource deadlocks individually in each
4500 	 * raid personality. (RAID5 does preallocation) We also use RR and
4501 	 * the very same RT priority as kswapd, thus we will never get
4502 	 * into a priority inversion deadlock.
4503 	 *
4504 	 * we definitely have to have equal or higher priority than
4505 	 * bdflush, otherwise bdflush will deadlock if there are too
4506 	 * many dirty RAID5 blocks.
4507 	 */
4508 
4509 	allow_signal(SIGKILL);
4510 	while (!kthread_should_stop()) {
4511 
4512 		/* We need to wait INTERRUPTIBLE so that
4513 		 * we don't add to the load-average.
4514 		 * That means we need to be sure no signals are
4515 		 * pending
4516 		 */
4517 		if (signal_pending(current))
4518 			flush_signals(current);
4519 
4520 		wait_event_interruptible_timeout
4521 			(thread->wqueue,
4522 			 test_bit(THREAD_WAKEUP, &thread->flags)
4523 			 || kthread_should_stop(),
4524 			 thread->timeout);
4525 		try_to_freeze();
4526 
4527 		clear_bit(THREAD_WAKEUP, &thread->flags);
4528 
4529 		thread->run(thread->mddev);
4530 	}
4531 
4532 	return 0;
4533 }
4534 
4535 void md_wakeup_thread(mdk_thread_t *thread)
4536 {
4537 	if (thread) {
4538 		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4539 		set_bit(THREAD_WAKEUP, &thread->flags);
4540 		wake_up(&thread->wqueue);
4541 	}
4542 }
4543 
4544 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4545 				 const char *name)
4546 {
4547 	mdk_thread_t *thread;
4548 
4549 	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4550 	if (!thread)
4551 		return NULL;
4552 
4553 	init_waitqueue_head(&thread->wqueue);
4554 
4555 	thread->run = run;
4556 	thread->mddev = mddev;
4557 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
4558 	thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4559 	if (IS_ERR(thread->tsk)) {
4560 		kfree(thread);
4561 		return NULL;
4562 	}
4563 	return thread;
4564 }
4565 
4566 void md_unregister_thread(mdk_thread_t *thread)
4567 {
4568 	dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
4569 
4570 	kthread_stop(thread->tsk);
4571 	kfree(thread);
4572 }
4573 
4574 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4575 {
4576 	if (!mddev) {
4577 		MD_BUG();
4578 		return;
4579 	}
4580 
4581 	if (!rdev || test_bit(Faulty, &rdev->flags))
4582 		return;
4583 /*
4584 	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4585 		mdname(mddev),
4586 		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4587 		__builtin_return_address(0),__builtin_return_address(1),
4588 		__builtin_return_address(2),__builtin_return_address(3));
4589 */
4590 	if (!mddev->pers->error_handler)
4591 		return;
4592 	mddev->pers->error_handler(mddev,rdev);
4593 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4594 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4595 	md_wakeup_thread(mddev->thread);
4596 	md_new_event_inintr(mddev);
4597 }
4598 
4599 /* seq_file implementation /proc/mdstat */
4600 
4601 static void status_unused(struct seq_file *seq)
4602 {
4603 	int i = 0;
4604 	mdk_rdev_t *rdev;
4605 	struct list_head *tmp;
4606 
4607 	seq_printf(seq, "unused devices: ");
4608 
4609 	ITERATE_RDEV_PENDING(rdev,tmp) {
4610 		char b[BDEVNAME_SIZE];
4611 		i++;
4612 		seq_printf(seq, "%s ",
4613 			      bdevname(rdev->bdev,b));
4614 	}
4615 	if (!i)
4616 		seq_printf(seq, "<none>");
4617 
4618 	seq_printf(seq, "\n");
4619 }
4620 
4621 
4622 static void status_resync(struct seq_file *seq, mddev_t * mddev)
4623 {
4624 	sector_t max_blocks, resync, res;
4625 	unsigned long dt, db, rt;
4626 	int scale;
4627 	unsigned int per_milli;
4628 
4629 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4630 
4631 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4632 		max_blocks = mddev->resync_max_sectors >> 1;
4633 	else
4634 		max_blocks = mddev->size;
4635 
4636 	/*
4637 	 * Should not happen.
4638 	 */
4639 	if (!max_blocks) {
4640 		MD_BUG();
4641 		return;
4642 	}
4643 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
4644 	 * in a sector_t, and (max_blocks>>scale) will fit in a
4645 	 * u32, as those are the requirements for sector_div.
4646 	 * Thus 'scale' must be at least 10
4647 	 */
4648 	scale = 10;
4649 	if (sizeof(sector_t) > sizeof(unsigned long)) {
4650 		while ( max_blocks/2 > (1ULL<<(scale+32)))
4651 			scale++;
4652 	}
4653 	res = (resync>>scale)*1000;
4654 	sector_div(res, (u32)((max_blocks>>scale)+1));
4655 
4656 	per_milli = res;
4657 	{
4658 		int i, x = per_milli/50, y = 20-x;
4659 		seq_printf(seq, "[");
4660 		for (i = 0; i < x; i++)
4661 			seq_printf(seq, "=");
4662 		seq_printf(seq, ">");
4663 		for (i = 0; i < y; i++)
4664 			seq_printf(seq, ".");
4665 		seq_printf(seq, "] ");
4666 	}
4667 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4668 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4669 		    "reshape" :
4670 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4671 		       "resync" : "recovery")),
4672 		      per_milli/10, per_milli % 10,
4673 		   (unsigned long long) resync,
4674 		   (unsigned long long) max_blocks);
4675 
4676 	/*
4677 	 * We do not want to overflow, so the order of operands and
4678 	 * the * 100 / 100 trick are important. We do a +1 to be
4679 	 * safe against division by zero. We only estimate anyway.
4680 	 *
4681 	 * dt: time from mark until now
4682 	 * db: blocks written from mark until now
4683 	 * rt: remaining time
4684 	 */
4685 	dt = ((jiffies - mddev->resync_mark) / HZ);
4686 	if (!dt) dt++;
4687 	db = resync - (mddev->resync_mark_cnt/2);
4688 	rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
4689 
4690 	seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4691 
4692 	seq_printf(seq, " speed=%ldK/sec", db/dt);
4693 }
4694 
4695 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4696 {
4697 	struct list_head *tmp;
4698 	loff_t l = *pos;
4699 	mddev_t *mddev;
4700 
4701 	if (l >= 0x10000)
4702 		return NULL;
4703 	if (!l--)
4704 		/* header */
4705 		return (void*)1;
4706 
4707 	spin_lock(&all_mddevs_lock);
4708 	list_for_each(tmp,&all_mddevs)
4709 		if (!l--) {
4710 			mddev = list_entry(tmp, mddev_t, all_mddevs);
4711 			mddev_get(mddev);
4712 			spin_unlock(&all_mddevs_lock);
4713 			return mddev;
4714 		}
4715 	spin_unlock(&all_mddevs_lock);
4716 	if (!l--)
4717 		return (void*)2;/* tail */
4718 	return NULL;
4719 }
4720 
4721 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4722 {
4723 	struct list_head *tmp;
4724 	mddev_t *next_mddev, *mddev = v;
4725 
4726 	++*pos;
4727 	if (v == (void*)2)
4728 		return NULL;
4729 
4730 	spin_lock(&all_mddevs_lock);
4731 	if (v == (void*)1)
4732 		tmp = all_mddevs.next;
4733 	else
4734 		tmp = mddev->all_mddevs.next;
4735 	if (tmp != &all_mddevs)
4736 		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4737 	else {
4738 		next_mddev = (void*)2;
4739 		*pos = 0x10000;
4740 	}
4741 	spin_unlock(&all_mddevs_lock);
4742 
4743 	if (v != (void*)1)
4744 		mddev_put(mddev);
4745 	return next_mddev;
4746 
4747 }
4748 
4749 static void md_seq_stop(struct seq_file *seq, void *v)
4750 {
4751 	mddev_t *mddev = v;
4752 
4753 	if (mddev && v != (void*)1 && v != (void*)2)
4754 		mddev_put(mddev);
4755 }
4756 
4757 struct mdstat_info {
4758 	int event;
4759 };
4760 
4761 static int md_seq_show(struct seq_file *seq, void *v)
4762 {
4763 	mddev_t *mddev = v;
4764 	sector_t size;
4765 	struct list_head *tmp2;
4766 	mdk_rdev_t *rdev;
4767 	struct mdstat_info *mi = seq->private;
4768 	struct bitmap *bitmap;
4769 
4770 	if (v == (void*)1) {
4771 		struct mdk_personality *pers;
4772 		seq_printf(seq, "Personalities : ");
4773 		spin_lock(&pers_lock);
4774 		list_for_each_entry(pers, &pers_list, list)
4775 			seq_printf(seq, "[%s] ", pers->name);
4776 
4777 		spin_unlock(&pers_lock);
4778 		seq_printf(seq, "\n");
4779 		mi->event = atomic_read(&md_event_count);
4780 		return 0;
4781 	}
4782 	if (v == (void*)2) {
4783 		status_unused(seq);
4784 		return 0;
4785 	}
4786 
4787 	if (mddev_lock(mddev) < 0)
4788 		return -EINTR;
4789 
4790 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4791 		seq_printf(seq, "%s : %sactive", mdname(mddev),
4792 						mddev->pers ? "" : "in");
4793 		if (mddev->pers) {
4794 			if (mddev->ro==1)
4795 				seq_printf(seq, " (read-only)");
4796 			if (mddev->ro==2)
4797 				seq_printf(seq, "(auto-read-only)");
4798 			seq_printf(seq, " %s", mddev->pers->name);
4799 		}
4800 
4801 		size = 0;
4802 		ITERATE_RDEV(mddev,rdev,tmp2) {
4803 			char b[BDEVNAME_SIZE];
4804 			seq_printf(seq, " %s[%d]",
4805 				bdevname(rdev->bdev,b), rdev->desc_nr);
4806 			if (test_bit(WriteMostly, &rdev->flags))
4807 				seq_printf(seq, "(W)");
4808 			if (test_bit(Faulty, &rdev->flags)) {
4809 				seq_printf(seq, "(F)");
4810 				continue;
4811 			} else if (rdev->raid_disk < 0)
4812 				seq_printf(seq, "(S)"); /* spare */
4813 			size += rdev->size;
4814 		}
4815 
4816 		if (!list_empty(&mddev->disks)) {
4817 			if (mddev->pers)
4818 				seq_printf(seq, "\n      %llu blocks",
4819 					(unsigned long long)mddev->array_size);
4820 			else
4821 				seq_printf(seq, "\n      %llu blocks",
4822 					(unsigned long long)size);
4823 		}
4824 		if (mddev->persistent) {
4825 			if (mddev->major_version != 0 ||
4826 			    mddev->minor_version != 90) {
4827 				seq_printf(seq," super %d.%d",
4828 					   mddev->major_version,
4829 					   mddev->minor_version);
4830 			}
4831 		} else
4832 			seq_printf(seq, " super non-persistent");
4833 
4834 		if (mddev->pers) {
4835 			mddev->pers->status (seq, mddev);
4836 	 		seq_printf(seq, "\n      ");
4837 			if (mddev->pers->sync_request) {
4838 				if (mddev->curr_resync > 2) {
4839 					status_resync (seq, mddev);
4840 					seq_printf(seq, "\n      ");
4841 				} else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4842 					seq_printf(seq, "\tresync=DELAYED\n      ");
4843 				else if (mddev->recovery_cp < MaxSector)
4844 					seq_printf(seq, "\tresync=PENDING\n      ");
4845 			}
4846 		} else
4847 			seq_printf(seq, "\n       ");
4848 
4849 		if ((bitmap = mddev->bitmap)) {
4850 			unsigned long chunk_kb;
4851 			unsigned long flags;
4852 			spin_lock_irqsave(&bitmap->lock, flags);
4853 			chunk_kb = bitmap->chunksize >> 10;
4854 			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
4855 				"%lu%s chunk",
4856 				bitmap->pages - bitmap->missing_pages,
4857 				bitmap->pages,
4858 				(bitmap->pages - bitmap->missing_pages)
4859 					<< (PAGE_SHIFT - 10),
4860 				chunk_kb ? chunk_kb : bitmap->chunksize,
4861 				chunk_kb ? "KB" : "B");
4862 			if (bitmap->file) {
4863 				seq_printf(seq, ", file: ");
4864 				seq_path(seq, bitmap->file->f_vfsmnt,
4865 					 bitmap->file->f_dentry," \t\n");
4866 			}
4867 
4868 			seq_printf(seq, "\n");
4869 			spin_unlock_irqrestore(&bitmap->lock, flags);
4870 		}
4871 
4872 		seq_printf(seq, "\n");
4873 	}
4874 	mddev_unlock(mddev);
4875 
4876 	return 0;
4877 }
4878 
4879 static struct seq_operations md_seq_ops = {
4880 	.start  = md_seq_start,
4881 	.next   = md_seq_next,
4882 	.stop   = md_seq_stop,
4883 	.show   = md_seq_show,
4884 };
4885 
4886 static int md_seq_open(struct inode *inode, struct file *file)
4887 {
4888 	int error;
4889 	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4890 	if (mi == NULL)
4891 		return -ENOMEM;
4892 
4893 	error = seq_open(file, &md_seq_ops);
4894 	if (error)
4895 		kfree(mi);
4896 	else {
4897 		struct seq_file *p = file->private_data;
4898 		p->private = mi;
4899 		mi->event = atomic_read(&md_event_count);
4900 	}
4901 	return error;
4902 }
4903 
4904 static int md_seq_release(struct inode *inode, struct file *file)
4905 {
4906 	struct seq_file *m = file->private_data;
4907 	struct mdstat_info *mi = m->private;
4908 	m->private = NULL;
4909 	kfree(mi);
4910 	return seq_release(inode, file);
4911 }
4912 
4913 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4914 {
4915 	struct seq_file *m = filp->private_data;
4916 	struct mdstat_info *mi = m->private;
4917 	int mask;
4918 
4919 	poll_wait(filp, &md_event_waiters, wait);
4920 
4921 	/* always allow read */
4922 	mask = POLLIN | POLLRDNORM;
4923 
4924 	if (mi->event != atomic_read(&md_event_count))
4925 		mask |= POLLERR | POLLPRI;
4926 	return mask;
4927 }
4928 
4929 static struct file_operations md_seq_fops = {
4930 	.open           = md_seq_open,
4931 	.read           = seq_read,
4932 	.llseek         = seq_lseek,
4933 	.release	= md_seq_release,
4934 	.poll		= mdstat_poll,
4935 };
4936 
4937 int register_md_personality(struct mdk_personality *p)
4938 {
4939 	spin_lock(&pers_lock);
4940 	list_add_tail(&p->list, &pers_list);
4941 	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
4942 	spin_unlock(&pers_lock);
4943 	return 0;
4944 }
4945 
4946 int unregister_md_personality(struct mdk_personality *p)
4947 {
4948 	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
4949 	spin_lock(&pers_lock);
4950 	list_del_init(&p->list);
4951 	spin_unlock(&pers_lock);
4952 	return 0;
4953 }
4954 
4955 static int is_mddev_idle(mddev_t *mddev)
4956 {
4957 	mdk_rdev_t * rdev;
4958 	struct list_head *tmp;
4959 	int idle;
4960 	unsigned long curr_events;
4961 
4962 	idle = 1;
4963 	ITERATE_RDEV(mddev,rdev,tmp) {
4964 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
4965 		curr_events = disk_stat_read(disk, sectors[0]) +
4966 				disk_stat_read(disk, sectors[1]) -
4967 				atomic_read(&disk->sync_io);
4968 		/* The difference between curr_events and last_events
4969 		 * will be affected by any new non-sync IO (making
4970 		 * curr_events bigger) and any difference in the amount of
4971 		 * in-flight syncio (making current_events bigger or smaller)
4972 		 * The amount in-flight is currently limited to
4973 		 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
4974 		 * which is at most 4096 sectors.
4975 		 * These numbers are fairly fragile and should be made
4976 		 * more robust, probably by enforcing the
4977 		 * 'window size' that md_do_sync sort-of uses.
4978 		 *
4979 		 * Note: the following is an unsigned comparison.
4980 		 */
4981 		if ((curr_events - rdev->last_events + 4096) > 8192) {
4982 			rdev->last_events = curr_events;
4983 			idle = 0;
4984 		}
4985 	}
4986 	return idle;
4987 }
4988 
4989 void md_done_sync(mddev_t *mddev, int blocks, int ok)
4990 {
4991 	/* another "blocks" (512byte) blocks have been synced */
4992 	atomic_sub(blocks, &mddev->recovery_active);
4993 	wake_up(&mddev->recovery_wait);
4994 	if (!ok) {
4995 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4996 		md_wakeup_thread(mddev->thread);
4997 		// stop recovery, signal do_sync ....
4998 	}
4999 }
5000 
5001 
5002 /* md_write_start(mddev, bi)
5003  * If we need to update some array metadata (e.g. 'active' flag
5004  * in superblock) before writing, schedule a superblock update
5005  * and wait for it to complete.
5006  */
5007 void md_write_start(mddev_t *mddev, struct bio *bi)
5008 {
5009 	if (bio_data_dir(bi) != WRITE)
5010 		return;
5011 
5012 	BUG_ON(mddev->ro == 1);
5013 	if (mddev->ro == 2) {
5014 		/* need to switch to read/write */
5015 		mddev->ro = 0;
5016 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5017 		md_wakeup_thread(mddev->thread);
5018 	}
5019 	atomic_inc(&mddev->writes_pending);
5020 	if (mddev->in_sync) {
5021 		spin_lock_irq(&mddev->write_lock);
5022 		if (mddev->in_sync) {
5023 			mddev->in_sync = 0;
5024 			mddev->sb_dirty = 3;
5025 			md_wakeup_thread(mddev->thread);
5026 		}
5027 		spin_unlock_irq(&mddev->write_lock);
5028 	}
5029 	wait_event(mddev->sb_wait, mddev->sb_dirty==0);
5030 }
5031 
5032 void md_write_end(mddev_t *mddev)
5033 {
5034 	if (atomic_dec_and_test(&mddev->writes_pending)) {
5035 		if (mddev->safemode == 2)
5036 			md_wakeup_thread(mddev->thread);
5037 		else if (mddev->safemode_delay)
5038 			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5039 	}
5040 }
5041 
5042 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5043 
5044 #define SYNC_MARKS	10
5045 #define	SYNC_MARK_STEP	(3*HZ)
5046 void md_do_sync(mddev_t *mddev)
5047 {
5048 	mddev_t *mddev2;
5049 	unsigned int currspeed = 0,
5050 		 window;
5051 	sector_t max_sectors,j, io_sectors;
5052 	unsigned long mark[SYNC_MARKS];
5053 	sector_t mark_cnt[SYNC_MARKS];
5054 	int last_mark,m;
5055 	struct list_head *tmp;
5056 	sector_t last_check;
5057 	int skipped = 0;
5058 	struct list_head *rtmp;
5059 	mdk_rdev_t *rdev;
5060 
5061 	/* just incase thread restarts... */
5062 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5063 		return;
5064 	if (mddev->ro) /* never try to sync a read-only array */
5065 		return;
5066 
5067 	/* we overload curr_resync somewhat here.
5068 	 * 0 == not engaged in resync at all
5069 	 * 2 == checking that there is no conflict with another sync
5070 	 * 1 == like 2, but have yielded to allow conflicting resync to
5071 	 *		commense
5072 	 * other == active in resync - this many blocks
5073 	 *
5074 	 * Before starting a resync we must have set curr_resync to
5075 	 * 2, and then checked that every "conflicting" array has curr_resync
5076 	 * less than ours.  When we find one that is the same or higher
5077 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5078 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5079 	 * This will mean we have to start checking from the beginning again.
5080 	 *
5081 	 */
5082 
5083 	do {
5084 		mddev->curr_resync = 2;
5085 
5086 	try_again:
5087 		if (kthread_should_stop()) {
5088 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5089 			goto skip;
5090 		}
5091 		ITERATE_MDDEV(mddev2,tmp) {
5092 			if (mddev2 == mddev)
5093 				continue;
5094 			if (mddev2->curr_resync &&
5095 			    match_mddev_units(mddev,mddev2)) {
5096 				DEFINE_WAIT(wq);
5097 				if (mddev < mddev2 && mddev->curr_resync == 2) {
5098 					/* arbitrarily yield */
5099 					mddev->curr_resync = 1;
5100 					wake_up(&resync_wait);
5101 				}
5102 				if (mddev > mddev2 && mddev->curr_resync == 1)
5103 					/* no need to wait here, we can wait the next
5104 					 * time 'round when curr_resync == 2
5105 					 */
5106 					continue;
5107 				prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5108 				if (!kthread_should_stop() &&
5109 				    mddev2->curr_resync >= mddev->curr_resync) {
5110 					printk(KERN_INFO "md: delaying resync of %s"
5111 					       " until %s has finished resync (they"
5112 					       " share one or more physical units)\n",
5113 					       mdname(mddev), mdname(mddev2));
5114 					mddev_put(mddev2);
5115 					schedule();
5116 					finish_wait(&resync_wait, &wq);
5117 					goto try_again;
5118 				}
5119 				finish_wait(&resync_wait, &wq);
5120 			}
5121 		}
5122 	} while (mddev->curr_resync < 2);
5123 
5124 	j = 0;
5125 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5126 		/* resync follows the size requested by the personality,
5127 		 * which defaults to physical size, but can be virtual size
5128 		 */
5129 		max_sectors = mddev->resync_max_sectors;
5130 		mddev->resync_mismatches = 0;
5131 		/* we don't use the checkpoint if there's a bitmap */
5132 		if (!mddev->bitmap &&
5133 		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5134 			j = mddev->recovery_cp;
5135 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5136 		max_sectors = mddev->size << 1;
5137 	else {
5138 		/* recovery follows the physical size of devices */
5139 		max_sectors = mddev->size << 1;
5140 		j = MaxSector;
5141 		ITERATE_RDEV(mddev,rdev,rtmp)
5142 			if (rdev->raid_disk >= 0 &&
5143 			    !test_bit(Faulty, &rdev->flags) &&
5144 			    !test_bit(In_sync, &rdev->flags) &&
5145 			    rdev->recovery_offset < j)
5146 				j = rdev->recovery_offset;
5147 	}
5148 
5149 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
5150 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
5151 		" %d KB/sec/disc.\n", speed_min(mddev));
5152 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5153 	       "(but not more than %d KB/sec) for reconstruction.\n",
5154 	       speed_max(mddev));
5155 
5156 	is_mddev_idle(mddev); /* this also initializes IO event counters */
5157 
5158 	io_sectors = 0;
5159 	for (m = 0; m < SYNC_MARKS; m++) {
5160 		mark[m] = jiffies;
5161 		mark_cnt[m] = io_sectors;
5162 	}
5163 	last_mark = 0;
5164 	mddev->resync_mark = mark[last_mark];
5165 	mddev->resync_mark_cnt = mark_cnt[last_mark];
5166 
5167 	/*
5168 	 * Tune reconstruction:
5169 	 */
5170 	window = 32*(PAGE_SIZE/512);
5171 	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5172 		window/2,(unsigned long long) max_sectors/2);
5173 
5174 	atomic_set(&mddev->recovery_active, 0);
5175 	init_waitqueue_head(&mddev->recovery_wait);
5176 	last_check = 0;
5177 
5178 	if (j>2) {
5179 		printk(KERN_INFO
5180 			"md: resuming recovery of %s from checkpoint.\n",
5181 			mdname(mddev));
5182 		mddev->curr_resync = j;
5183 	}
5184 
5185 	while (j < max_sectors) {
5186 		sector_t sectors;
5187 
5188 		skipped = 0;
5189 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
5190 					    currspeed < speed_min(mddev));
5191 		if (sectors == 0) {
5192 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5193 			goto out;
5194 		}
5195 
5196 		if (!skipped) { /* actual IO requested */
5197 			io_sectors += sectors;
5198 			atomic_add(sectors, &mddev->recovery_active);
5199 		}
5200 
5201 		j += sectors;
5202 		if (j>1) mddev->curr_resync = j;
5203 		if (last_check == 0)
5204 			/* this is the earliers that rebuilt will be
5205 			 * visible in /proc/mdstat
5206 			 */
5207 			md_new_event(mddev);
5208 
5209 		if (last_check + window > io_sectors || j == max_sectors)
5210 			continue;
5211 
5212 		last_check = io_sectors;
5213 
5214 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5215 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5216 			break;
5217 
5218 	repeat:
5219 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5220 			/* step marks */
5221 			int next = (last_mark+1) % SYNC_MARKS;
5222 
5223 			mddev->resync_mark = mark[next];
5224 			mddev->resync_mark_cnt = mark_cnt[next];
5225 			mark[next] = jiffies;
5226 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5227 			last_mark = next;
5228 		}
5229 
5230 
5231 		if (kthread_should_stop()) {
5232 			/*
5233 			 * got a signal, exit.
5234 			 */
5235 			printk(KERN_INFO
5236 				"md: md_do_sync() got signal ... exiting\n");
5237 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5238 			goto out;
5239 		}
5240 
5241 		/*
5242 		 * this loop exits only if either when we are slower than
5243 		 * the 'hard' speed limit, or the system was IO-idle for
5244 		 * a jiffy.
5245 		 * the system might be non-idle CPU-wise, but we only care
5246 		 * about not overloading the IO subsystem. (things like an
5247 		 * e2fsck being done on the RAID array should execute fast)
5248 		 */
5249 		mddev->queue->unplug_fn(mddev->queue);
5250 		cond_resched();
5251 
5252 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5253 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
5254 
5255 		if (currspeed > speed_min(mddev)) {
5256 			if ((currspeed > speed_max(mddev)) ||
5257 					!is_mddev_idle(mddev)) {
5258 				msleep(500);
5259 				goto repeat;
5260 			}
5261 		}
5262 	}
5263 	printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
5264 	/*
5265 	 * this also signals 'finished resyncing' to md_stop
5266 	 */
5267  out:
5268 	mddev->queue->unplug_fn(mddev->queue);
5269 
5270 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5271 
5272 	/* tell personality that we are finished */
5273 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5274 
5275 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5276 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
5277 	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5278 	    mddev->curr_resync > 2) {
5279 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5280 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5281 				if (mddev->curr_resync >= mddev->recovery_cp) {
5282 					printk(KERN_INFO
5283 					       "md: checkpointing recovery of %s.\n",
5284 					       mdname(mddev));
5285 					mddev->recovery_cp = mddev->curr_resync;
5286 				}
5287 			} else
5288 				mddev->recovery_cp = MaxSector;
5289 		} else {
5290 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5291 				mddev->curr_resync = MaxSector;
5292 			ITERATE_RDEV(mddev,rdev,rtmp)
5293 				if (rdev->raid_disk >= 0 &&
5294 				    !test_bit(Faulty, &rdev->flags) &&
5295 				    !test_bit(In_sync, &rdev->flags) &&
5296 				    rdev->recovery_offset < mddev->curr_resync)
5297 					rdev->recovery_offset = mddev->curr_resync;
5298 			mddev->sb_dirty = 1;
5299 		}
5300 	}
5301 
5302  skip:
5303 	mddev->curr_resync = 0;
5304 	wake_up(&resync_wait);
5305 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5306 	md_wakeup_thread(mddev->thread);
5307 }
5308 EXPORT_SYMBOL_GPL(md_do_sync);
5309 
5310 
5311 /*
5312  * This routine is regularly called by all per-raid-array threads to
5313  * deal with generic issues like resync and super-block update.
5314  * Raid personalities that don't have a thread (linear/raid0) do not
5315  * need this as they never do any recovery or update the superblock.
5316  *
5317  * It does not do any resync itself, but rather "forks" off other threads
5318  * to do that as needed.
5319  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5320  * "->recovery" and create a thread at ->sync_thread.
5321  * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5322  * and wakeups up this thread which will reap the thread and finish up.
5323  * This thread also removes any faulty devices (with nr_pending == 0).
5324  *
5325  * The overall approach is:
5326  *  1/ if the superblock needs updating, update it.
5327  *  2/ If a recovery thread is running, don't do anything else.
5328  *  3/ If recovery has finished, clean up, possibly marking spares active.
5329  *  4/ If there are any faulty devices, remove them.
5330  *  5/ If array is degraded, try to add spares devices
5331  *  6/ If array has spares or is not in-sync, start a resync thread.
5332  */
5333 void md_check_recovery(mddev_t *mddev)
5334 {
5335 	mdk_rdev_t *rdev;
5336 	struct list_head *rtmp;
5337 
5338 
5339 	if (mddev->bitmap)
5340 		bitmap_daemon_work(mddev->bitmap);
5341 
5342 	if (mddev->ro)
5343 		return;
5344 
5345 	if (signal_pending(current)) {
5346 		if (mddev->pers->sync_request) {
5347 			printk(KERN_INFO "md: %s in immediate safe mode\n",
5348 			       mdname(mddev));
5349 			mddev->safemode = 2;
5350 		}
5351 		flush_signals(current);
5352 	}
5353 
5354 	if ( ! (
5355 		mddev->sb_dirty ||
5356 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5357 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5358 		(mddev->safemode == 1) ||
5359 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5360 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5361 		))
5362 		return;
5363 
5364 	if (mddev_trylock(mddev)) {
5365 		int spares =0;
5366 
5367 		spin_lock_irq(&mddev->write_lock);
5368 		if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5369 		    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5370 			mddev->in_sync = 1;
5371 			mddev->sb_dirty = 3;
5372 		}
5373 		if (mddev->safemode == 1)
5374 			mddev->safemode = 0;
5375 		spin_unlock_irq(&mddev->write_lock);
5376 
5377 		if (mddev->sb_dirty)
5378 			md_update_sb(mddev);
5379 
5380 
5381 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5382 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5383 			/* resync/recovery still happening */
5384 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5385 			goto unlock;
5386 		}
5387 		if (mddev->sync_thread) {
5388 			/* resync has finished, collect result */
5389 			md_unregister_thread(mddev->sync_thread);
5390 			mddev->sync_thread = NULL;
5391 			if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5392 			    !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5393 				/* success...*/
5394 				/* activate any spares */
5395 				mddev->pers->spare_active(mddev);
5396 			}
5397 			md_update_sb(mddev);
5398 
5399 			/* if array is no-longer degraded, then any saved_raid_disk
5400 			 * information must be scrapped
5401 			 */
5402 			if (!mddev->degraded)
5403 				ITERATE_RDEV(mddev,rdev,rtmp)
5404 					rdev->saved_raid_disk = -1;
5405 
5406 			mddev->recovery = 0;
5407 			/* flag recovery needed just to double check */
5408 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5409 			md_new_event(mddev);
5410 			goto unlock;
5411 		}
5412 		/* Clear some bits that don't mean anything, but
5413 		 * might be left set
5414 		 */
5415 		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5416 		clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5417 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5418 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5419 
5420 		if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5421 			goto unlock;
5422 		/* no recovery is running.
5423 		 * remove any failed drives, then
5424 		 * add spares if possible.
5425 		 * Spare are also removed and re-added, to allow
5426 		 * the personality to fail the re-add.
5427 		 */
5428 		ITERATE_RDEV(mddev,rdev,rtmp)
5429 			if (rdev->raid_disk >= 0 &&
5430 			    (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
5431 			    atomic_read(&rdev->nr_pending)==0) {
5432 				if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
5433 					char nm[20];
5434 					sprintf(nm,"rd%d", rdev->raid_disk);
5435 					sysfs_remove_link(&mddev->kobj, nm);
5436 					rdev->raid_disk = -1;
5437 				}
5438 			}
5439 
5440 		if (mddev->degraded) {
5441 			ITERATE_RDEV(mddev,rdev,rtmp)
5442 				if (rdev->raid_disk < 0
5443 				    && !test_bit(Faulty, &rdev->flags)) {
5444 					rdev->recovery_offset = 0;
5445 					if (mddev->pers->hot_add_disk(mddev,rdev)) {
5446 						char nm[20];
5447 						sprintf(nm, "rd%d", rdev->raid_disk);
5448 						sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
5449 						spares++;
5450 						md_new_event(mddev);
5451 					} else
5452 						break;
5453 				}
5454 		}
5455 
5456 		if (spares) {
5457 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5458 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5459 		} else if (mddev->recovery_cp < MaxSector) {
5460 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5461 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5462 			/* nothing to be done ... */
5463 			goto unlock;
5464 
5465 		if (mddev->pers->sync_request) {
5466 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5467 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5468 				/* We are adding a device or devices to an array
5469 				 * which has the bitmap stored on all devices.
5470 				 * So make sure all bitmap pages get written
5471 				 */
5472 				bitmap_write_all(mddev->bitmap);
5473 			}
5474 			mddev->sync_thread = md_register_thread(md_do_sync,
5475 								mddev,
5476 								"%s_resync");
5477 			if (!mddev->sync_thread) {
5478 				printk(KERN_ERR "%s: could not start resync"
5479 					" thread...\n",
5480 					mdname(mddev));
5481 				/* leave the spares where they are, it shouldn't hurt */
5482 				mddev->recovery = 0;
5483 			} else
5484 				md_wakeup_thread(mddev->sync_thread);
5485 			md_new_event(mddev);
5486 		}
5487 	unlock:
5488 		mddev_unlock(mddev);
5489 	}
5490 }
5491 
5492 static int md_notify_reboot(struct notifier_block *this,
5493 			    unsigned long code, void *x)
5494 {
5495 	struct list_head *tmp;
5496 	mddev_t *mddev;
5497 
5498 	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5499 
5500 		printk(KERN_INFO "md: stopping all md devices.\n");
5501 
5502 		ITERATE_MDDEV(mddev,tmp)
5503 			if (mddev_trylock(mddev)) {
5504 				do_md_stop (mddev, 1);
5505 				mddev_unlock(mddev);
5506 			}
5507 		/*
5508 		 * certain more exotic SCSI devices are known to be
5509 		 * volatile wrt too early system reboots. While the
5510 		 * right place to handle this issue is the given
5511 		 * driver, we do want to have a safe RAID driver ...
5512 		 */
5513 		mdelay(1000*1);
5514 	}
5515 	return NOTIFY_DONE;
5516 }
5517 
5518 static struct notifier_block md_notifier = {
5519 	.notifier_call	= md_notify_reboot,
5520 	.next		= NULL,
5521 	.priority	= INT_MAX, /* before any real devices */
5522 };
5523 
5524 static void md_geninit(void)
5525 {
5526 	struct proc_dir_entry *p;
5527 
5528 	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5529 
5530 	p = create_proc_entry("mdstat", S_IRUGO, NULL);
5531 	if (p)
5532 		p->proc_fops = &md_seq_fops;
5533 }
5534 
5535 static int __init md_init(void)
5536 {
5537 	printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
5538 			" MD_SB_DISKS=%d\n",
5539 			MD_MAJOR_VERSION, MD_MINOR_VERSION,
5540 			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
5541 	printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
5542 			BITMAP_MINOR);
5543 
5544 	if (register_blkdev(MAJOR_NR, "md"))
5545 		return -1;
5546 	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5547 		unregister_blkdev(MAJOR_NR, "md");
5548 		return -1;
5549 	}
5550 	blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
5551 				md_probe, NULL, NULL);
5552 	blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
5553 			    md_probe, NULL, NULL);
5554 
5555 	register_reboot_notifier(&md_notifier);
5556 	raid_table_header = register_sysctl_table(raid_root_table, 1);
5557 
5558 	md_geninit();
5559 	return (0);
5560 }
5561 
5562 
5563 #ifndef MODULE
5564 
5565 /*
5566  * Searches all registered partitions for autorun RAID arrays
5567  * at boot time.
5568  */
5569 static dev_t detected_devices[128];
5570 static int dev_cnt;
5571 
5572 void md_autodetect_dev(dev_t dev)
5573 {
5574 	if (dev_cnt >= 0 && dev_cnt < 127)
5575 		detected_devices[dev_cnt++] = dev;
5576 }
5577 
5578 
5579 static void autostart_arrays(int part)
5580 {
5581 	mdk_rdev_t *rdev;
5582 	int i;
5583 
5584 	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5585 
5586 	for (i = 0; i < dev_cnt; i++) {
5587 		dev_t dev = detected_devices[i];
5588 
5589 		rdev = md_import_device(dev,0, 0);
5590 		if (IS_ERR(rdev))
5591 			continue;
5592 
5593 		if (test_bit(Faulty, &rdev->flags)) {
5594 			MD_BUG();
5595 			continue;
5596 		}
5597 		list_add(&rdev->same_set, &pending_raid_disks);
5598 	}
5599 	dev_cnt = 0;
5600 
5601 	autorun_devices(part);
5602 }
5603 
5604 #endif
5605 
5606 static __exit void md_exit(void)
5607 {
5608 	mddev_t *mddev;
5609 	struct list_head *tmp;
5610 
5611 	blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5612 	blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5613 
5614 	unregister_blkdev(MAJOR_NR,"md");
5615 	unregister_blkdev(mdp_major, "mdp");
5616 	unregister_reboot_notifier(&md_notifier);
5617 	unregister_sysctl_table(raid_table_header);
5618 	remove_proc_entry("mdstat", NULL);
5619 	ITERATE_MDDEV(mddev,tmp) {
5620 		struct gendisk *disk = mddev->gendisk;
5621 		if (!disk)
5622 			continue;
5623 		export_array(mddev);
5624 		del_gendisk(disk);
5625 		put_disk(disk);
5626 		mddev->gendisk = NULL;
5627 		mddev_put(mddev);
5628 	}
5629 }
5630 
5631 module_init(md_init)
5632 module_exit(md_exit)
5633 
5634 static int get_ro(char *buffer, struct kernel_param *kp)
5635 {
5636 	return sprintf(buffer, "%d", start_readonly);
5637 }
5638 static int set_ro(const char *val, struct kernel_param *kp)
5639 {
5640 	char *e;
5641 	int num = simple_strtoul(val, &e, 10);
5642 	if (*val && (*e == '\0' || *e == '\n')) {
5643 		start_readonly = num;
5644 		return 0;
5645 	}
5646 	return -EINVAL;
5647 }
5648 
5649 module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
5650 module_param(start_dirty_degraded, int, 0644);
5651 
5652 
5653 EXPORT_SYMBOL(register_md_personality);
5654 EXPORT_SYMBOL(unregister_md_personality);
5655 EXPORT_SYMBOL(md_error);
5656 EXPORT_SYMBOL(md_done_sync);
5657 EXPORT_SYMBOL(md_write_start);
5658 EXPORT_SYMBOL(md_write_end);
5659 EXPORT_SYMBOL(md_register_thread);
5660 EXPORT_SYMBOL(md_unregister_thread);
5661 EXPORT_SYMBOL(md_wakeup_thread);
5662 EXPORT_SYMBOL(md_check_recovery);
5663 MODULE_LICENSE("GPL");
5664 MODULE_ALIAS("md");
5665 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
5666