xref: /linux/drivers/md/md.c (revision 7025bec9125b0a02edcaf22c2dce753bf2c95480)
1 /*
2    md.c : Multiple Devices driver for Linux
3 	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 
5      completely rewritten, based on the MD driver code from Marc Zyngier
6 
7    Changes:
8 
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19 
20      Neil Brown <neilb@cse.unsw.edu.au>.
21 
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29 
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34 
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/string.h>
43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h>
45 #include <linux/random.h>
46 #include <linux/reboot.h>
47 #include <linux/file.h>
48 #include <linux/compat.h>
49 #include <linux/delay.h>
50 #include <linux/raid/md_p.h>
51 #include <linux/raid/md_u.h>
52 #include "md.h"
53 #include "bitmap.h"
54 
55 #define DEBUG 0
56 #define dprintk(x...) ((void)(DEBUG && printk(x)))
57 
58 
59 #ifndef MODULE
60 static void autostart_arrays(int part);
61 #endif
62 
63 static LIST_HEAD(pers_list);
64 static DEFINE_SPINLOCK(pers_lock);
65 
66 static void md_print_devices(void);
67 
68 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
69 
70 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
71 
72 /*
73  * Default number of read corrections we'll attempt on an rdev
74  * before ejecting it from the array. We divide the read error
75  * count by 2 for every hour elapsed between read errors.
76  */
77 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
78 /*
79  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
80  * is 1000 KB/sec, so the extra system load does not show up that much.
81  * Increase it if you want to have more _guaranteed_ speed. Note that
82  * the RAID driver will use the maximum available bandwidth if the IO
83  * subsystem is idle. There is also an 'absolute maximum' reconstruction
84  * speed limit - in case reconstruction slows down your system despite
85  * idle IO detection.
86  *
87  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88  * or /sys/block/mdX/md/sync_speed_{min,max}
89  */
90 
91 static int sysctl_speed_limit_min = 1000;
92 static int sysctl_speed_limit_max = 200000;
93 static inline int speed_min(mddev_t *mddev)
94 {
95 	return mddev->sync_speed_min ?
96 		mddev->sync_speed_min : sysctl_speed_limit_min;
97 }
98 
99 static inline int speed_max(mddev_t *mddev)
100 {
101 	return mddev->sync_speed_max ?
102 		mddev->sync_speed_max : sysctl_speed_limit_max;
103 }
104 
105 static struct ctl_table_header *raid_table_header;
106 
107 static ctl_table raid_table[] = {
108 	{
109 		.procname	= "speed_limit_min",
110 		.data		= &sysctl_speed_limit_min,
111 		.maxlen		= sizeof(int),
112 		.mode		= S_IRUGO|S_IWUSR,
113 		.proc_handler	= proc_dointvec,
114 	},
115 	{
116 		.procname	= "speed_limit_max",
117 		.data		= &sysctl_speed_limit_max,
118 		.maxlen		= sizeof(int),
119 		.mode		= S_IRUGO|S_IWUSR,
120 		.proc_handler	= proc_dointvec,
121 	},
122 	{ }
123 };
124 
125 static ctl_table raid_dir_table[] = {
126 	{
127 		.procname	= "raid",
128 		.maxlen		= 0,
129 		.mode		= S_IRUGO|S_IXUGO,
130 		.child		= raid_table,
131 	},
132 	{ }
133 };
134 
135 static ctl_table raid_root_table[] = {
136 	{
137 		.procname	= "dev",
138 		.maxlen		= 0,
139 		.mode		= 0555,
140 		.child		= raid_dir_table,
141 	},
142 	{  }
143 };
144 
145 static const struct block_device_operations md_fops;
146 
147 static int start_readonly;
148 
149 /*
150  * We have a system wide 'event count' that is incremented
151  * on any 'interesting' event, and readers of /proc/mdstat
152  * can use 'poll' or 'select' to find out when the event
153  * count increases.
154  *
155  * Events are:
156  *  start array, stop array, error, add device, remove device,
157  *  start build, activate spare
158  */
159 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160 static atomic_t md_event_count;
161 void md_new_event(mddev_t *mddev)
162 {
163 	atomic_inc(&md_event_count);
164 	wake_up(&md_event_waiters);
165 }
166 EXPORT_SYMBOL_GPL(md_new_event);
167 
168 /* Alternate version that can be called from interrupts
169  * when calling sysfs_notify isn't needed.
170  */
171 static void md_new_event_inintr(mddev_t *mddev)
172 {
173 	atomic_inc(&md_event_count);
174 	wake_up(&md_event_waiters);
175 }
176 
177 /*
178  * Enables to iterate over all existing md arrays
179  * all_mddevs_lock protects this list.
180  */
181 static LIST_HEAD(all_mddevs);
182 static DEFINE_SPINLOCK(all_mddevs_lock);
183 
184 
185 /*
186  * iterates through all used mddevs in the system.
187  * We take care to grab the all_mddevs_lock whenever navigating
188  * the list, and to always hold a refcount when unlocked.
189  * Any code which breaks out of this loop while own
190  * a reference to the current mddev and must mddev_put it.
191  */
192 #define for_each_mddev(mddev,tmp)					\
193 									\
194 	for (({ spin_lock(&all_mddevs_lock); 				\
195 		tmp = all_mddevs.next;					\
196 		mddev = NULL;});					\
197 	     ({ if (tmp != &all_mddevs)					\
198 			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
199 		spin_unlock(&all_mddevs_lock);				\
200 		if (mddev) mddev_put(mddev);				\
201 		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
202 		tmp != &all_mddevs;});					\
203 	     ({ spin_lock(&all_mddevs_lock);				\
204 		tmp = tmp->next;})					\
205 		)
206 
207 
208 /* Rather than calling directly into the personality make_request function,
209  * IO requests come here first so that we can check if the device is
210  * being suspended pending a reconfiguration.
211  * We hold a refcount over the call to ->make_request.  By the time that
212  * call has finished, the bio has been linked into some internal structure
213  * and so is visible to ->quiesce(), so we don't need the refcount any more.
214  */
215 static int md_make_request(struct request_queue *q, struct bio *bio)
216 {
217 	mddev_t *mddev = q->queuedata;
218 	int rv;
219 	if (mddev == NULL || mddev->pers == NULL) {
220 		bio_io_error(bio);
221 		return 0;
222 	}
223 	rcu_read_lock();
224 	if (mddev->suspended || mddev->barrier) {
225 		DEFINE_WAIT(__wait);
226 		for (;;) {
227 			prepare_to_wait(&mddev->sb_wait, &__wait,
228 					TASK_UNINTERRUPTIBLE);
229 			if (!mddev->suspended && !mddev->barrier)
230 				break;
231 			rcu_read_unlock();
232 			schedule();
233 			rcu_read_lock();
234 		}
235 		finish_wait(&mddev->sb_wait, &__wait);
236 	}
237 	atomic_inc(&mddev->active_io);
238 	rcu_read_unlock();
239 	rv = mddev->pers->make_request(q, bio);
240 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
241 		wake_up(&mddev->sb_wait);
242 
243 	return rv;
244 }
245 
246 static void mddev_suspend(mddev_t *mddev)
247 {
248 	BUG_ON(mddev->suspended);
249 	mddev->suspended = 1;
250 	synchronize_rcu();
251 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
252 	mddev->pers->quiesce(mddev, 1);
253 	md_unregister_thread(mddev->thread);
254 	mddev->thread = NULL;
255 	/* we now know that no code is executing in the personality module,
256 	 * except possibly the tail end of a ->bi_end_io function, but that
257 	 * is certain to complete before the module has a chance to get
258 	 * unloaded
259 	 */
260 }
261 
262 static void mddev_resume(mddev_t *mddev)
263 {
264 	mddev->suspended = 0;
265 	wake_up(&mddev->sb_wait);
266 	mddev->pers->quiesce(mddev, 0);
267 }
268 
269 int mddev_congested(mddev_t *mddev, int bits)
270 {
271 	if (mddev->barrier)
272 		return 1;
273 	return mddev->suspended;
274 }
275 EXPORT_SYMBOL(mddev_congested);
276 
277 /*
278  * Generic barrier handling for md
279  */
280 
281 #define POST_REQUEST_BARRIER ((void*)1)
282 
283 static void md_end_barrier(struct bio *bio, int err)
284 {
285 	mdk_rdev_t *rdev = bio->bi_private;
286 	mddev_t *mddev = rdev->mddev;
287 	if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
288 		set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
289 
290 	rdev_dec_pending(rdev, mddev);
291 
292 	if (atomic_dec_and_test(&mddev->flush_pending)) {
293 		if (mddev->barrier == POST_REQUEST_BARRIER) {
294 			/* This was a post-request barrier */
295 			mddev->barrier = NULL;
296 			wake_up(&mddev->sb_wait);
297 		} else
298 			/* The pre-request barrier has finished */
299 			schedule_work(&mddev->barrier_work);
300 	}
301 	bio_put(bio);
302 }
303 
304 static void submit_barriers(mddev_t *mddev)
305 {
306 	mdk_rdev_t *rdev;
307 
308 	rcu_read_lock();
309 	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
310 		if (rdev->raid_disk >= 0 &&
311 		    !test_bit(Faulty, &rdev->flags)) {
312 			/* Take two references, one is dropped
313 			 * when request finishes, one after
314 			 * we reclaim rcu_read_lock
315 			 */
316 			struct bio *bi;
317 			atomic_inc(&rdev->nr_pending);
318 			atomic_inc(&rdev->nr_pending);
319 			rcu_read_unlock();
320 			bi = bio_alloc(GFP_KERNEL, 0);
321 			bi->bi_end_io = md_end_barrier;
322 			bi->bi_private = rdev;
323 			bi->bi_bdev = rdev->bdev;
324 			atomic_inc(&mddev->flush_pending);
325 			submit_bio(WRITE_BARRIER, bi);
326 			rcu_read_lock();
327 			rdev_dec_pending(rdev, mddev);
328 		}
329 	rcu_read_unlock();
330 }
331 
332 static void md_submit_barrier(struct work_struct *ws)
333 {
334 	mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
335 	struct bio *bio = mddev->barrier;
336 
337 	atomic_set(&mddev->flush_pending, 1);
338 
339 	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
340 		bio_endio(bio, -EOPNOTSUPP);
341 	else if (bio->bi_size == 0)
342 		/* an empty barrier - all done */
343 		bio_endio(bio, 0);
344 	else {
345 		bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
346 		if (mddev->pers->make_request(mddev->queue, bio))
347 			generic_make_request(bio);
348 		mddev->barrier = POST_REQUEST_BARRIER;
349 		submit_barriers(mddev);
350 	}
351 	if (atomic_dec_and_test(&mddev->flush_pending)) {
352 		mddev->barrier = NULL;
353 		wake_up(&mddev->sb_wait);
354 	}
355 }
356 
357 void md_barrier_request(mddev_t *mddev, struct bio *bio)
358 {
359 	spin_lock_irq(&mddev->write_lock);
360 	wait_event_lock_irq(mddev->sb_wait,
361 			    !mddev->barrier,
362 			    mddev->write_lock, /*nothing*/);
363 	mddev->barrier = bio;
364 	spin_unlock_irq(&mddev->write_lock);
365 
366 	atomic_set(&mddev->flush_pending, 1);
367 	INIT_WORK(&mddev->barrier_work, md_submit_barrier);
368 
369 	submit_barriers(mddev);
370 
371 	if (atomic_dec_and_test(&mddev->flush_pending))
372 		schedule_work(&mddev->barrier_work);
373 }
374 EXPORT_SYMBOL(md_barrier_request);
375 
376 static inline mddev_t *mddev_get(mddev_t *mddev)
377 {
378 	atomic_inc(&mddev->active);
379 	return mddev;
380 }
381 
382 static void mddev_delayed_delete(struct work_struct *ws);
383 
384 static void mddev_put(mddev_t *mddev)
385 {
386 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
387 		return;
388 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
389 	    !mddev->hold_active) {
390 		list_del(&mddev->all_mddevs);
391 		if (mddev->gendisk) {
392 			/* we did a probe so need to clean up.
393 			 * Call schedule_work inside the spinlock
394 			 * so that flush_scheduled_work() after
395 			 * mddev_find will succeed in waiting for the
396 			 * work to be done.
397 			 */
398 			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
399 			schedule_work(&mddev->del_work);
400 		} else
401 			kfree(mddev);
402 	}
403 	spin_unlock(&all_mddevs_lock);
404 }
405 
406 static mddev_t * mddev_find(dev_t unit)
407 {
408 	mddev_t *mddev, *new = NULL;
409 
410  retry:
411 	spin_lock(&all_mddevs_lock);
412 
413 	if (unit) {
414 		list_for_each_entry(mddev, &all_mddevs, all_mddevs)
415 			if (mddev->unit == unit) {
416 				mddev_get(mddev);
417 				spin_unlock(&all_mddevs_lock);
418 				kfree(new);
419 				return mddev;
420 			}
421 
422 		if (new) {
423 			list_add(&new->all_mddevs, &all_mddevs);
424 			spin_unlock(&all_mddevs_lock);
425 			new->hold_active = UNTIL_IOCTL;
426 			return new;
427 		}
428 	} else if (new) {
429 		/* find an unused unit number */
430 		static int next_minor = 512;
431 		int start = next_minor;
432 		int is_free = 0;
433 		int dev = 0;
434 		while (!is_free) {
435 			dev = MKDEV(MD_MAJOR, next_minor);
436 			next_minor++;
437 			if (next_minor > MINORMASK)
438 				next_minor = 0;
439 			if (next_minor == start) {
440 				/* Oh dear, all in use. */
441 				spin_unlock(&all_mddevs_lock);
442 				kfree(new);
443 				return NULL;
444 			}
445 
446 			is_free = 1;
447 			list_for_each_entry(mddev, &all_mddevs, all_mddevs)
448 				if (mddev->unit == dev) {
449 					is_free = 0;
450 					break;
451 				}
452 		}
453 		new->unit = dev;
454 		new->md_minor = MINOR(dev);
455 		new->hold_active = UNTIL_STOP;
456 		list_add(&new->all_mddevs, &all_mddevs);
457 		spin_unlock(&all_mddevs_lock);
458 		return new;
459 	}
460 	spin_unlock(&all_mddevs_lock);
461 
462 	new = kzalloc(sizeof(*new), GFP_KERNEL);
463 	if (!new)
464 		return NULL;
465 
466 	new->unit = unit;
467 	if (MAJOR(unit) == MD_MAJOR)
468 		new->md_minor = MINOR(unit);
469 	else
470 		new->md_minor = MINOR(unit) >> MdpMinorShift;
471 
472 	mutex_init(&new->open_mutex);
473 	mutex_init(&new->reconfig_mutex);
474 	mutex_init(&new->bitmap_info.mutex);
475 	INIT_LIST_HEAD(&new->disks);
476 	INIT_LIST_HEAD(&new->all_mddevs);
477 	init_timer(&new->safemode_timer);
478 	atomic_set(&new->active, 1);
479 	atomic_set(&new->openers, 0);
480 	atomic_set(&new->active_io, 0);
481 	spin_lock_init(&new->write_lock);
482 	atomic_set(&new->flush_pending, 0);
483 	init_waitqueue_head(&new->sb_wait);
484 	init_waitqueue_head(&new->recovery_wait);
485 	new->reshape_position = MaxSector;
486 	new->resync_min = 0;
487 	new->resync_max = MaxSector;
488 	new->level = LEVEL_NONE;
489 
490 	goto retry;
491 }
492 
493 static inline int mddev_lock(mddev_t * mddev)
494 {
495 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
496 }
497 
498 static inline int mddev_is_locked(mddev_t *mddev)
499 {
500 	return mutex_is_locked(&mddev->reconfig_mutex);
501 }
502 
503 static inline int mddev_trylock(mddev_t * mddev)
504 {
505 	return mutex_trylock(&mddev->reconfig_mutex);
506 }
507 
508 static inline void mddev_unlock(mddev_t * mddev)
509 {
510 	mutex_unlock(&mddev->reconfig_mutex);
511 
512 	md_wakeup_thread(mddev->thread);
513 }
514 
515 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
516 {
517 	mdk_rdev_t *rdev;
518 
519 	list_for_each_entry(rdev, &mddev->disks, same_set)
520 		if (rdev->desc_nr == nr)
521 			return rdev;
522 
523 	return NULL;
524 }
525 
526 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
527 {
528 	mdk_rdev_t *rdev;
529 
530 	list_for_each_entry(rdev, &mddev->disks, same_set)
531 		if (rdev->bdev->bd_dev == dev)
532 			return rdev;
533 
534 	return NULL;
535 }
536 
537 static struct mdk_personality *find_pers(int level, char *clevel)
538 {
539 	struct mdk_personality *pers;
540 	list_for_each_entry(pers, &pers_list, list) {
541 		if (level != LEVEL_NONE && pers->level == level)
542 			return pers;
543 		if (strcmp(pers->name, clevel)==0)
544 			return pers;
545 	}
546 	return NULL;
547 }
548 
549 /* return the offset of the super block in 512byte sectors */
550 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
551 {
552 	sector_t num_sectors = bdev->bd_inode->i_size / 512;
553 	return MD_NEW_SIZE_SECTORS(num_sectors);
554 }
555 
556 static int alloc_disk_sb(mdk_rdev_t * rdev)
557 {
558 	if (rdev->sb_page)
559 		MD_BUG();
560 
561 	rdev->sb_page = alloc_page(GFP_KERNEL);
562 	if (!rdev->sb_page) {
563 		printk(KERN_ALERT "md: out of memory.\n");
564 		return -ENOMEM;
565 	}
566 
567 	return 0;
568 }
569 
570 static void free_disk_sb(mdk_rdev_t * rdev)
571 {
572 	if (rdev->sb_page) {
573 		put_page(rdev->sb_page);
574 		rdev->sb_loaded = 0;
575 		rdev->sb_page = NULL;
576 		rdev->sb_start = 0;
577 		rdev->sectors = 0;
578 	}
579 }
580 
581 
582 static void super_written(struct bio *bio, int error)
583 {
584 	mdk_rdev_t *rdev = bio->bi_private;
585 	mddev_t *mddev = rdev->mddev;
586 
587 	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
588 		printk("md: super_written gets error=%d, uptodate=%d\n",
589 		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
590 		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
591 		md_error(mddev, rdev);
592 	}
593 
594 	if (atomic_dec_and_test(&mddev->pending_writes))
595 		wake_up(&mddev->sb_wait);
596 	bio_put(bio);
597 }
598 
599 static void super_written_barrier(struct bio *bio, int error)
600 {
601 	struct bio *bio2 = bio->bi_private;
602 	mdk_rdev_t *rdev = bio2->bi_private;
603 	mddev_t *mddev = rdev->mddev;
604 
605 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
606 	    error == -EOPNOTSUPP) {
607 		unsigned long flags;
608 		/* barriers don't appear to be supported :-( */
609 		set_bit(BarriersNotsupp, &rdev->flags);
610 		mddev->barriers_work = 0;
611 		spin_lock_irqsave(&mddev->write_lock, flags);
612 		bio2->bi_next = mddev->biolist;
613 		mddev->biolist = bio2;
614 		spin_unlock_irqrestore(&mddev->write_lock, flags);
615 		wake_up(&mddev->sb_wait);
616 		bio_put(bio);
617 	} else {
618 		bio_put(bio2);
619 		bio->bi_private = rdev;
620 		super_written(bio, error);
621 	}
622 }
623 
624 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
625 		   sector_t sector, int size, struct page *page)
626 {
627 	/* write first size bytes of page to sector of rdev
628 	 * Increment mddev->pending_writes before returning
629 	 * and decrement it on completion, waking up sb_wait
630 	 * if zero is reached.
631 	 * If an error occurred, call md_error
632 	 *
633 	 * As we might need to resubmit the request if BIO_RW_BARRIER
634 	 * causes ENOTSUPP, we allocate a spare bio...
635 	 */
636 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
637 	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
638 
639 	bio->bi_bdev = rdev->bdev;
640 	bio->bi_sector = sector;
641 	bio_add_page(bio, page, size, 0);
642 	bio->bi_private = rdev;
643 	bio->bi_end_io = super_written;
644 	bio->bi_rw = rw;
645 
646 	atomic_inc(&mddev->pending_writes);
647 	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
648 		struct bio *rbio;
649 		rw |= (1<<BIO_RW_BARRIER);
650 		rbio = bio_clone(bio, GFP_NOIO);
651 		rbio->bi_private = bio;
652 		rbio->bi_end_io = super_written_barrier;
653 		submit_bio(rw, rbio);
654 	} else
655 		submit_bio(rw, bio);
656 }
657 
658 void md_super_wait(mddev_t *mddev)
659 {
660 	/* wait for all superblock writes that were scheduled to complete.
661 	 * if any had to be retried (due to BARRIER problems), retry them
662 	 */
663 	DEFINE_WAIT(wq);
664 	for(;;) {
665 		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
666 		if (atomic_read(&mddev->pending_writes)==0)
667 			break;
668 		while (mddev->biolist) {
669 			struct bio *bio;
670 			spin_lock_irq(&mddev->write_lock);
671 			bio = mddev->biolist;
672 			mddev->biolist = bio->bi_next ;
673 			bio->bi_next = NULL;
674 			spin_unlock_irq(&mddev->write_lock);
675 			submit_bio(bio->bi_rw, bio);
676 		}
677 		schedule();
678 	}
679 	finish_wait(&mddev->sb_wait, &wq);
680 }
681 
682 static void bi_complete(struct bio *bio, int error)
683 {
684 	complete((struct completion*)bio->bi_private);
685 }
686 
687 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
688 		   struct page *page, int rw)
689 {
690 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
691 	struct completion event;
692 	int ret;
693 
694 	rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
695 
696 	bio->bi_bdev = bdev;
697 	bio->bi_sector = sector;
698 	bio_add_page(bio, page, size, 0);
699 	init_completion(&event);
700 	bio->bi_private = &event;
701 	bio->bi_end_io = bi_complete;
702 	submit_bio(rw, bio);
703 	wait_for_completion(&event);
704 
705 	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
706 	bio_put(bio);
707 	return ret;
708 }
709 EXPORT_SYMBOL_GPL(sync_page_io);
710 
711 static int read_disk_sb(mdk_rdev_t * rdev, int size)
712 {
713 	char b[BDEVNAME_SIZE];
714 	if (!rdev->sb_page) {
715 		MD_BUG();
716 		return -EINVAL;
717 	}
718 	if (rdev->sb_loaded)
719 		return 0;
720 
721 
722 	if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
723 		goto fail;
724 	rdev->sb_loaded = 1;
725 	return 0;
726 
727 fail:
728 	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
729 		bdevname(rdev->bdev,b));
730 	return -EINVAL;
731 }
732 
733 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
734 {
735 	return 	sb1->set_uuid0 == sb2->set_uuid0 &&
736 		sb1->set_uuid1 == sb2->set_uuid1 &&
737 		sb1->set_uuid2 == sb2->set_uuid2 &&
738 		sb1->set_uuid3 == sb2->set_uuid3;
739 }
740 
741 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
742 {
743 	int ret;
744 	mdp_super_t *tmp1, *tmp2;
745 
746 	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
747 	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
748 
749 	if (!tmp1 || !tmp2) {
750 		ret = 0;
751 		printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
752 		goto abort;
753 	}
754 
755 	*tmp1 = *sb1;
756 	*tmp2 = *sb2;
757 
758 	/*
759 	 * nr_disks is not constant
760 	 */
761 	tmp1->nr_disks = 0;
762 	tmp2->nr_disks = 0;
763 
764 	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
765 abort:
766 	kfree(tmp1);
767 	kfree(tmp2);
768 	return ret;
769 }
770 
771 
772 static u32 md_csum_fold(u32 csum)
773 {
774 	csum = (csum & 0xffff) + (csum >> 16);
775 	return (csum & 0xffff) + (csum >> 16);
776 }
777 
778 static unsigned int calc_sb_csum(mdp_super_t * sb)
779 {
780 	u64 newcsum = 0;
781 	u32 *sb32 = (u32*)sb;
782 	int i;
783 	unsigned int disk_csum, csum;
784 
785 	disk_csum = sb->sb_csum;
786 	sb->sb_csum = 0;
787 
788 	for (i = 0; i < MD_SB_BYTES/4 ; i++)
789 		newcsum += sb32[i];
790 	csum = (newcsum & 0xffffffff) + (newcsum>>32);
791 
792 
793 #ifdef CONFIG_ALPHA
794 	/* This used to use csum_partial, which was wrong for several
795 	 * reasons including that different results are returned on
796 	 * different architectures.  It isn't critical that we get exactly
797 	 * the same return value as before (we always csum_fold before
798 	 * testing, and that removes any differences).  However as we
799 	 * know that csum_partial always returned a 16bit value on
800 	 * alphas, do a fold to maximise conformity to previous behaviour.
801 	 */
802 	sb->sb_csum = md_csum_fold(disk_csum);
803 #else
804 	sb->sb_csum = disk_csum;
805 #endif
806 	return csum;
807 }
808 
809 
810 /*
811  * Handle superblock details.
812  * We want to be able to handle multiple superblock formats
813  * so we have a common interface to them all, and an array of
814  * different handlers.
815  * We rely on user-space to write the initial superblock, and support
816  * reading and updating of superblocks.
817  * Interface methods are:
818  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
819  *      loads and validates a superblock on dev.
820  *      if refdev != NULL, compare superblocks on both devices
821  *    Return:
822  *      0 - dev has a superblock that is compatible with refdev
823  *      1 - dev has a superblock that is compatible and newer than refdev
824  *          so dev should be used as the refdev in future
825  *     -EINVAL superblock incompatible or invalid
826  *     -othererror e.g. -EIO
827  *
828  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
829  *      Verify that dev is acceptable into mddev.
830  *       The first time, mddev->raid_disks will be 0, and data from
831  *       dev should be merged in.  Subsequent calls check that dev
832  *       is new enough.  Return 0 or -EINVAL
833  *
834  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
835  *     Update the superblock for rdev with data in mddev
836  *     This does not write to disc.
837  *
838  */
839 
840 struct super_type  {
841 	char		    *name;
842 	struct module	    *owner;
843 	int		    (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
844 					  int minor_version);
845 	int		    (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
846 	void		    (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
847 	unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
848 						sector_t num_sectors);
849 };
850 
851 /*
852  * Check that the given mddev has no bitmap.
853  *
854  * This function is called from the run method of all personalities that do not
855  * support bitmaps. It prints an error message and returns non-zero if mddev
856  * has a bitmap. Otherwise, it returns 0.
857  *
858  */
859 int md_check_no_bitmap(mddev_t *mddev)
860 {
861 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
862 		return 0;
863 	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
864 		mdname(mddev), mddev->pers->name);
865 	return 1;
866 }
867 EXPORT_SYMBOL(md_check_no_bitmap);
868 
869 /*
870  * load_super for 0.90.0
871  */
872 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
873 {
874 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
875 	mdp_super_t *sb;
876 	int ret;
877 
878 	/*
879 	 * Calculate the position of the superblock (512byte sectors),
880 	 * it's at the end of the disk.
881 	 *
882 	 * It also happens to be a multiple of 4Kb.
883 	 */
884 	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
885 
886 	ret = read_disk_sb(rdev, MD_SB_BYTES);
887 	if (ret) return ret;
888 
889 	ret = -EINVAL;
890 
891 	bdevname(rdev->bdev, b);
892 	sb = (mdp_super_t*)page_address(rdev->sb_page);
893 
894 	if (sb->md_magic != MD_SB_MAGIC) {
895 		printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
896 		       b);
897 		goto abort;
898 	}
899 
900 	if (sb->major_version != 0 ||
901 	    sb->minor_version < 90 ||
902 	    sb->minor_version > 91) {
903 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
904 			sb->major_version, sb->minor_version,
905 			b);
906 		goto abort;
907 	}
908 
909 	if (sb->raid_disks <= 0)
910 		goto abort;
911 
912 	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
913 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
914 			b);
915 		goto abort;
916 	}
917 
918 	rdev->preferred_minor = sb->md_minor;
919 	rdev->data_offset = 0;
920 	rdev->sb_size = MD_SB_BYTES;
921 
922 	if (sb->level == LEVEL_MULTIPATH)
923 		rdev->desc_nr = -1;
924 	else
925 		rdev->desc_nr = sb->this_disk.number;
926 
927 	if (!refdev) {
928 		ret = 1;
929 	} else {
930 		__u64 ev1, ev2;
931 		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
932 		if (!uuid_equal(refsb, sb)) {
933 			printk(KERN_WARNING "md: %s has different UUID to %s\n",
934 				b, bdevname(refdev->bdev,b2));
935 			goto abort;
936 		}
937 		if (!sb_equal(refsb, sb)) {
938 			printk(KERN_WARNING "md: %s has same UUID"
939 			       " but different superblock to %s\n",
940 			       b, bdevname(refdev->bdev, b2));
941 			goto abort;
942 		}
943 		ev1 = md_event(sb);
944 		ev2 = md_event(refsb);
945 		if (ev1 > ev2)
946 			ret = 1;
947 		else
948 			ret = 0;
949 	}
950 	rdev->sectors = rdev->sb_start;
951 
952 	if (rdev->sectors < sb->size * 2 && sb->level > 1)
953 		/* "this cannot possibly happen" ... */
954 		ret = -EINVAL;
955 
956  abort:
957 	return ret;
958 }
959 
960 /*
961  * validate_super for 0.90.0
962  */
963 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
964 {
965 	mdp_disk_t *desc;
966 	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
967 	__u64 ev1 = md_event(sb);
968 
969 	rdev->raid_disk = -1;
970 	clear_bit(Faulty, &rdev->flags);
971 	clear_bit(In_sync, &rdev->flags);
972 	clear_bit(WriteMostly, &rdev->flags);
973 	clear_bit(BarriersNotsupp, &rdev->flags);
974 
975 	if (mddev->raid_disks == 0) {
976 		mddev->major_version = 0;
977 		mddev->minor_version = sb->minor_version;
978 		mddev->patch_version = sb->patch_version;
979 		mddev->external = 0;
980 		mddev->chunk_sectors = sb->chunk_size >> 9;
981 		mddev->ctime = sb->ctime;
982 		mddev->utime = sb->utime;
983 		mddev->level = sb->level;
984 		mddev->clevel[0] = 0;
985 		mddev->layout = sb->layout;
986 		mddev->raid_disks = sb->raid_disks;
987 		mddev->dev_sectors = sb->size * 2;
988 		mddev->events = ev1;
989 		mddev->bitmap_info.offset = 0;
990 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
991 
992 		if (mddev->minor_version >= 91) {
993 			mddev->reshape_position = sb->reshape_position;
994 			mddev->delta_disks = sb->delta_disks;
995 			mddev->new_level = sb->new_level;
996 			mddev->new_layout = sb->new_layout;
997 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
998 		} else {
999 			mddev->reshape_position = MaxSector;
1000 			mddev->delta_disks = 0;
1001 			mddev->new_level = mddev->level;
1002 			mddev->new_layout = mddev->layout;
1003 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1004 		}
1005 
1006 		if (sb->state & (1<<MD_SB_CLEAN))
1007 			mddev->recovery_cp = MaxSector;
1008 		else {
1009 			if (sb->events_hi == sb->cp_events_hi &&
1010 				sb->events_lo == sb->cp_events_lo) {
1011 				mddev->recovery_cp = sb->recovery_cp;
1012 			} else
1013 				mddev->recovery_cp = 0;
1014 		}
1015 
1016 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1017 		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1018 		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1019 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1020 
1021 		mddev->max_disks = MD_SB_DISKS;
1022 
1023 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1024 		    mddev->bitmap_info.file == NULL)
1025 			mddev->bitmap_info.offset =
1026 				mddev->bitmap_info.default_offset;
1027 
1028 	} else if (mddev->pers == NULL) {
1029 		/* Insist on good event counter while assembling */
1030 		++ev1;
1031 		if (ev1 < mddev->events)
1032 			return -EINVAL;
1033 	} else if (mddev->bitmap) {
1034 		/* if adding to array with a bitmap, then we can accept an
1035 		 * older device ... but not too old.
1036 		 */
1037 		if (ev1 < mddev->bitmap->events_cleared)
1038 			return 0;
1039 	} else {
1040 		if (ev1 < mddev->events)
1041 			/* just a hot-add of a new device, leave raid_disk at -1 */
1042 			return 0;
1043 	}
1044 
1045 	if (mddev->level != LEVEL_MULTIPATH) {
1046 		desc = sb->disks + rdev->desc_nr;
1047 
1048 		if (desc->state & (1<<MD_DISK_FAULTY))
1049 			set_bit(Faulty, &rdev->flags);
1050 		else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1051 			    desc->raid_disk < mddev->raid_disks */) {
1052 			set_bit(In_sync, &rdev->flags);
1053 			rdev->raid_disk = desc->raid_disk;
1054 		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1055 			/* active but not in sync implies recovery up to
1056 			 * reshape position.  We don't know exactly where
1057 			 * that is, so set to zero for now */
1058 			if (mddev->minor_version >= 91) {
1059 				rdev->recovery_offset = 0;
1060 				rdev->raid_disk = desc->raid_disk;
1061 			}
1062 		}
1063 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1064 			set_bit(WriteMostly, &rdev->flags);
1065 	} else /* MULTIPATH are always insync */
1066 		set_bit(In_sync, &rdev->flags);
1067 	return 0;
1068 }
1069 
1070 /*
1071  * sync_super for 0.90.0
1072  */
1073 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1074 {
1075 	mdp_super_t *sb;
1076 	mdk_rdev_t *rdev2;
1077 	int next_spare = mddev->raid_disks;
1078 
1079 
1080 	/* make rdev->sb match mddev data..
1081 	 *
1082 	 * 1/ zero out disks
1083 	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1084 	 * 3/ any empty disks < next_spare become removed
1085 	 *
1086 	 * disks[0] gets initialised to REMOVED because
1087 	 * we cannot be sure from other fields if it has
1088 	 * been initialised or not.
1089 	 */
1090 	int i;
1091 	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1092 
1093 	rdev->sb_size = MD_SB_BYTES;
1094 
1095 	sb = (mdp_super_t*)page_address(rdev->sb_page);
1096 
1097 	memset(sb, 0, sizeof(*sb));
1098 
1099 	sb->md_magic = MD_SB_MAGIC;
1100 	sb->major_version = mddev->major_version;
1101 	sb->patch_version = mddev->patch_version;
1102 	sb->gvalid_words  = 0; /* ignored */
1103 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1104 	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1105 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1106 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1107 
1108 	sb->ctime = mddev->ctime;
1109 	sb->level = mddev->level;
1110 	sb->size = mddev->dev_sectors / 2;
1111 	sb->raid_disks = mddev->raid_disks;
1112 	sb->md_minor = mddev->md_minor;
1113 	sb->not_persistent = 0;
1114 	sb->utime = mddev->utime;
1115 	sb->state = 0;
1116 	sb->events_hi = (mddev->events>>32);
1117 	sb->events_lo = (u32)mddev->events;
1118 
1119 	if (mddev->reshape_position == MaxSector)
1120 		sb->minor_version = 90;
1121 	else {
1122 		sb->minor_version = 91;
1123 		sb->reshape_position = mddev->reshape_position;
1124 		sb->new_level = mddev->new_level;
1125 		sb->delta_disks = mddev->delta_disks;
1126 		sb->new_layout = mddev->new_layout;
1127 		sb->new_chunk = mddev->new_chunk_sectors << 9;
1128 	}
1129 	mddev->minor_version = sb->minor_version;
1130 	if (mddev->in_sync)
1131 	{
1132 		sb->recovery_cp = mddev->recovery_cp;
1133 		sb->cp_events_hi = (mddev->events>>32);
1134 		sb->cp_events_lo = (u32)mddev->events;
1135 		if (mddev->recovery_cp == MaxSector)
1136 			sb->state = (1<< MD_SB_CLEAN);
1137 	} else
1138 		sb->recovery_cp = 0;
1139 
1140 	sb->layout = mddev->layout;
1141 	sb->chunk_size = mddev->chunk_sectors << 9;
1142 
1143 	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1144 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1145 
1146 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1147 	list_for_each_entry(rdev2, &mddev->disks, same_set) {
1148 		mdp_disk_t *d;
1149 		int desc_nr;
1150 		int is_active = test_bit(In_sync, &rdev2->flags);
1151 
1152 		if (rdev2->raid_disk >= 0 &&
1153 		    sb->minor_version >= 91)
1154 			/* we have nowhere to store the recovery_offset,
1155 			 * but if it is not below the reshape_position,
1156 			 * we can piggy-back on that.
1157 			 */
1158 			is_active = 1;
1159 		if (rdev2->raid_disk < 0 ||
1160 		    test_bit(Faulty, &rdev2->flags))
1161 			is_active = 0;
1162 		if (is_active)
1163 			desc_nr = rdev2->raid_disk;
1164 		else
1165 			desc_nr = next_spare++;
1166 		rdev2->desc_nr = desc_nr;
1167 		d = &sb->disks[rdev2->desc_nr];
1168 		nr_disks++;
1169 		d->number = rdev2->desc_nr;
1170 		d->major = MAJOR(rdev2->bdev->bd_dev);
1171 		d->minor = MINOR(rdev2->bdev->bd_dev);
1172 		if (is_active)
1173 			d->raid_disk = rdev2->raid_disk;
1174 		else
1175 			d->raid_disk = rdev2->desc_nr; /* compatibility */
1176 		if (test_bit(Faulty, &rdev2->flags))
1177 			d->state = (1<<MD_DISK_FAULTY);
1178 		else if (is_active) {
1179 			d->state = (1<<MD_DISK_ACTIVE);
1180 			if (test_bit(In_sync, &rdev2->flags))
1181 				d->state |= (1<<MD_DISK_SYNC);
1182 			active++;
1183 			working++;
1184 		} else {
1185 			d->state = 0;
1186 			spare++;
1187 			working++;
1188 		}
1189 		if (test_bit(WriteMostly, &rdev2->flags))
1190 			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1191 	}
1192 	/* now set the "removed" and "faulty" bits on any missing devices */
1193 	for (i=0 ; i < mddev->raid_disks ; i++) {
1194 		mdp_disk_t *d = &sb->disks[i];
1195 		if (d->state == 0 && d->number == 0) {
1196 			d->number = i;
1197 			d->raid_disk = i;
1198 			d->state = (1<<MD_DISK_REMOVED);
1199 			d->state |= (1<<MD_DISK_FAULTY);
1200 			failed++;
1201 		}
1202 	}
1203 	sb->nr_disks = nr_disks;
1204 	sb->active_disks = active;
1205 	sb->working_disks = working;
1206 	sb->failed_disks = failed;
1207 	sb->spare_disks = spare;
1208 
1209 	sb->this_disk = sb->disks[rdev->desc_nr];
1210 	sb->sb_csum = calc_sb_csum(sb);
1211 }
1212 
1213 /*
1214  * rdev_size_change for 0.90.0
1215  */
1216 static unsigned long long
1217 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1218 {
1219 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1220 		return 0; /* component must fit device */
1221 	if (rdev->mddev->bitmap_info.offset)
1222 		return 0; /* can't move bitmap */
1223 	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1224 	if (!num_sectors || num_sectors > rdev->sb_start)
1225 		num_sectors = rdev->sb_start;
1226 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1227 		       rdev->sb_page);
1228 	md_super_wait(rdev->mddev);
1229 	return num_sectors / 2; /* kB for sysfs */
1230 }
1231 
1232 
1233 /*
1234  * version 1 superblock
1235  */
1236 
1237 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1238 {
1239 	__le32 disk_csum;
1240 	u32 csum;
1241 	unsigned long long newcsum;
1242 	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1243 	__le32 *isuper = (__le32*)sb;
1244 	int i;
1245 
1246 	disk_csum = sb->sb_csum;
1247 	sb->sb_csum = 0;
1248 	newcsum = 0;
1249 	for (i=0; size>=4; size -= 4 )
1250 		newcsum += le32_to_cpu(*isuper++);
1251 
1252 	if (size == 2)
1253 		newcsum += le16_to_cpu(*(__le16*) isuper);
1254 
1255 	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1256 	sb->sb_csum = disk_csum;
1257 	return cpu_to_le32(csum);
1258 }
1259 
1260 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1261 {
1262 	struct mdp_superblock_1 *sb;
1263 	int ret;
1264 	sector_t sb_start;
1265 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1266 	int bmask;
1267 
1268 	/*
1269 	 * Calculate the position of the superblock in 512byte sectors.
1270 	 * It is always aligned to a 4K boundary and
1271 	 * depeding on minor_version, it can be:
1272 	 * 0: At least 8K, but less than 12K, from end of device
1273 	 * 1: At start of device
1274 	 * 2: 4K from start of device.
1275 	 */
1276 	switch(minor_version) {
1277 	case 0:
1278 		sb_start = rdev->bdev->bd_inode->i_size >> 9;
1279 		sb_start -= 8*2;
1280 		sb_start &= ~(sector_t)(4*2-1);
1281 		break;
1282 	case 1:
1283 		sb_start = 0;
1284 		break;
1285 	case 2:
1286 		sb_start = 8;
1287 		break;
1288 	default:
1289 		return -EINVAL;
1290 	}
1291 	rdev->sb_start = sb_start;
1292 
1293 	/* superblock is rarely larger than 1K, but it can be larger,
1294 	 * and it is safe to read 4k, so we do that
1295 	 */
1296 	ret = read_disk_sb(rdev, 4096);
1297 	if (ret) return ret;
1298 
1299 
1300 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1301 
1302 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1303 	    sb->major_version != cpu_to_le32(1) ||
1304 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1305 	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1306 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1307 		return -EINVAL;
1308 
1309 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1310 		printk("md: invalid superblock checksum on %s\n",
1311 			bdevname(rdev->bdev,b));
1312 		return -EINVAL;
1313 	}
1314 	if (le64_to_cpu(sb->data_size) < 10) {
1315 		printk("md: data_size too small on %s\n",
1316 		       bdevname(rdev->bdev,b));
1317 		return -EINVAL;
1318 	}
1319 
1320 	rdev->preferred_minor = 0xffff;
1321 	rdev->data_offset = le64_to_cpu(sb->data_offset);
1322 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1323 
1324 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1325 	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1326 	if (rdev->sb_size & bmask)
1327 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1328 
1329 	if (minor_version
1330 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1331 		return -EINVAL;
1332 
1333 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1334 		rdev->desc_nr = -1;
1335 	else
1336 		rdev->desc_nr = le32_to_cpu(sb->dev_number);
1337 
1338 	if (!refdev) {
1339 		ret = 1;
1340 	} else {
1341 		__u64 ev1, ev2;
1342 		struct mdp_superblock_1 *refsb =
1343 			(struct mdp_superblock_1*)page_address(refdev->sb_page);
1344 
1345 		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1346 		    sb->level != refsb->level ||
1347 		    sb->layout != refsb->layout ||
1348 		    sb->chunksize != refsb->chunksize) {
1349 			printk(KERN_WARNING "md: %s has strangely different"
1350 				" superblock to %s\n",
1351 				bdevname(rdev->bdev,b),
1352 				bdevname(refdev->bdev,b2));
1353 			return -EINVAL;
1354 		}
1355 		ev1 = le64_to_cpu(sb->events);
1356 		ev2 = le64_to_cpu(refsb->events);
1357 
1358 		if (ev1 > ev2)
1359 			ret = 1;
1360 		else
1361 			ret = 0;
1362 	}
1363 	if (minor_version)
1364 		rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1365 			le64_to_cpu(sb->data_offset);
1366 	else
1367 		rdev->sectors = rdev->sb_start;
1368 	if (rdev->sectors < le64_to_cpu(sb->data_size))
1369 		return -EINVAL;
1370 	rdev->sectors = le64_to_cpu(sb->data_size);
1371 	if (le64_to_cpu(sb->size) > rdev->sectors)
1372 		return -EINVAL;
1373 	return ret;
1374 }
1375 
1376 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1377 {
1378 	struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1379 	__u64 ev1 = le64_to_cpu(sb->events);
1380 
1381 	rdev->raid_disk = -1;
1382 	clear_bit(Faulty, &rdev->flags);
1383 	clear_bit(In_sync, &rdev->flags);
1384 	clear_bit(WriteMostly, &rdev->flags);
1385 	clear_bit(BarriersNotsupp, &rdev->flags);
1386 
1387 	if (mddev->raid_disks == 0) {
1388 		mddev->major_version = 1;
1389 		mddev->patch_version = 0;
1390 		mddev->external = 0;
1391 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1392 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1393 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1394 		mddev->level = le32_to_cpu(sb->level);
1395 		mddev->clevel[0] = 0;
1396 		mddev->layout = le32_to_cpu(sb->layout);
1397 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1398 		mddev->dev_sectors = le64_to_cpu(sb->size);
1399 		mddev->events = ev1;
1400 		mddev->bitmap_info.offset = 0;
1401 		mddev->bitmap_info.default_offset = 1024 >> 9;
1402 
1403 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1404 		memcpy(mddev->uuid, sb->set_uuid, 16);
1405 
1406 		mddev->max_disks =  (4096-256)/2;
1407 
1408 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1409 		    mddev->bitmap_info.file == NULL )
1410 			mddev->bitmap_info.offset =
1411 				(__s32)le32_to_cpu(sb->bitmap_offset);
1412 
1413 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1414 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1415 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1416 			mddev->new_level = le32_to_cpu(sb->new_level);
1417 			mddev->new_layout = le32_to_cpu(sb->new_layout);
1418 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1419 		} else {
1420 			mddev->reshape_position = MaxSector;
1421 			mddev->delta_disks = 0;
1422 			mddev->new_level = mddev->level;
1423 			mddev->new_layout = mddev->layout;
1424 			mddev->new_chunk_sectors = mddev->chunk_sectors;
1425 		}
1426 
1427 	} else if (mddev->pers == NULL) {
1428 		/* Insist of good event counter while assembling */
1429 		++ev1;
1430 		if (ev1 < mddev->events)
1431 			return -EINVAL;
1432 	} else if (mddev->bitmap) {
1433 		/* If adding to array with a bitmap, then we can accept an
1434 		 * older device, but not too old.
1435 		 */
1436 		if (ev1 < mddev->bitmap->events_cleared)
1437 			return 0;
1438 	} else {
1439 		if (ev1 < mddev->events)
1440 			/* just a hot-add of a new device, leave raid_disk at -1 */
1441 			return 0;
1442 	}
1443 	if (mddev->level != LEVEL_MULTIPATH) {
1444 		int role;
1445 		if (rdev->desc_nr < 0 ||
1446 		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1447 			role = 0xffff;
1448 			rdev->desc_nr = -1;
1449 		} else
1450 			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1451 		switch(role) {
1452 		case 0xffff: /* spare */
1453 			break;
1454 		case 0xfffe: /* faulty */
1455 			set_bit(Faulty, &rdev->flags);
1456 			break;
1457 		default:
1458 			if ((le32_to_cpu(sb->feature_map) &
1459 			     MD_FEATURE_RECOVERY_OFFSET))
1460 				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1461 			else
1462 				set_bit(In_sync, &rdev->flags);
1463 			rdev->raid_disk = role;
1464 			break;
1465 		}
1466 		if (sb->devflags & WriteMostly1)
1467 			set_bit(WriteMostly, &rdev->flags);
1468 	} else /* MULTIPATH are always insync */
1469 		set_bit(In_sync, &rdev->flags);
1470 
1471 	return 0;
1472 }
1473 
1474 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1475 {
1476 	struct mdp_superblock_1 *sb;
1477 	mdk_rdev_t *rdev2;
1478 	int max_dev, i;
1479 	/* make rdev->sb match mddev and rdev data. */
1480 
1481 	sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1482 
1483 	sb->feature_map = 0;
1484 	sb->pad0 = 0;
1485 	sb->recovery_offset = cpu_to_le64(0);
1486 	memset(sb->pad1, 0, sizeof(sb->pad1));
1487 	memset(sb->pad2, 0, sizeof(sb->pad2));
1488 	memset(sb->pad3, 0, sizeof(sb->pad3));
1489 
1490 	sb->utime = cpu_to_le64((__u64)mddev->utime);
1491 	sb->events = cpu_to_le64(mddev->events);
1492 	if (mddev->in_sync)
1493 		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1494 	else
1495 		sb->resync_offset = cpu_to_le64(0);
1496 
1497 	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1498 
1499 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1500 	sb->size = cpu_to_le64(mddev->dev_sectors);
1501 	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1502 	sb->level = cpu_to_le32(mddev->level);
1503 	sb->layout = cpu_to_le32(mddev->layout);
1504 
1505 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1506 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1507 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1508 	}
1509 
1510 	if (rdev->raid_disk >= 0 &&
1511 	    !test_bit(In_sync, &rdev->flags)) {
1512 		sb->feature_map |=
1513 			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1514 		sb->recovery_offset =
1515 			cpu_to_le64(rdev->recovery_offset);
1516 	}
1517 
1518 	if (mddev->reshape_position != MaxSector) {
1519 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1520 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1521 		sb->new_layout = cpu_to_le32(mddev->new_layout);
1522 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1523 		sb->new_level = cpu_to_le32(mddev->new_level);
1524 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1525 	}
1526 
1527 	max_dev = 0;
1528 	list_for_each_entry(rdev2, &mddev->disks, same_set)
1529 		if (rdev2->desc_nr+1 > max_dev)
1530 			max_dev = rdev2->desc_nr+1;
1531 
1532 	if (max_dev > le32_to_cpu(sb->max_dev)) {
1533 		int bmask;
1534 		sb->max_dev = cpu_to_le32(max_dev);
1535 		rdev->sb_size = max_dev * 2 + 256;
1536 		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1537 		if (rdev->sb_size & bmask)
1538 			rdev->sb_size = (rdev->sb_size | bmask) + 1;
1539 	}
1540 	for (i=0; i<max_dev;i++)
1541 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
1542 
1543 	list_for_each_entry(rdev2, &mddev->disks, same_set) {
1544 		i = rdev2->desc_nr;
1545 		if (test_bit(Faulty, &rdev2->flags))
1546 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
1547 		else if (test_bit(In_sync, &rdev2->flags))
1548 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1549 		else if (rdev2->raid_disk >= 0)
1550 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1551 		else
1552 			sb->dev_roles[i] = cpu_to_le16(0xffff);
1553 	}
1554 
1555 	sb->sb_csum = calc_sb_1_csum(sb);
1556 }
1557 
1558 static unsigned long long
1559 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1560 {
1561 	struct mdp_superblock_1 *sb;
1562 	sector_t max_sectors;
1563 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1564 		return 0; /* component must fit device */
1565 	if (rdev->sb_start < rdev->data_offset) {
1566 		/* minor versions 1 and 2; superblock before data */
1567 		max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1568 		max_sectors -= rdev->data_offset;
1569 		if (!num_sectors || num_sectors > max_sectors)
1570 			num_sectors = max_sectors;
1571 	} else if (rdev->mddev->bitmap_info.offset) {
1572 		/* minor version 0 with bitmap we can't move */
1573 		return 0;
1574 	} else {
1575 		/* minor version 0; superblock after data */
1576 		sector_t sb_start;
1577 		sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1578 		sb_start &= ~(sector_t)(4*2 - 1);
1579 		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1580 		if (!num_sectors || num_sectors > max_sectors)
1581 			num_sectors = max_sectors;
1582 		rdev->sb_start = sb_start;
1583 	}
1584 	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1585 	sb->data_size = cpu_to_le64(num_sectors);
1586 	sb->super_offset = rdev->sb_start;
1587 	sb->sb_csum = calc_sb_1_csum(sb);
1588 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1589 		       rdev->sb_page);
1590 	md_super_wait(rdev->mddev);
1591 	return num_sectors / 2; /* kB for sysfs */
1592 }
1593 
1594 static struct super_type super_types[] = {
1595 	[0] = {
1596 		.name	= "0.90.0",
1597 		.owner	= THIS_MODULE,
1598 		.load_super	    = super_90_load,
1599 		.validate_super	    = super_90_validate,
1600 		.sync_super	    = super_90_sync,
1601 		.rdev_size_change   = super_90_rdev_size_change,
1602 	},
1603 	[1] = {
1604 		.name	= "md-1",
1605 		.owner	= THIS_MODULE,
1606 		.load_super	    = super_1_load,
1607 		.validate_super	    = super_1_validate,
1608 		.sync_super	    = super_1_sync,
1609 		.rdev_size_change   = super_1_rdev_size_change,
1610 	},
1611 };
1612 
1613 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1614 {
1615 	mdk_rdev_t *rdev, *rdev2;
1616 
1617 	rcu_read_lock();
1618 	rdev_for_each_rcu(rdev, mddev1)
1619 		rdev_for_each_rcu(rdev2, mddev2)
1620 			if (rdev->bdev->bd_contains ==
1621 			    rdev2->bdev->bd_contains) {
1622 				rcu_read_unlock();
1623 				return 1;
1624 			}
1625 	rcu_read_unlock();
1626 	return 0;
1627 }
1628 
1629 static LIST_HEAD(pending_raid_disks);
1630 
1631 /*
1632  * Try to register data integrity profile for an mddev
1633  *
1634  * This is called when an array is started and after a disk has been kicked
1635  * from the array. It only succeeds if all working and active component devices
1636  * are integrity capable with matching profiles.
1637  */
1638 int md_integrity_register(mddev_t *mddev)
1639 {
1640 	mdk_rdev_t *rdev, *reference = NULL;
1641 
1642 	if (list_empty(&mddev->disks))
1643 		return 0; /* nothing to do */
1644 	if (blk_get_integrity(mddev->gendisk))
1645 		return 0; /* already registered */
1646 	list_for_each_entry(rdev, &mddev->disks, same_set) {
1647 		/* skip spares and non-functional disks */
1648 		if (test_bit(Faulty, &rdev->flags))
1649 			continue;
1650 		if (rdev->raid_disk < 0)
1651 			continue;
1652 		/*
1653 		 * If at least one rdev is not integrity capable, we can not
1654 		 * enable data integrity for the md device.
1655 		 */
1656 		if (!bdev_get_integrity(rdev->bdev))
1657 			return -EINVAL;
1658 		if (!reference) {
1659 			/* Use the first rdev as the reference */
1660 			reference = rdev;
1661 			continue;
1662 		}
1663 		/* does this rdev's profile match the reference profile? */
1664 		if (blk_integrity_compare(reference->bdev->bd_disk,
1665 				rdev->bdev->bd_disk) < 0)
1666 			return -EINVAL;
1667 	}
1668 	/*
1669 	 * All component devices are integrity capable and have matching
1670 	 * profiles, register the common profile for the md device.
1671 	 */
1672 	if (blk_integrity_register(mddev->gendisk,
1673 			bdev_get_integrity(reference->bdev)) != 0) {
1674 		printk(KERN_ERR "md: failed to register integrity for %s\n",
1675 			mdname(mddev));
1676 		return -EINVAL;
1677 	}
1678 	printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1679 		mdname(mddev));
1680 	return 0;
1681 }
1682 EXPORT_SYMBOL(md_integrity_register);
1683 
1684 /* Disable data integrity if non-capable/non-matching disk is being added */
1685 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1686 {
1687 	struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1688 	struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1689 
1690 	if (!bi_mddev) /* nothing to do */
1691 		return;
1692 	if (rdev->raid_disk < 0) /* skip spares */
1693 		return;
1694 	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1695 					     rdev->bdev->bd_disk) >= 0)
1696 		return;
1697 	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1698 	blk_integrity_unregister(mddev->gendisk);
1699 }
1700 EXPORT_SYMBOL(md_integrity_add_rdev);
1701 
1702 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1703 {
1704 	char b[BDEVNAME_SIZE];
1705 	struct kobject *ko;
1706 	char *s;
1707 	int err;
1708 
1709 	if (rdev->mddev) {
1710 		MD_BUG();
1711 		return -EINVAL;
1712 	}
1713 
1714 	/* prevent duplicates */
1715 	if (find_rdev(mddev, rdev->bdev->bd_dev))
1716 		return -EEXIST;
1717 
1718 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
1719 	if (rdev->sectors && (mddev->dev_sectors == 0 ||
1720 			rdev->sectors < mddev->dev_sectors)) {
1721 		if (mddev->pers) {
1722 			/* Cannot change size, so fail
1723 			 * If mddev->level <= 0, then we don't care
1724 			 * about aligning sizes (e.g. linear)
1725 			 */
1726 			if (mddev->level > 0)
1727 				return -ENOSPC;
1728 		} else
1729 			mddev->dev_sectors = rdev->sectors;
1730 	}
1731 
1732 	/* Verify rdev->desc_nr is unique.
1733 	 * If it is -1, assign a free number, else
1734 	 * check number is not in use
1735 	 */
1736 	if (rdev->desc_nr < 0) {
1737 		int choice = 0;
1738 		if (mddev->pers) choice = mddev->raid_disks;
1739 		while (find_rdev_nr(mddev, choice))
1740 			choice++;
1741 		rdev->desc_nr = choice;
1742 	} else {
1743 		if (find_rdev_nr(mddev, rdev->desc_nr))
1744 			return -EBUSY;
1745 	}
1746 	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1747 		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1748 		       mdname(mddev), mddev->max_disks);
1749 		return -EBUSY;
1750 	}
1751 	bdevname(rdev->bdev,b);
1752 	while ( (s=strchr(b, '/')) != NULL)
1753 		*s = '!';
1754 
1755 	rdev->mddev = mddev;
1756 	printk(KERN_INFO "md: bind<%s>\n", b);
1757 
1758 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1759 		goto fail;
1760 
1761 	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1762 	if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1763 		kobject_del(&rdev->kobj);
1764 		goto fail;
1765 	}
1766 	rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1767 
1768 	list_add_rcu(&rdev->same_set, &mddev->disks);
1769 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1770 
1771 	/* May as well allow recovery to be retried once */
1772 	mddev->recovery_disabled = 0;
1773 
1774 	return 0;
1775 
1776  fail:
1777 	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1778 	       b, mdname(mddev));
1779 	return err;
1780 }
1781 
1782 static void md_delayed_delete(struct work_struct *ws)
1783 {
1784 	mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1785 	kobject_del(&rdev->kobj);
1786 	kobject_put(&rdev->kobj);
1787 }
1788 
1789 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1790 {
1791 	char b[BDEVNAME_SIZE];
1792 	if (!rdev->mddev) {
1793 		MD_BUG();
1794 		return;
1795 	}
1796 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1797 	list_del_rcu(&rdev->same_set);
1798 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1799 	rdev->mddev = NULL;
1800 	sysfs_remove_link(&rdev->kobj, "block");
1801 	sysfs_put(rdev->sysfs_state);
1802 	rdev->sysfs_state = NULL;
1803 	/* We need to delay this, otherwise we can deadlock when
1804 	 * writing to 'remove' to "dev/state".  We also need
1805 	 * to delay it due to rcu usage.
1806 	 */
1807 	synchronize_rcu();
1808 	INIT_WORK(&rdev->del_work, md_delayed_delete);
1809 	kobject_get(&rdev->kobj);
1810 	schedule_work(&rdev->del_work);
1811 }
1812 
1813 /*
1814  * prevent the device from being mounted, repartitioned or
1815  * otherwise reused by a RAID array (or any other kernel
1816  * subsystem), by bd_claiming the device.
1817  */
1818 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1819 {
1820 	int err = 0;
1821 	struct block_device *bdev;
1822 	char b[BDEVNAME_SIZE];
1823 
1824 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1825 	if (IS_ERR(bdev)) {
1826 		printk(KERN_ERR "md: could not open %s.\n",
1827 			__bdevname(dev, b));
1828 		return PTR_ERR(bdev);
1829 	}
1830 	err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1831 	if (err) {
1832 		printk(KERN_ERR "md: could not bd_claim %s.\n",
1833 			bdevname(bdev, b));
1834 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1835 		return err;
1836 	}
1837 	if (!shared)
1838 		set_bit(AllReserved, &rdev->flags);
1839 	rdev->bdev = bdev;
1840 	return err;
1841 }
1842 
1843 static void unlock_rdev(mdk_rdev_t *rdev)
1844 {
1845 	struct block_device *bdev = rdev->bdev;
1846 	rdev->bdev = NULL;
1847 	if (!bdev)
1848 		MD_BUG();
1849 	bd_release(bdev);
1850 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1851 }
1852 
1853 void md_autodetect_dev(dev_t dev);
1854 
1855 static void export_rdev(mdk_rdev_t * rdev)
1856 {
1857 	char b[BDEVNAME_SIZE];
1858 	printk(KERN_INFO "md: export_rdev(%s)\n",
1859 		bdevname(rdev->bdev,b));
1860 	if (rdev->mddev)
1861 		MD_BUG();
1862 	free_disk_sb(rdev);
1863 #ifndef MODULE
1864 	if (test_bit(AutoDetected, &rdev->flags))
1865 		md_autodetect_dev(rdev->bdev->bd_dev);
1866 #endif
1867 	unlock_rdev(rdev);
1868 	kobject_put(&rdev->kobj);
1869 }
1870 
1871 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1872 {
1873 	unbind_rdev_from_array(rdev);
1874 	export_rdev(rdev);
1875 }
1876 
1877 static void export_array(mddev_t *mddev)
1878 {
1879 	mdk_rdev_t *rdev, *tmp;
1880 
1881 	rdev_for_each(rdev, tmp, mddev) {
1882 		if (!rdev->mddev) {
1883 			MD_BUG();
1884 			continue;
1885 		}
1886 		kick_rdev_from_array(rdev);
1887 	}
1888 	if (!list_empty(&mddev->disks))
1889 		MD_BUG();
1890 	mddev->raid_disks = 0;
1891 	mddev->major_version = 0;
1892 }
1893 
1894 static void print_desc(mdp_disk_t *desc)
1895 {
1896 	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1897 		desc->major,desc->minor,desc->raid_disk,desc->state);
1898 }
1899 
1900 static void print_sb_90(mdp_super_t *sb)
1901 {
1902 	int i;
1903 
1904 	printk(KERN_INFO
1905 		"md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1906 		sb->major_version, sb->minor_version, sb->patch_version,
1907 		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1908 		sb->ctime);
1909 	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1910 		sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1911 		sb->md_minor, sb->layout, sb->chunk_size);
1912 	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1913 		" FD:%d SD:%d CSUM:%08x E:%08lx\n",
1914 		sb->utime, sb->state, sb->active_disks, sb->working_disks,
1915 		sb->failed_disks, sb->spare_disks,
1916 		sb->sb_csum, (unsigned long)sb->events_lo);
1917 
1918 	printk(KERN_INFO);
1919 	for (i = 0; i < MD_SB_DISKS; i++) {
1920 		mdp_disk_t *desc;
1921 
1922 		desc = sb->disks + i;
1923 		if (desc->number || desc->major || desc->minor ||
1924 		    desc->raid_disk || (desc->state && (desc->state != 4))) {
1925 			printk("     D %2d: ", i);
1926 			print_desc(desc);
1927 		}
1928 	}
1929 	printk(KERN_INFO "md:     THIS: ");
1930 	print_desc(&sb->this_disk);
1931 }
1932 
1933 static void print_sb_1(struct mdp_superblock_1 *sb)
1934 {
1935 	__u8 *uuid;
1936 
1937 	uuid = sb->set_uuid;
1938 	printk(KERN_INFO
1939 	       "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
1940 	       "md:    Name: \"%s\" CT:%llu\n",
1941 		le32_to_cpu(sb->major_version),
1942 		le32_to_cpu(sb->feature_map),
1943 		uuid,
1944 		sb->set_name,
1945 		(unsigned long long)le64_to_cpu(sb->ctime)
1946 		       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1947 
1948 	uuid = sb->device_uuid;
1949 	printk(KERN_INFO
1950 	       "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1951 			" RO:%llu\n"
1952 	       "md:     Dev:%08x UUID: %pU\n"
1953 	       "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1954 	       "md:         (MaxDev:%u) \n",
1955 		le32_to_cpu(sb->level),
1956 		(unsigned long long)le64_to_cpu(sb->size),
1957 		le32_to_cpu(sb->raid_disks),
1958 		le32_to_cpu(sb->layout),
1959 		le32_to_cpu(sb->chunksize),
1960 		(unsigned long long)le64_to_cpu(sb->data_offset),
1961 		(unsigned long long)le64_to_cpu(sb->data_size),
1962 		(unsigned long long)le64_to_cpu(sb->super_offset),
1963 		(unsigned long long)le64_to_cpu(sb->recovery_offset),
1964 		le32_to_cpu(sb->dev_number),
1965 		uuid,
1966 		sb->devflags,
1967 		(unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1968 		(unsigned long long)le64_to_cpu(sb->events),
1969 		(unsigned long long)le64_to_cpu(sb->resync_offset),
1970 		le32_to_cpu(sb->sb_csum),
1971 		le32_to_cpu(sb->max_dev)
1972 		);
1973 }
1974 
1975 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1976 {
1977 	char b[BDEVNAME_SIZE];
1978 	printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1979 		bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1980 	        test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1981 	        rdev->desc_nr);
1982 	if (rdev->sb_loaded) {
1983 		printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1984 		switch (major_version) {
1985 		case 0:
1986 			print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1987 			break;
1988 		case 1:
1989 			print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1990 			break;
1991 		}
1992 	} else
1993 		printk(KERN_INFO "md: no rdev superblock!\n");
1994 }
1995 
1996 static void md_print_devices(void)
1997 {
1998 	struct list_head *tmp;
1999 	mdk_rdev_t *rdev;
2000 	mddev_t *mddev;
2001 	char b[BDEVNAME_SIZE];
2002 
2003 	printk("\n");
2004 	printk("md:	**********************************\n");
2005 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
2006 	printk("md:	**********************************\n");
2007 	for_each_mddev(mddev, tmp) {
2008 
2009 		if (mddev->bitmap)
2010 			bitmap_print_sb(mddev->bitmap);
2011 		else
2012 			printk("%s: ", mdname(mddev));
2013 		list_for_each_entry(rdev, &mddev->disks, same_set)
2014 			printk("<%s>", bdevname(rdev->bdev,b));
2015 		printk("\n");
2016 
2017 		list_for_each_entry(rdev, &mddev->disks, same_set)
2018 			print_rdev(rdev, mddev->major_version);
2019 	}
2020 	printk("md:	**********************************\n");
2021 	printk("\n");
2022 }
2023 
2024 
2025 static void sync_sbs(mddev_t * mddev, int nospares)
2026 {
2027 	/* Update each superblock (in-memory image), but
2028 	 * if we are allowed to, skip spares which already
2029 	 * have the right event counter, or have one earlier
2030 	 * (which would mean they aren't being marked as dirty
2031 	 * with the rest of the array)
2032 	 */
2033 	mdk_rdev_t *rdev;
2034 
2035 	/* First make sure individual recovery_offsets are correct */
2036 	list_for_each_entry(rdev, &mddev->disks, same_set) {
2037 		if (rdev->raid_disk >= 0 &&
2038 		    !test_bit(In_sync, &rdev->flags) &&
2039 		    mddev->curr_resync_completed > rdev->recovery_offset)
2040 				rdev->recovery_offset = mddev->curr_resync_completed;
2041 
2042 	}
2043 	list_for_each_entry(rdev, &mddev->disks, same_set) {
2044 		if (rdev->sb_events == mddev->events ||
2045 		    (nospares &&
2046 		     rdev->raid_disk < 0 &&
2047 		     (rdev->sb_events&1)==0 &&
2048 		     rdev->sb_events+1 == mddev->events)) {
2049 			/* Don't update this superblock */
2050 			rdev->sb_loaded = 2;
2051 		} else {
2052 			super_types[mddev->major_version].
2053 				sync_super(mddev, rdev);
2054 			rdev->sb_loaded = 1;
2055 		}
2056 	}
2057 }
2058 
2059 static void md_update_sb(mddev_t * mddev, int force_change)
2060 {
2061 	mdk_rdev_t *rdev;
2062 	int sync_req;
2063 	int nospares = 0;
2064 
2065 	mddev->utime = get_seconds();
2066 	if (mddev->external)
2067 		return;
2068 repeat:
2069 	spin_lock_irq(&mddev->write_lock);
2070 
2071 	set_bit(MD_CHANGE_PENDING, &mddev->flags);
2072 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2073 		force_change = 1;
2074 	if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2075 		/* just a clean<-> dirty transition, possibly leave spares alone,
2076 		 * though if events isn't the right even/odd, we will have to do
2077 		 * spares after all
2078 		 */
2079 		nospares = 1;
2080 	if (force_change)
2081 		nospares = 0;
2082 	if (mddev->degraded)
2083 		/* If the array is degraded, then skipping spares is both
2084 		 * dangerous and fairly pointless.
2085 		 * Dangerous because a device that was removed from the array
2086 		 * might have a event_count that still looks up-to-date,
2087 		 * so it can be re-added without a resync.
2088 		 * Pointless because if there are any spares to skip,
2089 		 * then a recovery will happen and soon that array won't
2090 		 * be degraded any more and the spare can go back to sleep then.
2091 		 */
2092 		nospares = 0;
2093 
2094 	sync_req = mddev->in_sync;
2095 
2096 	/* If this is just a dirty<->clean transition, and the array is clean
2097 	 * and 'events' is odd, we can roll back to the previous clean state */
2098 	if (nospares
2099 	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2100 	    && (mddev->events & 1)
2101 	    && mddev->events != 1)
2102 		mddev->events--;
2103 	else {
2104 		/* otherwise we have to go forward and ... */
2105 		mddev->events ++;
2106 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
2107 			/* .. if the array isn't clean, an 'even' event must also go
2108 			 * to spares. */
2109 			if ((mddev->events&1)==0)
2110 				nospares = 0;
2111 		} else {
2112 			/* otherwise an 'odd' event must go to spares */
2113 			if ((mddev->events&1))
2114 				nospares = 0;
2115 		}
2116 	}
2117 
2118 	if (!mddev->events) {
2119 		/*
2120 		 * oops, this 64-bit counter should never wrap.
2121 		 * Either we are in around ~1 trillion A.C., assuming
2122 		 * 1 reboot per second, or we have a bug:
2123 		 */
2124 		MD_BUG();
2125 		mddev->events --;
2126 	}
2127 
2128 	/*
2129 	 * do not write anything to disk if using
2130 	 * nonpersistent superblocks
2131 	 */
2132 	if (!mddev->persistent) {
2133 		if (!mddev->external)
2134 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2135 
2136 		spin_unlock_irq(&mddev->write_lock);
2137 		wake_up(&mddev->sb_wait);
2138 		return;
2139 	}
2140 	sync_sbs(mddev, nospares);
2141 	spin_unlock_irq(&mddev->write_lock);
2142 
2143 	dprintk(KERN_INFO
2144 		"md: updating %s RAID superblock on device (in sync %d)\n",
2145 		mdname(mddev),mddev->in_sync);
2146 
2147 	bitmap_update_sb(mddev->bitmap);
2148 	list_for_each_entry(rdev, &mddev->disks, same_set) {
2149 		char b[BDEVNAME_SIZE];
2150 		dprintk(KERN_INFO "md: ");
2151 		if (rdev->sb_loaded != 1)
2152 			continue; /* no noise on spare devices */
2153 		if (test_bit(Faulty, &rdev->flags))
2154 			dprintk("(skipping faulty ");
2155 
2156 		dprintk("%s ", bdevname(rdev->bdev,b));
2157 		if (!test_bit(Faulty, &rdev->flags)) {
2158 			md_super_write(mddev,rdev,
2159 				       rdev->sb_start, rdev->sb_size,
2160 				       rdev->sb_page);
2161 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2162 				bdevname(rdev->bdev,b),
2163 				(unsigned long long)rdev->sb_start);
2164 			rdev->sb_events = mddev->events;
2165 
2166 		} else
2167 			dprintk(")\n");
2168 		if (mddev->level == LEVEL_MULTIPATH)
2169 			/* only need to write one superblock... */
2170 			break;
2171 	}
2172 	md_super_wait(mddev);
2173 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2174 
2175 	spin_lock_irq(&mddev->write_lock);
2176 	if (mddev->in_sync != sync_req ||
2177 	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2178 		/* have to write it out again */
2179 		spin_unlock_irq(&mddev->write_lock);
2180 		goto repeat;
2181 	}
2182 	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2183 	spin_unlock_irq(&mddev->write_lock);
2184 	wake_up(&mddev->sb_wait);
2185 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2186 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2187 
2188 }
2189 
2190 /* words written to sysfs files may, or may not, be \n terminated.
2191  * We want to accept with case. For this we use cmd_match.
2192  */
2193 static int cmd_match(const char *cmd, const char *str)
2194 {
2195 	/* See if cmd, written into a sysfs file, matches
2196 	 * str.  They must either be the same, or cmd can
2197 	 * have a trailing newline
2198 	 */
2199 	while (*cmd && *str && *cmd == *str) {
2200 		cmd++;
2201 		str++;
2202 	}
2203 	if (*cmd == '\n')
2204 		cmd++;
2205 	if (*str || *cmd)
2206 		return 0;
2207 	return 1;
2208 }
2209 
2210 struct rdev_sysfs_entry {
2211 	struct attribute attr;
2212 	ssize_t (*show)(mdk_rdev_t *, char *);
2213 	ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2214 };
2215 
2216 static ssize_t
2217 state_show(mdk_rdev_t *rdev, char *page)
2218 {
2219 	char *sep = "";
2220 	size_t len = 0;
2221 
2222 	if (test_bit(Faulty, &rdev->flags)) {
2223 		len+= sprintf(page+len, "%sfaulty",sep);
2224 		sep = ",";
2225 	}
2226 	if (test_bit(In_sync, &rdev->flags)) {
2227 		len += sprintf(page+len, "%sin_sync",sep);
2228 		sep = ",";
2229 	}
2230 	if (test_bit(WriteMostly, &rdev->flags)) {
2231 		len += sprintf(page+len, "%swrite_mostly",sep);
2232 		sep = ",";
2233 	}
2234 	if (test_bit(Blocked, &rdev->flags)) {
2235 		len += sprintf(page+len, "%sblocked", sep);
2236 		sep = ",";
2237 	}
2238 	if (!test_bit(Faulty, &rdev->flags) &&
2239 	    !test_bit(In_sync, &rdev->flags)) {
2240 		len += sprintf(page+len, "%sspare", sep);
2241 		sep = ",";
2242 	}
2243 	return len+sprintf(page+len, "\n");
2244 }
2245 
2246 static ssize_t
2247 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2248 {
2249 	/* can write
2250 	 *  faulty  - simulates and error
2251 	 *  remove  - disconnects the device
2252 	 *  writemostly - sets write_mostly
2253 	 *  -writemostly - clears write_mostly
2254 	 *  blocked - sets the Blocked flag
2255 	 *  -blocked - clears the Blocked flag
2256 	 *  insync - sets Insync providing device isn't active
2257 	 */
2258 	int err = -EINVAL;
2259 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2260 		md_error(rdev->mddev, rdev);
2261 		err = 0;
2262 	} else if (cmd_match(buf, "remove")) {
2263 		if (rdev->raid_disk >= 0)
2264 			err = -EBUSY;
2265 		else {
2266 			mddev_t *mddev = rdev->mddev;
2267 			kick_rdev_from_array(rdev);
2268 			if (mddev->pers)
2269 				md_update_sb(mddev, 1);
2270 			md_new_event(mddev);
2271 			err = 0;
2272 		}
2273 	} else if (cmd_match(buf, "writemostly")) {
2274 		set_bit(WriteMostly, &rdev->flags);
2275 		err = 0;
2276 	} else if (cmd_match(buf, "-writemostly")) {
2277 		clear_bit(WriteMostly, &rdev->flags);
2278 		err = 0;
2279 	} else if (cmd_match(buf, "blocked")) {
2280 		set_bit(Blocked, &rdev->flags);
2281 		err = 0;
2282 	} else if (cmd_match(buf, "-blocked")) {
2283 		clear_bit(Blocked, &rdev->flags);
2284 		wake_up(&rdev->blocked_wait);
2285 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2286 		md_wakeup_thread(rdev->mddev->thread);
2287 
2288 		err = 0;
2289 	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2290 		set_bit(In_sync, &rdev->flags);
2291 		err = 0;
2292 	}
2293 	if (!err && rdev->sysfs_state)
2294 		sysfs_notify_dirent(rdev->sysfs_state);
2295 	return err ? err : len;
2296 }
2297 static struct rdev_sysfs_entry rdev_state =
2298 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2299 
2300 static ssize_t
2301 errors_show(mdk_rdev_t *rdev, char *page)
2302 {
2303 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2304 }
2305 
2306 static ssize_t
2307 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2308 {
2309 	char *e;
2310 	unsigned long n = simple_strtoul(buf, &e, 10);
2311 	if (*buf && (*e == 0 || *e == '\n')) {
2312 		atomic_set(&rdev->corrected_errors, n);
2313 		return len;
2314 	}
2315 	return -EINVAL;
2316 }
2317 static struct rdev_sysfs_entry rdev_errors =
2318 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2319 
2320 static ssize_t
2321 slot_show(mdk_rdev_t *rdev, char *page)
2322 {
2323 	if (rdev->raid_disk < 0)
2324 		return sprintf(page, "none\n");
2325 	else
2326 		return sprintf(page, "%d\n", rdev->raid_disk);
2327 }
2328 
2329 static ssize_t
2330 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2331 {
2332 	char *e;
2333 	int err;
2334 	char nm[20];
2335 	int slot = simple_strtoul(buf, &e, 10);
2336 	if (strncmp(buf, "none", 4)==0)
2337 		slot = -1;
2338 	else if (e==buf || (*e && *e!= '\n'))
2339 		return -EINVAL;
2340 	if (rdev->mddev->pers && slot == -1) {
2341 		/* Setting 'slot' on an active array requires also
2342 		 * updating the 'rd%d' link, and communicating
2343 		 * with the personality with ->hot_*_disk.
2344 		 * For now we only support removing
2345 		 * failed/spare devices.  This normally happens automatically,
2346 		 * but not when the metadata is externally managed.
2347 		 */
2348 		if (rdev->raid_disk == -1)
2349 			return -EEXIST;
2350 		/* personality does all needed checks */
2351 		if (rdev->mddev->pers->hot_add_disk == NULL)
2352 			return -EINVAL;
2353 		err = rdev->mddev->pers->
2354 			hot_remove_disk(rdev->mddev, rdev->raid_disk);
2355 		if (err)
2356 			return err;
2357 		sprintf(nm, "rd%d", rdev->raid_disk);
2358 		sysfs_remove_link(&rdev->mddev->kobj, nm);
2359 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2360 		md_wakeup_thread(rdev->mddev->thread);
2361 	} else if (rdev->mddev->pers) {
2362 		mdk_rdev_t *rdev2;
2363 		/* Activating a spare .. or possibly reactivating
2364 		 * if we ever get bitmaps working here.
2365 		 */
2366 
2367 		if (rdev->raid_disk != -1)
2368 			return -EBUSY;
2369 
2370 		if (rdev->mddev->pers->hot_add_disk == NULL)
2371 			return -EINVAL;
2372 
2373 		list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2374 			if (rdev2->raid_disk == slot)
2375 				return -EEXIST;
2376 
2377 		rdev->raid_disk = slot;
2378 		if (test_bit(In_sync, &rdev->flags))
2379 			rdev->saved_raid_disk = slot;
2380 		else
2381 			rdev->saved_raid_disk = -1;
2382 		err = rdev->mddev->pers->
2383 			hot_add_disk(rdev->mddev, rdev);
2384 		if (err) {
2385 			rdev->raid_disk = -1;
2386 			return err;
2387 		} else
2388 			sysfs_notify_dirent(rdev->sysfs_state);
2389 		sprintf(nm, "rd%d", rdev->raid_disk);
2390 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2391 			printk(KERN_WARNING
2392 			       "md: cannot register "
2393 			       "%s for %s\n",
2394 			       nm, mdname(rdev->mddev));
2395 
2396 		/* don't wakeup anyone, leave that to userspace. */
2397 	} else {
2398 		if (slot >= rdev->mddev->raid_disks)
2399 			return -ENOSPC;
2400 		rdev->raid_disk = slot;
2401 		/* assume it is working */
2402 		clear_bit(Faulty, &rdev->flags);
2403 		clear_bit(WriteMostly, &rdev->flags);
2404 		set_bit(In_sync, &rdev->flags);
2405 		sysfs_notify_dirent(rdev->sysfs_state);
2406 	}
2407 	return len;
2408 }
2409 
2410 
2411 static struct rdev_sysfs_entry rdev_slot =
2412 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2413 
2414 static ssize_t
2415 offset_show(mdk_rdev_t *rdev, char *page)
2416 {
2417 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2418 }
2419 
2420 static ssize_t
2421 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2422 {
2423 	char *e;
2424 	unsigned long long offset = simple_strtoull(buf, &e, 10);
2425 	if (e==buf || (*e && *e != '\n'))
2426 		return -EINVAL;
2427 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
2428 		return -EBUSY;
2429 	if (rdev->sectors && rdev->mddev->external)
2430 		/* Must set offset before size, so overlap checks
2431 		 * can be sane */
2432 		return -EBUSY;
2433 	rdev->data_offset = offset;
2434 	return len;
2435 }
2436 
2437 static struct rdev_sysfs_entry rdev_offset =
2438 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2439 
2440 static ssize_t
2441 rdev_size_show(mdk_rdev_t *rdev, char *page)
2442 {
2443 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2444 }
2445 
2446 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2447 {
2448 	/* check if two start/length pairs overlap */
2449 	if (s1+l1 <= s2)
2450 		return 0;
2451 	if (s2+l2 <= s1)
2452 		return 0;
2453 	return 1;
2454 }
2455 
2456 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2457 {
2458 	unsigned long long blocks;
2459 	sector_t new;
2460 
2461 	if (strict_strtoull(buf, 10, &blocks) < 0)
2462 		return -EINVAL;
2463 
2464 	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2465 		return -EINVAL; /* sector conversion overflow */
2466 
2467 	new = blocks * 2;
2468 	if (new != blocks * 2)
2469 		return -EINVAL; /* unsigned long long to sector_t overflow */
2470 
2471 	*sectors = new;
2472 	return 0;
2473 }
2474 
2475 static ssize_t
2476 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2477 {
2478 	mddev_t *my_mddev = rdev->mddev;
2479 	sector_t oldsectors = rdev->sectors;
2480 	sector_t sectors;
2481 
2482 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
2483 		return -EINVAL;
2484 	if (my_mddev->pers && rdev->raid_disk >= 0) {
2485 		if (my_mddev->persistent) {
2486 			sectors = super_types[my_mddev->major_version].
2487 				rdev_size_change(rdev, sectors);
2488 			if (!sectors)
2489 				return -EBUSY;
2490 		} else if (!sectors)
2491 			sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2492 				rdev->data_offset;
2493 	}
2494 	if (sectors < my_mddev->dev_sectors)
2495 		return -EINVAL; /* component must fit device */
2496 
2497 	rdev->sectors = sectors;
2498 	if (sectors > oldsectors && my_mddev->external) {
2499 		/* need to check that all other rdevs with the same ->bdev
2500 		 * do not overlap.  We need to unlock the mddev to avoid
2501 		 * a deadlock.  We have already changed rdev->sectors, and if
2502 		 * we have to change it back, we will have the lock again.
2503 		 */
2504 		mddev_t *mddev;
2505 		int overlap = 0;
2506 		struct list_head *tmp;
2507 
2508 		mddev_unlock(my_mddev);
2509 		for_each_mddev(mddev, tmp) {
2510 			mdk_rdev_t *rdev2;
2511 
2512 			mddev_lock(mddev);
2513 			list_for_each_entry(rdev2, &mddev->disks, same_set)
2514 				if (test_bit(AllReserved, &rdev2->flags) ||
2515 				    (rdev->bdev == rdev2->bdev &&
2516 				     rdev != rdev2 &&
2517 				     overlaps(rdev->data_offset, rdev->sectors,
2518 					      rdev2->data_offset,
2519 					      rdev2->sectors))) {
2520 					overlap = 1;
2521 					break;
2522 				}
2523 			mddev_unlock(mddev);
2524 			if (overlap) {
2525 				mddev_put(mddev);
2526 				break;
2527 			}
2528 		}
2529 		mddev_lock(my_mddev);
2530 		if (overlap) {
2531 			/* Someone else could have slipped in a size
2532 			 * change here, but doing so is just silly.
2533 			 * We put oldsectors back because we *know* it is
2534 			 * safe, and trust userspace not to race with
2535 			 * itself
2536 			 */
2537 			rdev->sectors = oldsectors;
2538 			return -EBUSY;
2539 		}
2540 	}
2541 	return len;
2542 }
2543 
2544 static struct rdev_sysfs_entry rdev_size =
2545 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2546 
2547 
2548 static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2549 {
2550 	unsigned long long recovery_start = rdev->recovery_offset;
2551 
2552 	if (test_bit(In_sync, &rdev->flags) ||
2553 	    recovery_start == MaxSector)
2554 		return sprintf(page, "none\n");
2555 
2556 	return sprintf(page, "%llu\n", recovery_start);
2557 }
2558 
2559 static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2560 {
2561 	unsigned long long recovery_start;
2562 
2563 	if (cmd_match(buf, "none"))
2564 		recovery_start = MaxSector;
2565 	else if (strict_strtoull(buf, 10, &recovery_start))
2566 		return -EINVAL;
2567 
2568 	if (rdev->mddev->pers &&
2569 	    rdev->raid_disk >= 0)
2570 		return -EBUSY;
2571 
2572 	rdev->recovery_offset = recovery_start;
2573 	if (recovery_start == MaxSector)
2574 		set_bit(In_sync, &rdev->flags);
2575 	else
2576 		clear_bit(In_sync, &rdev->flags);
2577 	return len;
2578 }
2579 
2580 static struct rdev_sysfs_entry rdev_recovery_start =
2581 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2582 
2583 static struct attribute *rdev_default_attrs[] = {
2584 	&rdev_state.attr,
2585 	&rdev_errors.attr,
2586 	&rdev_slot.attr,
2587 	&rdev_offset.attr,
2588 	&rdev_size.attr,
2589 	&rdev_recovery_start.attr,
2590 	NULL,
2591 };
2592 static ssize_t
2593 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2594 {
2595 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2596 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2597 	mddev_t *mddev = rdev->mddev;
2598 	ssize_t rv;
2599 
2600 	if (!entry->show)
2601 		return -EIO;
2602 
2603 	rv = mddev ? mddev_lock(mddev) : -EBUSY;
2604 	if (!rv) {
2605 		if (rdev->mddev == NULL)
2606 			rv = -EBUSY;
2607 		else
2608 			rv = entry->show(rdev, page);
2609 		mddev_unlock(mddev);
2610 	}
2611 	return rv;
2612 }
2613 
2614 static ssize_t
2615 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2616 	      const char *page, size_t length)
2617 {
2618 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2619 	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2620 	ssize_t rv;
2621 	mddev_t *mddev = rdev->mddev;
2622 
2623 	if (!entry->store)
2624 		return -EIO;
2625 	if (!capable(CAP_SYS_ADMIN))
2626 		return -EACCES;
2627 	rv = mddev ? mddev_lock(mddev): -EBUSY;
2628 	if (!rv) {
2629 		if (rdev->mddev == NULL)
2630 			rv = -EBUSY;
2631 		else
2632 			rv = entry->store(rdev, page, length);
2633 		mddev_unlock(mddev);
2634 	}
2635 	return rv;
2636 }
2637 
2638 static void rdev_free(struct kobject *ko)
2639 {
2640 	mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2641 	kfree(rdev);
2642 }
2643 static struct sysfs_ops rdev_sysfs_ops = {
2644 	.show		= rdev_attr_show,
2645 	.store		= rdev_attr_store,
2646 };
2647 static struct kobj_type rdev_ktype = {
2648 	.release	= rdev_free,
2649 	.sysfs_ops	= &rdev_sysfs_ops,
2650 	.default_attrs	= rdev_default_attrs,
2651 };
2652 
2653 /*
2654  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2655  *
2656  * mark the device faulty if:
2657  *
2658  *   - the device is nonexistent (zero size)
2659  *   - the device has no valid superblock
2660  *
2661  * a faulty rdev _never_ has rdev->sb set.
2662  */
2663 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2664 {
2665 	char b[BDEVNAME_SIZE];
2666 	int err;
2667 	mdk_rdev_t *rdev;
2668 	sector_t size;
2669 
2670 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2671 	if (!rdev) {
2672 		printk(KERN_ERR "md: could not alloc mem for new device!\n");
2673 		return ERR_PTR(-ENOMEM);
2674 	}
2675 
2676 	if ((err = alloc_disk_sb(rdev)))
2677 		goto abort_free;
2678 
2679 	err = lock_rdev(rdev, newdev, super_format == -2);
2680 	if (err)
2681 		goto abort_free;
2682 
2683 	kobject_init(&rdev->kobj, &rdev_ktype);
2684 
2685 	rdev->desc_nr = -1;
2686 	rdev->saved_raid_disk = -1;
2687 	rdev->raid_disk = -1;
2688 	rdev->flags = 0;
2689 	rdev->data_offset = 0;
2690 	rdev->sb_events = 0;
2691 	rdev->last_read_error.tv_sec  = 0;
2692 	rdev->last_read_error.tv_nsec = 0;
2693 	atomic_set(&rdev->nr_pending, 0);
2694 	atomic_set(&rdev->read_errors, 0);
2695 	atomic_set(&rdev->corrected_errors, 0);
2696 
2697 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2698 	if (!size) {
2699 		printk(KERN_WARNING
2700 			"md: %s has zero or unknown size, marking faulty!\n",
2701 			bdevname(rdev->bdev,b));
2702 		err = -EINVAL;
2703 		goto abort_free;
2704 	}
2705 
2706 	if (super_format >= 0) {
2707 		err = super_types[super_format].
2708 			load_super(rdev, NULL, super_minor);
2709 		if (err == -EINVAL) {
2710 			printk(KERN_WARNING
2711 				"md: %s does not have a valid v%d.%d "
2712 			       "superblock, not importing!\n",
2713 				bdevname(rdev->bdev,b),
2714 			       super_format, super_minor);
2715 			goto abort_free;
2716 		}
2717 		if (err < 0) {
2718 			printk(KERN_WARNING
2719 				"md: could not read %s's sb, not importing!\n",
2720 				bdevname(rdev->bdev,b));
2721 			goto abort_free;
2722 		}
2723 	}
2724 
2725 	INIT_LIST_HEAD(&rdev->same_set);
2726 	init_waitqueue_head(&rdev->blocked_wait);
2727 
2728 	return rdev;
2729 
2730 abort_free:
2731 	if (rdev->sb_page) {
2732 		if (rdev->bdev)
2733 			unlock_rdev(rdev);
2734 		free_disk_sb(rdev);
2735 	}
2736 	kfree(rdev);
2737 	return ERR_PTR(err);
2738 }
2739 
2740 /*
2741  * Check a full RAID array for plausibility
2742  */
2743 
2744 
2745 static void analyze_sbs(mddev_t * mddev)
2746 {
2747 	int i;
2748 	mdk_rdev_t *rdev, *freshest, *tmp;
2749 	char b[BDEVNAME_SIZE];
2750 
2751 	freshest = NULL;
2752 	rdev_for_each(rdev, tmp, mddev)
2753 		switch (super_types[mddev->major_version].
2754 			load_super(rdev, freshest, mddev->minor_version)) {
2755 		case 1:
2756 			freshest = rdev;
2757 			break;
2758 		case 0:
2759 			break;
2760 		default:
2761 			printk( KERN_ERR \
2762 				"md: fatal superblock inconsistency in %s"
2763 				" -- removing from array\n",
2764 				bdevname(rdev->bdev,b));
2765 			kick_rdev_from_array(rdev);
2766 		}
2767 
2768 
2769 	super_types[mddev->major_version].
2770 		validate_super(mddev, freshest);
2771 
2772 	i = 0;
2773 	rdev_for_each(rdev, tmp, mddev) {
2774 		if (rdev->desc_nr >= mddev->max_disks ||
2775 		    i > mddev->max_disks) {
2776 			printk(KERN_WARNING
2777 			       "md: %s: %s: only %d devices permitted\n",
2778 			       mdname(mddev), bdevname(rdev->bdev, b),
2779 			       mddev->max_disks);
2780 			kick_rdev_from_array(rdev);
2781 			continue;
2782 		}
2783 		if (rdev != freshest)
2784 			if (super_types[mddev->major_version].
2785 			    validate_super(mddev, rdev)) {
2786 				printk(KERN_WARNING "md: kicking non-fresh %s"
2787 					" from array!\n",
2788 					bdevname(rdev->bdev,b));
2789 				kick_rdev_from_array(rdev);
2790 				continue;
2791 			}
2792 		if (mddev->level == LEVEL_MULTIPATH) {
2793 			rdev->desc_nr = i++;
2794 			rdev->raid_disk = rdev->desc_nr;
2795 			set_bit(In_sync, &rdev->flags);
2796 		} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
2797 			rdev->raid_disk = -1;
2798 			clear_bit(In_sync, &rdev->flags);
2799 		}
2800 	}
2801 }
2802 
2803 /* Read a fixed-point number.
2804  * Numbers in sysfs attributes should be in "standard" units where
2805  * possible, so time should be in seconds.
2806  * However we internally use a a much smaller unit such as
2807  * milliseconds or jiffies.
2808  * This function takes a decimal number with a possible fractional
2809  * component, and produces an integer which is the result of
2810  * multiplying that number by 10^'scale'.
2811  * all without any floating-point arithmetic.
2812  */
2813 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
2814 {
2815 	unsigned long result = 0;
2816 	long decimals = -1;
2817 	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
2818 		if (*cp == '.')
2819 			decimals = 0;
2820 		else if (decimals < scale) {
2821 			unsigned int value;
2822 			value = *cp - '0';
2823 			result = result * 10 + value;
2824 			if (decimals >= 0)
2825 				decimals++;
2826 		}
2827 		cp++;
2828 	}
2829 	if (*cp == '\n')
2830 		cp++;
2831 	if (*cp)
2832 		return -EINVAL;
2833 	if (decimals < 0)
2834 		decimals = 0;
2835 	while (decimals < scale) {
2836 		result *= 10;
2837 		decimals ++;
2838 	}
2839 	*res = result;
2840 	return 0;
2841 }
2842 
2843 
2844 static void md_safemode_timeout(unsigned long data);
2845 
2846 static ssize_t
2847 safe_delay_show(mddev_t *mddev, char *page)
2848 {
2849 	int msec = (mddev->safemode_delay*1000)/HZ;
2850 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2851 }
2852 static ssize_t
2853 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2854 {
2855 	unsigned long msec;
2856 
2857 	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
2858 		return -EINVAL;
2859 	if (msec == 0)
2860 		mddev->safemode_delay = 0;
2861 	else {
2862 		unsigned long old_delay = mddev->safemode_delay;
2863 		mddev->safemode_delay = (msec*HZ)/1000;
2864 		if (mddev->safemode_delay == 0)
2865 			mddev->safemode_delay = 1;
2866 		if (mddev->safemode_delay < old_delay)
2867 			md_safemode_timeout((unsigned long)mddev);
2868 	}
2869 	return len;
2870 }
2871 static struct md_sysfs_entry md_safe_delay =
2872 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2873 
2874 static ssize_t
2875 level_show(mddev_t *mddev, char *page)
2876 {
2877 	struct mdk_personality *p = mddev->pers;
2878 	if (p)
2879 		return sprintf(page, "%s\n", p->name);
2880 	else if (mddev->clevel[0])
2881 		return sprintf(page, "%s\n", mddev->clevel);
2882 	else if (mddev->level != LEVEL_NONE)
2883 		return sprintf(page, "%d\n", mddev->level);
2884 	else
2885 		return 0;
2886 }
2887 
2888 static ssize_t
2889 level_store(mddev_t *mddev, const char *buf, size_t len)
2890 {
2891 	char level[16];
2892 	ssize_t rv = len;
2893 	struct mdk_personality *pers;
2894 	void *priv;
2895 	mdk_rdev_t *rdev;
2896 
2897 	if (mddev->pers == NULL) {
2898 		if (len == 0)
2899 			return 0;
2900 		if (len >= sizeof(mddev->clevel))
2901 			return -ENOSPC;
2902 		strncpy(mddev->clevel, buf, len);
2903 		if (mddev->clevel[len-1] == '\n')
2904 			len--;
2905 		mddev->clevel[len] = 0;
2906 		mddev->level = LEVEL_NONE;
2907 		return rv;
2908 	}
2909 
2910 	/* request to change the personality.  Need to ensure:
2911 	 *  - array is not engaged in resync/recovery/reshape
2912 	 *  - old personality can be suspended
2913 	 *  - new personality will access other array.
2914 	 */
2915 
2916 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2917 		return -EBUSY;
2918 
2919 	if (!mddev->pers->quiesce) {
2920 		printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2921 		       mdname(mddev), mddev->pers->name);
2922 		return -EINVAL;
2923 	}
2924 
2925 	/* Now find the new personality */
2926 	if (len == 0 || len >= sizeof(level))
2927 		return -EINVAL;
2928 	strncpy(level, buf, len);
2929 	if (level[len-1] == '\n')
2930 		len--;
2931 	level[len] = 0;
2932 
2933 	request_module("md-%s", level);
2934 	spin_lock(&pers_lock);
2935 	pers = find_pers(LEVEL_NONE, level);
2936 	if (!pers || !try_module_get(pers->owner)) {
2937 		spin_unlock(&pers_lock);
2938 		printk(KERN_WARNING "md: personality %s not loaded\n", level);
2939 		return -EINVAL;
2940 	}
2941 	spin_unlock(&pers_lock);
2942 
2943 	if (pers == mddev->pers) {
2944 		/* Nothing to do! */
2945 		module_put(pers->owner);
2946 		return rv;
2947 	}
2948 	if (!pers->takeover) {
2949 		module_put(pers->owner);
2950 		printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2951 		       mdname(mddev), level);
2952 		return -EINVAL;
2953 	}
2954 
2955 	/* ->takeover must set new_* and/or delta_disks
2956 	 * if it succeeds, and may set them when it fails.
2957 	 */
2958 	priv = pers->takeover(mddev);
2959 	if (IS_ERR(priv)) {
2960 		mddev->new_level = mddev->level;
2961 		mddev->new_layout = mddev->layout;
2962 		mddev->new_chunk_sectors = mddev->chunk_sectors;
2963 		mddev->raid_disks -= mddev->delta_disks;
2964 		mddev->delta_disks = 0;
2965 		module_put(pers->owner);
2966 		printk(KERN_WARNING "md: %s: %s would not accept array\n",
2967 		       mdname(mddev), level);
2968 		return PTR_ERR(priv);
2969 	}
2970 
2971 	/* Looks like we have a winner */
2972 	mddev_suspend(mddev);
2973 	mddev->pers->stop(mddev);
2974 	module_put(mddev->pers->owner);
2975 	/* Invalidate devices that are now superfluous */
2976 	list_for_each_entry(rdev, &mddev->disks, same_set)
2977 		if (rdev->raid_disk >= mddev->raid_disks) {
2978 			rdev->raid_disk = -1;
2979 			clear_bit(In_sync, &rdev->flags);
2980 		}
2981 	mddev->pers = pers;
2982 	mddev->private = priv;
2983 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2984 	mddev->level = mddev->new_level;
2985 	mddev->layout = mddev->new_layout;
2986 	mddev->chunk_sectors = mddev->new_chunk_sectors;
2987 	mddev->delta_disks = 0;
2988 	pers->run(mddev);
2989 	mddev_resume(mddev);
2990 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
2991 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2992 	md_wakeup_thread(mddev->thread);
2993 	return rv;
2994 }
2995 
2996 static struct md_sysfs_entry md_level =
2997 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2998 
2999 
3000 static ssize_t
3001 layout_show(mddev_t *mddev, char *page)
3002 {
3003 	/* just a number, not meaningful for all levels */
3004 	if (mddev->reshape_position != MaxSector &&
3005 	    mddev->layout != mddev->new_layout)
3006 		return sprintf(page, "%d (%d)\n",
3007 			       mddev->new_layout, mddev->layout);
3008 	return sprintf(page, "%d\n", mddev->layout);
3009 }
3010 
3011 static ssize_t
3012 layout_store(mddev_t *mddev, const char *buf, size_t len)
3013 {
3014 	char *e;
3015 	unsigned long n = simple_strtoul(buf, &e, 10);
3016 
3017 	if (!*buf || (*e && *e != '\n'))
3018 		return -EINVAL;
3019 
3020 	if (mddev->pers) {
3021 		int err;
3022 		if (mddev->pers->check_reshape == NULL)
3023 			return -EBUSY;
3024 		mddev->new_layout = n;
3025 		err = mddev->pers->check_reshape(mddev);
3026 		if (err) {
3027 			mddev->new_layout = mddev->layout;
3028 			return err;
3029 		}
3030 	} else {
3031 		mddev->new_layout = n;
3032 		if (mddev->reshape_position == MaxSector)
3033 			mddev->layout = n;
3034 	}
3035 	return len;
3036 }
3037 static struct md_sysfs_entry md_layout =
3038 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3039 
3040 
3041 static ssize_t
3042 raid_disks_show(mddev_t *mddev, char *page)
3043 {
3044 	if (mddev->raid_disks == 0)
3045 		return 0;
3046 	if (mddev->reshape_position != MaxSector &&
3047 	    mddev->delta_disks != 0)
3048 		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3049 			       mddev->raid_disks - mddev->delta_disks);
3050 	return sprintf(page, "%d\n", mddev->raid_disks);
3051 }
3052 
3053 static int update_raid_disks(mddev_t *mddev, int raid_disks);
3054 
3055 static ssize_t
3056 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
3057 {
3058 	char *e;
3059 	int rv = 0;
3060 	unsigned long n = simple_strtoul(buf, &e, 10);
3061 
3062 	if (!*buf || (*e && *e != '\n'))
3063 		return -EINVAL;
3064 
3065 	if (mddev->pers)
3066 		rv = update_raid_disks(mddev, n);
3067 	else if (mddev->reshape_position != MaxSector) {
3068 		int olddisks = mddev->raid_disks - mddev->delta_disks;
3069 		mddev->delta_disks = n - olddisks;
3070 		mddev->raid_disks = n;
3071 	} else
3072 		mddev->raid_disks = n;
3073 	return rv ? rv : len;
3074 }
3075 static struct md_sysfs_entry md_raid_disks =
3076 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3077 
3078 static ssize_t
3079 chunk_size_show(mddev_t *mddev, char *page)
3080 {
3081 	if (mddev->reshape_position != MaxSector &&
3082 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
3083 		return sprintf(page, "%d (%d)\n",
3084 			       mddev->new_chunk_sectors << 9,
3085 			       mddev->chunk_sectors << 9);
3086 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3087 }
3088 
3089 static ssize_t
3090 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
3091 {
3092 	char *e;
3093 	unsigned long n = simple_strtoul(buf, &e, 10);
3094 
3095 	if (!*buf || (*e && *e != '\n'))
3096 		return -EINVAL;
3097 
3098 	if (mddev->pers) {
3099 		int err;
3100 		if (mddev->pers->check_reshape == NULL)
3101 			return -EBUSY;
3102 		mddev->new_chunk_sectors = n >> 9;
3103 		err = mddev->pers->check_reshape(mddev);
3104 		if (err) {
3105 			mddev->new_chunk_sectors = mddev->chunk_sectors;
3106 			return err;
3107 		}
3108 	} else {
3109 		mddev->new_chunk_sectors = n >> 9;
3110 		if (mddev->reshape_position == MaxSector)
3111 			mddev->chunk_sectors = n >> 9;
3112 	}
3113 	return len;
3114 }
3115 static struct md_sysfs_entry md_chunk_size =
3116 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3117 
3118 static ssize_t
3119 resync_start_show(mddev_t *mddev, char *page)
3120 {
3121 	if (mddev->recovery_cp == MaxSector)
3122 		return sprintf(page, "none\n");
3123 	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3124 }
3125 
3126 static ssize_t
3127 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3128 {
3129 	char *e;
3130 	unsigned long long n = simple_strtoull(buf, &e, 10);
3131 
3132 	if (mddev->pers)
3133 		return -EBUSY;
3134 	if (cmd_match(buf, "none"))
3135 		n = MaxSector;
3136 	else if (!*buf || (*e && *e != '\n'))
3137 		return -EINVAL;
3138 
3139 	mddev->recovery_cp = n;
3140 	return len;
3141 }
3142 static struct md_sysfs_entry md_resync_start =
3143 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3144 
3145 /*
3146  * The array state can be:
3147  *
3148  * clear
3149  *     No devices, no size, no level
3150  *     Equivalent to STOP_ARRAY ioctl
3151  * inactive
3152  *     May have some settings, but array is not active
3153  *        all IO results in error
3154  *     When written, doesn't tear down array, but just stops it
3155  * suspended (not supported yet)
3156  *     All IO requests will block. The array can be reconfigured.
3157  *     Writing this, if accepted, will block until array is quiescent
3158  * readonly
3159  *     no resync can happen.  no superblocks get written.
3160  *     write requests fail
3161  * read-auto
3162  *     like readonly, but behaves like 'clean' on a write request.
3163  *
3164  * clean - no pending writes, but otherwise active.
3165  *     When written to inactive array, starts without resync
3166  *     If a write request arrives then
3167  *       if metadata is known, mark 'dirty' and switch to 'active'.
3168  *       if not known, block and switch to write-pending
3169  *     If written to an active array that has pending writes, then fails.
3170  * active
3171  *     fully active: IO and resync can be happening.
3172  *     When written to inactive array, starts with resync
3173  *
3174  * write-pending
3175  *     clean, but writes are blocked waiting for 'active' to be written.
3176  *
3177  * active-idle
3178  *     like active, but no writes have been seen for a while (100msec).
3179  *
3180  */
3181 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3182 		   write_pending, active_idle, bad_word};
3183 static char *array_states[] = {
3184 	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3185 	"write-pending", "active-idle", NULL };
3186 
3187 static int match_word(const char *word, char **list)
3188 {
3189 	int n;
3190 	for (n=0; list[n]; n++)
3191 		if (cmd_match(word, list[n]))
3192 			break;
3193 	return n;
3194 }
3195 
3196 static ssize_t
3197 array_state_show(mddev_t *mddev, char *page)
3198 {
3199 	enum array_state st = inactive;
3200 
3201 	if (mddev->pers)
3202 		switch(mddev->ro) {
3203 		case 1:
3204 			st = readonly;
3205 			break;
3206 		case 2:
3207 			st = read_auto;
3208 			break;
3209 		case 0:
3210 			if (mddev->in_sync)
3211 				st = clean;
3212 			else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3213 				st = write_pending;
3214 			else if (mddev->safemode)
3215 				st = active_idle;
3216 			else
3217 				st = active;
3218 		}
3219 	else {
3220 		if (list_empty(&mddev->disks) &&
3221 		    mddev->raid_disks == 0 &&
3222 		    mddev->dev_sectors == 0)
3223 			st = clear;
3224 		else
3225 			st = inactive;
3226 	}
3227 	return sprintf(page, "%s\n", array_states[st]);
3228 }
3229 
3230 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3231 static int do_md_run(mddev_t * mddev);
3232 static int restart_array(mddev_t *mddev);
3233 
3234 static ssize_t
3235 array_state_store(mddev_t *mddev, const char *buf, size_t len)
3236 {
3237 	int err = -EINVAL;
3238 	enum array_state st = match_word(buf, array_states);
3239 	switch(st) {
3240 	case bad_word:
3241 		break;
3242 	case clear:
3243 		/* stopping an active array */
3244 		if (atomic_read(&mddev->openers) > 0)
3245 			return -EBUSY;
3246 		err = do_md_stop(mddev, 0, 0);
3247 		break;
3248 	case inactive:
3249 		/* stopping an active array */
3250 		if (mddev->pers) {
3251 			if (atomic_read(&mddev->openers) > 0)
3252 				return -EBUSY;
3253 			err = do_md_stop(mddev, 2, 0);
3254 		} else
3255 			err = 0; /* already inactive */
3256 		break;
3257 	case suspended:
3258 		break; /* not supported yet */
3259 	case readonly:
3260 		if (mddev->pers)
3261 			err = do_md_stop(mddev, 1, 0);
3262 		else {
3263 			mddev->ro = 1;
3264 			set_disk_ro(mddev->gendisk, 1);
3265 			err = do_md_run(mddev);
3266 		}
3267 		break;
3268 	case read_auto:
3269 		if (mddev->pers) {
3270 			if (mddev->ro == 0)
3271 				err = do_md_stop(mddev, 1, 0);
3272 			else if (mddev->ro == 1)
3273 				err = restart_array(mddev);
3274 			if (err == 0) {
3275 				mddev->ro = 2;
3276 				set_disk_ro(mddev->gendisk, 0);
3277 			}
3278 		} else {
3279 			mddev->ro = 2;
3280 			err = do_md_run(mddev);
3281 		}
3282 		break;
3283 	case clean:
3284 		if (mddev->pers) {
3285 			restart_array(mddev);
3286 			spin_lock_irq(&mddev->write_lock);
3287 			if (atomic_read(&mddev->writes_pending) == 0) {
3288 				if (mddev->in_sync == 0) {
3289 					mddev->in_sync = 1;
3290 					if (mddev->safemode == 1)
3291 						mddev->safemode = 0;
3292 					if (mddev->persistent)
3293 						set_bit(MD_CHANGE_CLEAN,
3294 							&mddev->flags);
3295 				}
3296 				err = 0;
3297 			} else
3298 				err = -EBUSY;
3299 			spin_unlock_irq(&mddev->write_lock);
3300 		} else
3301 			err = -EINVAL;
3302 		break;
3303 	case active:
3304 		if (mddev->pers) {
3305 			restart_array(mddev);
3306 			if (mddev->external)
3307 				clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3308 			wake_up(&mddev->sb_wait);
3309 			err = 0;
3310 		} else {
3311 			mddev->ro = 0;
3312 			set_disk_ro(mddev->gendisk, 0);
3313 			err = do_md_run(mddev);
3314 		}
3315 		break;
3316 	case write_pending:
3317 	case active_idle:
3318 		/* these cannot be set */
3319 		break;
3320 	}
3321 	if (err)
3322 		return err;
3323 	else {
3324 		sysfs_notify_dirent(mddev->sysfs_state);
3325 		return len;
3326 	}
3327 }
3328 static struct md_sysfs_entry md_array_state =
3329 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3330 
3331 static ssize_t
3332 max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3333 	return sprintf(page, "%d\n",
3334 		       atomic_read(&mddev->max_corr_read_errors));
3335 }
3336 
3337 static ssize_t
3338 max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3339 {
3340 	char *e;
3341 	unsigned long n = simple_strtoul(buf, &e, 10);
3342 
3343 	if (*buf && (*e == 0 || *e == '\n')) {
3344 		atomic_set(&mddev->max_corr_read_errors, n);
3345 		return len;
3346 	}
3347 	return -EINVAL;
3348 }
3349 
3350 static struct md_sysfs_entry max_corr_read_errors =
3351 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3352 	max_corrected_read_errors_store);
3353 
3354 static ssize_t
3355 null_show(mddev_t *mddev, char *page)
3356 {
3357 	return -EINVAL;
3358 }
3359 
3360 static ssize_t
3361 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3362 {
3363 	/* buf must be %d:%d\n? giving major and minor numbers */
3364 	/* The new device is added to the array.
3365 	 * If the array has a persistent superblock, we read the
3366 	 * superblock to initialise info and check validity.
3367 	 * Otherwise, only checking done is that in bind_rdev_to_array,
3368 	 * which mainly checks size.
3369 	 */
3370 	char *e;
3371 	int major = simple_strtoul(buf, &e, 10);
3372 	int minor;
3373 	dev_t dev;
3374 	mdk_rdev_t *rdev;
3375 	int err;
3376 
3377 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3378 		return -EINVAL;
3379 	minor = simple_strtoul(e+1, &e, 10);
3380 	if (*e && *e != '\n')
3381 		return -EINVAL;
3382 	dev = MKDEV(major, minor);
3383 	if (major != MAJOR(dev) ||
3384 	    minor != MINOR(dev))
3385 		return -EOVERFLOW;
3386 
3387 
3388 	if (mddev->persistent) {
3389 		rdev = md_import_device(dev, mddev->major_version,
3390 					mddev->minor_version);
3391 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3392 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3393 						       mdk_rdev_t, same_set);
3394 			err = super_types[mddev->major_version]
3395 				.load_super(rdev, rdev0, mddev->minor_version);
3396 			if (err < 0)
3397 				goto out;
3398 		}
3399 	} else if (mddev->external)
3400 		rdev = md_import_device(dev, -2, -1);
3401 	else
3402 		rdev = md_import_device(dev, -1, -1);
3403 
3404 	if (IS_ERR(rdev))
3405 		return PTR_ERR(rdev);
3406 	err = bind_rdev_to_array(rdev, mddev);
3407  out:
3408 	if (err)
3409 		export_rdev(rdev);
3410 	return err ? err : len;
3411 }
3412 
3413 static struct md_sysfs_entry md_new_device =
3414 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3415 
3416 static ssize_t
3417 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3418 {
3419 	char *end;
3420 	unsigned long chunk, end_chunk;
3421 
3422 	if (!mddev->bitmap)
3423 		goto out;
3424 	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3425 	while (*buf) {
3426 		chunk = end_chunk = simple_strtoul(buf, &end, 0);
3427 		if (buf == end) break;
3428 		if (*end == '-') { /* range */
3429 			buf = end + 1;
3430 			end_chunk = simple_strtoul(buf, &end, 0);
3431 			if (buf == end) break;
3432 		}
3433 		if (*end && !isspace(*end)) break;
3434 		bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3435 		buf = skip_spaces(end);
3436 	}
3437 	bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3438 out:
3439 	return len;
3440 }
3441 
3442 static struct md_sysfs_entry md_bitmap =
3443 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3444 
3445 static ssize_t
3446 size_show(mddev_t *mddev, char *page)
3447 {
3448 	return sprintf(page, "%llu\n",
3449 		(unsigned long long)mddev->dev_sectors / 2);
3450 }
3451 
3452 static int update_size(mddev_t *mddev, sector_t num_sectors);
3453 
3454 static ssize_t
3455 size_store(mddev_t *mddev, const char *buf, size_t len)
3456 {
3457 	/* If array is inactive, we can reduce the component size, but
3458 	 * not increase it (except from 0).
3459 	 * If array is active, we can try an on-line resize
3460 	 */
3461 	sector_t sectors;
3462 	int err = strict_blocks_to_sectors(buf, &sectors);
3463 
3464 	if (err < 0)
3465 		return err;
3466 	if (mddev->pers) {
3467 		err = update_size(mddev, sectors);
3468 		md_update_sb(mddev, 1);
3469 	} else {
3470 		if (mddev->dev_sectors == 0 ||
3471 		    mddev->dev_sectors > sectors)
3472 			mddev->dev_sectors = sectors;
3473 		else
3474 			err = -ENOSPC;
3475 	}
3476 	return err ? err : len;
3477 }
3478 
3479 static struct md_sysfs_entry md_size =
3480 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3481 
3482 
3483 /* Metdata version.
3484  * This is one of
3485  *   'none' for arrays with no metadata (good luck...)
3486  *   'external' for arrays with externally managed metadata,
3487  * or N.M for internally known formats
3488  */
3489 static ssize_t
3490 metadata_show(mddev_t *mddev, char *page)
3491 {
3492 	if (mddev->persistent)
3493 		return sprintf(page, "%d.%d\n",
3494 			       mddev->major_version, mddev->minor_version);
3495 	else if (mddev->external)
3496 		return sprintf(page, "external:%s\n", mddev->metadata_type);
3497 	else
3498 		return sprintf(page, "none\n");
3499 }
3500 
3501 static ssize_t
3502 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3503 {
3504 	int major, minor;
3505 	char *e;
3506 	/* Changing the details of 'external' metadata is
3507 	 * always permitted.  Otherwise there must be
3508 	 * no devices attached to the array.
3509 	 */
3510 	if (mddev->external && strncmp(buf, "external:", 9) == 0)
3511 		;
3512 	else if (!list_empty(&mddev->disks))
3513 		return -EBUSY;
3514 
3515 	if (cmd_match(buf, "none")) {
3516 		mddev->persistent = 0;
3517 		mddev->external = 0;
3518 		mddev->major_version = 0;
3519 		mddev->minor_version = 90;
3520 		return len;
3521 	}
3522 	if (strncmp(buf, "external:", 9) == 0) {
3523 		size_t namelen = len-9;
3524 		if (namelen >= sizeof(mddev->metadata_type))
3525 			namelen = sizeof(mddev->metadata_type)-1;
3526 		strncpy(mddev->metadata_type, buf+9, namelen);
3527 		mddev->metadata_type[namelen] = 0;
3528 		if (namelen && mddev->metadata_type[namelen-1] == '\n')
3529 			mddev->metadata_type[--namelen] = 0;
3530 		mddev->persistent = 0;
3531 		mddev->external = 1;
3532 		mddev->major_version = 0;
3533 		mddev->minor_version = 90;
3534 		return len;
3535 	}
3536 	major = simple_strtoul(buf, &e, 10);
3537 	if (e==buf || *e != '.')
3538 		return -EINVAL;
3539 	buf = e+1;
3540 	minor = simple_strtoul(buf, &e, 10);
3541 	if (e==buf || (*e && *e != '\n') )
3542 		return -EINVAL;
3543 	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3544 		return -ENOENT;
3545 	mddev->major_version = major;
3546 	mddev->minor_version = minor;
3547 	mddev->persistent = 1;
3548 	mddev->external = 0;
3549 	return len;
3550 }
3551 
3552 static struct md_sysfs_entry md_metadata =
3553 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3554 
3555 static ssize_t
3556 action_show(mddev_t *mddev, char *page)
3557 {
3558 	char *type = "idle";
3559 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3560 		type = "frozen";
3561 	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3562 	    (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3563 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3564 			type = "reshape";
3565 		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3566 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3567 				type = "resync";
3568 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3569 				type = "check";
3570 			else
3571 				type = "repair";
3572 		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3573 			type = "recover";
3574 	}
3575 	return sprintf(page, "%s\n", type);
3576 }
3577 
3578 static ssize_t
3579 action_store(mddev_t *mddev, const char *page, size_t len)
3580 {
3581 	if (!mddev->pers || !mddev->pers->sync_request)
3582 		return -EINVAL;
3583 
3584 	if (cmd_match(page, "frozen"))
3585 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3586 	else
3587 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3588 
3589 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3590 		if (mddev->sync_thread) {
3591 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3592 			md_unregister_thread(mddev->sync_thread);
3593 			mddev->sync_thread = NULL;
3594 			mddev->recovery = 0;
3595 		}
3596 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3597 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3598 		return -EBUSY;
3599 	else if (cmd_match(page, "resync"))
3600 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3601 	else if (cmd_match(page, "recover")) {
3602 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3603 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3604 	} else if (cmd_match(page, "reshape")) {
3605 		int err;
3606 		if (mddev->pers->start_reshape == NULL)
3607 			return -EINVAL;
3608 		err = mddev->pers->start_reshape(mddev);
3609 		if (err)
3610 			return err;
3611 		sysfs_notify(&mddev->kobj, NULL, "degraded");
3612 	} else {
3613 		if (cmd_match(page, "check"))
3614 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3615 		else if (!cmd_match(page, "repair"))
3616 			return -EINVAL;
3617 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3618 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3619 	}
3620 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3621 	md_wakeup_thread(mddev->thread);
3622 	sysfs_notify_dirent(mddev->sysfs_action);
3623 	return len;
3624 }
3625 
3626 static ssize_t
3627 mismatch_cnt_show(mddev_t *mddev, char *page)
3628 {
3629 	return sprintf(page, "%llu\n",
3630 		       (unsigned long long) mddev->resync_mismatches);
3631 }
3632 
3633 static struct md_sysfs_entry md_scan_mode =
3634 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3635 
3636 
3637 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3638 
3639 static ssize_t
3640 sync_min_show(mddev_t *mddev, char *page)
3641 {
3642 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
3643 		       mddev->sync_speed_min ? "local": "system");
3644 }
3645 
3646 static ssize_t
3647 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3648 {
3649 	int min;
3650 	char *e;
3651 	if (strncmp(buf, "system", 6)==0) {
3652 		mddev->sync_speed_min = 0;
3653 		return len;
3654 	}
3655 	min = simple_strtoul(buf, &e, 10);
3656 	if (buf == e || (*e && *e != '\n') || min <= 0)
3657 		return -EINVAL;
3658 	mddev->sync_speed_min = min;
3659 	return len;
3660 }
3661 
3662 static struct md_sysfs_entry md_sync_min =
3663 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3664 
3665 static ssize_t
3666 sync_max_show(mddev_t *mddev, char *page)
3667 {
3668 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
3669 		       mddev->sync_speed_max ? "local": "system");
3670 }
3671 
3672 static ssize_t
3673 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3674 {
3675 	int max;
3676 	char *e;
3677 	if (strncmp(buf, "system", 6)==0) {
3678 		mddev->sync_speed_max = 0;
3679 		return len;
3680 	}
3681 	max = simple_strtoul(buf, &e, 10);
3682 	if (buf == e || (*e && *e != '\n') || max <= 0)
3683 		return -EINVAL;
3684 	mddev->sync_speed_max = max;
3685 	return len;
3686 }
3687 
3688 static struct md_sysfs_entry md_sync_max =
3689 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3690 
3691 static ssize_t
3692 degraded_show(mddev_t *mddev, char *page)
3693 {
3694 	return sprintf(page, "%d\n", mddev->degraded);
3695 }
3696 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3697 
3698 static ssize_t
3699 sync_force_parallel_show(mddev_t *mddev, char *page)
3700 {
3701 	return sprintf(page, "%d\n", mddev->parallel_resync);
3702 }
3703 
3704 static ssize_t
3705 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3706 {
3707 	long n;
3708 
3709 	if (strict_strtol(buf, 10, &n))
3710 		return -EINVAL;
3711 
3712 	if (n != 0 && n != 1)
3713 		return -EINVAL;
3714 
3715 	mddev->parallel_resync = n;
3716 
3717 	if (mddev->sync_thread)
3718 		wake_up(&resync_wait);
3719 
3720 	return len;
3721 }
3722 
3723 /* force parallel resync, even with shared block devices */
3724 static struct md_sysfs_entry md_sync_force_parallel =
3725 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3726        sync_force_parallel_show, sync_force_parallel_store);
3727 
3728 static ssize_t
3729 sync_speed_show(mddev_t *mddev, char *page)
3730 {
3731 	unsigned long resync, dt, db;
3732 	if (mddev->curr_resync == 0)
3733 		return sprintf(page, "none\n");
3734 	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3735 	dt = (jiffies - mddev->resync_mark) / HZ;
3736 	if (!dt) dt++;
3737 	db = resync - mddev->resync_mark_cnt;
3738 	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3739 }
3740 
3741 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3742 
3743 static ssize_t
3744 sync_completed_show(mddev_t *mddev, char *page)
3745 {
3746 	unsigned long max_sectors, resync;
3747 
3748 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3749 		return sprintf(page, "none\n");
3750 
3751 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3752 		max_sectors = mddev->resync_max_sectors;
3753 	else
3754 		max_sectors = mddev->dev_sectors;
3755 
3756 	resync = mddev->curr_resync_completed;
3757 	return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3758 }
3759 
3760 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3761 
3762 static ssize_t
3763 min_sync_show(mddev_t *mddev, char *page)
3764 {
3765 	return sprintf(page, "%llu\n",
3766 		       (unsigned long long)mddev->resync_min);
3767 }
3768 static ssize_t
3769 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3770 {
3771 	unsigned long long min;
3772 	if (strict_strtoull(buf, 10, &min))
3773 		return -EINVAL;
3774 	if (min > mddev->resync_max)
3775 		return -EINVAL;
3776 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3777 		return -EBUSY;
3778 
3779 	/* Must be a multiple of chunk_size */
3780 	if (mddev->chunk_sectors) {
3781 		sector_t temp = min;
3782 		if (sector_div(temp, mddev->chunk_sectors))
3783 			return -EINVAL;
3784 	}
3785 	mddev->resync_min = min;
3786 
3787 	return len;
3788 }
3789 
3790 static struct md_sysfs_entry md_min_sync =
3791 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3792 
3793 static ssize_t
3794 max_sync_show(mddev_t *mddev, char *page)
3795 {
3796 	if (mddev->resync_max == MaxSector)
3797 		return sprintf(page, "max\n");
3798 	else
3799 		return sprintf(page, "%llu\n",
3800 			       (unsigned long long)mddev->resync_max);
3801 }
3802 static ssize_t
3803 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3804 {
3805 	if (strncmp(buf, "max", 3) == 0)
3806 		mddev->resync_max = MaxSector;
3807 	else {
3808 		unsigned long long max;
3809 		if (strict_strtoull(buf, 10, &max))
3810 			return -EINVAL;
3811 		if (max < mddev->resync_min)
3812 			return -EINVAL;
3813 		if (max < mddev->resync_max &&
3814 		    mddev->ro == 0 &&
3815 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3816 			return -EBUSY;
3817 
3818 		/* Must be a multiple of chunk_size */
3819 		if (mddev->chunk_sectors) {
3820 			sector_t temp = max;
3821 			if (sector_div(temp, mddev->chunk_sectors))
3822 				return -EINVAL;
3823 		}
3824 		mddev->resync_max = max;
3825 	}
3826 	wake_up(&mddev->recovery_wait);
3827 	return len;
3828 }
3829 
3830 static struct md_sysfs_entry md_max_sync =
3831 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3832 
3833 static ssize_t
3834 suspend_lo_show(mddev_t *mddev, char *page)
3835 {
3836 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3837 }
3838 
3839 static ssize_t
3840 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3841 {
3842 	char *e;
3843 	unsigned long long new = simple_strtoull(buf, &e, 10);
3844 
3845 	if (mddev->pers == NULL ||
3846 	    mddev->pers->quiesce == NULL)
3847 		return -EINVAL;
3848 	if (buf == e || (*e && *e != '\n'))
3849 		return -EINVAL;
3850 	if (new >= mddev->suspend_hi ||
3851 	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3852 		mddev->suspend_lo = new;
3853 		mddev->pers->quiesce(mddev, 2);
3854 		return len;
3855 	} else
3856 		return -EINVAL;
3857 }
3858 static struct md_sysfs_entry md_suspend_lo =
3859 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3860 
3861 
3862 static ssize_t
3863 suspend_hi_show(mddev_t *mddev, char *page)
3864 {
3865 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3866 }
3867 
3868 static ssize_t
3869 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3870 {
3871 	char *e;
3872 	unsigned long long new = simple_strtoull(buf, &e, 10);
3873 
3874 	if (mddev->pers == NULL ||
3875 	    mddev->pers->quiesce == NULL)
3876 		return -EINVAL;
3877 	if (buf == e || (*e && *e != '\n'))
3878 		return -EINVAL;
3879 	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3880 	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3881 		mddev->suspend_hi = new;
3882 		mddev->pers->quiesce(mddev, 1);
3883 		mddev->pers->quiesce(mddev, 0);
3884 		return len;
3885 	} else
3886 		return -EINVAL;
3887 }
3888 static struct md_sysfs_entry md_suspend_hi =
3889 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3890 
3891 static ssize_t
3892 reshape_position_show(mddev_t *mddev, char *page)
3893 {
3894 	if (mddev->reshape_position != MaxSector)
3895 		return sprintf(page, "%llu\n",
3896 			       (unsigned long long)mddev->reshape_position);
3897 	strcpy(page, "none\n");
3898 	return 5;
3899 }
3900 
3901 static ssize_t
3902 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3903 {
3904 	char *e;
3905 	unsigned long long new = simple_strtoull(buf, &e, 10);
3906 	if (mddev->pers)
3907 		return -EBUSY;
3908 	if (buf == e || (*e && *e != '\n'))
3909 		return -EINVAL;
3910 	mddev->reshape_position = new;
3911 	mddev->delta_disks = 0;
3912 	mddev->new_level = mddev->level;
3913 	mddev->new_layout = mddev->layout;
3914 	mddev->new_chunk_sectors = mddev->chunk_sectors;
3915 	return len;
3916 }
3917 
3918 static struct md_sysfs_entry md_reshape_position =
3919 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3920        reshape_position_store);
3921 
3922 static ssize_t
3923 array_size_show(mddev_t *mddev, char *page)
3924 {
3925 	if (mddev->external_size)
3926 		return sprintf(page, "%llu\n",
3927 			       (unsigned long long)mddev->array_sectors/2);
3928 	else
3929 		return sprintf(page, "default\n");
3930 }
3931 
3932 static ssize_t
3933 array_size_store(mddev_t *mddev, const char *buf, size_t len)
3934 {
3935 	sector_t sectors;
3936 
3937 	if (strncmp(buf, "default", 7) == 0) {
3938 		if (mddev->pers)
3939 			sectors = mddev->pers->size(mddev, 0, 0);
3940 		else
3941 			sectors = mddev->array_sectors;
3942 
3943 		mddev->external_size = 0;
3944 	} else {
3945 		if (strict_blocks_to_sectors(buf, &sectors) < 0)
3946 			return -EINVAL;
3947 		if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3948 			return -E2BIG;
3949 
3950 		mddev->external_size = 1;
3951 	}
3952 
3953 	mddev->array_sectors = sectors;
3954 	set_capacity(mddev->gendisk, mddev->array_sectors);
3955 	if (mddev->pers)
3956 		revalidate_disk(mddev->gendisk);
3957 
3958 	return len;
3959 }
3960 
3961 static struct md_sysfs_entry md_array_size =
3962 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3963        array_size_store);
3964 
3965 static struct attribute *md_default_attrs[] = {
3966 	&md_level.attr,
3967 	&md_layout.attr,
3968 	&md_raid_disks.attr,
3969 	&md_chunk_size.attr,
3970 	&md_size.attr,
3971 	&md_resync_start.attr,
3972 	&md_metadata.attr,
3973 	&md_new_device.attr,
3974 	&md_safe_delay.attr,
3975 	&md_array_state.attr,
3976 	&md_reshape_position.attr,
3977 	&md_array_size.attr,
3978 	&max_corr_read_errors.attr,
3979 	NULL,
3980 };
3981 
3982 static struct attribute *md_redundancy_attrs[] = {
3983 	&md_scan_mode.attr,
3984 	&md_mismatches.attr,
3985 	&md_sync_min.attr,
3986 	&md_sync_max.attr,
3987 	&md_sync_speed.attr,
3988 	&md_sync_force_parallel.attr,
3989 	&md_sync_completed.attr,
3990 	&md_min_sync.attr,
3991 	&md_max_sync.attr,
3992 	&md_suspend_lo.attr,
3993 	&md_suspend_hi.attr,
3994 	&md_bitmap.attr,
3995 	&md_degraded.attr,
3996 	NULL,
3997 };
3998 static struct attribute_group md_redundancy_group = {
3999 	.name = NULL,
4000 	.attrs = md_redundancy_attrs,
4001 };
4002 
4003 
4004 static ssize_t
4005 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4006 {
4007 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4008 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4009 	ssize_t rv;
4010 
4011 	if (!entry->show)
4012 		return -EIO;
4013 	rv = mddev_lock(mddev);
4014 	if (!rv) {
4015 		rv = entry->show(mddev, page);
4016 		mddev_unlock(mddev);
4017 	}
4018 	return rv;
4019 }
4020 
4021 static ssize_t
4022 md_attr_store(struct kobject *kobj, struct attribute *attr,
4023 	      const char *page, size_t length)
4024 {
4025 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4026 	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4027 	ssize_t rv;
4028 
4029 	if (!entry->store)
4030 		return -EIO;
4031 	if (!capable(CAP_SYS_ADMIN))
4032 		return -EACCES;
4033 	rv = mddev_lock(mddev);
4034 	if (mddev->hold_active == UNTIL_IOCTL)
4035 		mddev->hold_active = 0;
4036 	if (!rv) {
4037 		rv = entry->store(mddev, page, length);
4038 		mddev_unlock(mddev);
4039 	}
4040 	return rv;
4041 }
4042 
4043 static void md_free(struct kobject *ko)
4044 {
4045 	mddev_t *mddev = container_of(ko, mddev_t, kobj);
4046 
4047 	if (mddev->sysfs_state)
4048 		sysfs_put(mddev->sysfs_state);
4049 
4050 	if (mddev->gendisk) {
4051 		del_gendisk(mddev->gendisk);
4052 		put_disk(mddev->gendisk);
4053 	}
4054 	if (mddev->queue)
4055 		blk_cleanup_queue(mddev->queue);
4056 
4057 	kfree(mddev);
4058 }
4059 
4060 static struct sysfs_ops md_sysfs_ops = {
4061 	.show	= md_attr_show,
4062 	.store	= md_attr_store,
4063 };
4064 static struct kobj_type md_ktype = {
4065 	.release	= md_free,
4066 	.sysfs_ops	= &md_sysfs_ops,
4067 	.default_attrs	= md_default_attrs,
4068 };
4069 
4070 int mdp_major = 0;
4071 
4072 static void mddev_delayed_delete(struct work_struct *ws)
4073 {
4074 	mddev_t *mddev = container_of(ws, mddev_t, del_work);
4075 
4076 	if (mddev->private == &md_redundancy_group) {
4077 		sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4078 		if (mddev->sysfs_action)
4079 			sysfs_put(mddev->sysfs_action);
4080 		mddev->sysfs_action = NULL;
4081 		mddev->private = NULL;
4082 	}
4083 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4084 	kobject_del(&mddev->kobj);
4085 	kobject_put(&mddev->kobj);
4086 }
4087 
4088 static int md_alloc(dev_t dev, char *name)
4089 {
4090 	static DEFINE_MUTEX(disks_mutex);
4091 	mddev_t *mddev = mddev_find(dev);
4092 	struct gendisk *disk;
4093 	int partitioned;
4094 	int shift;
4095 	int unit;
4096 	int error;
4097 
4098 	if (!mddev)
4099 		return -ENODEV;
4100 
4101 	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4102 	shift = partitioned ? MdpMinorShift : 0;
4103 	unit = MINOR(mddev->unit) >> shift;
4104 
4105 	/* wait for any previous instance if this device
4106 	 * to be completed removed (mddev_delayed_delete).
4107 	 */
4108 	flush_scheduled_work();
4109 
4110 	mutex_lock(&disks_mutex);
4111 	error = -EEXIST;
4112 	if (mddev->gendisk)
4113 		goto abort;
4114 
4115 	if (name) {
4116 		/* Need to ensure that 'name' is not a duplicate.
4117 		 */
4118 		mddev_t *mddev2;
4119 		spin_lock(&all_mddevs_lock);
4120 
4121 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4122 			if (mddev2->gendisk &&
4123 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
4124 				spin_unlock(&all_mddevs_lock);
4125 				goto abort;
4126 			}
4127 		spin_unlock(&all_mddevs_lock);
4128 	}
4129 
4130 	error = -ENOMEM;
4131 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
4132 	if (!mddev->queue)
4133 		goto abort;
4134 	mddev->queue->queuedata = mddev;
4135 
4136 	/* Can be unlocked because the queue is new: no concurrency */
4137 	queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
4138 
4139 	blk_queue_make_request(mddev->queue, md_make_request);
4140 
4141 	disk = alloc_disk(1 << shift);
4142 	if (!disk) {
4143 		blk_cleanup_queue(mddev->queue);
4144 		mddev->queue = NULL;
4145 		goto abort;
4146 	}
4147 	disk->major = MAJOR(mddev->unit);
4148 	disk->first_minor = unit << shift;
4149 	if (name)
4150 		strcpy(disk->disk_name, name);
4151 	else if (partitioned)
4152 		sprintf(disk->disk_name, "md_d%d", unit);
4153 	else
4154 		sprintf(disk->disk_name, "md%d", unit);
4155 	disk->fops = &md_fops;
4156 	disk->private_data = mddev;
4157 	disk->queue = mddev->queue;
4158 	/* Allow extended partitions.  This makes the
4159 	 * 'mdp' device redundant, but we can't really
4160 	 * remove it now.
4161 	 */
4162 	disk->flags |= GENHD_FL_EXT_DEVT;
4163 	add_disk(disk);
4164 	mddev->gendisk = disk;
4165 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4166 				     &disk_to_dev(disk)->kobj, "%s", "md");
4167 	if (error) {
4168 		/* This isn't possible, but as kobject_init_and_add is marked
4169 		 * __must_check, we must do something with the result
4170 		 */
4171 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4172 		       disk->disk_name);
4173 		error = 0;
4174 	}
4175 	if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4176 		printk(KERN_DEBUG "pointless warning\n");
4177  abort:
4178 	mutex_unlock(&disks_mutex);
4179 	if (!error) {
4180 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
4181 		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
4182 	}
4183 	mddev_put(mddev);
4184 	return error;
4185 }
4186 
4187 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4188 {
4189 	md_alloc(dev, NULL);
4190 	return NULL;
4191 }
4192 
4193 static int add_named_array(const char *val, struct kernel_param *kp)
4194 {
4195 	/* val must be "md_*" where * is not all digits.
4196 	 * We allocate an array with a large free minor number, and
4197 	 * set the name to val.  val must not already be an active name.
4198 	 */
4199 	int len = strlen(val);
4200 	char buf[DISK_NAME_LEN];
4201 
4202 	while (len && val[len-1] == '\n')
4203 		len--;
4204 	if (len >= DISK_NAME_LEN)
4205 		return -E2BIG;
4206 	strlcpy(buf, val, len+1);
4207 	if (strncmp(buf, "md_", 3) != 0)
4208 		return -EINVAL;
4209 	return md_alloc(0, buf);
4210 }
4211 
4212 static void md_safemode_timeout(unsigned long data)
4213 {
4214 	mddev_t *mddev = (mddev_t *) data;
4215 
4216 	if (!atomic_read(&mddev->writes_pending)) {
4217 		mddev->safemode = 1;
4218 		if (mddev->external)
4219 			sysfs_notify_dirent(mddev->sysfs_state);
4220 	}
4221 	md_wakeup_thread(mddev->thread);
4222 }
4223 
4224 static int start_dirty_degraded;
4225 
4226 static int do_md_run(mddev_t * mddev)
4227 {
4228 	int err;
4229 	mdk_rdev_t *rdev;
4230 	struct gendisk *disk;
4231 	struct mdk_personality *pers;
4232 
4233 	if (list_empty(&mddev->disks))
4234 		/* cannot run an array with no devices.. */
4235 		return -EINVAL;
4236 
4237 	if (mddev->pers)
4238 		return -EBUSY;
4239 
4240 	/*
4241 	 * Analyze all RAID superblock(s)
4242 	 */
4243 	if (!mddev->raid_disks) {
4244 		if (!mddev->persistent)
4245 			return -EINVAL;
4246 		analyze_sbs(mddev);
4247 	}
4248 
4249 	if (mddev->level != LEVEL_NONE)
4250 		request_module("md-level-%d", mddev->level);
4251 	else if (mddev->clevel[0])
4252 		request_module("md-%s", mddev->clevel);
4253 
4254 	/*
4255 	 * Drop all container device buffers, from now on
4256 	 * the only valid external interface is through the md
4257 	 * device.
4258 	 */
4259 	list_for_each_entry(rdev, &mddev->disks, same_set) {
4260 		if (test_bit(Faulty, &rdev->flags))
4261 			continue;
4262 		sync_blockdev(rdev->bdev);
4263 		invalidate_bdev(rdev->bdev);
4264 
4265 		/* perform some consistency tests on the device.
4266 		 * We don't want the data to overlap the metadata,
4267 		 * Internal Bitmap issues have been handled elsewhere.
4268 		 */
4269 		if (rdev->data_offset < rdev->sb_start) {
4270 			if (mddev->dev_sectors &&
4271 			    rdev->data_offset + mddev->dev_sectors
4272 			    > rdev->sb_start) {
4273 				printk("md: %s: data overlaps metadata\n",
4274 				       mdname(mddev));
4275 				return -EINVAL;
4276 			}
4277 		} else {
4278 			if (rdev->sb_start + rdev->sb_size/512
4279 			    > rdev->data_offset) {
4280 				printk("md: %s: metadata overlaps data\n",
4281 				       mdname(mddev));
4282 				return -EINVAL;
4283 			}
4284 		}
4285 		sysfs_notify_dirent(rdev->sysfs_state);
4286 	}
4287 
4288 	md_probe(mddev->unit, NULL, NULL);
4289 	disk = mddev->gendisk;
4290 	if (!disk)
4291 		return -ENOMEM;
4292 
4293 	spin_lock(&pers_lock);
4294 	pers = find_pers(mddev->level, mddev->clevel);
4295 	if (!pers || !try_module_get(pers->owner)) {
4296 		spin_unlock(&pers_lock);
4297 		if (mddev->level != LEVEL_NONE)
4298 			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4299 			       mddev->level);
4300 		else
4301 			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4302 			       mddev->clevel);
4303 		return -EINVAL;
4304 	}
4305 	mddev->pers = pers;
4306 	spin_unlock(&pers_lock);
4307 	if (mddev->level != pers->level) {
4308 		mddev->level = pers->level;
4309 		mddev->new_level = pers->level;
4310 	}
4311 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4312 
4313 	if (mddev->reshape_position != MaxSector &&
4314 	    pers->start_reshape == NULL) {
4315 		/* This personality cannot handle reshaping... */
4316 		mddev->pers = NULL;
4317 		module_put(pers->owner);
4318 		return -EINVAL;
4319 	}
4320 
4321 	if (pers->sync_request) {
4322 		/* Warn if this is a potentially silly
4323 		 * configuration.
4324 		 */
4325 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4326 		mdk_rdev_t *rdev2;
4327 		int warned = 0;
4328 
4329 		list_for_each_entry(rdev, &mddev->disks, same_set)
4330 			list_for_each_entry(rdev2, &mddev->disks, same_set) {
4331 				if (rdev < rdev2 &&
4332 				    rdev->bdev->bd_contains ==
4333 				    rdev2->bdev->bd_contains) {
4334 					printk(KERN_WARNING
4335 					       "%s: WARNING: %s appears to be"
4336 					       " on the same physical disk as"
4337 					       " %s.\n",
4338 					       mdname(mddev),
4339 					       bdevname(rdev->bdev,b),
4340 					       bdevname(rdev2->bdev,b2));
4341 					warned = 1;
4342 				}
4343 			}
4344 
4345 		if (warned)
4346 			printk(KERN_WARNING
4347 			       "True protection against single-disk"
4348 			       " failure might be compromised.\n");
4349 	}
4350 
4351 	mddev->recovery = 0;
4352 	/* may be over-ridden by personality */
4353 	mddev->resync_max_sectors = mddev->dev_sectors;
4354 
4355 	mddev->barriers_work = 1;
4356 	mddev->ok_start_degraded = start_dirty_degraded;
4357 
4358 	if (start_readonly)
4359 		mddev->ro = 2; /* read-only, but switch on first write */
4360 
4361 	err = mddev->pers->run(mddev);
4362 	if (err)
4363 		printk(KERN_ERR "md: pers->run() failed ...\n");
4364 	else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4365 		WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4366 			  " but 'external_size' not in effect?\n", __func__);
4367 		printk(KERN_ERR
4368 		       "md: invalid array_size %llu > default size %llu\n",
4369 		       (unsigned long long)mddev->array_sectors / 2,
4370 		       (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4371 		err = -EINVAL;
4372 		mddev->pers->stop(mddev);
4373 	}
4374 	if (err == 0 && mddev->pers->sync_request) {
4375 		err = bitmap_create(mddev);
4376 		if (err) {
4377 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4378 			       mdname(mddev), err);
4379 			mddev->pers->stop(mddev);
4380 		}
4381 	}
4382 	if (err) {
4383 		module_put(mddev->pers->owner);
4384 		mddev->pers = NULL;
4385 		bitmap_destroy(mddev);
4386 		return err;
4387 	}
4388 	if (mddev->pers->sync_request) {
4389 		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4390 			printk(KERN_WARNING
4391 			       "md: cannot register extra attributes for %s\n",
4392 			       mdname(mddev));
4393 		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4394 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
4395 		mddev->ro = 0;
4396 
4397  	atomic_set(&mddev->writes_pending,0);
4398 	atomic_set(&mddev->max_corr_read_errors,
4399 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4400 	mddev->safemode = 0;
4401 	mddev->safemode_timer.function = md_safemode_timeout;
4402 	mddev->safemode_timer.data = (unsigned long) mddev;
4403 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4404 	mddev->in_sync = 1;
4405 
4406 	list_for_each_entry(rdev, &mddev->disks, same_set)
4407 		if (rdev->raid_disk >= 0) {
4408 			char nm[20];
4409 			sprintf(nm, "rd%d", rdev->raid_disk);
4410 			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4411 				printk("md: cannot register %s for %s\n",
4412 				       nm, mdname(mddev));
4413 		}
4414 
4415 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4416 
4417 	if (mddev->flags)
4418 		md_update_sb(mddev, 0);
4419 
4420 	set_capacity(disk, mddev->array_sectors);
4421 
4422 	/* If there is a partially-recovered drive we need to
4423 	 * start recovery here.  If we leave it to md_check_recovery,
4424 	 * it will remove the drives and not do the right thing
4425 	 */
4426 	if (mddev->degraded && !mddev->sync_thread) {
4427 		int spares = 0;
4428 		list_for_each_entry(rdev, &mddev->disks, same_set)
4429 			if (rdev->raid_disk >= 0 &&
4430 			    !test_bit(In_sync, &rdev->flags) &&
4431 			    !test_bit(Faulty, &rdev->flags))
4432 				/* complete an interrupted recovery */
4433 				spares++;
4434 		if (spares && mddev->pers->sync_request) {
4435 			mddev->recovery = 0;
4436 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4437 			mddev->sync_thread = md_register_thread(md_do_sync,
4438 								mddev,
4439 								"resync");
4440 			if (!mddev->sync_thread) {
4441 				printk(KERN_ERR "%s: could not start resync"
4442 				       " thread...\n",
4443 				       mdname(mddev));
4444 				/* leave the spares where they are, it shouldn't hurt */
4445 				mddev->recovery = 0;
4446 			}
4447 		}
4448 	}
4449 	md_wakeup_thread(mddev->thread);
4450 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4451 
4452 	revalidate_disk(mddev->gendisk);
4453 	mddev->changed = 1;
4454 	md_new_event(mddev);
4455 	sysfs_notify_dirent(mddev->sysfs_state);
4456 	if (mddev->sysfs_action)
4457 		sysfs_notify_dirent(mddev->sysfs_action);
4458 	sysfs_notify(&mddev->kobj, NULL, "degraded");
4459 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4460 	return 0;
4461 }
4462 
4463 static int restart_array(mddev_t *mddev)
4464 {
4465 	struct gendisk *disk = mddev->gendisk;
4466 
4467 	/* Complain if it has no devices */
4468 	if (list_empty(&mddev->disks))
4469 		return -ENXIO;
4470 	if (!mddev->pers)
4471 		return -EINVAL;
4472 	if (!mddev->ro)
4473 		return -EBUSY;
4474 	mddev->safemode = 0;
4475 	mddev->ro = 0;
4476 	set_disk_ro(disk, 0);
4477 	printk(KERN_INFO "md: %s switched to read-write mode.\n",
4478 		mdname(mddev));
4479 	/* Kick recovery or resync if necessary */
4480 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4481 	md_wakeup_thread(mddev->thread);
4482 	md_wakeup_thread(mddev->sync_thread);
4483 	sysfs_notify_dirent(mddev->sysfs_state);
4484 	return 0;
4485 }
4486 
4487 /* similar to deny_write_access, but accounts for our holding a reference
4488  * to the file ourselves */
4489 static int deny_bitmap_write_access(struct file * file)
4490 {
4491 	struct inode *inode = file->f_mapping->host;
4492 
4493 	spin_lock(&inode->i_lock);
4494 	if (atomic_read(&inode->i_writecount) > 1) {
4495 		spin_unlock(&inode->i_lock);
4496 		return -ETXTBSY;
4497 	}
4498 	atomic_set(&inode->i_writecount, -1);
4499 	spin_unlock(&inode->i_lock);
4500 
4501 	return 0;
4502 }
4503 
4504 void restore_bitmap_write_access(struct file *file)
4505 {
4506 	struct inode *inode = file->f_mapping->host;
4507 
4508 	spin_lock(&inode->i_lock);
4509 	atomic_set(&inode->i_writecount, 1);
4510 	spin_unlock(&inode->i_lock);
4511 }
4512 
4513 /* mode:
4514  *   0 - completely stop and dis-assemble array
4515  *   1 - switch to readonly
4516  *   2 - stop but do not disassemble array
4517  */
4518 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4519 {
4520 	int err = 0;
4521 	struct gendisk *disk = mddev->gendisk;
4522 	mdk_rdev_t *rdev;
4523 
4524 	mutex_lock(&mddev->open_mutex);
4525 	if (atomic_read(&mddev->openers) > is_open) {
4526 		printk("md: %s still in use.\n",mdname(mddev));
4527 		err = -EBUSY;
4528 	} else if (mddev->pers) {
4529 
4530 		if (mddev->sync_thread) {
4531 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4532 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4533 			md_unregister_thread(mddev->sync_thread);
4534 			mddev->sync_thread = NULL;
4535 		}
4536 
4537 		del_timer_sync(&mddev->safemode_timer);
4538 
4539 		switch(mode) {
4540 		case 1: /* readonly */
4541 			err  = -ENXIO;
4542 			if (mddev->ro==1)
4543 				goto out;
4544 			mddev->ro = 1;
4545 			break;
4546 		case 0: /* disassemble */
4547 		case 2: /* stop */
4548 			bitmap_flush(mddev);
4549 			md_super_wait(mddev);
4550 			if (mddev->ro)
4551 				set_disk_ro(disk, 0);
4552 
4553 			mddev->pers->stop(mddev);
4554 			mddev->queue->merge_bvec_fn = NULL;
4555 			mddev->queue->unplug_fn = NULL;
4556 			mddev->queue->backing_dev_info.congested_fn = NULL;
4557 			module_put(mddev->pers->owner);
4558 			if (mddev->pers->sync_request)
4559 				mddev->private = &md_redundancy_group;
4560 			mddev->pers = NULL;
4561 			/* tell userspace to handle 'inactive' */
4562 			sysfs_notify_dirent(mddev->sysfs_state);
4563 
4564 			list_for_each_entry(rdev, &mddev->disks, same_set)
4565 				if (rdev->raid_disk >= 0) {
4566 					char nm[20];
4567 					sprintf(nm, "rd%d", rdev->raid_disk);
4568 					sysfs_remove_link(&mddev->kobj, nm);
4569 				}
4570 
4571 			set_capacity(disk, 0);
4572 			mddev->changed = 1;
4573 
4574 			if (mddev->ro)
4575 				mddev->ro = 0;
4576 		}
4577 		if (!mddev->in_sync || mddev->flags) {
4578 			/* mark array as shutdown cleanly */
4579 			mddev->in_sync = 1;
4580 			md_update_sb(mddev, 1);
4581 		}
4582 		if (mode == 1)
4583 			set_disk_ro(disk, 1);
4584 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4585 		err = 0;
4586 	}
4587 out:
4588 	mutex_unlock(&mddev->open_mutex);
4589 	if (err)
4590 		return err;
4591 	/*
4592 	 * Free resources if final stop
4593 	 */
4594 	if (mode == 0) {
4595 
4596 		printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4597 
4598 		bitmap_destroy(mddev);
4599 		if (mddev->bitmap_info.file) {
4600 			restore_bitmap_write_access(mddev->bitmap_info.file);
4601 			fput(mddev->bitmap_info.file);
4602 			mddev->bitmap_info.file = NULL;
4603 		}
4604 		mddev->bitmap_info.offset = 0;
4605 
4606 		/* make sure all md_delayed_delete calls have finished */
4607 		flush_scheduled_work();
4608 
4609 		export_array(mddev);
4610 
4611 		mddev->array_sectors = 0;
4612 		mddev->external_size = 0;
4613 		mddev->dev_sectors = 0;
4614 		mddev->raid_disks = 0;
4615 		mddev->recovery_cp = 0;
4616 		mddev->resync_min = 0;
4617 		mddev->resync_max = MaxSector;
4618 		mddev->reshape_position = MaxSector;
4619 		mddev->external = 0;
4620 		mddev->persistent = 0;
4621 		mddev->level = LEVEL_NONE;
4622 		mddev->clevel[0] = 0;
4623 		mddev->flags = 0;
4624 		mddev->ro = 0;
4625 		mddev->metadata_type[0] = 0;
4626 		mddev->chunk_sectors = 0;
4627 		mddev->ctime = mddev->utime = 0;
4628 		mddev->layout = 0;
4629 		mddev->max_disks = 0;
4630 		mddev->events = 0;
4631 		mddev->delta_disks = 0;
4632 		mddev->new_level = LEVEL_NONE;
4633 		mddev->new_layout = 0;
4634 		mddev->new_chunk_sectors = 0;
4635 		mddev->curr_resync = 0;
4636 		mddev->resync_mismatches = 0;
4637 		mddev->suspend_lo = mddev->suspend_hi = 0;
4638 		mddev->sync_speed_min = mddev->sync_speed_max = 0;
4639 		mddev->recovery = 0;
4640 		mddev->in_sync = 0;
4641 		mddev->changed = 0;
4642 		mddev->degraded = 0;
4643 		mddev->barriers_work = 0;
4644 		mddev->safemode = 0;
4645 		mddev->bitmap_info.offset = 0;
4646 		mddev->bitmap_info.default_offset = 0;
4647 		mddev->bitmap_info.chunksize = 0;
4648 		mddev->bitmap_info.daemon_sleep = 0;
4649 		mddev->bitmap_info.max_write_behind = 0;
4650 		kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4651 		if (mddev->hold_active == UNTIL_STOP)
4652 			mddev->hold_active = 0;
4653 
4654 	} else if (mddev->pers)
4655 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
4656 			mdname(mddev));
4657 	err = 0;
4658 	blk_integrity_unregister(disk);
4659 	md_new_event(mddev);
4660 	sysfs_notify_dirent(mddev->sysfs_state);
4661 	return err;
4662 }
4663 
4664 #ifndef MODULE
4665 static void autorun_array(mddev_t *mddev)
4666 {
4667 	mdk_rdev_t *rdev;
4668 	int err;
4669 
4670 	if (list_empty(&mddev->disks))
4671 		return;
4672 
4673 	printk(KERN_INFO "md: running: ");
4674 
4675 	list_for_each_entry(rdev, &mddev->disks, same_set) {
4676 		char b[BDEVNAME_SIZE];
4677 		printk("<%s>", bdevname(rdev->bdev,b));
4678 	}
4679 	printk("\n");
4680 
4681 	err = do_md_run(mddev);
4682 	if (err) {
4683 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4684 		do_md_stop(mddev, 0, 0);
4685 	}
4686 }
4687 
4688 /*
4689  * lets try to run arrays based on all disks that have arrived
4690  * until now. (those are in pending_raid_disks)
4691  *
4692  * the method: pick the first pending disk, collect all disks with
4693  * the same UUID, remove all from the pending list and put them into
4694  * the 'same_array' list. Then order this list based on superblock
4695  * update time (freshest comes first), kick out 'old' disks and
4696  * compare superblocks. If everything's fine then run it.
4697  *
4698  * If "unit" is allocated, then bump its reference count
4699  */
4700 static void autorun_devices(int part)
4701 {
4702 	mdk_rdev_t *rdev0, *rdev, *tmp;
4703 	mddev_t *mddev;
4704 	char b[BDEVNAME_SIZE];
4705 
4706 	printk(KERN_INFO "md: autorun ...\n");
4707 	while (!list_empty(&pending_raid_disks)) {
4708 		int unit;
4709 		dev_t dev;
4710 		LIST_HEAD(candidates);
4711 		rdev0 = list_entry(pending_raid_disks.next,
4712 					 mdk_rdev_t, same_set);
4713 
4714 		printk(KERN_INFO "md: considering %s ...\n",
4715 			bdevname(rdev0->bdev,b));
4716 		INIT_LIST_HEAD(&candidates);
4717 		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4718 			if (super_90_load(rdev, rdev0, 0) >= 0) {
4719 				printk(KERN_INFO "md:  adding %s ...\n",
4720 					bdevname(rdev->bdev,b));
4721 				list_move(&rdev->same_set, &candidates);
4722 			}
4723 		/*
4724 		 * now we have a set of devices, with all of them having
4725 		 * mostly sane superblocks. It's time to allocate the
4726 		 * mddev.
4727 		 */
4728 		if (part) {
4729 			dev = MKDEV(mdp_major,
4730 				    rdev0->preferred_minor << MdpMinorShift);
4731 			unit = MINOR(dev) >> MdpMinorShift;
4732 		} else {
4733 			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4734 			unit = MINOR(dev);
4735 		}
4736 		if (rdev0->preferred_minor != unit) {
4737 			printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4738 			       bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4739 			break;
4740 		}
4741 
4742 		md_probe(dev, NULL, NULL);
4743 		mddev = mddev_find(dev);
4744 		if (!mddev || !mddev->gendisk) {
4745 			if (mddev)
4746 				mddev_put(mddev);
4747 			printk(KERN_ERR
4748 				"md: cannot allocate memory for md drive.\n");
4749 			break;
4750 		}
4751 		if (mddev_lock(mddev))
4752 			printk(KERN_WARNING "md: %s locked, cannot run\n",
4753 			       mdname(mddev));
4754 		else if (mddev->raid_disks || mddev->major_version
4755 			 || !list_empty(&mddev->disks)) {
4756 			printk(KERN_WARNING
4757 				"md: %s already running, cannot run %s\n",
4758 				mdname(mddev), bdevname(rdev0->bdev,b));
4759 			mddev_unlock(mddev);
4760 		} else {
4761 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
4762 			mddev->persistent = 1;
4763 			rdev_for_each_list(rdev, tmp, &candidates) {
4764 				list_del_init(&rdev->same_set);
4765 				if (bind_rdev_to_array(rdev, mddev))
4766 					export_rdev(rdev);
4767 			}
4768 			autorun_array(mddev);
4769 			mddev_unlock(mddev);
4770 		}
4771 		/* on success, candidates will be empty, on error
4772 		 * it won't...
4773 		 */
4774 		rdev_for_each_list(rdev, tmp, &candidates) {
4775 			list_del_init(&rdev->same_set);
4776 			export_rdev(rdev);
4777 		}
4778 		mddev_put(mddev);
4779 	}
4780 	printk(KERN_INFO "md: ... autorun DONE.\n");
4781 }
4782 #endif /* !MODULE */
4783 
4784 static int get_version(void __user * arg)
4785 {
4786 	mdu_version_t ver;
4787 
4788 	ver.major = MD_MAJOR_VERSION;
4789 	ver.minor = MD_MINOR_VERSION;
4790 	ver.patchlevel = MD_PATCHLEVEL_VERSION;
4791 
4792 	if (copy_to_user(arg, &ver, sizeof(ver)))
4793 		return -EFAULT;
4794 
4795 	return 0;
4796 }
4797 
4798 static int get_array_info(mddev_t * mddev, void __user * arg)
4799 {
4800 	mdu_array_info_t info;
4801 	int nr,working,insync,failed,spare;
4802 	mdk_rdev_t *rdev;
4803 
4804 	nr=working=insync=failed=spare=0;
4805 	list_for_each_entry(rdev, &mddev->disks, same_set) {
4806 		nr++;
4807 		if (test_bit(Faulty, &rdev->flags))
4808 			failed++;
4809 		else {
4810 			working++;
4811 			if (test_bit(In_sync, &rdev->flags))
4812 				insync++;
4813 			else
4814 				spare++;
4815 		}
4816 	}
4817 
4818 	info.major_version = mddev->major_version;
4819 	info.minor_version = mddev->minor_version;
4820 	info.patch_version = MD_PATCHLEVEL_VERSION;
4821 	info.ctime         = mddev->ctime;
4822 	info.level         = mddev->level;
4823 	info.size          = mddev->dev_sectors / 2;
4824 	if (info.size != mddev->dev_sectors / 2) /* overflow */
4825 		info.size = -1;
4826 	info.nr_disks      = nr;
4827 	info.raid_disks    = mddev->raid_disks;
4828 	info.md_minor      = mddev->md_minor;
4829 	info.not_persistent= !mddev->persistent;
4830 
4831 	info.utime         = mddev->utime;
4832 	info.state         = 0;
4833 	if (mddev->in_sync)
4834 		info.state = (1<<MD_SB_CLEAN);
4835 	if (mddev->bitmap && mddev->bitmap_info.offset)
4836 		info.state = (1<<MD_SB_BITMAP_PRESENT);
4837 	info.active_disks  = insync;
4838 	info.working_disks = working;
4839 	info.failed_disks  = failed;
4840 	info.spare_disks   = spare;
4841 
4842 	info.layout        = mddev->layout;
4843 	info.chunk_size    = mddev->chunk_sectors << 9;
4844 
4845 	if (copy_to_user(arg, &info, sizeof(info)))
4846 		return -EFAULT;
4847 
4848 	return 0;
4849 }
4850 
4851 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4852 {
4853 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4854 	char *ptr, *buf = NULL;
4855 	int err = -ENOMEM;
4856 
4857 	if (md_allow_write(mddev))
4858 		file = kmalloc(sizeof(*file), GFP_NOIO);
4859 	else
4860 		file = kmalloc(sizeof(*file), GFP_KERNEL);
4861 
4862 	if (!file)
4863 		goto out;
4864 
4865 	/* bitmap disabled, zero the first byte and copy out */
4866 	if (!mddev->bitmap || !mddev->bitmap->file) {
4867 		file->pathname[0] = '\0';
4868 		goto copy_out;
4869 	}
4870 
4871 	buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4872 	if (!buf)
4873 		goto out;
4874 
4875 	ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4876 	if (IS_ERR(ptr))
4877 		goto out;
4878 
4879 	strcpy(file->pathname, ptr);
4880 
4881 copy_out:
4882 	err = 0;
4883 	if (copy_to_user(arg, file, sizeof(*file)))
4884 		err = -EFAULT;
4885 out:
4886 	kfree(buf);
4887 	kfree(file);
4888 	return err;
4889 }
4890 
4891 static int get_disk_info(mddev_t * mddev, void __user * arg)
4892 {
4893 	mdu_disk_info_t info;
4894 	mdk_rdev_t *rdev;
4895 
4896 	if (copy_from_user(&info, arg, sizeof(info)))
4897 		return -EFAULT;
4898 
4899 	rdev = find_rdev_nr(mddev, info.number);
4900 	if (rdev) {
4901 		info.major = MAJOR(rdev->bdev->bd_dev);
4902 		info.minor = MINOR(rdev->bdev->bd_dev);
4903 		info.raid_disk = rdev->raid_disk;
4904 		info.state = 0;
4905 		if (test_bit(Faulty, &rdev->flags))
4906 			info.state |= (1<<MD_DISK_FAULTY);
4907 		else if (test_bit(In_sync, &rdev->flags)) {
4908 			info.state |= (1<<MD_DISK_ACTIVE);
4909 			info.state |= (1<<MD_DISK_SYNC);
4910 		}
4911 		if (test_bit(WriteMostly, &rdev->flags))
4912 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
4913 	} else {
4914 		info.major = info.minor = 0;
4915 		info.raid_disk = -1;
4916 		info.state = (1<<MD_DISK_REMOVED);
4917 	}
4918 
4919 	if (copy_to_user(arg, &info, sizeof(info)))
4920 		return -EFAULT;
4921 
4922 	return 0;
4923 }
4924 
4925 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4926 {
4927 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4928 	mdk_rdev_t *rdev;
4929 	dev_t dev = MKDEV(info->major,info->minor);
4930 
4931 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4932 		return -EOVERFLOW;
4933 
4934 	if (!mddev->raid_disks) {
4935 		int err;
4936 		/* expecting a device which has a superblock */
4937 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4938 		if (IS_ERR(rdev)) {
4939 			printk(KERN_WARNING
4940 				"md: md_import_device returned %ld\n",
4941 				PTR_ERR(rdev));
4942 			return PTR_ERR(rdev);
4943 		}
4944 		if (!list_empty(&mddev->disks)) {
4945 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4946 							mdk_rdev_t, same_set);
4947 			err = super_types[mddev->major_version]
4948 				.load_super(rdev, rdev0, mddev->minor_version);
4949 			if (err < 0) {
4950 				printk(KERN_WARNING
4951 					"md: %s has different UUID to %s\n",
4952 					bdevname(rdev->bdev,b),
4953 					bdevname(rdev0->bdev,b2));
4954 				export_rdev(rdev);
4955 				return -EINVAL;
4956 			}
4957 		}
4958 		err = bind_rdev_to_array(rdev, mddev);
4959 		if (err)
4960 			export_rdev(rdev);
4961 		return err;
4962 	}
4963 
4964 	/*
4965 	 * add_new_disk can be used once the array is assembled
4966 	 * to add "hot spares".  They must already have a superblock
4967 	 * written
4968 	 */
4969 	if (mddev->pers) {
4970 		int err;
4971 		if (!mddev->pers->hot_add_disk) {
4972 			printk(KERN_WARNING
4973 				"%s: personality does not support diskops!\n",
4974 			       mdname(mddev));
4975 			return -EINVAL;
4976 		}
4977 		if (mddev->persistent)
4978 			rdev = md_import_device(dev, mddev->major_version,
4979 						mddev->minor_version);
4980 		else
4981 			rdev = md_import_device(dev, -1, -1);
4982 		if (IS_ERR(rdev)) {
4983 			printk(KERN_WARNING
4984 				"md: md_import_device returned %ld\n",
4985 				PTR_ERR(rdev));
4986 			return PTR_ERR(rdev);
4987 		}
4988 		/* set save_raid_disk if appropriate */
4989 		if (!mddev->persistent) {
4990 			if (info->state & (1<<MD_DISK_SYNC)  &&
4991 			    info->raid_disk < mddev->raid_disks)
4992 				rdev->raid_disk = info->raid_disk;
4993 			else
4994 				rdev->raid_disk = -1;
4995 		} else
4996 			super_types[mddev->major_version].
4997 				validate_super(mddev, rdev);
4998 		rdev->saved_raid_disk = rdev->raid_disk;
4999 
5000 		clear_bit(In_sync, &rdev->flags); /* just to be sure */
5001 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5002 			set_bit(WriteMostly, &rdev->flags);
5003 		else
5004 			clear_bit(WriteMostly, &rdev->flags);
5005 
5006 		rdev->raid_disk = -1;
5007 		err = bind_rdev_to_array(rdev, mddev);
5008 		if (!err && !mddev->pers->hot_remove_disk) {
5009 			/* If there is hot_add_disk but no hot_remove_disk
5010 			 * then added disks for geometry changes,
5011 			 * and should be added immediately.
5012 			 */
5013 			super_types[mddev->major_version].
5014 				validate_super(mddev, rdev);
5015 			err = mddev->pers->hot_add_disk(mddev, rdev);
5016 			if (err)
5017 				unbind_rdev_from_array(rdev);
5018 		}
5019 		if (err)
5020 			export_rdev(rdev);
5021 		else
5022 			sysfs_notify_dirent(rdev->sysfs_state);
5023 
5024 		md_update_sb(mddev, 1);
5025 		if (mddev->degraded)
5026 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5027 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5028 		md_wakeup_thread(mddev->thread);
5029 		return err;
5030 	}
5031 
5032 	/* otherwise, add_new_disk is only allowed
5033 	 * for major_version==0 superblocks
5034 	 */
5035 	if (mddev->major_version != 0) {
5036 		printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5037 		       mdname(mddev));
5038 		return -EINVAL;
5039 	}
5040 
5041 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
5042 		int err;
5043 		rdev = md_import_device(dev, -1, 0);
5044 		if (IS_ERR(rdev)) {
5045 			printk(KERN_WARNING
5046 				"md: error, md_import_device() returned %ld\n",
5047 				PTR_ERR(rdev));
5048 			return PTR_ERR(rdev);
5049 		}
5050 		rdev->desc_nr = info->number;
5051 		if (info->raid_disk < mddev->raid_disks)
5052 			rdev->raid_disk = info->raid_disk;
5053 		else
5054 			rdev->raid_disk = -1;
5055 
5056 		if (rdev->raid_disk < mddev->raid_disks)
5057 			if (info->state & (1<<MD_DISK_SYNC))
5058 				set_bit(In_sync, &rdev->flags);
5059 
5060 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5061 			set_bit(WriteMostly, &rdev->flags);
5062 
5063 		if (!mddev->persistent) {
5064 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
5065 			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
5066 		} else
5067 			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
5068 		rdev->sectors = rdev->sb_start;
5069 
5070 		err = bind_rdev_to_array(rdev, mddev);
5071 		if (err) {
5072 			export_rdev(rdev);
5073 			return err;
5074 		}
5075 	}
5076 
5077 	return 0;
5078 }
5079 
5080 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
5081 {
5082 	char b[BDEVNAME_SIZE];
5083 	mdk_rdev_t *rdev;
5084 
5085 	rdev = find_rdev(mddev, dev);
5086 	if (!rdev)
5087 		return -ENXIO;
5088 
5089 	if (rdev->raid_disk >= 0)
5090 		goto busy;
5091 
5092 	kick_rdev_from_array(rdev);
5093 	md_update_sb(mddev, 1);
5094 	md_new_event(mddev);
5095 
5096 	return 0;
5097 busy:
5098 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5099 		bdevname(rdev->bdev,b), mdname(mddev));
5100 	return -EBUSY;
5101 }
5102 
5103 static int hot_add_disk(mddev_t * mddev, dev_t dev)
5104 {
5105 	char b[BDEVNAME_SIZE];
5106 	int err;
5107 	mdk_rdev_t *rdev;
5108 
5109 	if (!mddev->pers)
5110 		return -ENODEV;
5111 
5112 	if (mddev->major_version != 0) {
5113 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5114 			" version-0 superblocks.\n",
5115 			mdname(mddev));
5116 		return -EINVAL;
5117 	}
5118 	if (!mddev->pers->hot_add_disk) {
5119 		printk(KERN_WARNING
5120 			"%s: personality does not support diskops!\n",
5121 			mdname(mddev));
5122 		return -EINVAL;
5123 	}
5124 
5125 	rdev = md_import_device(dev, -1, 0);
5126 	if (IS_ERR(rdev)) {
5127 		printk(KERN_WARNING
5128 			"md: error, md_import_device() returned %ld\n",
5129 			PTR_ERR(rdev));
5130 		return -EINVAL;
5131 	}
5132 
5133 	if (mddev->persistent)
5134 		rdev->sb_start = calc_dev_sboffset(rdev->bdev);
5135 	else
5136 		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
5137 
5138 	rdev->sectors = rdev->sb_start;
5139 
5140 	if (test_bit(Faulty, &rdev->flags)) {
5141 		printk(KERN_WARNING
5142 			"md: can not hot-add faulty %s disk to %s!\n",
5143 			bdevname(rdev->bdev,b), mdname(mddev));
5144 		err = -EINVAL;
5145 		goto abort_export;
5146 	}
5147 	clear_bit(In_sync, &rdev->flags);
5148 	rdev->desc_nr = -1;
5149 	rdev->saved_raid_disk = -1;
5150 	err = bind_rdev_to_array(rdev, mddev);
5151 	if (err)
5152 		goto abort_export;
5153 
5154 	/*
5155 	 * The rest should better be atomic, we can have disk failures
5156 	 * noticed in interrupt contexts ...
5157 	 */
5158 
5159 	rdev->raid_disk = -1;
5160 
5161 	md_update_sb(mddev, 1);
5162 
5163 	/*
5164 	 * Kick recovery, maybe this spare has to be added to the
5165 	 * array immediately.
5166 	 */
5167 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5168 	md_wakeup_thread(mddev->thread);
5169 	md_new_event(mddev);
5170 	return 0;
5171 
5172 abort_export:
5173 	export_rdev(rdev);
5174 	return err;
5175 }
5176 
5177 static int set_bitmap_file(mddev_t *mddev, int fd)
5178 {
5179 	int err;
5180 
5181 	if (mddev->pers) {
5182 		if (!mddev->pers->quiesce)
5183 			return -EBUSY;
5184 		if (mddev->recovery || mddev->sync_thread)
5185 			return -EBUSY;
5186 		/* we should be able to change the bitmap.. */
5187 	}
5188 
5189 
5190 	if (fd >= 0) {
5191 		if (mddev->bitmap)
5192 			return -EEXIST; /* cannot add when bitmap is present */
5193 		mddev->bitmap_info.file = fget(fd);
5194 
5195 		if (mddev->bitmap_info.file == NULL) {
5196 			printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5197 			       mdname(mddev));
5198 			return -EBADF;
5199 		}
5200 
5201 		err = deny_bitmap_write_access(mddev->bitmap_info.file);
5202 		if (err) {
5203 			printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5204 			       mdname(mddev));
5205 			fput(mddev->bitmap_info.file);
5206 			mddev->bitmap_info.file = NULL;
5207 			return err;
5208 		}
5209 		mddev->bitmap_info.offset = 0; /* file overrides offset */
5210 	} else if (mddev->bitmap == NULL)
5211 		return -ENOENT; /* cannot remove what isn't there */
5212 	err = 0;
5213 	if (mddev->pers) {
5214 		mddev->pers->quiesce(mddev, 1);
5215 		if (fd >= 0)
5216 			err = bitmap_create(mddev);
5217 		if (fd < 0 || err) {
5218 			bitmap_destroy(mddev);
5219 			fd = -1; /* make sure to put the file */
5220 		}
5221 		mddev->pers->quiesce(mddev, 0);
5222 	}
5223 	if (fd < 0) {
5224 		if (mddev->bitmap_info.file) {
5225 			restore_bitmap_write_access(mddev->bitmap_info.file);
5226 			fput(mddev->bitmap_info.file);
5227 		}
5228 		mddev->bitmap_info.file = NULL;
5229 	}
5230 
5231 	return err;
5232 }
5233 
5234 /*
5235  * set_array_info is used two different ways
5236  * The original usage is when creating a new array.
5237  * In this usage, raid_disks is > 0 and it together with
5238  *  level, size, not_persistent,layout,chunksize determine the
5239  *  shape of the array.
5240  *  This will always create an array with a type-0.90.0 superblock.
5241  * The newer usage is when assembling an array.
5242  *  In this case raid_disks will be 0, and the major_version field is
5243  *  use to determine which style super-blocks are to be found on the devices.
5244  *  The minor and patch _version numbers are also kept incase the
5245  *  super_block handler wishes to interpret them.
5246  */
5247 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5248 {
5249 
5250 	if (info->raid_disks == 0) {
5251 		/* just setting version number for superblock loading */
5252 		if (info->major_version < 0 ||
5253 		    info->major_version >= ARRAY_SIZE(super_types) ||
5254 		    super_types[info->major_version].name == NULL) {
5255 			/* maybe try to auto-load a module? */
5256 			printk(KERN_INFO
5257 				"md: superblock version %d not known\n",
5258 				info->major_version);
5259 			return -EINVAL;
5260 		}
5261 		mddev->major_version = info->major_version;
5262 		mddev->minor_version = info->minor_version;
5263 		mddev->patch_version = info->patch_version;
5264 		mddev->persistent = !info->not_persistent;
5265 		return 0;
5266 	}
5267 	mddev->major_version = MD_MAJOR_VERSION;
5268 	mddev->minor_version = MD_MINOR_VERSION;
5269 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
5270 	mddev->ctime         = get_seconds();
5271 
5272 	mddev->level         = info->level;
5273 	mddev->clevel[0]     = 0;
5274 	mddev->dev_sectors   = 2 * (sector_t)info->size;
5275 	mddev->raid_disks    = info->raid_disks;
5276 	/* don't set md_minor, it is determined by which /dev/md* was
5277 	 * openned
5278 	 */
5279 	if (info->state & (1<<MD_SB_CLEAN))
5280 		mddev->recovery_cp = MaxSector;
5281 	else
5282 		mddev->recovery_cp = 0;
5283 	mddev->persistent    = ! info->not_persistent;
5284 	mddev->external	     = 0;
5285 
5286 	mddev->layout        = info->layout;
5287 	mddev->chunk_sectors = info->chunk_size >> 9;
5288 
5289 	mddev->max_disks     = MD_SB_DISKS;
5290 
5291 	if (mddev->persistent)
5292 		mddev->flags         = 0;
5293 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5294 
5295 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5296 	mddev->bitmap_info.offset = 0;
5297 
5298 	mddev->reshape_position = MaxSector;
5299 
5300 	/*
5301 	 * Generate a 128 bit UUID
5302 	 */
5303 	get_random_bytes(mddev->uuid, 16);
5304 
5305 	mddev->new_level = mddev->level;
5306 	mddev->new_chunk_sectors = mddev->chunk_sectors;
5307 	mddev->new_layout = mddev->layout;
5308 	mddev->delta_disks = 0;
5309 
5310 	return 0;
5311 }
5312 
5313 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5314 {
5315 	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5316 
5317 	if (mddev->external_size)
5318 		return;
5319 
5320 	mddev->array_sectors = array_sectors;
5321 }
5322 EXPORT_SYMBOL(md_set_array_sectors);
5323 
5324 static int update_size(mddev_t *mddev, sector_t num_sectors)
5325 {
5326 	mdk_rdev_t *rdev;
5327 	int rv;
5328 	int fit = (num_sectors == 0);
5329 
5330 	if (mddev->pers->resize == NULL)
5331 		return -EINVAL;
5332 	/* The "num_sectors" is the number of sectors of each device that
5333 	 * is used.  This can only make sense for arrays with redundancy.
5334 	 * linear and raid0 always use whatever space is available. We can only
5335 	 * consider changing this number if no resync or reconstruction is
5336 	 * happening, and if the new size is acceptable. It must fit before the
5337 	 * sb_start or, if that is <data_offset, it must fit before the size
5338 	 * of each device.  If num_sectors is zero, we find the largest size
5339 	 * that fits.
5340 
5341 	 */
5342 	if (mddev->sync_thread)
5343 		return -EBUSY;
5344 	if (mddev->bitmap)
5345 		/* Sorry, cannot grow a bitmap yet, just remove it,
5346 		 * grow, and re-add.
5347 		 */
5348 		return -EBUSY;
5349 	list_for_each_entry(rdev, &mddev->disks, same_set) {
5350 		sector_t avail = rdev->sectors;
5351 
5352 		if (fit && (num_sectors == 0 || num_sectors > avail))
5353 			num_sectors = avail;
5354 		if (avail < num_sectors)
5355 			return -ENOSPC;
5356 	}
5357 	rv = mddev->pers->resize(mddev, num_sectors);
5358 	if (!rv)
5359 		revalidate_disk(mddev->gendisk);
5360 	return rv;
5361 }
5362 
5363 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5364 {
5365 	int rv;
5366 	/* change the number of raid disks */
5367 	if (mddev->pers->check_reshape == NULL)
5368 		return -EINVAL;
5369 	if (raid_disks <= 0 ||
5370 	    raid_disks >= mddev->max_disks)
5371 		return -EINVAL;
5372 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5373 		return -EBUSY;
5374 	mddev->delta_disks = raid_disks - mddev->raid_disks;
5375 
5376 	rv = mddev->pers->check_reshape(mddev);
5377 	return rv;
5378 }
5379 
5380 
5381 /*
5382  * update_array_info is used to change the configuration of an
5383  * on-line array.
5384  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5385  * fields in the info are checked against the array.
5386  * Any differences that cannot be handled will cause an error.
5387  * Normally, only one change can be managed at a time.
5388  */
5389 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5390 {
5391 	int rv = 0;
5392 	int cnt = 0;
5393 	int state = 0;
5394 
5395 	/* calculate expected state,ignoring low bits */
5396 	if (mddev->bitmap && mddev->bitmap_info.offset)
5397 		state |= (1 << MD_SB_BITMAP_PRESENT);
5398 
5399 	if (mddev->major_version != info->major_version ||
5400 	    mddev->minor_version != info->minor_version ||
5401 /*	    mddev->patch_version != info->patch_version || */
5402 	    mddev->ctime         != info->ctime         ||
5403 	    mddev->level         != info->level         ||
5404 /*	    mddev->layout        != info->layout        || */
5405 	    !mddev->persistent	 != info->not_persistent||
5406 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
5407 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5408 	    ((state^info->state) & 0xfffffe00)
5409 		)
5410 		return -EINVAL;
5411 	/* Check there is only one change */
5412 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5413 		cnt++;
5414 	if (mddev->raid_disks != info->raid_disks)
5415 		cnt++;
5416 	if (mddev->layout != info->layout)
5417 		cnt++;
5418 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5419 		cnt++;
5420 	if (cnt == 0)
5421 		return 0;
5422 	if (cnt > 1)
5423 		return -EINVAL;
5424 
5425 	if (mddev->layout != info->layout) {
5426 		/* Change layout
5427 		 * we don't need to do anything at the md level, the
5428 		 * personality will take care of it all.
5429 		 */
5430 		if (mddev->pers->check_reshape == NULL)
5431 			return -EINVAL;
5432 		else {
5433 			mddev->new_layout = info->layout;
5434 			rv = mddev->pers->check_reshape(mddev);
5435 			if (rv)
5436 				mddev->new_layout = mddev->layout;
5437 			return rv;
5438 		}
5439 	}
5440 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5441 		rv = update_size(mddev, (sector_t)info->size * 2);
5442 
5443 	if (mddev->raid_disks    != info->raid_disks)
5444 		rv = update_raid_disks(mddev, info->raid_disks);
5445 
5446 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5447 		if (mddev->pers->quiesce == NULL)
5448 			return -EINVAL;
5449 		if (mddev->recovery || mddev->sync_thread)
5450 			return -EBUSY;
5451 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5452 			/* add the bitmap */
5453 			if (mddev->bitmap)
5454 				return -EEXIST;
5455 			if (mddev->bitmap_info.default_offset == 0)
5456 				return -EINVAL;
5457 			mddev->bitmap_info.offset =
5458 				mddev->bitmap_info.default_offset;
5459 			mddev->pers->quiesce(mddev, 1);
5460 			rv = bitmap_create(mddev);
5461 			if (rv)
5462 				bitmap_destroy(mddev);
5463 			mddev->pers->quiesce(mddev, 0);
5464 		} else {
5465 			/* remove the bitmap */
5466 			if (!mddev->bitmap)
5467 				return -ENOENT;
5468 			if (mddev->bitmap->file)
5469 				return -EINVAL;
5470 			mddev->pers->quiesce(mddev, 1);
5471 			bitmap_destroy(mddev);
5472 			mddev->pers->quiesce(mddev, 0);
5473 			mddev->bitmap_info.offset = 0;
5474 		}
5475 	}
5476 	md_update_sb(mddev, 1);
5477 	return rv;
5478 }
5479 
5480 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5481 {
5482 	mdk_rdev_t *rdev;
5483 
5484 	if (mddev->pers == NULL)
5485 		return -ENODEV;
5486 
5487 	rdev = find_rdev(mddev, dev);
5488 	if (!rdev)
5489 		return -ENODEV;
5490 
5491 	md_error(mddev, rdev);
5492 	return 0;
5493 }
5494 
5495 /*
5496  * We have a problem here : there is no easy way to give a CHS
5497  * virtual geometry. We currently pretend that we have a 2 heads
5498  * 4 sectors (with a BIG number of cylinders...). This drives
5499  * dosfs just mad... ;-)
5500  */
5501 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5502 {
5503 	mddev_t *mddev = bdev->bd_disk->private_data;
5504 
5505 	geo->heads = 2;
5506 	geo->sectors = 4;
5507 	geo->cylinders = get_capacity(mddev->gendisk) / 8;
5508 	return 0;
5509 }
5510 
5511 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5512 			unsigned int cmd, unsigned long arg)
5513 {
5514 	int err = 0;
5515 	void __user *argp = (void __user *)arg;
5516 	mddev_t *mddev = NULL;
5517 
5518 	if (!capable(CAP_SYS_ADMIN))
5519 		return -EACCES;
5520 
5521 	/*
5522 	 * Commands dealing with the RAID driver but not any
5523 	 * particular array:
5524 	 */
5525 	switch (cmd)
5526 	{
5527 		case RAID_VERSION:
5528 			err = get_version(argp);
5529 			goto done;
5530 
5531 		case PRINT_RAID_DEBUG:
5532 			err = 0;
5533 			md_print_devices();
5534 			goto done;
5535 
5536 #ifndef MODULE
5537 		case RAID_AUTORUN:
5538 			err = 0;
5539 			autostart_arrays(arg);
5540 			goto done;
5541 #endif
5542 		default:;
5543 	}
5544 
5545 	/*
5546 	 * Commands creating/starting a new array:
5547 	 */
5548 
5549 	mddev = bdev->bd_disk->private_data;
5550 
5551 	if (!mddev) {
5552 		BUG();
5553 		goto abort;
5554 	}
5555 
5556 	err = mddev_lock(mddev);
5557 	if (err) {
5558 		printk(KERN_INFO
5559 			"md: ioctl lock interrupted, reason %d, cmd %d\n",
5560 			err, cmd);
5561 		goto abort;
5562 	}
5563 
5564 	switch (cmd)
5565 	{
5566 		case SET_ARRAY_INFO:
5567 			{
5568 				mdu_array_info_t info;
5569 				if (!arg)
5570 					memset(&info, 0, sizeof(info));
5571 				else if (copy_from_user(&info, argp, sizeof(info))) {
5572 					err = -EFAULT;
5573 					goto abort_unlock;
5574 				}
5575 				if (mddev->pers) {
5576 					err = update_array_info(mddev, &info);
5577 					if (err) {
5578 						printk(KERN_WARNING "md: couldn't update"
5579 						       " array info. %d\n", err);
5580 						goto abort_unlock;
5581 					}
5582 					goto done_unlock;
5583 				}
5584 				if (!list_empty(&mddev->disks)) {
5585 					printk(KERN_WARNING
5586 					       "md: array %s already has disks!\n",
5587 					       mdname(mddev));
5588 					err = -EBUSY;
5589 					goto abort_unlock;
5590 				}
5591 				if (mddev->raid_disks) {
5592 					printk(KERN_WARNING
5593 					       "md: array %s already initialised!\n",
5594 					       mdname(mddev));
5595 					err = -EBUSY;
5596 					goto abort_unlock;
5597 				}
5598 				err = set_array_info(mddev, &info);
5599 				if (err) {
5600 					printk(KERN_WARNING "md: couldn't set"
5601 					       " array info. %d\n", err);
5602 					goto abort_unlock;
5603 				}
5604 			}
5605 			goto done_unlock;
5606 
5607 		default:;
5608 	}
5609 
5610 	/*
5611 	 * Commands querying/configuring an existing array:
5612 	 */
5613 	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5614 	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5615 	if ((!mddev->raid_disks && !mddev->external)
5616 	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5617 	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5618 	    && cmd != GET_BITMAP_FILE) {
5619 		err = -ENODEV;
5620 		goto abort_unlock;
5621 	}
5622 
5623 	/*
5624 	 * Commands even a read-only array can execute:
5625 	 */
5626 	switch (cmd)
5627 	{
5628 		case GET_ARRAY_INFO:
5629 			err = get_array_info(mddev, argp);
5630 			goto done_unlock;
5631 
5632 		case GET_BITMAP_FILE:
5633 			err = get_bitmap_file(mddev, argp);
5634 			goto done_unlock;
5635 
5636 		case GET_DISK_INFO:
5637 			err = get_disk_info(mddev, argp);
5638 			goto done_unlock;
5639 
5640 		case RESTART_ARRAY_RW:
5641 			err = restart_array(mddev);
5642 			goto done_unlock;
5643 
5644 		case STOP_ARRAY:
5645 			err = do_md_stop(mddev, 0, 1);
5646 			goto done_unlock;
5647 
5648 		case STOP_ARRAY_RO:
5649 			err = do_md_stop(mddev, 1, 1);
5650 			goto done_unlock;
5651 
5652 	}
5653 
5654 	/*
5655 	 * The remaining ioctls are changing the state of the
5656 	 * superblock, so we do not allow them on read-only arrays.
5657 	 * However non-MD ioctls (e.g. get-size) will still come through
5658 	 * here and hit the 'default' below, so only disallow
5659 	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
5660 	 */
5661 	if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5662 		if (mddev->ro == 2) {
5663 			mddev->ro = 0;
5664 			sysfs_notify_dirent(mddev->sysfs_state);
5665 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5666 			md_wakeup_thread(mddev->thread);
5667 		} else {
5668 			err = -EROFS;
5669 			goto abort_unlock;
5670 		}
5671 	}
5672 
5673 	switch (cmd)
5674 	{
5675 		case ADD_NEW_DISK:
5676 		{
5677 			mdu_disk_info_t info;
5678 			if (copy_from_user(&info, argp, sizeof(info)))
5679 				err = -EFAULT;
5680 			else
5681 				err = add_new_disk(mddev, &info);
5682 			goto done_unlock;
5683 		}
5684 
5685 		case HOT_REMOVE_DISK:
5686 			err = hot_remove_disk(mddev, new_decode_dev(arg));
5687 			goto done_unlock;
5688 
5689 		case HOT_ADD_DISK:
5690 			err = hot_add_disk(mddev, new_decode_dev(arg));
5691 			goto done_unlock;
5692 
5693 		case SET_DISK_FAULTY:
5694 			err = set_disk_faulty(mddev, new_decode_dev(arg));
5695 			goto done_unlock;
5696 
5697 		case RUN_ARRAY:
5698 			err = do_md_run(mddev);
5699 			goto done_unlock;
5700 
5701 		case SET_BITMAP_FILE:
5702 			err = set_bitmap_file(mddev, (int)arg);
5703 			goto done_unlock;
5704 
5705 		default:
5706 			err = -EINVAL;
5707 			goto abort_unlock;
5708 	}
5709 
5710 done_unlock:
5711 abort_unlock:
5712 	if (mddev->hold_active == UNTIL_IOCTL &&
5713 	    err != -EINVAL)
5714 		mddev->hold_active = 0;
5715 	mddev_unlock(mddev);
5716 
5717 	return err;
5718 done:
5719 	if (err)
5720 		MD_BUG();
5721 abort:
5722 	return err;
5723 }
5724 #ifdef CONFIG_COMPAT
5725 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
5726 		    unsigned int cmd, unsigned long arg)
5727 {
5728 	switch (cmd) {
5729 	case HOT_REMOVE_DISK:
5730 	case HOT_ADD_DISK:
5731 	case SET_DISK_FAULTY:
5732 	case SET_BITMAP_FILE:
5733 		/* These take in integer arg, do not convert */
5734 		break;
5735 	default:
5736 		arg = (unsigned long)compat_ptr(arg);
5737 		break;
5738 	}
5739 
5740 	return md_ioctl(bdev, mode, cmd, arg);
5741 }
5742 #endif /* CONFIG_COMPAT */
5743 
5744 static int md_open(struct block_device *bdev, fmode_t mode)
5745 {
5746 	/*
5747 	 * Succeed if we can lock the mddev, which confirms that
5748 	 * it isn't being stopped right now.
5749 	 */
5750 	mddev_t *mddev = mddev_find(bdev->bd_dev);
5751 	int err;
5752 
5753 	if (mddev->gendisk != bdev->bd_disk) {
5754 		/* we are racing with mddev_put which is discarding this
5755 		 * bd_disk.
5756 		 */
5757 		mddev_put(mddev);
5758 		/* Wait until bdev->bd_disk is definitely gone */
5759 		flush_scheduled_work();
5760 		/* Then retry the open from the top */
5761 		return -ERESTARTSYS;
5762 	}
5763 	BUG_ON(mddev != bdev->bd_disk->private_data);
5764 
5765 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5766 		goto out;
5767 
5768 	err = 0;
5769 	atomic_inc(&mddev->openers);
5770 	mutex_unlock(&mddev->open_mutex);
5771 
5772 	check_disk_change(bdev);
5773  out:
5774 	return err;
5775 }
5776 
5777 static int md_release(struct gendisk *disk, fmode_t mode)
5778 {
5779  	mddev_t *mddev = disk->private_data;
5780 
5781 	BUG_ON(!mddev);
5782 	atomic_dec(&mddev->openers);
5783 	mddev_put(mddev);
5784 
5785 	return 0;
5786 }
5787 
5788 static int md_media_changed(struct gendisk *disk)
5789 {
5790 	mddev_t *mddev = disk->private_data;
5791 
5792 	return mddev->changed;
5793 }
5794 
5795 static int md_revalidate(struct gendisk *disk)
5796 {
5797 	mddev_t *mddev = disk->private_data;
5798 
5799 	mddev->changed = 0;
5800 	return 0;
5801 }
5802 static const struct block_device_operations md_fops =
5803 {
5804 	.owner		= THIS_MODULE,
5805 	.open		= md_open,
5806 	.release	= md_release,
5807 	.ioctl		= md_ioctl,
5808 #ifdef CONFIG_COMPAT
5809 	.compat_ioctl	= md_compat_ioctl,
5810 #endif
5811 	.getgeo		= md_getgeo,
5812 	.media_changed	= md_media_changed,
5813 	.revalidate_disk= md_revalidate,
5814 };
5815 
5816 static int md_thread(void * arg)
5817 {
5818 	mdk_thread_t *thread = arg;
5819 
5820 	/*
5821 	 * md_thread is a 'system-thread', it's priority should be very
5822 	 * high. We avoid resource deadlocks individually in each
5823 	 * raid personality. (RAID5 does preallocation) We also use RR and
5824 	 * the very same RT priority as kswapd, thus we will never get
5825 	 * into a priority inversion deadlock.
5826 	 *
5827 	 * we definitely have to have equal or higher priority than
5828 	 * bdflush, otherwise bdflush will deadlock if there are too
5829 	 * many dirty RAID5 blocks.
5830 	 */
5831 
5832 	allow_signal(SIGKILL);
5833 	while (!kthread_should_stop()) {
5834 
5835 		/* We need to wait INTERRUPTIBLE so that
5836 		 * we don't add to the load-average.
5837 		 * That means we need to be sure no signals are
5838 		 * pending
5839 		 */
5840 		if (signal_pending(current))
5841 			flush_signals(current);
5842 
5843 		wait_event_interruptible_timeout
5844 			(thread->wqueue,
5845 			 test_bit(THREAD_WAKEUP, &thread->flags)
5846 			 || kthread_should_stop(),
5847 			 thread->timeout);
5848 
5849 		clear_bit(THREAD_WAKEUP, &thread->flags);
5850 
5851 		thread->run(thread->mddev);
5852 	}
5853 
5854 	return 0;
5855 }
5856 
5857 void md_wakeup_thread(mdk_thread_t *thread)
5858 {
5859 	if (thread) {
5860 		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5861 		set_bit(THREAD_WAKEUP, &thread->flags);
5862 		wake_up(&thread->wqueue);
5863 	}
5864 }
5865 
5866 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5867 				 const char *name)
5868 {
5869 	mdk_thread_t *thread;
5870 
5871 	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5872 	if (!thread)
5873 		return NULL;
5874 
5875 	init_waitqueue_head(&thread->wqueue);
5876 
5877 	thread->run = run;
5878 	thread->mddev = mddev;
5879 	thread->timeout = MAX_SCHEDULE_TIMEOUT;
5880 	thread->tsk = kthread_run(md_thread, thread,
5881 				  "%s_%s",
5882 				  mdname(thread->mddev),
5883 				  name ?: mddev->pers->name);
5884 	if (IS_ERR(thread->tsk)) {
5885 		kfree(thread);
5886 		return NULL;
5887 	}
5888 	return thread;
5889 }
5890 
5891 void md_unregister_thread(mdk_thread_t *thread)
5892 {
5893 	if (!thread)
5894 		return;
5895 	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5896 
5897 	kthread_stop(thread->tsk);
5898 	kfree(thread);
5899 }
5900 
5901 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5902 {
5903 	if (!mddev) {
5904 		MD_BUG();
5905 		return;
5906 	}
5907 
5908 	if (!rdev || test_bit(Faulty, &rdev->flags))
5909 		return;
5910 
5911 	if (mddev->external)
5912 		set_bit(Blocked, &rdev->flags);
5913 /*
5914 	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5915 		mdname(mddev),
5916 		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5917 		__builtin_return_address(0),__builtin_return_address(1),
5918 		__builtin_return_address(2),__builtin_return_address(3));
5919 */
5920 	if (!mddev->pers)
5921 		return;
5922 	if (!mddev->pers->error_handler)
5923 		return;
5924 	mddev->pers->error_handler(mddev,rdev);
5925 	if (mddev->degraded)
5926 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5927 	set_bit(StateChanged, &rdev->flags);
5928 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5929 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5930 	md_wakeup_thread(mddev->thread);
5931 	md_new_event_inintr(mddev);
5932 }
5933 
5934 /* seq_file implementation /proc/mdstat */
5935 
5936 static void status_unused(struct seq_file *seq)
5937 {
5938 	int i = 0;
5939 	mdk_rdev_t *rdev;
5940 
5941 	seq_printf(seq, "unused devices: ");
5942 
5943 	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5944 		char b[BDEVNAME_SIZE];
5945 		i++;
5946 		seq_printf(seq, "%s ",
5947 			      bdevname(rdev->bdev,b));
5948 	}
5949 	if (!i)
5950 		seq_printf(seq, "<none>");
5951 
5952 	seq_printf(seq, "\n");
5953 }
5954 
5955 
5956 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5957 {
5958 	sector_t max_sectors, resync, res;
5959 	unsigned long dt, db;
5960 	sector_t rt;
5961 	int scale;
5962 	unsigned int per_milli;
5963 
5964 	resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5965 
5966 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5967 		max_sectors = mddev->resync_max_sectors;
5968 	else
5969 		max_sectors = mddev->dev_sectors;
5970 
5971 	/*
5972 	 * Should not happen.
5973 	 */
5974 	if (!max_sectors) {
5975 		MD_BUG();
5976 		return;
5977 	}
5978 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
5979 	 * in a sector_t, and (max_sectors>>scale) will fit in a
5980 	 * u32, as those are the requirements for sector_div.
5981 	 * Thus 'scale' must be at least 10
5982 	 */
5983 	scale = 10;
5984 	if (sizeof(sector_t) > sizeof(unsigned long)) {
5985 		while ( max_sectors/2 > (1ULL<<(scale+32)))
5986 			scale++;
5987 	}
5988 	res = (resync>>scale)*1000;
5989 	sector_div(res, (u32)((max_sectors>>scale)+1));
5990 
5991 	per_milli = res;
5992 	{
5993 		int i, x = per_milli/50, y = 20-x;
5994 		seq_printf(seq, "[");
5995 		for (i = 0; i < x; i++)
5996 			seq_printf(seq, "=");
5997 		seq_printf(seq, ">");
5998 		for (i = 0; i < y; i++)
5999 			seq_printf(seq, ".");
6000 		seq_printf(seq, "] ");
6001 	}
6002 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6003 		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6004 		    "reshape" :
6005 		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6006 		     "check" :
6007 		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6008 		      "resync" : "recovery"))),
6009 		   per_milli/10, per_milli % 10,
6010 		   (unsigned long long) resync/2,
6011 		   (unsigned long long) max_sectors/2);
6012 
6013 	/*
6014 	 * dt: time from mark until now
6015 	 * db: blocks written from mark until now
6016 	 * rt: remaining time
6017 	 *
6018 	 * rt is a sector_t, so could be 32bit or 64bit.
6019 	 * So we divide before multiply in case it is 32bit and close
6020 	 * to the limit.
6021 	 * We scale the divisor (db) by 32 to avoid loosing precision
6022 	 * near the end of resync when the number of remaining sectors
6023 	 * is close to 'db'.
6024 	 * We then divide rt by 32 after multiplying by db to compensate.
6025 	 * The '+1' avoids division by zero if db is very small.
6026 	 */
6027 	dt = ((jiffies - mddev->resync_mark) / HZ);
6028 	if (!dt) dt++;
6029 	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6030 		- mddev->resync_mark_cnt;
6031 
6032 	rt = max_sectors - resync;    /* number of remaining sectors */
6033 	sector_div(rt, db/32+1);
6034 	rt *= dt;
6035 	rt >>= 5;
6036 
6037 	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6038 		   ((unsigned long)rt % 60)/6);
6039 
6040 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6041 }
6042 
6043 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6044 {
6045 	struct list_head *tmp;
6046 	loff_t l = *pos;
6047 	mddev_t *mddev;
6048 
6049 	if (l >= 0x10000)
6050 		return NULL;
6051 	if (!l--)
6052 		/* header */
6053 		return (void*)1;
6054 
6055 	spin_lock(&all_mddevs_lock);
6056 	list_for_each(tmp,&all_mddevs)
6057 		if (!l--) {
6058 			mddev = list_entry(tmp, mddev_t, all_mddevs);
6059 			mddev_get(mddev);
6060 			spin_unlock(&all_mddevs_lock);
6061 			return mddev;
6062 		}
6063 	spin_unlock(&all_mddevs_lock);
6064 	if (!l--)
6065 		return (void*)2;/* tail */
6066 	return NULL;
6067 }
6068 
6069 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6070 {
6071 	struct list_head *tmp;
6072 	mddev_t *next_mddev, *mddev = v;
6073 
6074 	++*pos;
6075 	if (v == (void*)2)
6076 		return NULL;
6077 
6078 	spin_lock(&all_mddevs_lock);
6079 	if (v == (void*)1)
6080 		tmp = all_mddevs.next;
6081 	else
6082 		tmp = mddev->all_mddevs.next;
6083 	if (tmp != &all_mddevs)
6084 		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
6085 	else {
6086 		next_mddev = (void*)2;
6087 		*pos = 0x10000;
6088 	}
6089 	spin_unlock(&all_mddevs_lock);
6090 
6091 	if (v != (void*)1)
6092 		mddev_put(mddev);
6093 	return next_mddev;
6094 
6095 }
6096 
6097 static void md_seq_stop(struct seq_file *seq, void *v)
6098 {
6099 	mddev_t *mddev = v;
6100 
6101 	if (mddev && v != (void*)1 && v != (void*)2)
6102 		mddev_put(mddev);
6103 }
6104 
6105 struct mdstat_info {
6106 	int event;
6107 };
6108 
6109 static int md_seq_show(struct seq_file *seq, void *v)
6110 {
6111 	mddev_t *mddev = v;
6112 	sector_t sectors;
6113 	mdk_rdev_t *rdev;
6114 	struct mdstat_info *mi = seq->private;
6115 	struct bitmap *bitmap;
6116 
6117 	if (v == (void*)1) {
6118 		struct mdk_personality *pers;
6119 		seq_printf(seq, "Personalities : ");
6120 		spin_lock(&pers_lock);
6121 		list_for_each_entry(pers, &pers_list, list)
6122 			seq_printf(seq, "[%s] ", pers->name);
6123 
6124 		spin_unlock(&pers_lock);
6125 		seq_printf(seq, "\n");
6126 		mi->event = atomic_read(&md_event_count);
6127 		return 0;
6128 	}
6129 	if (v == (void*)2) {
6130 		status_unused(seq);
6131 		return 0;
6132 	}
6133 
6134 	if (mddev_lock(mddev) < 0)
6135 		return -EINTR;
6136 
6137 	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6138 		seq_printf(seq, "%s : %sactive", mdname(mddev),
6139 						mddev->pers ? "" : "in");
6140 		if (mddev->pers) {
6141 			if (mddev->ro==1)
6142 				seq_printf(seq, " (read-only)");
6143 			if (mddev->ro==2)
6144 				seq_printf(seq, " (auto-read-only)");
6145 			seq_printf(seq, " %s", mddev->pers->name);
6146 		}
6147 
6148 		sectors = 0;
6149 		list_for_each_entry(rdev, &mddev->disks, same_set) {
6150 			char b[BDEVNAME_SIZE];
6151 			seq_printf(seq, " %s[%d]",
6152 				bdevname(rdev->bdev,b), rdev->desc_nr);
6153 			if (test_bit(WriteMostly, &rdev->flags))
6154 				seq_printf(seq, "(W)");
6155 			if (test_bit(Faulty, &rdev->flags)) {
6156 				seq_printf(seq, "(F)");
6157 				continue;
6158 			} else if (rdev->raid_disk < 0)
6159 				seq_printf(seq, "(S)"); /* spare */
6160 			sectors += rdev->sectors;
6161 		}
6162 
6163 		if (!list_empty(&mddev->disks)) {
6164 			if (mddev->pers)
6165 				seq_printf(seq, "\n      %llu blocks",
6166 					   (unsigned long long)
6167 					   mddev->array_sectors / 2);
6168 			else
6169 				seq_printf(seq, "\n      %llu blocks",
6170 					   (unsigned long long)sectors / 2);
6171 		}
6172 		if (mddev->persistent) {
6173 			if (mddev->major_version != 0 ||
6174 			    mddev->minor_version != 90) {
6175 				seq_printf(seq," super %d.%d",
6176 					   mddev->major_version,
6177 					   mddev->minor_version);
6178 			}
6179 		} else if (mddev->external)
6180 			seq_printf(seq, " super external:%s",
6181 				   mddev->metadata_type);
6182 		else
6183 			seq_printf(seq, " super non-persistent");
6184 
6185 		if (mddev->pers) {
6186 			mddev->pers->status(seq, mddev);
6187 	 		seq_printf(seq, "\n      ");
6188 			if (mddev->pers->sync_request) {
6189 				if (mddev->curr_resync > 2) {
6190 					status_resync(seq, mddev);
6191 					seq_printf(seq, "\n      ");
6192 				} else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
6193 					seq_printf(seq, "\tresync=DELAYED\n      ");
6194 				else if (mddev->recovery_cp < MaxSector)
6195 					seq_printf(seq, "\tresync=PENDING\n      ");
6196 			}
6197 		} else
6198 			seq_printf(seq, "\n       ");
6199 
6200 		if ((bitmap = mddev->bitmap)) {
6201 			unsigned long chunk_kb;
6202 			unsigned long flags;
6203 			spin_lock_irqsave(&bitmap->lock, flags);
6204 			chunk_kb = mddev->bitmap_info.chunksize >> 10;
6205 			seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6206 				"%lu%s chunk",
6207 				bitmap->pages - bitmap->missing_pages,
6208 				bitmap->pages,
6209 				(bitmap->pages - bitmap->missing_pages)
6210 					<< (PAGE_SHIFT - 10),
6211 				chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6212 				chunk_kb ? "KB" : "B");
6213 			if (bitmap->file) {
6214 				seq_printf(seq, ", file: ");
6215 				seq_path(seq, &bitmap->file->f_path, " \t\n");
6216 			}
6217 
6218 			seq_printf(seq, "\n");
6219 			spin_unlock_irqrestore(&bitmap->lock, flags);
6220 		}
6221 
6222 		seq_printf(seq, "\n");
6223 	}
6224 	mddev_unlock(mddev);
6225 
6226 	return 0;
6227 }
6228 
6229 static const struct seq_operations md_seq_ops = {
6230 	.start  = md_seq_start,
6231 	.next   = md_seq_next,
6232 	.stop   = md_seq_stop,
6233 	.show   = md_seq_show,
6234 };
6235 
6236 static int md_seq_open(struct inode *inode, struct file *file)
6237 {
6238 	int error;
6239 	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
6240 	if (mi == NULL)
6241 		return -ENOMEM;
6242 
6243 	error = seq_open(file, &md_seq_ops);
6244 	if (error)
6245 		kfree(mi);
6246 	else {
6247 		struct seq_file *p = file->private_data;
6248 		p->private = mi;
6249 		mi->event = atomic_read(&md_event_count);
6250 	}
6251 	return error;
6252 }
6253 
6254 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6255 {
6256 	struct seq_file *m = filp->private_data;
6257 	struct mdstat_info *mi = m->private;
6258 	int mask;
6259 
6260 	poll_wait(filp, &md_event_waiters, wait);
6261 
6262 	/* always allow read */
6263 	mask = POLLIN | POLLRDNORM;
6264 
6265 	if (mi->event != atomic_read(&md_event_count))
6266 		mask |= POLLERR | POLLPRI;
6267 	return mask;
6268 }
6269 
6270 static const struct file_operations md_seq_fops = {
6271 	.owner		= THIS_MODULE,
6272 	.open           = md_seq_open,
6273 	.read           = seq_read,
6274 	.llseek         = seq_lseek,
6275 	.release	= seq_release_private,
6276 	.poll		= mdstat_poll,
6277 };
6278 
6279 int register_md_personality(struct mdk_personality *p)
6280 {
6281 	spin_lock(&pers_lock);
6282 	list_add_tail(&p->list, &pers_list);
6283 	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6284 	spin_unlock(&pers_lock);
6285 	return 0;
6286 }
6287 
6288 int unregister_md_personality(struct mdk_personality *p)
6289 {
6290 	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6291 	spin_lock(&pers_lock);
6292 	list_del_init(&p->list);
6293 	spin_unlock(&pers_lock);
6294 	return 0;
6295 }
6296 
6297 static int is_mddev_idle(mddev_t *mddev, int init)
6298 {
6299 	mdk_rdev_t * rdev;
6300 	int idle;
6301 	int curr_events;
6302 
6303 	idle = 1;
6304 	rcu_read_lock();
6305 	rdev_for_each_rcu(rdev, mddev) {
6306 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6307 		curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6308 			      (int)part_stat_read(&disk->part0, sectors[1]) -
6309 			      atomic_read(&disk->sync_io);
6310 		/* sync IO will cause sync_io to increase before the disk_stats
6311 		 * as sync_io is counted when a request starts, and
6312 		 * disk_stats is counted when it completes.
6313 		 * So resync activity will cause curr_events to be smaller than
6314 		 * when there was no such activity.
6315 		 * non-sync IO will cause disk_stat to increase without
6316 		 * increasing sync_io so curr_events will (eventually)
6317 		 * be larger than it was before.  Once it becomes
6318 		 * substantially larger, the test below will cause
6319 		 * the array to appear non-idle, and resync will slow
6320 		 * down.
6321 		 * If there is a lot of outstanding resync activity when
6322 		 * we set last_event to curr_events, then all that activity
6323 		 * completing might cause the array to appear non-idle
6324 		 * and resync will be slowed down even though there might
6325 		 * not have been non-resync activity.  This will only
6326 		 * happen once though.  'last_events' will soon reflect
6327 		 * the state where there is little or no outstanding
6328 		 * resync requests, and further resync activity will
6329 		 * always make curr_events less than last_events.
6330 		 *
6331 		 */
6332 		if (init || curr_events - rdev->last_events > 64) {
6333 			rdev->last_events = curr_events;
6334 			idle = 0;
6335 		}
6336 	}
6337 	rcu_read_unlock();
6338 	return idle;
6339 }
6340 
6341 void md_done_sync(mddev_t *mddev, int blocks, int ok)
6342 {
6343 	/* another "blocks" (512byte) blocks have been synced */
6344 	atomic_sub(blocks, &mddev->recovery_active);
6345 	wake_up(&mddev->recovery_wait);
6346 	if (!ok) {
6347 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6348 		md_wakeup_thread(mddev->thread);
6349 		// stop recovery, signal do_sync ....
6350 	}
6351 }
6352 
6353 
6354 /* md_write_start(mddev, bi)
6355  * If we need to update some array metadata (e.g. 'active' flag
6356  * in superblock) before writing, schedule a superblock update
6357  * and wait for it to complete.
6358  */
6359 void md_write_start(mddev_t *mddev, struct bio *bi)
6360 {
6361 	int did_change = 0;
6362 	if (bio_data_dir(bi) != WRITE)
6363 		return;
6364 
6365 	BUG_ON(mddev->ro == 1);
6366 	if (mddev->ro == 2) {
6367 		/* need to switch to read/write */
6368 		mddev->ro = 0;
6369 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6370 		md_wakeup_thread(mddev->thread);
6371 		md_wakeup_thread(mddev->sync_thread);
6372 		did_change = 1;
6373 	}
6374 	atomic_inc(&mddev->writes_pending);
6375 	if (mddev->safemode == 1)
6376 		mddev->safemode = 0;
6377 	if (mddev->in_sync) {
6378 		spin_lock_irq(&mddev->write_lock);
6379 		if (mddev->in_sync) {
6380 			mddev->in_sync = 0;
6381 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6382 			md_wakeup_thread(mddev->thread);
6383 			did_change = 1;
6384 		}
6385 		spin_unlock_irq(&mddev->write_lock);
6386 	}
6387 	if (did_change)
6388 		sysfs_notify_dirent(mddev->sysfs_state);
6389 	wait_event(mddev->sb_wait,
6390 		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6391 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6392 }
6393 
6394 void md_write_end(mddev_t *mddev)
6395 {
6396 	if (atomic_dec_and_test(&mddev->writes_pending)) {
6397 		if (mddev->safemode == 2)
6398 			md_wakeup_thread(mddev->thread);
6399 		else if (mddev->safemode_delay)
6400 			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6401 	}
6402 }
6403 
6404 /* md_allow_write(mddev)
6405  * Calling this ensures that the array is marked 'active' so that writes
6406  * may proceed without blocking.  It is important to call this before
6407  * attempting a GFP_KERNEL allocation while holding the mddev lock.
6408  * Must be called with mddev_lock held.
6409  *
6410  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6411  * is dropped, so return -EAGAIN after notifying userspace.
6412  */
6413 int md_allow_write(mddev_t *mddev)
6414 {
6415 	if (!mddev->pers)
6416 		return 0;
6417 	if (mddev->ro)
6418 		return 0;
6419 	if (!mddev->pers->sync_request)
6420 		return 0;
6421 
6422 	spin_lock_irq(&mddev->write_lock);
6423 	if (mddev->in_sync) {
6424 		mddev->in_sync = 0;
6425 		set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6426 		if (mddev->safemode_delay &&
6427 		    mddev->safemode == 0)
6428 			mddev->safemode = 1;
6429 		spin_unlock_irq(&mddev->write_lock);
6430 		md_update_sb(mddev, 0);
6431 		sysfs_notify_dirent(mddev->sysfs_state);
6432 	} else
6433 		spin_unlock_irq(&mddev->write_lock);
6434 
6435 	if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6436 		return -EAGAIN;
6437 	else
6438 		return 0;
6439 }
6440 EXPORT_SYMBOL_GPL(md_allow_write);
6441 
6442 #define SYNC_MARKS	10
6443 #define	SYNC_MARK_STEP	(3*HZ)
6444 void md_do_sync(mddev_t *mddev)
6445 {
6446 	mddev_t *mddev2;
6447 	unsigned int currspeed = 0,
6448 		 window;
6449 	sector_t max_sectors,j, io_sectors;
6450 	unsigned long mark[SYNC_MARKS];
6451 	sector_t mark_cnt[SYNC_MARKS];
6452 	int last_mark,m;
6453 	struct list_head *tmp;
6454 	sector_t last_check;
6455 	int skipped = 0;
6456 	mdk_rdev_t *rdev;
6457 	char *desc;
6458 
6459 	/* just incase thread restarts... */
6460 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6461 		return;
6462 	if (mddev->ro) /* never try to sync a read-only array */
6463 		return;
6464 
6465 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6466 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6467 			desc = "data-check";
6468 		else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6469 			desc = "requested-resync";
6470 		else
6471 			desc = "resync";
6472 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6473 		desc = "reshape";
6474 	else
6475 		desc = "recovery";
6476 
6477 	/* we overload curr_resync somewhat here.
6478 	 * 0 == not engaged in resync at all
6479 	 * 2 == checking that there is no conflict with another sync
6480 	 * 1 == like 2, but have yielded to allow conflicting resync to
6481 	 *		commense
6482 	 * other == active in resync - this many blocks
6483 	 *
6484 	 * Before starting a resync we must have set curr_resync to
6485 	 * 2, and then checked that every "conflicting" array has curr_resync
6486 	 * less than ours.  When we find one that is the same or higher
6487 	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6488 	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6489 	 * This will mean we have to start checking from the beginning again.
6490 	 *
6491 	 */
6492 
6493 	do {
6494 		mddev->curr_resync = 2;
6495 
6496 	try_again:
6497 		if (kthread_should_stop()) {
6498 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6499 			goto skip;
6500 		}
6501 		for_each_mddev(mddev2, tmp) {
6502 			if (mddev2 == mddev)
6503 				continue;
6504 			if (!mddev->parallel_resync
6505 			&&  mddev2->curr_resync
6506 			&&  match_mddev_units(mddev, mddev2)) {
6507 				DEFINE_WAIT(wq);
6508 				if (mddev < mddev2 && mddev->curr_resync == 2) {
6509 					/* arbitrarily yield */
6510 					mddev->curr_resync = 1;
6511 					wake_up(&resync_wait);
6512 				}
6513 				if (mddev > mddev2 && mddev->curr_resync == 1)
6514 					/* no need to wait here, we can wait the next
6515 					 * time 'round when curr_resync == 2
6516 					 */
6517 					continue;
6518 				/* We need to wait 'interruptible' so as not to
6519 				 * contribute to the load average, and not to
6520 				 * be caught by 'softlockup'
6521 				 */
6522 				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6523 				if (!kthread_should_stop() &&
6524 				    mddev2->curr_resync >= mddev->curr_resync) {
6525 					printk(KERN_INFO "md: delaying %s of %s"
6526 					       " until %s has finished (they"
6527 					       " share one or more physical units)\n",
6528 					       desc, mdname(mddev), mdname(mddev2));
6529 					mddev_put(mddev2);
6530 					if (signal_pending(current))
6531 						flush_signals(current);
6532 					schedule();
6533 					finish_wait(&resync_wait, &wq);
6534 					goto try_again;
6535 				}
6536 				finish_wait(&resync_wait, &wq);
6537 			}
6538 		}
6539 	} while (mddev->curr_resync < 2);
6540 
6541 	j = 0;
6542 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6543 		/* resync follows the size requested by the personality,
6544 		 * which defaults to physical size, but can be virtual size
6545 		 */
6546 		max_sectors = mddev->resync_max_sectors;
6547 		mddev->resync_mismatches = 0;
6548 		/* we don't use the checkpoint if there's a bitmap */
6549 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6550 			j = mddev->resync_min;
6551 		else if (!mddev->bitmap)
6552 			j = mddev->recovery_cp;
6553 
6554 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6555 		max_sectors = mddev->dev_sectors;
6556 	else {
6557 		/* recovery follows the physical size of devices */
6558 		max_sectors = mddev->dev_sectors;
6559 		j = MaxSector;
6560 		rcu_read_lock();
6561 		list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6562 			if (rdev->raid_disk >= 0 &&
6563 			    !test_bit(Faulty, &rdev->flags) &&
6564 			    !test_bit(In_sync, &rdev->flags) &&
6565 			    rdev->recovery_offset < j)
6566 				j = rdev->recovery_offset;
6567 		rcu_read_unlock();
6568 	}
6569 
6570 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6571 	printk(KERN_INFO "md: minimum _guaranteed_  speed:"
6572 		" %d KB/sec/disk.\n", speed_min(mddev));
6573 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6574 	       "(but not more than %d KB/sec) for %s.\n",
6575 	       speed_max(mddev), desc);
6576 
6577 	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6578 
6579 	io_sectors = 0;
6580 	for (m = 0; m < SYNC_MARKS; m++) {
6581 		mark[m] = jiffies;
6582 		mark_cnt[m] = io_sectors;
6583 	}
6584 	last_mark = 0;
6585 	mddev->resync_mark = mark[last_mark];
6586 	mddev->resync_mark_cnt = mark_cnt[last_mark];
6587 
6588 	/*
6589 	 * Tune reconstruction:
6590 	 */
6591 	window = 32*(PAGE_SIZE/512);
6592 	printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6593 		window/2,(unsigned long long) max_sectors/2);
6594 
6595 	atomic_set(&mddev->recovery_active, 0);
6596 	last_check = 0;
6597 
6598 	if (j>2) {
6599 		printk(KERN_INFO
6600 		       "md: resuming %s of %s from checkpoint.\n",
6601 		       desc, mdname(mddev));
6602 		mddev->curr_resync = j;
6603 	}
6604 	mddev->curr_resync_completed = mddev->curr_resync;
6605 
6606 	while (j < max_sectors) {
6607 		sector_t sectors;
6608 
6609 		skipped = 0;
6610 
6611 		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6612 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
6613 		      (mddev->curr_resync - mddev->curr_resync_completed)
6614 		      > (max_sectors >> 4)) ||
6615 		     (j - mddev->curr_resync_completed)*2
6616 		     >= mddev->resync_max - mddev->curr_resync_completed
6617 			    )) {
6618 			/* time to update curr_resync_completed */
6619 			blk_unplug(mddev->queue);
6620 			wait_event(mddev->recovery_wait,
6621 				   atomic_read(&mddev->recovery_active) == 0);
6622 			mddev->curr_resync_completed =
6623 				mddev->curr_resync;
6624 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6625 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6626 		}
6627 
6628 		while (j >= mddev->resync_max && !kthread_should_stop()) {
6629 			/* As this condition is controlled by user-space,
6630 			 * we can block indefinitely, so use '_interruptible'
6631 			 * to avoid triggering warnings.
6632 			 */
6633 			flush_signals(current); /* just in case */
6634 			wait_event_interruptible(mddev->recovery_wait,
6635 						 mddev->resync_max > j
6636 						 || kthread_should_stop());
6637 		}
6638 
6639 		if (kthread_should_stop())
6640 			goto interrupted;
6641 
6642 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
6643 						  currspeed < speed_min(mddev));
6644 		if (sectors == 0) {
6645 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6646 			goto out;
6647 		}
6648 
6649 		if (!skipped) { /* actual IO requested */
6650 			io_sectors += sectors;
6651 			atomic_add(sectors, &mddev->recovery_active);
6652 		}
6653 
6654 		j += sectors;
6655 		if (j>1) mddev->curr_resync = j;
6656 		mddev->curr_mark_cnt = io_sectors;
6657 		if (last_check == 0)
6658 			/* this is the earliers that rebuilt will be
6659 			 * visible in /proc/mdstat
6660 			 */
6661 			md_new_event(mddev);
6662 
6663 		if (last_check + window > io_sectors || j == max_sectors)
6664 			continue;
6665 
6666 		last_check = io_sectors;
6667 
6668 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6669 			break;
6670 
6671 	repeat:
6672 		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6673 			/* step marks */
6674 			int next = (last_mark+1) % SYNC_MARKS;
6675 
6676 			mddev->resync_mark = mark[next];
6677 			mddev->resync_mark_cnt = mark_cnt[next];
6678 			mark[next] = jiffies;
6679 			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6680 			last_mark = next;
6681 		}
6682 
6683 
6684 		if (kthread_should_stop())
6685 			goto interrupted;
6686 
6687 
6688 		/*
6689 		 * this loop exits only if either when we are slower than
6690 		 * the 'hard' speed limit, or the system was IO-idle for
6691 		 * a jiffy.
6692 		 * the system might be non-idle CPU-wise, but we only care
6693 		 * about not overloading the IO subsystem. (things like an
6694 		 * e2fsck being done on the RAID array should execute fast)
6695 		 */
6696 		blk_unplug(mddev->queue);
6697 		cond_resched();
6698 
6699 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6700 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
6701 
6702 		if (currspeed > speed_min(mddev)) {
6703 			if ((currspeed > speed_max(mddev)) ||
6704 					!is_mddev_idle(mddev, 0)) {
6705 				msleep(500);
6706 				goto repeat;
6707 			}
6708 		}
6709 	}
6710 	printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6711 	/*
6712 	 * this also signals 'finished resyncing' to md_stop
6713 	 */
6714  out:
6715 	blk_unplug(mddev->queue);
6716 
6717 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6718 
6719 	/* tell personality that we are finished */
6720 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6721 
6722 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6723 	    mddev->curr_resync > 2) {
6724 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6725 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6726 				if (mddev->curr_resync >= mddev->recovery_cp) {
6727 					printk(KERN_INFO
6728 					       "md: checkpointing %s of %s.\n",
6729 					       desc, mdname(mddev));
6730 					mddev->recovery_cp = mddev->curr_resync;
6731 				}
6732 			} else
6733 				mddev->recovery_cp = MaxSector;
6734 		} else {
6735 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6736 				mddev->curr_resync = MaxSector;
6737 			rcu_read_lock();
6738 			list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6739 				if (rdev->raid_disk >= 0 &&
6740 				    !test_bit(Faulty, &rdev->flags) &&
6741 				    !test_bit(In_sync, &rdev->flags) &&
6742 				    rdev->recovery_offset < mddev->curr_resync)
6743 					rdev->recovery_offset = mddev->curr_resync;
6744 			rcu_read_unlock();
6745 		}
6746 	}
6747 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
6748 
6749  skip:
6750 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6751 		/* We completed so min/max setting can be forgotten if used. */
6752 		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6753 			mddev->resync_min = 0;
6754 		mddev->resync_max = MaxSector;
6755 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6756 		mddev->resync_min = mddev->curr_resync_completed;
6757 	mddev->curr_resync = 0;
6758 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6759 		mddev->curr_resync_completed = 0;
6760 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6761 	wake_up(&resync_wait);
6762 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6763 	md_wakeup_thread(mddev->thread);
6764 	return;
6765 
6766  interrupted:
6767 	/*
6768 	 * got a signal, exit.
6769 	 */
6770 	printk(KERN_INFO
6771 	       "md: md_do_sync() got signal ... exiting\n");
6772 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6773 	goto out;
6774 
6775 }
6776 EXPORT_SYMBOL_GPL(md_do_sync);
6777 
6778 
6779 static int remove_and_add_spares(mddev_t *mddev)
6780 {
6781 	mdk_rdev_t *rdev;
6782 	int spares = 0;
6783 
6784 	mddev->curr_resync_completed = 0;
6785 
6786 	list_for_each_entry(rdev, &mddev->disks, same_set)
6787 		if (rdev->raid_disk >= 0 &&
6788 		    !test_bit(Blocked, &rdev->flags) &&
6789 		    (test_bit(Faulty, &rdev->flags) ||
6790 		     ! test_bit(In_sync, &rdev->flags)) &&
6791 		    atomic_read(&rdev->nr_pending)==0) {
6792 			if (mddev->pers->hot_remove_disk(
6793 				    mddev, rdev->raid_disk)==0) {
6794 				char nm[20];
6795 				sprintf(nm,"rd%d", rdev->raid_disk);
6796 				sysfs_remove_link(&mddev->kobj, nm);
6797 				rdev->raid_disk = -1;
6798 			}
6799 		}
6800 
6801 	if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6802 		list_for_each_entry(rdev, &mddev->disks, same_set) {
6803 			if (rdev->raid_disk >= 0 &&
6804 			    !test_bit(In_sync, &rdev->flags) &&
6805 			    !test_bit(Blocked, &rdev->flags))
6806 				spares++;
6807 			if (rdev->raid_disk < 0
6808 			    && !test_bit(Faulty, &rdev->flags)) {
6809 				rdev->recovery_offset = 0;
6810 				if (mddev->pers->
6811 				    hot_add_disk(mddev, rdev) == 0) {
6812 					char nm[20];
6813 					sprintf(nm, "rd%d", rdev->raid_disk);
6814 					if (sysfs_create_link(&mddev->kobj,
6815 							      &rdev->kobj, nm))
6816 						printk(KERN_WARNING
6817 						       "md: cannot register "
6818 						       "%s for %s\n",
6819 						       nm, mdname(mddev));
6820 					spares++;
6821 					md_new_event(mddev);
6822 					set_bit(MD_CHANGE_DEVS, &mddev->flags);
6823 				} else
6824 					break;
6825 			}
6826 		}
6827 	}
6828 	return spares;
6829 }
6830 /*
6831  * This routine is regularly called by all per-raid-array threads to
6832  * deal with generic issues like resync and super-block update.
6833  * Raid personalities that don't have a thread (linear/raid0) do not
6834  * need this as they never do any recovery or update the superblock.
6835  *
6836  * It does not do any resync itself, but rather "forks" off other threads
6837  * to do that as needed.
6838  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6839  * "->recovery" and create a thread at ->sync_thread.
6840  * When the thread finishes it sets MD_RECOVERY_DONE
6841  * and wakeups up this thread which will reap the thread and finish up.
6842  * This thread also removes any faulty devices (with nr_pending == 0).
6843  *
6844  * The overall approach is:
6845  *  1/ if the superblock needs updating, update it.
6846  *  2/ If a recovery thread is running, don't do anything else.
6847  *  3/ If recovery has finished, clean up, possibly marking spares active.
6848  *  4/ If there are any faulty devices, remove them.
6849  *  5/ If array is degraded, try to add spares devices
6850  *  6/ If array has spares or is not in-sync, start a resync thread.
6851  */
6852 void md_check_recovery(mddev_t *mddev)
6853 {
6854 	mdk_rdev_t *rdev;
6855 
6856 
6857 	if (mddev->bitmap)
6858 		bitmap_daemon_work(mddev);
6859 
6860 	if (mddev->ro)
6861 		return;
6862 
6863 	if (signal_pending(current)) {
6864 		if (mddev->pers->sync_request && !mddev->external) {
6865 			printk(KERN_INFO "md: %s in immediate safe mode\n",
6866 			       mdname(mddev));
6867 			mddev->safemode = 2;
6868 		}
6869 		flush_signals(current);
6870 	}
6871 
6872 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6873 		return;
6874 	if ( ! (
6875 		(mddev->flags && !mddev->external) ||
6876 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6877 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6878 		(mddev->external == 0 && mddev->safemode == 1) ||
6879 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6880 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6881 		))
6882 		return;
6883 
6884 	if (mddev_trylock(mddev)) {
6885 		int spares = 0;
6886 
6887 		if (mddev->ro) {
6888 			/* Only thing we do on a ro array is remove
6889 			 * failed devices.
6890 			 */
6891 			remove_and_add_spares(mddev);
6892 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6893 			goto unlock;
6894 		}
6895 
6896 		if (!mddev->external) {
6897 			int did_change = 0;
6898 			spin_lock_irq(&mddev->write_lock);
6899 			if (mddev->safemode &&
6900 			    !atomic_read(&mddev->writes_pending) &&
6901 			    !mddev->in_sync &&
6902 			    mddev->recovery_cp == MaxSector) {
6903 				mddev->in_sync = 1;
6904 				did_change = 1;
6905 				if (mddev->persistent)
6906 					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6907 			}
6908 			if (mddev->safemode == 1)
6909 				mddev->safemode = 0;
6910 			spin_unlock_irq(&mddev->write_lock);
6911 			if (did_change)
6912 				sysfs_notify_dirent(mddev->sysfs_state);
6913 		}
6914 
6915 		if (mddev->flags)
6916 			md_update_sb(mddev, 0);
6917 
6918 		list_for_each_entry(rdev, &mddev->disks, same_set)
6919 			if (test_and_clear_bit(StateChanged, &rdev->flags))
6920 				sysfs_notify_dirent(rdev->sysfs_state);
6921 
6922 
6923 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6924 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6925 			/* resync/recovery still happening */
6926 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6927 			goto unlock;
6928 		}
6929 		if (mddev->sync_thread) {
6930 			/* resync has finished, collect result */
6931 			md_unregister_thread(mddev->sync_thread);
6932 			mddev->sync_thread = NULL;
6933 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6934 			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6935 				/* success...*/
6936 				/* activate any spares */
6937 				if (mddev->pers->spare_active(mddev))
6938 					sysfs_notify(&mddev->kobj, NULL,
6939 						     "degraded");
6940 			}
6941 			if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6942 			    mddev->pers->finish_reshape)
6943 				mddev->pers->finish_reshape(mddev);
6944 			md_update_sb(mddev, 1);
6945 
6946 			/* if array is no-longer degraded, then any saved_raid_disk
6947 			 * information must be scrapped
6948 			 */
6949 			if (!mddev->degraded)
6950 				list_for_each_entry(rdev, &mddev->disks, same_set)
6951 					rdev->saved_raid_disk = -1;
6952 
6953 			mddev->recovery = 0;
6954 			/* flag recovery needed just to double check */
6955 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6956 			sysfs_notify_dirent(mddev->sysfs_action);
6957 			md_new_event(mddev);
6958 			goto unlock;
6959 		}
6960 		/* Set RUNNING before clearing NEEDED to avoid
6961 		 * any transients in the value of "sync_action".
6962 		 */
6963 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6964 		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6965 		/* Clear some bits that don't mean anything, but
6966 		 * might be left set
6967 		 */
6968 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6969 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6970 
6971 		if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6972 			goto unlock;
6973 		/* no recovery is running.
6974 		 * remove any failed drives, then
6975 		 * add spares if possible.
6976 		 * Spare are also removed and re-added, to allow
6977 		 * the personality to fail the re-add.
6978 		 */
6979 
6980 		if (mddev->reshape_position != MaxSector) {
6981 			if (mddev->pers->check_reshape == NULL ||
6982 			    mddev->pers->check_reshape(mddev) != 0)
6983 				/* Cannot proceed */
6984 				goto unlock;
6985 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6986 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6987 		} else if ((spares = remove_and_add_spares(mddev))) {
6988 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6989 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6990 			clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6991 			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6992 		} else if (mddev->recovery_cp < MaxSector) {
6993 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6994 			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6995 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6996 			/* nothing to be done ... */
6997 			goto unlock;
6998 
6999 		if (mddev->pers->sync_request) {
7000 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
7001 				/* We are adding a device or devices to an array
7002 				 * which has the bitmap stored on all devices.
7003 				 * So make sure all bitmap pages get written
7004 				 */
7005 				bitmap_write_all(mddev->bitmap);
7006 			}
7007 			mddev->sync_thread = md_register_thread(md_do_sync,
7008 								mddev,
7009 								"resync");
7010 			if (!mddev->sync_thread) {
7011 				printk(KERN_ERR "%s: could not start resync"
7012 					" thread...\n",
7013 					mdname(mddev));
7014 				/* leave the spares where they are, it shouldn't hurt */
7015 				mddev->recovery = 0;
7016 			} else
7017 				md_wakeup_thread(mddev->sync_thread);
7018 			sysfs_notify_dirent(mddev->sysfs_action);
7019 			md_new_event(mddev);
7020 		}
7021 	unlock:
7022 		if (!mddev->sync_thread) {
7023 			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7024 			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7025 					       &mddev->recovery))
7026 				if (mddev->sysfs_action)
7027 					sysfs_notify_dirent(mddev->sysfs_action);
7028 		}
7029 		mddev_unlock(mddev);
7030 	}
7031 }
7032 
7033 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7034 {
7035 	sysfs_notify_dirent(rdev->sysfs_state);
7036 	wait_event_timeout(rdev->blocked_wait,
7037 			   !test_bit(Blocked, &rdev->flags),
7038 			   msecs_to_jiffies(5000));
7039 	rdev_dec_pending(rdev, mddev);
7040 }
7041 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7042 
7043 static int md_notify_reboot(struct notifier_block *this,
7044 			    unsigned long code, void *x)
7045 {
7046 	struct list_head *tmp;
7047 	mddev_t *mddev;
7048 
7049 	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
7050 
7051 		printk(KERN_INFO "md: stopping all md devices.\n");
7052 
7053 		for_each_mddev(mddev, tmp)
7054 			if (mddev_trylock(mddev)) {
7055 				/* Force a switch to readonly even array
7056 				 * appears to still be in use.  Hence
7057 				 * the '100'.
7058 				 */
7059 				do_md_stop(mddev, 1, 100);
7060 				mddev_unlock(mddev);
7061 			}
7062 		/*
7063 		 * certain more exotic SCSI devices are known to be
7064 		 * volatile wrt too early system reboots. While the
7065 		 * right place to handle this issue is the given
7066 		 * driver, we do want to have a safe RAID driver ...
7067 		 */
7068 		mdelay(1000*1);
7069 	}
7070 	return NOTIFY_DONE;
7071 }
7072 
7073 static struct notifier_block md_notifier = {
7074 	.notifier_call	= md_notify_reboot,
7075 	.next		= NULL,
7076 	.priority	= INT_MAX, /* before any real devices */
7077 };
7078 
7079 static void md_geninit(void)
7080 {
7081 	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
7082 
7083 	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
7084 }
7085 
7086 static int __init md_init(void)
7087 {
7088 	if (register_blkdev(MD_MAJOR, "md"))
7089 		return -1;
7090 	if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
7091 		unregister_blkdev(MD_MAJOR, "md");
7092 		return -1;
7093 	}
7094 	blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
7095 			    md_probe, NULL, NULL);
7096 	blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
7097 			    md_probe, NULL, NULL);
7098 
7099 	register_reboot_notifier(&md_notifier);
7100 	raid_table_header = register_sysctl_table(raid_root_table);
7101 
7102 	md_geninit();
7103 	return 0;
7104 }
7105 
7106 
7107 #ifndef MODULE
7108 
7109 /*
7110  * Searches all registered partitions for autorun RAID arrays
7111  * at boot time.
7112  */
7113 
7114 static LIST_HEAD(all_detected_devices);
7115 struct detected_devices_node {
7116 	struct list_head list;
7117 	dev_t dev;
7118 };
7119 
7120 void md_autodetect_dev(dev_t dev)
7121 {
7122 	struct detected_devices_node *node_detected_dev;
7123 
7124 	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
7125 	if (node_detected_dev) {
7126 		node_detected_dev->dev = dev;
7127 		list_add_tail(&node_detected_dev->list, &all_detected_devices);
7128 	} else {
7129 		printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
7130 			", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
7131 	}
7132 }
7133 
7134 
7135 static void autostart_arrays(int part)
7136 {
7137 	mdk_rdev_t *rdev;
7138 	struct detected_devices_node *node_detected_dev;
7139 	dev_t dev;
7140 	int i_scanned, i_passed;
7141 
7142 	i_scanned = 0;
7143 	i_passed = 0;
7144 
7145 	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
7146 
7147 	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
7148 		i_scanned++;
7149 		node_detected_dev = list_entry(all_detected_devices.next,
7150 					struct detected_devices_node, list);
7151 		list_del(&node_detected_dev->list);
7152 		dev = node_detected_dev->dev;
7153 		kfree(node_detected_dev);
7154 		rdev = md_import_device(dev,0, 90);
7155 		if (IS_ERR(rdev))
7156 			continue;
7157 
7158 		if (test_bit(Faulty, &rdev->flags)) {
7159 			MD_BUG();
7160 			continue;
7161 		}
7162 		set_bit(AutoDetected, &rdev->flags);
7163 		list_add(&rdev->same_set, &pending_raid_disks);
7164 		i_passed++;
7165 	}
7166 
7167 	printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
7168 						i_scanned, i_passed);
7169 
7170 	autorun_devices(part);
7171 }
7172 
7173 #endif /* !MODULE */
7174 
7175 static __exit void md_exit(void)
7176 {
7177 	mddev_t *mddev;
7178 	struct list_head *tmp;
7179 
7180 	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
7181 	blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
7182 
7183 	unregister_blkdev(MD_MAJOR,"md");
7184 	unregister_blkdev(mdp_major, "mdp");
7185 	unregister_reboot_notifier(&md_notifier);
7186 	unregister_sysctl_table(raid_table_header);
7187 	remove_proc_entry("mdstat", NULL);
7188 	for_each_mddev(mddev, tmp) {
7189 		export_array(mddev);
7190 		mddev->hold_active = 0;
7191 	}
7192 }
7193 
7194 subsys_initcall(md_init);
7195 module_exit(md_exit)
7196 
7197 static int get_ro(char *buffer, struct kernel_param *kp)
7198 {
7199 	return sprintf(buffer, "%d", start_readonly);
7200 }
7201 static int set_ro(const char *val, struct kernel_param *kp)
7202 {
7203 	char *e;
7204 	int num = simple_strtoul(val, &e, 10);
7205 	if (*val && (*e == '\0' || *e == '\n')) {
7206 		start_readonly = num;
7207 		return 0;
7208 	}
7209 	return -EINVAL;
7210 }
7211 
7212 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
7213 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
7214 
7215 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
7216 
7217 EXPORT_SYMBOL(register_md_personality);
7218 EXPORT_SYMBOL(unregister_md_personality);
7219 EXPORT_SYMBOL(md_error);
7220 EXPORT_SYMBOL(md_done_sync);
7221 EXPORT_SYMBOL(md_write_start);
7222 EXPORT_SYMBOL(md_write_end);
7223 EXPORT_SYMBOL(md_register_thread);
7224 EXPORT_SYMBOL(md_unregister_thread);
7225 EXPORT_SYMBOL(md_wakeup_thread);
7226 EXPORT_SYMBOL(md_check_recovery);
7227 MODULE_LICENSE("GPL");
7228 MODULE_DESCRIPTION("MD RAID framework");
7229 MODULE_ALIAS("md");
7230 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
7231