xref: /linux/drivers/md/raid5.c (revision c145211d1f9e2ef19e7b4c2b943f68366daa97af)
1 /*
2  * raid5.c : Multiple Devices driver for Linux
3  *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4  *	   Copyright (C) 1999, 2000 Ingo Molnar
5  *	   Copyright (C) 2002, 2003 H. Peter Anvin
6  *
7  * RAID-4/5/6 management functions.
8  * Thanks to Penguin Computing for making the RAID-6 development possible
9  * by donating a test server!
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * BITMAP UNPLUGGING:
23  *
24  * The sequencing for updating the bitmap reliably is a little
25  * subtle (and I got it wrong the first time) so it deserves some
26  * explanation.
27  *
28  * We group bitmap updates into batches.  Each batch has a number.
29  * We may write out several batches at once, but that isn't very important.
30  * conf->bm_write is the number of the last batch successfully written.
31  * conf->bm_flush is the number of the last batch that was closed to
32  *    new additions.
33  * When we discover that we will need to write to any block in a stripe
34  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35  * the number of the batch it will be in. This is bm_flush+1.
36  * When we are ready to do a write, if that batch hasn't been written yet,
37  *   we plug the array and queue the stripe for later.
38  * When an unplug happens, we increment bm_flush, thus closing the current
39  *   batch.
40  * When we notice that bm_flush > bm_write, we write out all pending updates
41  * to the bitmap, and advance bm_write to where bm_flush was.
42  * This may occasionally write a bit out twice, but is sure never to
43  * miss any bits.
44  */
45 
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/async.h>
51 #include <linux/seq_file.h>
52 #include <linux/cpu.h>
53 #include <linux/slab.h>
54 #include "md.h"
55 #include "raid5.h"
56 #include "bitmap.h"
57 
58 /*
59  * Stripe cache
60  */
61 
62 #define NR_STRIPES		256
63 #define STRIPE_SIZE		PAGE_SIZE
64 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
65 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
66 #define	IO_THRESHOLD		1
67 #define BYPASS_THRESHOLD	1
68 #define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
69 #define HASH_MASK		(NR_HASH - 1)
70 
71 #define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
72 
73 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
74  * order without overlap.  There may be several bio's per stripe+device, and
75  * a bio could span several devices.
76  * When walking this list for a particular stripe+device, we must never proceed
77  * beyond a bio that extends past this device, as the next bio might no longer
78  * be valid.
79  * This macro is used to determine the 'next' bio in the list, given the sector
80  * of the current stripe+device
81  */
82 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
83 /*
84  * The following can be used to debug the driver
85  */
86 #define RAID5_PARANOIA	1
87 #if RAID5_PARANOIA && defined(CONFIG_SMP)
88 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
89 #else
90 # define CHECK_DEVLOCK()
91 #endif
92 
93 #ifdef DEBUG
94 #define inline
95 #define __inline__
96 #endif
97 
98 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
99 
100 /*
101  * We maintain a biased count of active stripes in the bottom 16 bits of
102  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
103  */
104 static inline int raid5_bi_phys_segments(struct bio *bio)
105 {
106 	return bio->bi_phys_segments & 0xffff;
107 }
108 
109 static inline int raid5_bi_hw_segments(struct bio *bio)
110 {
111 	return (bio->bi_phys_segments >> 16) & 0xffff;
112 }
113 
114 static inline int raid5_dec_bi_phys_segments(struct bio *bio)
115 {
116 	--bio->bi_phys_segments;
117 	return raid5_bi_phys_segments(bio);
118 }
119 
120 static inline int raid5_dec_bi_hw_segments(struct bio *bio)
121 {
122 	unsigned short val = raid5_bi_hw_segments(bio);
123 
124 	--val;
125 	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
126 	return val;
127 }
128 
129 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
130 {
131 	bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
132 }
133 
134 /* Find first data disk in a raid6 stripe */
135 static inline int raid6_d0(struct stripe_head *sh)
136 {
137 	if (sh->ddf_layout)
138 		/* ddf always start from first device */
139 		return 0;
140 	/* md starts just after Q block */
141 	if (sh->qd_idx == sh->disks - 1)
142 		return 0;
143 	else
144 		return sh->qd_idx + 1;
145 }
146 static inline int raid6_next_disk(int disk, int raid_disks)
147 {
148 	disk++;
149 	return (disk < raid_disks) ? disk : 0;
150 }
151 
152 /* When walking through the disks in a raid5, starting at raid6_d0,
153  * We need to map each disk to a 'slot', where the data disks are slot
154  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
155  * is raid_disks-1.  This help does that mapping.
156  */
157 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
158 			     int *count, int syndrome_disks)
159 {
160 	int slot = *count;
161 
162 	if (sh->ddf_layout)
163 		(*count)++;
164 	if (idx == sh->pd_idx)
165 		return syndrome_disks;
166 	if (idx == sh->qd_idx)
167 		return syndrome_disks + 1;
168 	if (!sh->ddf_layout)
169 		(*count)++;
170 	return slot;
171 }
172 
173 static void return_io(struct bio *return_bi)
174 {
175 	struct bio *bi = return_bi;
176 	while (bi) {
177 
178 		return_bi = bi->bi_next;
179 		bi->bi_next = NULL;
180 		bi->bi_size = 0;
181 		bio_endio(bi, 0);
182 		bi = return_bi;
183 	}
184 }
185 
186 static void print_raid5_conf (raid5_conf_t *conf);
187 
188 static int stripe_operations_active(struct stripe_head *sh)
189 {
190 	return sh->check_state || sh->reconstruct_state ||
191 	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
192 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
193 }
194 
195 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
196 {
197 	if (atomic_dec_and_test(&sh->count)) {
198 		BUG_ON(!list_empty(&sh->lru));
199 		BUG_ON(atomic_read(&conf->active_stripes)==0);
200 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
201 			if (test_bit(STRIPE_DELAYED, &sh->state)) {
202 				list_add_tail(&sh->lru, &conf->delayed_list);
203 				blk_plug_device(conf->mddev->queue);
204 			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
205 				   sh->bm_seq - conf->seq_write > 0) {
206 				list_add_tail(&sh->lru, &conf->bitmap_list);
207 				blk_plug_device(conf->mddev->queue);
208 			} else {
209 				clear_bit(STRIPE_BIT_DELAY, &sh->state);
210 				list_add_tail(&sh->lru, &conf->handle_list);
211 			}
212 			md_wakeup_thread(conf->mddev->thread);
213 		} else {
214 			BUG_ON(stripe_operations_active(sh));
215 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
216 				atomic_dec(&conf->preread_active_stripes);
217 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
218 					md_wakeup_thread(conf->mddev->thread);
219 			}
220 			atomic_dec(&conf->active_stripes);
221 			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
222 				list_add_tail(&sh->lru, &conf->inactive_list);
223 				wake_up(&conf->wait_for_stripe);
224 				if (conf->retry_read_aligned)
225 					md_wakeup_thread(conf->mddev->thread);
226 			}
227 		}
228 	}
229 }
230 
231 static void release_stripe(struct stripe_head *sh)
232 {
233 	raid5_conf_t *conf = sh->raid_conf;
234 	unsigned long flags;
235 
236 	spin_lock_irqsave(&conf->device_lock, flags);
237 	__release_stripe(conf, sh);
238 	spin_unlock_irqrestore(&conf->device_lock, flags);
239 }
240 
241 static inline void remove_hash(struct stripe_head *sh)
242 {
243 	pr_debug("remove_hash(), stripe %llu\n",
244 		(unsigned long long)sh->sector);
245 
246 	hlist_del_init(&sh->hash);
247 }
248 
249 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
250 {
251 	struct hlist_head *hp = stripe_hash(conf, sh->sector);
252 
253 	pr_debug("insert_hash(), stripe %llu\n",
254 		(unsigned long long)sh->sector);
255 
256 	CHECK_DEVLOCK();
257 	hlist_add_head(&sh->hash, hp);
258 }
259 
260 
261 /* find an idle stripe, make sure it is unhashed, and return it. */
262 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
263 {
264 	struct stripe_head *sh = NULL;
265 	struct list_head *first;
266 
267 	CHECK_DEVLOCK();
268 	if (list_empty(&conf->inactive_list))
269 		goto out;
270 	first = conf->inactive_list.next;
271 	sh = list_entry(first, struct stripe_head, lru);
272 	list_del_init(first);
273 	remove_hash(sh);
274 	atomic_inc(&conf->active_stripes);
275 out:
276 	return sh;
277 }
278 
279 static void shrink_buffers(struct stripe_head *sh, int num)
280 {
281 	struct page *p;
282 	int i;
283 
284 	for (i=0; i<num ; i++) {
285 		p = sh->dev[i].page;
286 		if (!p)
287 			continue;
288 		sh->dev[i].page = NULL;
289 		put_page(p);
290 	}
291 }
292 
293 static int grow_buffers(struct stripe_head *sh, int num)
294 {
295 	int i;
296 
297 	for (i=0; i<num; i++) {
298 		struct page *page;
299 
300 		if (!(page = alloc_page(GFP_KERNEL))) {
301 			return 1;
302 		}
303 		sh->dev[i].page = page;
304 	}
305 	return 0;
306 }
307 
308 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
309 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
310 			    struct stripe_head *sh);
311 
312 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
313 {
314 	raid5_conf_t *conf = sh->raid_conf;
315 	int i;
316 
317 	BUG_ON(atomic_read(&sh->count) != 0);
318 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
319 	BUG_ON(stripe_operations_active(sh));
320 
321 	CHECK_DEVLOCK();
322 	pr_debug("init_stripe called, stripe %llu\n",
323 		(unsigned long long)sh->sector);
324 
325 	remove_hash(sh);
326 
327 	sh->generation = conf->generation - previous;
328 	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
329 	sh->sector = sector;
330 	stripe_set_idx(sector, conf, previous, sh);
331 	sh->state = 0;
332 
333 
334 	for (i = sh->disks; i--; ) {
335 		struct r5dev *dev = &sh->dev[i];
336 
337 		if (dev->toread || dev->read || dev->towrite || dev->written ||
338 		    test_bit(R5_LOCKED, &dev->flags)) {
339 			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
340 			       (unsigned long long)sh->sector, i, dev->toread,
341 			       dev->read, dev->towrite, dev->written,
342 			       test_bit(R5_LOCKED, &dev->flags));
343 			BUG();
344 		}
345 		dev->flags = 0;
346 		raid5_build_block(sh, i, previous);
347 	}
348 	insert_hash(conf, sh);
349 }
350 
351 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
352 					 short generation)
353 {
354 	struct stripe_head *sh;
355 	struct hlist_node *hn;
356 
357 	CHECK_DEVLOCK();
358 	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
359 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
360 		if (sh->sector == sector && sh->generation == generation)
361 			return sh;
362 	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
363 	return NULL;
364 }
365 
366 static void unplug_slaves(mddev_t *mddev);
367 static void raid5_unplug_device(struct request_queue *q);
368 
369 static struct stripe_head *
370 get_active_stripe(raid5_conf_t *conf, sector_t sector,
371 		  int previous, int noblock, int noquiesce)
372 {
373 	struct stripe_head *sh;
374 
375 	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
376 
377 	spin_lock_irq(&conf->device_lock);
378 
379 	do {
380 		wait_event_lock_irq(conf->wait_for_stripe,
381 				    conf->quiesce == 0 || noquiesce,
382 				    conf->device_lock, /* nothing */);
383 		sh = __find_stripe(conf, sector, conf->generation - previous);
384 		if (!sh) {
385 			if (!conf->inactive_blocked)
386 				sh = get_free_stripe(conf);
387 			if (noblock && sh == NULL)
388 				break;
389 			if (!sh) {
390 				conf->inactive_blocked = 1;
391 				wait_event_lock_irq(conf->wait_for_stripe,
392 						    !list_empty(&conf->inactive_list) &&
393 						    (atomic_read(&conf->active_stripes)
394 						     < (conf->max_nr_stripes *3/4)
395 						     || !conf->inactive_blocked),
396 						    conf->device_lock,
397 						    raid5_unplug_device(conf->mddev->queue)
398 					);
399 				conf->inactive_blocked = 0;
400 			} else
401 				init_stripe(sh, sector, previous);
402 		} else {
403 			if (atomic_read(&sh->count)) {
404 				BUG_ON(!list_empty(&sh->lru)
405 				    && !test_bit(STRIPE_EXPANDING, &sh->state));
406 			} else {
407 				if (!test_bit(STRIPE_HANDLE, &sh->state))
408 					atomic_inc(&conf->active_stripes);
409 				if (list_empty(&sh->lru) &&
410 				    !test_bit(STRIPE_EXPANDING, &sh->state))
411 					BUG();
412 				list_del_init(&sh->lru);
413 			}
414 		}
415 	} while (sh == NULL);
416 
417 	if (sh)
418 		atomic_inc(&sh->count);
419 
420 	spin_unlock_irq(&conf->device_lock);
421 	return sh;
422 }
423 
424 static void
425 raid5_end_read_request(struct bio *bi, int error);
426 static void
427 raid5_end_write_request(struct bio *bi, int error);
428 
429 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
430 {
431 	raid5_conf_t *conf = sh->raid_conf;
432 	int i, disks = sh->disks;
433 
434 	might_sleep();
435 
436 	for (i = disks; i--; ) {
437 		int rw;
438 		struct bio *bi;
439 		mdk_rdev_t *rdev;
440 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
441 			rw = WRITE;
442 		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
443 			rw = READ;
444 		else
445 			continue;
446 
447 		bi = &sh->dev[i].req;
448 
449 		bi->bi_rw = rw;
450 		if (rw == WRITE)
451 			bi->bi_end_io = raid5_end_write_request;
452 		else
453 			bi->bi_end_io = raid5_end_read_request;
454 
455 		rcu_read_lock();
456 		rdev = rcu_dereference(conf->disks[i].rdev);
457 		if (rdev && test_bit(Faulty, &rdev->flags))
458 			rdev = NULL;
459 		if (rdev)
460 			atomic_inc(&rdev->nr_pending);
461 		rcu_read_unlock();
462 
463 		if (rdev) {
464 			if (s->syncing || s->expanding || s->expanded)
465 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
466 
467 			set_bit(STRIPE_IO_STARTED, &sh->state);
468 
469 			bi->bi_bdev = rdev->bdev;
470 			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
471 				__func__, (unsigned long long)sh->sector,
472 				bi->bi_rw, i);
473 			atomic_inc(&sh->count);
474 			bi->bi_sector = sh->sector + rdev->data_offset;
475 			bi->bi_flags = 1 << BIO_UPTODATE;
476 			bi->bi_vcnt = 1;
477 			bi->bi_max_vecs = 1;
478 			bi->bi_idx = 0;
479 			bi->bi_io_vec = &sh->dev[i].vec;
480 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
481 			bi->bi_io_vec[0].bv_offset = 0;
482 			bi->bi_size = STRIPE_SIZE;
483 			bi->bi_next = NULL;
484 			if (rw == WRITE &&
485 			    test_bit(R5_ReWrite, &sh->dev[i].flags))
486 				atomic_add(STRIPE_SECTORS,
487 					&rdev->corrected_errors);
488 			generic_make_request(bi);
489 		} else {
490 			if (rw == WRITE)
491 				set_bit(STRIPE_DEGRADED, &sh->state);
492 			pr_debug("skip op %ld on disc %d for sector %llu\n",
493 				bi->bi_rw, i, (unsigned long long)sh->sector);
494 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
495 			set_bit(STRIPE_HANDLE, &sh->state);
496 		}
497 	}
498 }
499 
500 static struct dma_async_tx_descriptor *
501 async_copy_data(int frombio, struct bio *bio, struct page *page,
502 	sector_t sector, struct dma_async_tx_descriptor *tx)
503 {
504 	struct bio_vec *bvl;
505 	struct page *bio_page;
506 	int i;
507 	int page_offset;
508 	struct async_submit_ctl submit;
509 	enum async_tx_flags flags = 0;
510 
511 	if (bio->bi_sector >= sector)
512 		page_offset = (signed)(bio->bi_sector - sector) * 512;
513 	else
514 		page_offset = (signed)(sector - bio->bi_sector) * -512;
515 
516 	if (frombio)
517 		flags |= ASYNC_TX_FENCE;
518 	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
519 
520 	bio_for_each_segment(bvl, bio, i) {
521 		int len = bio_iovec_idx(bio, i)->bv_len;
522 		int clen;
523 		int b_offset = 0;
524 
525 		if (page_offset < 0) {
526 			b_offset = -page_offset;
527 			page_offset += b_offset;
528 			len -= b_offset;
529 		}
530 
531 		if (len > 0 && page_offset + len > STRIPE_SIZE)
532 			clen = STRIPE_SIZE - page_offset;
533 		else
534 			clen = len;
535 
536 		if (clen > 0) {
537 			b_offset += bio_iovec_idx(bio, i)->bv_offset;
538 			bio_page = bio_iovec_idx(bio, i)->bv_page;
539 			if (frombio)
540 				tx = async_memcpy(page, bio_page, page_offset,
541 						  b_offset, clen, &submit);
542 			else
543 				tx = async_memcpy(bio_page, page, b_offset,
544 						  page_offset, clen, &submit);
545 		}
546 		/* chain the operations */
547 		submit.depend_tx = tx;
548 
549 		if (clen < len) /* hit end of page */
550 			break;
551 		page_offset +=  len;
552 	}
553 
554 	return tx;
555 }
556 
557 static void ops_complete_biofill(void *stripe_head_ref)
558 {
559 	struct stripe_head *sh = stripe_head_ref;
560 	struct bio *return_bi = NULL;
561 	raid5_conf_t *conf = sh->raid_conf;
562 	int i;
563 
564 	pr_debug("%s: stripe %llu\n", __func__,
565 		(unsigned long long)sh->sector);
566 
567 	/* clear completed biofills */
568 	spin_lock_irq(&conf->device_lock);
569 	for (i = sh->disks; i--; ) {
570 		struct r5dev *dev = &sh->dev[i];
571 
572 		/* acknowledge completion of a biofill operation */
573 		/* and check if we need to reply to a read request,
574 		 * new R5_Wantfill requests are held off until
575 		 * !STRIPE_BIOFILL_RUN
576 		 */
577 		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
578 			struct bio *rbi, *rbi2;
579 
580 			BUG_ON(!dev->read);
581 			rbi = dev->read;
582 			dev->read = NULL;
583 			while (rbi && rbi->bi_sector <
584 				dev->sector + STRIPE_SECTORS) {
585 				rbi2 = r5_next_bio(rbi, dev->sector);
586 				if (!raid5_dec_bi_phys_segments(rbi)) {
587 					rbi->bi_next = return_bi;
588 					return_bi = rbi;
589 				}
590 				rbi = rbi2;
591 			}
592 		}
593 	}
594 	spin_unlock_irq(&conf->device_lock);
595 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
596 
597 	return_io(return_bi);
598 
599 	set_bit(STRIPE_HANDLE, &sh->state);
600 	release_stripe(sh);
601 }
602 
603 static void ops_run_biofill(struct stripe_head *sh)
604 {
605 	struct dma_async_tx_descriptor *tx = NULL;
606 	raid5_conf_t *conf = sh->raid_conf;
607 	struct async_submit_ctl submit;
608 	int i;
609 
610 	pr_debug("%s: stripe %llu\n", __func__,
611 		(unsigned long long)sh->sector);
612 
613 	for (i = sh->disks; i--; ) {
614 		struct r5dev *dev = &sh->dev[i];
615 		if (test_bit(R5_Wantfill, &dev->flags)) {
616 			struct bio *rbi;
617 			spin_lock_irq(&conf->device_lock);
618 			dev->read = rbi = dev->toread;
619 			dev->toread = NULL;
620 			spin_unlock_irq(&conf->device_lock);
621 			while (rbi && rbi->bi_sector <
622 				dev->sector + STRIPE_SECTORS) {
623 				tx = async_copy_data(0, rbi, dev->page,
624 					dev->sector, tx);
625 				rbi = r5_next_bio(rbi, dev->sector);
626 			}
627 		}
628 	}
629 
630 	atomic_inc(&sh->count);
631 	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
632 	async_trigger_callback(&submit);
633 }
634 
635 static void mark_target_uptodate(struct stripe_head *sh, int target)
636 {
637 	struct r5dev *tgt;
638 
639 	if (target < 0)
640 		return;
641 
642 	tgt = &sh->dev[target];
643 	set_bit(R5_UPTODATE, &tgt->flags);
644 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
645 	clear_bit(R5_Wantcompute, &tgt->flags);
646 }
647 
648 static void ops_complete_compute(void *stripe_head_ref)
649 {
650 	struct stripe_head *sh = stripe_head_ref;
651 
652 	pr_debug("%s: stripe %llu\n", __func__,
653 		(unsigned long long)sh->sector);
654 
655 	/* mark the computed target(s) as uptodate */
656 	mark_target_uptodate(sh, sh->ops.target);
657 	mark_target_uptodate(sh, sh->ops.target2);
658 
659 	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
660 	if (sh->check_state == check_state_compute_run)
661 		sh->check_state = check_state_compute_result;
662 	set_bit(STRIPE_HANDLE, &sh->state);
663 	release_stripe(sh);
664 }
665 
666 /* return a pointer to the address conversion region of the scribble buffer */
667 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
668 				 struct raid5_percpu *percpu)
669 {
670 	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
671 }
672 
673 static struct dma_async_tx_descriptor *
674 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
675 {
676 	int disks = sh->disks;
677 	struct page **xor_srcs = percpu->scribble;
678 	int target = sh->ops.target;
679 	struct r5dev *tgt = &sh->dev[target];
680 	struct page *xor_dest = tgt->page;
681 	int count = 0;
682 	struct dma_async_tx_descriptor *tx;
683 	struct async_submit_ctl submit;
684 	int i;
685 
686 	pr_debug("%s: stripe %llu block: %d\n",
687 		__func__, (unsigned long long)sh->sector, target);
688 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
689 
690 	for (i = disks; i--; )
691 		if (i != target)
692 			xor_srcs[count++] = sh->dev[i].page;
693 
694 	atomic_inc(&sh->count);
695 
696 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
697 			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
698 	if (unlikely(count == 1))
699 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
700 	else
701 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
702 
703 	return tx;
704 }
705 
706 /* set_syndrome_sources - populate source buffers for gen_syndrome
707  * @srcs - (struct page *) array of size sh->disks
708  * @sh - stripe_head to parse
709  *
710  * Populates srcs in proper layout order for the stripe and returns the
711  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
712  * destination buffer is recorded in srcs[count] and the Q destination
713  * is recorded in srcs[count+1]].
714  */
715 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
716 {
717 	int disks = sh->disks;
718 	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
719 	int d0_idx = raid6_d0(sh);
720 	int count;
721 	int i;
722 
723 	for (i = 0; i < disks; i++)
724 		srcs[i] = NULL;
725 
726 	count = 0;
727 	i = d0_idx;
728 	do {
729 		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
730 
731 		srcs[slot] = sh->dev[i].page;
732 		i = raid6_next_disk(i, disks);
733 	} while (i != d0_idx);
734 
735 	return syndrome_disks;
736 }
737 
738 static struct dma_async_tx_descriptor *
739 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
740 {
741 	int disks = sh->disks;
742 	struct page **blocks = percpu->scribble;
743 	int target;
744 	int qd_idx = sh->qd_idx;
745 	struct dma_async_tx_descriptor *tx;
746 	struct async_submit_ctl submit;
747 	struct r5dev *tgt;
748 	struct page *dest;
749 	int i;
750 	int count;
751 
752 	if (sh->ops.target < 0)
753 		target = sh->ops.target2;
754 	else if (sh->ops.target2 < 0)
755 		target = sh->ops.target;
756 	else
757 		/* we should only have one valid target */
758 		BUG();
759 	BUG_ON(target < 0);
760 	pr_debug("%s: stripe %llu block: %d\n",
761 		__func__, (unsigned long long)sh->sector, target);
762 
763 	tgt = &sh->dev[target];
764 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
765 	dest = tgt->page;
766 
767 	atomic_inc(&sh->count);
768 
769 	if (target == qd_idx) {
770 		count = set_syndrome_sources(blocks, sh);
771 		blocks[count] = NULL; /* regenerating p is not necessary */
772 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
773 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
774 				  ops_complete_compute, sh,
775 				  to_addr_conv(sh, percpu));
776 		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
777 	} else {
778 		/* Compute any data- or p-drive using XOR */
779 		count = 0;
780 		for (i = disks; i-- ; ) {
781 			if (i == target || i == qd_idx)
782 				continue;
783 			blocks[count++] = sh->dev[i].page;
784 		}
785 
786 		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
787 				  NULL, ops_complete_compute, sh,
788 				  to_addr_conv(sh, percpu));
789 		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
790 	}
791 
792 	return tx;
793 }
794 
795 static struct dma_async_tx_descriptor *
796 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
797 {
798 	int i, count, disks = sh->disks;
799 	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
800 	int d0_idx = raid6_d0(sh);
801 	int faila = -1, failb = -1;
802 	int target = sh->ops.target;
803 	int target2 = sh->ops.target2;
804 	struct r5dev *tgt = &sh->dev[target];
805 	struct r5dev *tgt2 = &sh->dev[target2];
806 	struct dma_async_tx_descriptor *tx;
807 	struct page **blocks = percpu->scribble;
808 	struct async_submit_ctl submit;
809 
810 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
811 		 __func__, (unsigned long long)sh->sector, target, target2);
812 	BUG_ON(target < 0 || target2 < 0);
813 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
814 	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
815 
816 	/* we need to open-code set_syndrome_sources to handle the
817 	 * slot number conversion for 'faila' and 'failb'
818 	 */
819 	for (i = 0; i < disks ; i++)
820 		blocks[i] = NULL;
821 	count = 0;
822 	i = d0_idx;
823 	do {
824 		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
825 
826 		blocks[slot] = sh->dev[i].page;
827 
828 		if (i == target)
829 			faila = slot;
830 		if (i == target2)
831 			failb = slot;
832 		i = raid6_next_disk(i, disks);
833 	} while (i != d0_idx);
834 
835 	BUG_ON(faila == failb);
836 	if (failb < faila)
837 		swap(faila, failb);
838 	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
839 		 __func__, (unsigned long long)sh->sector, faila, failb);
840 
841 	atomic_inc(&sh->count);
842 
843 	if (failb == syndrome_disks+1) {
844 		/* Q disk is one of the missing disks */
845 		if (faila == syndrome_disks) {
846 			/* Missing P+Q, just recompute */
847 			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
848 					  ops_complete_compute, sh,
849 					  to_addr_conv(sh, percpu));
850 			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
851 						  STRIPE_SIZE, &submit);
852 		} else {
853 			struct page *dest;
854 			int data_target;
855 			int qd_idx = sh->qd_idx;
856 
857 			/* Missing D+Q: recompute D from P, then recompute Q */
858 			if (target == qd_idx)
859 				data_target = target2;
860 			else
861 				data_target = target;
862 
863 			count = 0;
864 			for (i = disks; i-- ; ) {
865 				if (i == data_target || i == qd_idx)
866 					continue;
867 				blocks[count++] = sh->dev[i].page;
868 			}
869 			dest = sh->dev[data_target].page;
870 			init_async_submit(&submit,
871 					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
872 					  NULL, NULL, NULL,
873 					  to_addr_conv(sh, percpu));
874 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
875 				       &submit);
876 
877 			count = set_syndrome_sources(blocks, sh);
878 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
879 					  ops_complete_compute, sh,
880 					  to_addr_conv(sh, percpu));
881 			return async_gen_syndrome(blocks, 0, count+2,
882 						  STRIPE_SIZE, &submit);
883 		}
884 	} else {
885 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
886 				  ops_complete_compute, sh,
887 				  to_addr_conv(sh, percpu));
888 		if (failb == syndrome_disks) {
889 			/* We're missing D+P. */
890 			return async_raid6_datap_recov(syndrome_disks+2,
891 						       STRIPE_SIZE, faila,
892 						       blocks, &submit);
893 		} else {
894 			/* We're missing D+D. */
895 			return async_raid6_2data_recov(syndrome_disks+2,
896 						       STRIPE_SIZE, faila, failb,
897 						       blocks, &submit);
898 		}
899 	}
900 }
901 
902 
903 static void ops_complete_prexor(void *stripe_head_ref)
904 {
905 	struct stripe_head *sh = stripe_head_ref;
906 
907 	pr_debug("%s: stripe %llu\n", __func__,
908 		(unsigned long long)sh->sector);
909 }
910 
911 static struct dma_async_tx_descriptor *
912 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
913 	       struct dma_async_tx_descriptor *tx)
914 {
915 	int disks = sh->disks;
916 	struct page **xor_srcs = percpu->scribble;
917 	int count = 0, pd_idx = sh->pd_idx, i;
918 	struct async_submit_ctl submit;
919 
920 	/* existing parity data subtracted */
921 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
922 
923 	pr_debug("%s: stripe %llu\n", __func__,
924 		(unsigned long long)sh->sector);
925 
926 	for (i = disks; i--; ) {
927 		struct r5dev *dev = &sh->dev[i];
928 		/* Only process blocks that are known to be uptodate */
929 		if (test_bit(R5_Wantdrain, &dev->flags))
930 			xor_srcs[count++] = dev->page;
931 	}
932 
933 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
934 			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
935 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
936 
937 	return tx;
938 }
939 
940 static struct dma_async_tx_descriptor *
941 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
942 {
943 	int disks = sh->disks;
944 	int i;
945 
946 	pr_debug("%s: stripe %llu\n", __func__,
947 		(unsigned long long)sh->sector);
948 
949 	for (i = disks; i--; ) {
950 		struct r5dev *dev = &sh->dev[i];
951 		struct bio *chosen;
952 
953 		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
954 			struct bio *wbi;
955 
956 			spin_lock(&sh->lock);
957 			chosen = dev->towrite;
958 			dev->towrite = NULL;
959 			BUG_ON(dev->written);
960 			wbi = dev->written = chosen;
961 			spin_unlock(&sh->lock);
962 
963 			while (wbi && wbi->bi_sector <
964 				dev->sector + STRIPE_SECTORS) {
965 				tx = async_copy_data(1, wbi, dev->page,
966 					dev->sector, tx);
967 				wbi = r5_next_bio(wbi, dev->sector);
968 			}
969 		}
970 	}
971 
972 	return tx;
973 }
974 
975 static void ops_complete_reconstruct(void *stripe_head_ref)
976 {
977 	struct stripe_head *sh = stripe_head_ref;
978 	int disks = sh->disks;
979 	int pd_idx = sh->pd_idx;
980 	int qd_idx = sh->qd_idx;
981 	int i;
982 
983 	pr_debug("%s: stripe %llu\n", __func__,
984 		(unsigned long long)sh->sector);
985 
986 	for (i = disks; i--; ) {
987 		struct r5dev *dev = &sh->dev[i];
988 
989 		if (dev->written || i == pd_idx || i == qd_idx)
990 			set_bit(R5_UPTODATE, &dev->flags);
991 	}
992 
993 	if (sh->reconstruct_state == reconstruct_state_drain_run)
994 		sh->reconstruct_state = reconstruct_state_drain_result;
995 	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
996 		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
997 	else {
998 		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
999 		sh->reconstruct_state = reconstruct_state_result;
1000 	}
1001 
1002 	set_bit(STRIPE_HANDLE, &sh->state);
1003 	release_stripe(sh);
1004 }
1005 
1006 static void
1007 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1008 		     struct dma_async_tx_descriptor *tx)
1009 {
1010 	int disks = sh->disks;
1011 	struct page **xor_srcs = percpu->scribble;
1012 	struct async_submit_ctl submit;
1013 	int count = 0, pd_idx = sh->pd_idx, i;
1014 	struct page *xor_dest;
1015 	int prexor = 0;
1016 	unsigned long flags;
1017 
1018 	pr_debug("%s: stripe %llu\n", __func__,
1019 		(unsigned long long)sh->sector);
1020 
1021 	/* check if prexor is active which means only process blocks
1022 	 * that are part of a read-modify-write (written)
1023 	 */
1024 	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1025 		prexor = 1;
1026 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1027 		for (i = disks; i--; ) {
1028 			struct r5dev *dev = &sh->dev[i];
1029 			if (dev->written)
1030 				xor_srcs[count++] = dev->page;
1031 		}
1032 	} else {
1033 		xor_dest = sh->dev[pd_idx].page;
1034 		for (i = disks; i--; ) {
1035 			struct r5dev *dev = &sh->dev[i];
1036 			if (i != pd_idx)
1037 				xor_srcs[count++] = dev->page;
1038 		}
1039 	}
1040 
1041 	/* 1/ if we prexor'd then the dest is reused as a source
1042 	 * 2/ if we did not prexor then we are redoing the parity
1043 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1044 	 * for the synchronous xor case
1045 	 */
1046 	flags = ASYNC_TX_ACK |
1047 		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1048 
1049 	atomic_inc(&sh->count);
1050 
1051 	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1052 			  to_addr_conv(sh, percpu));
1053 	if (unlikely(count == 1))
1054 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1055 	else
1056 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1057 }
1058 
1059 static void
1060 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1061 		     struct dma_async_tx_descriptor *tx)
1062 {
1063 	struct async_submit_ctl submit;
1064 	struct page **blocks = percpu->scribble;
1065 	int count;
1066 
1067 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1068 
1069 	count = set_syndrome_sources(blocks, sh);
1070 
1071 	atomic_inc(&sh->count);
1072 
1073 	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1074 			  sh, to_addr_conv(sh, percpu));
1075 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1076 }
1077 
1078 static void ops_complete_check(void *stripe_head_ref)
1079 {
1080 	struct stripe_head *sh = stripe_head_ref;
1081 
1082 	pr_debug("%s: stripe %llu\n", __func__,
1083 		(unsigned long long)sh->sector);
1084 
1085 	sh->check_state = check_state_check_result;
1086 	set_bit(STRIPE_HANDLE, &sh->state);
1087 	release_stripe(sh);
1088 }
1089 
1090 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1091 {
1092 	int disks = sh->disks;
1093 	int pd_idx = sh->pd_idx;
1094 	int qd_idx = sh->qd_idx;
1095 	struct page *xor_dest;
1096 	struct page **xor_srcs = percpu->scribble;
1097 	struct dma_async_tx_descriptor *tx;
1098 	struct async_submit_ctl submit;
1099 	int count;
1100 	int i;
1101 
1102 	pr_debug("%s: stripe %llu\n", __func__,
1103 		(unsigned long long)sh->sector);
1104 
1105 	count = 0;
1106 	xor_dest = sh->dev[pd_idx].page;
1107 	xor_srcs[count++] = xor_dest;
1108 	for (i = disks; i--; ) {
1109 		if (i == pd_idx || i == qd_idx)
1110 			continue;
1111 		xor_srcs[count++] = sh->dev[i].page;
1112 	}
1113 
1114 	init_async_submit(&submit, 0, NULL, NULL, NULL,
1115 			  to_addr_conv(sh, percpu));
1116 	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1117 			   &sh->ops.zero_sum_result, &submit);
1118 
1119 	atomic_inc(&sh->count);
1120 	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1121 	tx = async_trigger_callback(&submit);
1122 }
1123 
1124 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1125 {
1126 	struct page **srcs = percpu->scribble;
1127 	struct async_submit_ctl submit;
1128 	int count;
1129 
1130 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1131 		(unsigned long long)sh->sector, checkp);
1132 
1133 	count = set_syndrome_sources(srcs, sh);
1134 	if (!checkp)
1135 		srcs[count] = NULL;
1136 
1137 	atomic_inc(&sh->count);
1138 	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1139 			  sh, to_addr_conv(sh, percpu));
1140 	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1141 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1142 }
1143 
1144 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1145 {
1146 	int overlap_clear = 0, i, disks = sh->disks;
1147 	struct dma_async_tx_descriptor *tx = NULL;
1148 	raid5_conf_t *conf = sh->raid_conf;
1149 	int level = conf->level;
1150 	struct raid5_percpu *percpu;
1151 	unsigned long cpu;
1152 
1153 	cpu = get_cpu();
1154 	percpu = per_cpu_ptr(conf->percpu, cpu);
1155 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1156 		ops_run_biofill(sh);
1157 		overlap_clear++;
1158 	}
1159 
1160 	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1161 		if (level < 6)
1162 			tx = ops_run_compute5(sh, percpu);
1163 		else {
1164 			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1165 				tx = ops_run_compute6_1(sh, percpu);
1166 			else
1167 				tx = ops_run_compute6_2(sh, percpu);
1168 		}
1169 		/* terminate the chain if reconstruct is not set to be run */
1170 		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1171 			async_tx_ack(tx);
1172 	}
1173 
1174 	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1175 		tx = ops_run_prexor(sh, percpu, tx);
1176 
1177 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1178 		tx = ops_run_biodrain(sh, tx);
1179 		overlap_clear++;
1180 	}
1181 
1182 	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1183 		if (level < 6)
1184 			ops_run_reconstruct5(sh, percpu, tx);
1185 		else
1186 			ops_run_reconstruct6(sh, percpu, tx);
1187 	}
1188 
1189 	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1190 		if (sh->check_state == check_state_run)
1191 			ops_run_check_p(sh, percpu);
1192 		else if (sh->check_state == check_state_run_q)
1193 			ops_run_check_pq(sh, percpu, 0);
1194 		else if (sh->check_state == check_state_run_pq)
1195 			ops_run_check_pq(sh, percpu, 1);
1196 		else
1197 			BUG();
1198 	}
1199 
1200 	if (overlap_clear)
1201 		for (i = disks; i--; ) {
1202 			struct r5dev *dev = &sh->dev[i];
1203 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
1204 				wake_up(&sh->raid_conf->wait_for_overlap);
1205 		}
1206 	put_cpu();
1207 }
1208 
1209 #ifdef CONFIG_MULTICORE_RAID456
1210 static void async_run_ops(void *param, async_cookie_t cookie)
1211 {
1212 	struct stripe_head *sh = param;
1213 	unsigned long ops_request = sh->ops.request;
1214 
1215 	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1216 	wake_up(&sh->ops.wait_for_ops);
1217 
1218 	__raid_run_ops(sh, ops_request);
1219 	release_stripe(sh);
1220 }
1221 
1222 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1223 {
1224 	/* since handle_stripe can be called outside of raid5d context
1225 	 * we need to ensure sh->ops.request is de-staged before another
1226 	 * request arrives
1227 	 */
1228 	wait_event(sh->ops.wait_for_ops,
1229 		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1230 	sh->ops.request = ops_request;
1231 
1232 	atomic_inc(&sh->count);
1233 	async_schedule(async_run_ops, sh);
1234 }
1235 #else
1236 #define raid_run_ops __raid_run_ops
1237 #endif
1238 
1239 static int grow_one_stripe(raid5_conf_t *conf)
1240 {
1241 	struct stripe_head *sh;
1242 	int disks = max(conf->raid_disks, conf->previous_raid_disks);
1243 	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1244 	if (!sh)
1245 		return 0;
1246 	memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
1247 	sh->raid_conf = conf;
1248 	spin_lock_init(&sh->lock);
1249 	#ifdef CONFIG_MULTICORE_RAID456
1250 	init_waitqueue_head(&sh->ops.wait_for_ops);
1251 	#endif
1252 
1253 	if (grow_buffers(sh, disks)) {
1254 		shrink_buffers(sh, disks);
1255 		kmem_cache_free(conf->slab_cache, sh);
1256 		return 0;
1257 	}
1258 	/* we just created an active stripe so... */
1259 	atomic_set(&sh->count, 1);
1260 	atomic_inc(&conf->active_stripes);
1261 	INIT_LIST_HEAD(&sh->lru);
1262 	release_stripe(sh);
1263 	return 1;
1264 }
1265 
1266 static int grow_stripes(raid5_conf_t *conf, int num)
1267 {
1268 	struct kmem_cache *sc;
1269 	int devs = max(conf->raid_disks, conf->previous_raid_disks);
1270 
1271 	sprintf(conf->cache_name[0],
1272 		"raid%d-%s", conf->level, mdname(conf->mddev));
1273 	sprintf(conf->cache_name[1],
1274 		"raid%d-%s-alt", conf->level, mdname(conf->mddev));
1275 	conf->active_name = 0;
1276 	sc = kmem_cache_create(conf->cache_name[conf->active_name],
1277 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1278 			       0, 0, NULL);
1279 	if (!sc)
1280 		return 1;
1281 	conf->slab_cache = sc;
1282 	conf->pool_size = devs;
1283 	while (num--)
1284 		if (!grow_one_stripe(conf))
1285 			return 1;
1286 	return 0;
1287 }
1288 
1289 /**
1290  * scribble_len - return the required size of the scribble region
1291  * @num - total number of disks in the array
1292  *
1293  * The size must be enough to contain:
1294  * 1/ a struct page pointer for each device in the array +2
1295  * 2/ room to convert each entry in (1) to its corresponding dma
1296  *    (dma_map_page()) or page (page_address()) address.
1297  *
1298  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1299  * calculate over all devices (not just the data blocks), using zeros in place
1300  * of the P and Q blocks.
1301  */
1302 static size_t scribble_len(int num)
1303 {
1304 	size_t len;
1305 
1306 	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1307 
1308 	return len;
1309 }
1310 
1311 static int resize_stripes(raid5_conf_t *conf, int newsize)
1312 {
1313 	/* Make all the stripes able to hold 'newsize' devices.
1314 	 * New slots in each stripe get 'page' set to a new page.
1315 	 *
1316 	 * This happens in stages:
1317 	 * 1/ create a new kmem_cache and allocate the required number of
1318 	 *    stripe_heads.
1319 	 * 2/ gather all the old stripe_heads and tranfer the pages across
1320 	 *    to the new stripe_heads.  This will have the side effect of
1321 	 *    freezing the array as once all stripe_heads have been collected,
1322 	 *    no IO will be possible.  Old stripe heads are freed once their
1323 	 *    pages have been transferred over, and the old kmem_cache is
1324 	 *    freed when all stripes are done.
1325 	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1326 	 *    we simple return a failre status - no need to clean anything up.
1327 	 * 4/ allocate new pages for the new slots in the new stripe_heads.
1328 	 *    If this fails, we don't bother trying the shrink the
1329 	 *    stripe_heads down again, we just leave them as they are.
1330 	 *    As each stripe_head is processed the new one is released into
1331 	 *    active service.
1332 	 *
1333 	 * Once step2 is started, we cannot afford to wait for a write,
1334 	 * so we use GFP_NOIO allocations.
1335 	 */
1336 	struct stripe_head *osh, *nsh;
1337 	LIST_HEAD(newstripes);
1338 	struct disk_info *ndisks;
1339 	unsigned long cpu;
1340 	int err;
1341 	struct kmem_cache *sc;
1342 	int i;
1343 
1344 	if (newsize <= conf->pool_size)
1345 		return 0; /* never bother to shrink */
1346 
1347 	err = md_allow_write(conf->mddev);
1348 	if (err)
1349 		return err;
1350 
1351 	/* Step 1 */
1352 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1353 			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1354 			       0, 0, NULL);
1355 	if (!sc)
1356 		return -ENOMEM;
1357 
1358 	for (i = conf->max_nr_stripes; i; i--) {
1359 		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1360 		if (!nsh)
1361 			break;
1362 
1363 		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1364 
1365 		nsh->raid_conf = conf;
1366 		spin_lock_init(&nsh->lock);
1367 		#ifdef CONFIG_MULTICORE_RAID456
1368 		init_waitqueue_head(&nsh->ops.wait_for_ops);
1369 		#endif
1370 
1371 		list_add(&nsh->lru, &newstripes);
1372 	}
1373 	if (i) {
1374 		/* didn't get enough, give up */
1375 		while (!list_empty(&newstripes)) {
1376 			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1377 			list_del(&nsh->lru);
1378 			kmem_cache_free(sc, nsh);
1379 		}
1380 		kmem_cache_destroy(sc);
1381 		return -ENOMEM;
1382 	}
1383 	/* Step 2 - Must use GFP_NOIO now.
1384 	 * OK, we have enough stripes, start collecting inactive
1385 	 * stripes and copying them over
1386 	 */
1387 	list_for_each_entry(nsh, &newstripes, lru) {
1388 		spin_lock_irq(&conf->device_lock);
1389 		wait_event_lock_irq(conf->wait_for_stripe,
1390 				    !list_empty(&conf->inactive_list),
1391 				    conf->device_lock,
1392 				    unplug_slaves(conf->mddev)
1393 			);
1394 		osh = get_free_stripe(conf);
1395 		spin_unlock_irq(&conf->device_lock);
1396 		atomic_set(&nsh->count, 1);
1397 		for(i=0; i<conf->pool_size; i++)
1398 			nsh->dev[i].page = osh->dev[i].page;
1399 		for( ; i<newsize; i++)
1400 			nsh->dev[i].page = NULL;
1401 		kmem_cache_free(conf->slab_cache, osh);
1402 	}
1403 	kmem_cache_destroy(conf->slab_cache);
1404 
1405 	/* Step 3.
1406 	 * At this point, we are holding all the stripes so the array
1407 	 * is completely stalled, so now is a good time to resize
1408 	 * conf->disks and the scribble region
1409 	 */
1410 	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1411 	if (ndisks) {
1412 		for (i=0; i<conf->raid_disks; i++)
1413 			ndisks[i] = conf->disks[i];
1414 		kfree(conf->disks);
1415 		conf->disks = ndisks;
1416 	} else
1417 		err = -ENOMEM;
1418 
1419 	get_online_cpus();
1420 	conf->scribble_len = scribble_len(newsize);
1421 	for_each_present_cpu(cpu) {
1422 		struct raid5_percpu *percpu;
1423 		void *scribble;
1424 
1425 		percpu = per_cpu_ptr(conf->percpu, cpu);
1426 		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1427 
1428 		if (scribble) {
1429 			kfree(percpu->scribble);
1430 			percpu->scribble = scribble;
1431 		} else {
1432 			err = -ENOMEM;
1433 			break;
1434 		}
1435 	}
1436 	put_online_cpus();
1437 
1438 	/* Step 4, return new stripes to service */
1439 	while(!list_empty(&newstripes)) {
1440 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1441 		list_del_init(&nsh->lru);
1442 
1443 		for (i=conf->raid_disks; i < newsize; i++)
1444 			if (nsh->dev[i].page == NULL) {
1445 				struct page *p = alloc_page(GFP_NOIO);
1446 				nsh->dev[i].page = p;
1447 				if (!p)
1448 					err = -ENOMEM;
1449 			}
1450 		release_stripe(nsh);
1451 	}
1452 	/* critical section pass, GFP_NOIO no longer needed */
1453 
1454 	conf->slab_cache = sc;
1455 	conf->active_name = 1-conf->active_name;
1456 	conf->pool_size = newsize;
1457 	return err;
1458 }
1459 
1460 static int drop_one_stripe(raid5_conf_t *conf)
1461 {
1462 	struct stripe_head *sh;
1463 
1464 	spin_lock_irq(&conf->device_lock);
1465 	sh = get_free_stripe(conf);
1466 	spin_unlock_irq(&conf->device_lock);
1467 	if (!sh)
1468 		return 0;
1469 	BUG_ON(atomic_read(&sh->count));
1470 	shrink_buffers(sh, conf->pool_size);
1471 	kmem_cache_free(conf->slab_cache, sh);
1472 	atomic_dec(&conf->active_stripes);
1473 	return 1;
1474 }
1475 
1476 static void shrink_stripes(raid5_conf_t *conf)
1477 {
1478 	while (drop_one_stripe(conf))
1479 		;
1480 
1481 	if (conf->slab_cache)
1482 		kmem_cache_destroy(conf->slab_cache);
1483 	conf->slab_cache = NULL;
1484 }
1485 
1486 static void raid5_end_read_request(struct bio * bi, int error)
1487 {
1488 	struct stripe_head *sh = bi->bi_private;
1489 	raid5_conf_t *conf = sh->raid_conf;
1490 	int disks = sh->disks, i;
1491 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1492 	char b[BDEVNAME_SIZE];
1493 	mdk_rdev_t *rdev;
1494 
1495 
1496 	for (i=0 ; i<disks; i++)
1497 		if (bi == &sh->dev[i].req)
1498 			break;
1499 
1500 	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1501 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1502 		uptodate);
1503 	if (i == disks) {
1504 		BUG();
1505 		return;
1506 	}
1507 
1508 	if (uptodate) {
1509 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1510 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1511 			rdev = conf->disks[i].rdev;
1512 			printk_rl(KERN_INFO "raid5:%s: read error corrected"
1513 				  " (%lu sectors at %llu on %s)\n",
1514 				  mdname(conf->mddev), STRIPE_SECTORS,
1515 				  (unsigned long long)(sh->sector
1516 						       + rdev->data_offset),
1517 				  bdevname(rdev->bdev, b));
1518 			clear_bit(R5_ReadError, &sh->dev[i].flags);
1519 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1520 		}
1521 		if (atomic_read(&conf->disks[i].rdev->read_errors))
1522 			atomic_set(&conf->disks[i].rdev->read_errors, 0);
1523 	} else {
1524 		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1525 		int retry = 0;
1526 		rdev = conf->disks[i].rdev;
1527 
1528 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1529 		atomic_inc(&rdev->read_errors);
1530 		if (conf->mddev->degraded >= conf->max_degraded)
1531 			printk_rl(KERN_WARNING
1532 				  "raid5:%s: read error not correctable "
1533 				  "(sector %llu on %s).\n",
1534 				  mdname(conf->mddev),
1535 				  (unsigned long long)(sh->sector
1536 						       + rdev->data_offset),
1537 				  bdn);
1538 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1539 			/* Oh, no!!! */
1540 			printk_rl(KERN_WARNING
1541 				  "raid5:%s: read error NOT corrected!! "
1542 				  "(sector %llu on %s).\n",
1543 				  mdname(conf->mddev),
1544 				  (unsigned long long)(sh->sector
1545 						       + rdev->data_offset),
1546 				  bdn);
1547 		else if (atomic_read(&rdev->read_errors)
1548 			 > conf->max_nr_stripes)
1549 			printk(KERN_WARNING
1550 			       "raid5:%s: Too many read errors, failing device %s.\n",
1551 			       mdname(conf->mddev), bdn);
1552 		else
1553 			retry = 1;
1554 		if (retry)
1555 			set_bit(R5_ReadError, &sh->dev[i].flags);
1556 		else {
1557 			clear_bit(R5_ReadError, &sh->dev[i].flags);
1558 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1559 			md_error(conf->mddev, rdev);
1560 		}
1561 	}
1562 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1563 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1564 	set_bit(STRIPE_HANDLE, &sh->state);
1565 	release_stripe(sh);
1566 }
1567 
1568 static void raid5_end_write_request(struct bio *bi, int error)
1569 {
1570 	struct stripe_head *sh = bi->bi_private;
1571 	raid5_conf_t *conf = sh->raid_conf;
1572 	int disks = sh->disks, i;
1573 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1574 
1575 	for (i=0 ; i<disks; i++)
1576 		if (bi == &sh->dev[i].req)
1577 			break;
1578 
1579 	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1580 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1581 		uptodate);
1582 	if (i == disks) {
1583 		BUG();
1584 		return;
1585 	}
1586 
1587 	if (!uptodate)
1588 		md_error(conf->mddev, conf->disks[i].rdev);
1589 
1590 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1591 
1592 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1593 	set_bit(STRIPE_HANDLE, &sh->state);
1594 	release_stripe(sh);
1595 }
1596 
1597 
1598 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1599 
1600 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1601 {
1602 	struct r5dev *dev = &sh->dev[i];
1603 
1604 	bio_init(&dev->req);
1605 	dev->req.bi_io_vec = &dev->vec;
1606 	dev->req.bi_vcnt++;
1607 	dev->req.bi_max_vecs++;
1608 	dev->vec.bv_page = dev->page;
1609 	dev->vec.bv_len = STRIPE_SIZE;
1610 	dev->vec.bv_offset = 0;
1611 
1612 	dev->req.bi_sector = sh->sector;
1613 	dev->req.bi_private = sh;
1614 
1615 	dev->flags = 0;
1616 	dev->sector = compute_blocknr(sh, i, previous);
1617 }
1618 
1619 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1620 {
1621 	char b[BDEVNAME_SIZE];
1622 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1623 	pr_debug("raid5: error called\n");
1624 
1625 	if (!test_bit(Faulty, &rdev->flags)) {
1626 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
1627 		if (test_and_clear_bit(In_sync, &rdev->flags)) {
1628 			unsigned long flags;
1629 			spin_lock_irqsave(&conf->device_lock, flags);
1630 			mddev->degraded++;
1631 			spin_unlock_irqrestore(&conf->device_lock, flags);
1632 			/*
1633 			 * if recovery was running, make sure it aborts.
1634 			 */
1635 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1636 		}
1637 		set_bit(Faulty, &rdev->flags);
1638 		printk(KERN_ALERT
1639 		       "raid5: Disk failure on %s, disabling device.\n"
1640 		       "raid5: Operation continuing on %d devices.\n",
1641 		       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1642 	}
1643 }
1644 
1645 /*
1646  * Input: a 'big' sector number,
1647  * Output: index of the data and parity disk, and the sector # in them.
1648  */
1649 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1650 				     int previous, int *dd_idx,
1651 				     struct stripe_head *sh)
1652 {
1653 	sector_t stripe, stripe2;
1654 	sector_t chunk_number;
1655 	unsigned int chunk_offset;
1656 	int pd_idx, qd_idx;
1657 	int ddf_layout = 0;
1658 	sector_t new_sector;
1659 	int algorithm = previous ? conf->prev_algo
1660 				 : conf->algorithm;
1661 	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1662 					 : conf->chunk_sectors;
1663 	int raid_disks = previous ? conf->previous_raid_disks
1664 				  : conf->raid_disks;
1665 	int data_disks = raid_disks - conf->max_degraded;
1666 
1667 	/* First compute the information on this sector */
1668 
1669 	/*
1670 	 * Compute the chunk number and the sector offset inside the chunk
1671 	 */
1672 	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1673 	chunk_number = r_sector;
1674 
1675 	/*
1676 	 * Compute the stripe number
1677 	 */
1678 	stripe = chunk_number;
1679 	*dd_idx = sector_div(stripe, data_disks);
1680 	stripe2 = stripe;
1681 	/*
1682 	 * Select the parity disk based on the user selected algorithm.
1683 	 */
1684 	pd_idx = qd_idx = ~0;
1685 	switch(conf->level) {
1686 	case 4:
1687 		pd_idx = data_disks;
1688 		break;
1689 	case 5:
1690 		switch (algorithm) {
1691 		case ALGORITHM_LEFT_ASYMMETRIC:
1692 			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1693 			if (*dd_idx >= pd_idx)
1694 				(*dd_idx)++;
1695 			break;
1696 		case ALGORITHM_RIGHT_ASYMMETRIC:
1697 			pd_idx = sector_div(stripe2, raid_disks);
1698 			if (*dd_idx >= pd_idx)
1699 				(*dd_idx)++;
1700 			break;
1701 		case ALGORITHM_LEFT_SYMMETRIC:
1702 			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1703 			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1704 			break;
1705 		case ALGORITHM_RIGHT_SYMMETRIC:
1706 			pd_idx = sector_div(stripe2, raid_disks);
1707 			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1708 			break;
1709 		case ALGORITHM_PARITY_0:
1710 			pd_idx = 0;
1711 			(*dd_idx)++;
1712 			break;
1713 		case ALGORITHM_PARITY_N:
1714 			pd_idx = data_disks;
1715 			break;
1716 		default:
1717 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1718 				algorithm);
1719 			BUG();
1720 		}
1721 		break;
1722 	case 6:
1723 
1724 		switch (algorithm) {
1725 		case ALGORITHM_LEFT_ASYMMETRIC:
1726 			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1727 			qd_idx = pd_idx + 1;
1728 			if (pd_idx == raid_disks-1) {
1729 				(*dd_idx)++;	/* Q D D D P */
1730 				qd_idx = 0;
1731 			} else if (*dd_idx >= pd_idx)
1732 				(*dd_idx) += 2; /* D D P Q D */
1733 			break;
1734 		case ALGORITHM_RIGHT_ASYMMETRIC:
1735 			pd_idx = sector_div(stripe2, raid_disks);
1736 			qd_idx = pd_idx + 1;
1737 			if (pd_idx == raid_disks-1) {
1738 				(*dd_idx)++;	/* Q D D D P */
1739 				qd_idx = 0;
1740 			} else if (*dd_idx >= pd_idx)
1741 				(*dd_idx) += 2; /* D D P Q D */
1742 			break;
1743 		case ALGORITHM_LEFT_SYMMETRIC:
1744 			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1745 			qd_idx = (pd_idx + 1) % raid_disks;
1746 			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1747 			break;
1748 		case ALGORITHM_RIGHT_SYMMETRIC:
1749 			pd_idx = sector_div(stripe2, raid_disks);
1750 			qd_idx = (pd_idx + 1) % raid_disks;
1751 			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1752 			break;
1753 
1754 		case ALGORITHM_PARITY_0:
1755 			pd_idx = 0;
1756 			qd_idx = 1;
1757 			(*dd_idx) += 2;
1758 			break;
1759 		case ALGORITHM_PARITY_N:
1760 			pd_idx = data_disks;
1761 			qd_idx = data_disks + 1;
1762 			break;
1763 
1764 		case ALGORITHM_ROTATING_ZERO_RESTART:
1765 			/* Exactly the same as RIGHT_ASYMMETRIC, but or
1766 			 * of blocks for computing Q is different.
1767 			 */
1768 			pd_idx = sector_div(stripe2, raid_disks);
1769 			qd_idx = pd_idx + 1;
1770 			if (pd_idx == raid_disks-1) {
1771 				(*dd_idx)++;	/* Q D D D P */
1772 				qd_idx = 0;
1773 			} else if (*dd_idx >= pd_idx)
1774 				(*dd_idx) += 2; /* D D P Q D */
1775 			ddf_layout = 1;
1776 			break;
1777 
1778 		case ALGORITHM_ROTATING_N_RESTART:
1779 			/* Same a left_asymmetric, by first stripe is
1780 			 * D D D P Q  rather than
1781 			 * Q D D D P
1782 			 */
1783 			stripe2 += 1;
1784 			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1785 			qd_idx = pd_idx + 1;
1786 			if (pd_idx == raid_disks-1) {
1787 				(*dd_idx)++;	/* Q D D D P */
1788 				qd_idx = 0;
1789 			} else if (*dd_idx >= pd_idx)
1790 				(*dd_idx) += 2; /* D D P Q D */
1791 			ddf_layout = 1;
1792 			break;
1793 
1794 		case ALGORITHM_ROTATING_N_CONTINUE:
1795 			/* Same as left_symmetric but Q is before P */
1796 			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1797 			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1798 			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1799 			ddf_layout = 1;
1800 			break;
1801 
1802 		case ALGORITHM_LEFT_ASYMMETRIC_6:
1803 			/* RAID5 left_asymmetric, with Q on last device */
1804 			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1805 			if (*dd_idx >= pd_idx)
1806 				(*dd_idx)++;
1807 			qd_idx = raid_disks - 1;
1808 			break;
1809 
1810 		case ALGORITHM_RIGHT_ASYMMETRIC_6:
1811 			pd_idx = sector_div(stripe2, raid_disks-1);
1812 			if (*dd_idx >= pd_idx)
1813 				(*dd_idx)++;
1814 			qd_idx = raid_disks - 1;
1815 			break;
1816 
1817 		case ALGORITHM_LEFT_SYMMETRIC_6:
1818 			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1819 			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1820 			qd_idx = raid_disks - 1;
1821 			break;
1822 
1823 		case ALGORITHM_RIGHT_SYMMETRIC_6:
1824 			pd_idx = sector_div(stripe2, raid_disks-1);
1825 			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1826 			qd_idx = raid_disks - 1;
1827 			break;
1828 
1829 		case ALGORITHM_PARITY_0_6:
1830 			pd_idx = 0;
1831 			(*dd_idx)++;
1832 			qd_idx = raid_disks - 1;
1833 			break;
1834 
1835 
1836 		default:
1837 			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1838 			       algorithm);
1839 			BUG();
1840 		}
1841 		break;
1842 	}
1843 
1844 	if (sh) {
1845 		sh->pd_idx = pd_idx;
1846 		sh->qd_idx = qd_idx;
1847 		sh->ddf_layout = ddf_layout;
1848 	}
1849 	/*
1850 	 * Finally, compute the new sector number
1851 	 */
1852 	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1853 	return new_sector;
1854 }
1855 
1856 
1857 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1858 {
1859 	raid5_conf_t *conf = sh->raid_conf;
1860 	int raid_disks = sh->disks;
1861 	int data_disks = raid_disks - conf->max_degraded;
1862 	sector_t new_sector = sh->sector, check;
1863 	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1864 					 : conf->chunk_sectors;
1865 	int algorithm = previous ? conf->prev_algo
1866 				 : conf->algorithm;
1867 	sector_t stripe;
1868 	int chunk_offset;
1869 	sector_t chunk_number;
1870 	int dummy1, dd_idx = i;
1871 	sector_t r_sector;
1872 	struct stripe_head sh2;
1873 
1874 
1875 	chunk_offset = sector_div(new_sector, sectors_per_chunk);
1876 	stripe = new_sector;
1877 
1878 	if (i == sh->pd_idx)
1879 		return 0;
1880 	switch(conf->level) {
1881 	case 4: break;
1882 	case 5:
1883 		switch (algorithm) {
1884 		case ALGORITHM_LEFT_ASYMMETRIC:
1885 		case ALGORITHM_RIGHT_ASYMMETRIC:
1886 			if (i > sh->pd_idx)
1887 				i--;
1888 			break;
1889 		case ALGORITHM_LEFT_SYMMETRIC:
1890 		case ALGORITHM_RIGHT_SYMMETRIC:
1891 			if (i < sh->pd_idx)
1892 				i += raid_disks;
1893 			i -= (sh->pd_idx + 1);
1894 			break;
1895 		case ALGORITHM_PARITY_0:
1896 			i -= 1;
1897 			break;
1898 		case ALGORITHM_PARITY_N:
1899 			break;
1900 		default:
1901 			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1902 			       algorithm);
1903 			BUG();
1904 		}
1905 		break;
1906 	case 6:
1907 		if (i == sh->qd_idx)
1908 			return 0; /* It is the Q disk */
1909 		switch (algorithm) {
1910 		case ALGORITHM_LEFT_ASYMMETRIC:
1911 		case ALGORITHM_RIGHT_ASYMMETRIC:
1912 		case ALGORITHM_ROTATING_ZERO_RESTART:
1913 		case ALGORITHM_ROTATING_N_RESTART:
1914 			if (sh->pd_idx == raid_disks-1)
1915 				i--;	/* Q D D D P */
1916 			else if (i > sh->pd_idx)
1917 				i -= 2; /* D D P Q D */
1918 			break;
1919 		case ALGORITHM_LEFT_SYMMETRIC:
1920 		case ALGORITHM_RIGHT_SYMMETRIC:
1921 			if (sh->pd_idx == raid_disks-1)
1922 				i--; /* Q D D D P */
1923 			else {
1924 				/* D D P Q D */
1925 				if (i < sh->pd_idx)
1926 					i += raid_disks;
1927 				i -= (sh->pd_idx + 2);
1928 			}
1929 			break;
1930 		case ALGORITHM_PARITY_0:
1931 			i -= 2;
1932 			break;
1933 		case ALGORITHM_PARITY_N:
1934 			break;
1935 		case ALGORITHM_ROTATING_N_CONTINUE:
1936 			/* Like left_symmetric, but P is before Q */
1937 			if (sh->pd_idx == 0)
1938 				i--;	/* P D D D Q */
1939 			else {
1940 				/* D D Q P D */
1941 				if (i < sh->pd_idx)
1942 					i += raid_disks;
1943 				i -= (sh->pd_idx + 1);
1944 			}
1945 			break;
1946 		case ALGORITHM_LEFT_ASYMMETRIC_6:
1947 		case ALGORITHM_RIGHT_ASYMMETRIC_6:
1948 			if (i > sh->pd_idx)
1949 				i--;
1950 			break;
1951 		case ALGORITHM_LEFT_SYMMETRIC_6:
1952 		case ALGORITHM_RIGHT_SYMMETRIC_6:
1953 			if (i < sh->pd_idx)
1954 				i += data_disks + 1;
1955 			i -= (sh->pd_idx + 1);
1956 			break;
1957 		case ALGORITHM_PARITY_0_6:
1958 			i -= 1;
1959 			break;
1960 		default:
1961 			printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1962 			       algorithm);
1963 			BUG();
1964 		}
1965 		break;
1966 	}
1967 
1968 	chunk_number = stripe * data_disks + i;
1969 	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
1970 
1971 	check = raid5_compute_sector(conf, r_sector,
1972 				     previous, &dummy1, &sh2);
1973 	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1974 		|| sh2.qd_idx != sh->qd_idx) {
1975 		printk(KERN_ERR "compute_blocknr: map not correct\n");
1976 		return 0;
1977 	}
1978 	return r_sector;
1979 }
1980 
1981 
1982 static void
1983 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1984 			 int rcw, int expand)
1985 {
1986 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
1987 	raid5_conf_t *conf = sh->raid_conf;
1988 	int level = conf->level;
1989 
1990 	if (rcw) {
1991 		/* if we are not expanding this is a proper write request, and
1992 		 * there will be bios with new data to be drained into the
1993 		 * stripe cache
1994 		 */
1995 		if (!expand) {
1996 			sh->reconstruct_state = reconstruct_state_drain_run;
1997 			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1998 		} else
1999 			sh->reconstruct_state = reconstruct_state_run;
2000 
2001 		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2002 
2003 		for (i = disks; i--; ) {
2004 			struct r5dev *dev = &sh->dev[i];
2005 
2006 			if (dev->towrite) {
2007 				set_bit(R5_LOCKED, &dev->flags);
2008 				set_bit(R5_Wantdrain, &dev->flags);
2009 				if (!expand)
2010 					clear_bit(R5_UPTODATE, &dev->flags);
2011 				s->locked++;
2012 			}
2013 		}
2014 		if (s->locked + conf->max_degraded == disks)
2015 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2016 				atomic_inc(&conf->pending_full_writes);
2017 	} else {
2018 		BUG_ON(level == 6);
2019 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2020 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2021 
2022 		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2023 		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2024 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2025 		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2026 
2027 		for (i = disks; i--; ) {
2028 			struct r5dev *dev = &sh->dev[i];
2029 			if (i == pd_idx)
2030 				continue;
2031 
2032 			if (dev->towrite &&
2033 			    (test_bit(R5_UPTODATE, &dev->flags) ||
2034 			     test_bit(R5_Wantcompute, &dev->flags))) {
2035 				set_bit(R5_Wantdrain, &dev->flags);
2036 				set_bit(R5_LOCKED, &dev->flags);
2037 				clear_bit(R5_UPTODATE, &dev->flags);
2038 				s->locked++;
2039 			}
2040 		}
2041 	}
2042 
2043 	/* keep the parity disk(s) locked while asynchronous operations
2044 	 * are in flight
2045 	 */
2046 	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2047 	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2048 	s->locked++;
2049 
2050 	if (level == 6) {
2051 		int qd_idx = sh->qd_idx;
2052 		struct r5dev *dev = &sh->dev[qd_idx];
2053 
2054 		set_bit(R5_LOCKED, &dev->flags);
2055 		clear_bit(R5_UPTODATE, &dev->flags);
2056 		s->locked++;
2057 	}
2058 
2059 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2060 		__func__, (unsigned long long)sh->sector,
2061 		s->locked, s->ops_request);
2062 }
2063 
2064 /*
2065  * Each stripe/dev can have one or more bion attached.
2066  * toread/towrite point to the first in a chain.
2067  * The bi_next chain must be in order.
2068  */
2069 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2070 {
2071 	struct bio **bip;
2072 	raid5_conf_t *conf = sh->raid_conf;
2073 	int firstwrite=0;
2074 
2075 	pr_debug("adding bh b#%llu to stripe s#%llu\n",
2076 		(unsigned long long)bi->bi_sector,
2077 		(unsigned long long)sh->sector);
2078 
2079 
2080 	spin_lock(&sh->lock);
2081 	spin_lock_irq(&conf->device_lock);
2082 	if (forwrite) {
2083 		bip = &sh->dev[dd_idx].towrite;
2084 		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2085 			firstwrite = 1;
2086 	} else
2087 		bip = &sh->dev[dd_idx].toread;
2088 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2089 		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2090 			goto overlap;
2091 		bip = & (*bip)->bi_next;
2092 	}
2093 	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2094 		goto overlap;
2095 
2096 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2097 	if (*bip)
2098 		bi->bi_next = *bip;
2099 	*bip = bi;
2100 	bi->bi_phys_segments++;
2101 	spin_unlock_irq(&conf->device_lock);
2102 	spin_unlock(&sh->lock);
2103 
2104 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2105 		(unsigned long long)bi->bi_sector,
2106 		(unsigned long long)sh->sector, dd_idx);
2107 
2108 	if (conf->mddev->bitmap && firstwrite) {
2109 		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2110 				  STRIPE_SECTORS, 0);
2111 		sh->bm_seq = conf->seq_flush+1;
2112 		set_bit(STRIPE_BIT_DELAY, &sh->state);
2113 	}
2114 
2115 	if (forwrite) {
2116 		/* check if page is covered */
2117 		sector_t sector = sh->dev[dd_idx].sector;
2118 		for (bi=sh->dev[dd_idx].towrite;
2119 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2120 			     bi && bi->bi_sector <= sector;
2121 		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2122 			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2123 				sector = bi->bi_sector + (bi->bi_size>>9);
2124 		}
2125 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2126 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2127 	}
2128 	return 1;
2129 
2130  overlap:
2131 	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2132 	spin_unlock_irq(&conf->device_lock);
2133 	spin_unlock(&sh->lock);
2134 	return 0;
2135 }
2136 
2137 static void end_reshape(raid5_conf_t *conf);
2138 
2139 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2140 			    struct stripe_head *sh)
2141 {
2142 	int sectors_per_chunk =
2143 		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2144 	int dd_idx;
2145 	int chunk_offset = sector_div(stripe, sectors_per_chunk);
2146 	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2147 
2148 	raid5_compute_sector(conf,
2149 			     stripe * (disks - conf->max_degraded)
2150 			     *sectors_per_chunk + chunk_offset,
2151 			     previous,
2152 			     &dd_idx, sh);
2153 }
2154 
2155 static void
2156 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2157 				struct stripe_head_state *s, int disks,
2158 				struct bio **return_bi)
2159 {
2160 	int i;
2161 	for (i = disks; i--; ) {
2162 		struct bio *bi;
2163 		int bitmap_end = 0;
2164 
2165 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2166 			mdk_rdev_t *rdev;
2167 			rcu_read_lock();
2168 			rdev = rcu_dereference(conf->disks[i].rdev);
2169 			if (rdev && test_bit(In_sync, &rdev->flags))
2170 				/* multiple read failures in one stripe */
2171 				md_error(conf->mddev, rdev);
2172 			rcu_read_unlock();
2173 		}
2174 		spin_lock_irq(&conf->device_lock);
2175 		/* fail all writes first */
2176 		bi = sh->dev[i].towrite;
2177 		sh->dev[i].towrite = NULL;
2178 		if (bi) {
2179 			s->to_write--;
2180 			bitmap_end = 1;
2181 		}
2182 
2183 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2184 			wake_up(&conf->wait_for_overlap);
2185 
2186 		while (bi && bi->bi_sector <
2187 			sh->dev[i].sector + STRIPE_SECTORS) {
2188 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2189 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2190 			if (!raid5_dec_bi_phys_segments(bi)) {
2191 				md_write_end(conf->mddev);
2192 				bi->bi_next = *return_bi;
2193 				*return_bi = bi;
2194 			}
2195 			bi = nextbi;
2196 		}
2197 		/* and fail all 'written' */
2198 		bi = sh->dev[i].written;
2199 		sh->dev[i].written = NULL;
2200 		if (bi) bitmap_end = 1;
2201 		while (bi && bi->bi_sector <
2202 		       sh->dev[i].sector + STRIPE_SECTORS) {
2203 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2204 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
2205 			if (!raid5_dec_bi_phys_segments(bi)) {
2206 				md_write_end(conf->mddev);
2207 				bi->bi_next = *return_bi;
2208 				*return_bi = bi;
2209 			}
2210 			bi = bi2;
2211 		}
2212 
2213 		/* fail any reads if this device is non-operational and
2214 		 * the data has not reached the cache yet.
2215 		 */
2216 		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2217 		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2218 		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
2219 			bi = sh->dev[i].toread;
2220 			sh->dev[i].toread = NULL;
2221 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2222 				wake_up(&conf->wait_for_overlap);
2223 			if (bi) s->to_read--;
2224 			while (bi && bi->bi_sector <
2225 			       sh->dev[i].sector + STRIPE_SECTORS) {
2226 				struct bio *nextbi =
2227 					r5_next_bio(bi, sh->dev[i].sector);
2228 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
2229 				if (!raid5_dec_bi_phys_segments(bi)) {
2230 					bi->bi_next = *return_bi;
2231 					*return_bi = bi;
2232 				}
2233 				bi = nextbi;
2234 			}
2235 		}
2236 		spin_unlock_irq(&conf->device_lock);
2237 		if (bitmap_end)
2238 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2239 					STRIPE_SECTORS, 0, 0);
2240 	}
2241 
2242 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2243 		if (atomic_dec_and_test(&conf->pending_full_writes))
2244 			md_wakeup_thread(conf->mddev->thread);
2245 }
2246 
2247 /* fetch_block5 - checks the given member device to see if its data needs
2248  * to be read or computed to satisfy a request.
2249  *
2250  * Returns 1 when no more member devices need to be checked, otherwise returns
2251  * 0 to tell the loop in handle_stripe_fill5 to continue
2252  */
2253 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2254 			int disk_idx, int disks)
2255 {
2256 	struct r5dev *dev = &sh->dev[disk_idx];
2257 	struct r5dev *failed_dev = &sh->dev[s->failed_num];
2258 
2259 	/* is the data in this block needed, and can we get it? */
2260 	if (!test_bit(R5_LOCKED, &dev->flags) &&
2261 	    !test_bit(R5_UPTODATE, &dev->flags) &&
2262 	    (dev->toread ||
2263 	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2264 	     s->syncing || s->expanding ||
2265 	     (s->failed &&
2266 	      (failed_dev->toread ||
2267 	       (failed_dev->towrite &&
2268 		!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2269 		/* We would like to get this block, possibly by computing it,
2270 		 * otherwise read it if the backing disk is insync
2271 		 */
2272 		if ((s->uptodate == disks - 1) &&
2273 		    (s->failed && disk_idx == s->failed_num)) {
2274 			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2275 			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2276 			set_bit(R5_Wantcompute, &dev->flags);
2277 			sh->ops.target = disk_idx;
2278 			sh->ops.target2 = -1;
2279 			s->req_compute = 1;
2280 			/* Careful: from this point on 'uptodate' is in the eye
2281 			 * of raid_run_ops which services 'compute' operations
2282 			 * before writes. R5_Wantcompute flags a block that will
2283 			 * be R5_UPTODATE by the time it is needed for a
2284 			 * subsequent operation.
2285 			 */
2286 			s->uptodate++;
2287 			return 1; /* uptodate + compute == disks */
2288 		} else if (test_bit(R5_Insync, &dev->flags)) {
2289 			set_bit(R5_LOCKED, &dev->flags);
2290 			set_bit(R5_Wantread, &dev->flags);
2291 			s->locked++;
2292 			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2293 				s->syncing);
2294 		}
2295 	}
2296 
2297 	return 0;
2298 }
2299 
2300 /**
2301  * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2302  */
2303 static void handle_stripe_fill5(struct stripe_head *sh,
2304 			struct stripe_head_state *s, int disks)
2305 {
2306 	int i;
2307 
2308 	/* look for blocks to read/compute, skip this if a compute
2309 	 * is already in flight, or if the stripe contents are in the
2310 	 * midst of changing due to a write
2311 	 */
2312 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2313 	    !sh->reconstruct_state)
2314 		for (i = disks; i--; )
2315 			if (fetch_block5(sh, s, i, disks))
2316 				break;
2317 	set_bit(STRIPE_HANDLE, &sh->state);
2318 }
2319 
2320 /* fetch_block6 - checks the given member device to see if its data needs
2321  * to be read or computed to satisfy a request.
2322  *
2323  * Returns 1 when no more member devices need to be checked, otherwise returns
2324  * 0 to tell the loop in handle_stripe_fill6 to continue
2325  */
2326 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2327 			 struct r6_state *r6s, int disk_idx, int disks)
2328 {
2329 	struct r5dev *dev = &sh->dev[disk_idx];
2330 	struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2331 				  &sh->dev[r6s->failed_num[1]] };
2332 
2333 	if (!test_bit(R5_LOCKED, &dev->flags) &&
2334 	    !test_bit(R5_UPTODATE, &dev->flags) &&
2335 	    (dev->toread ||
2336 	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2337 	     s->syncing || s->expanding ||
2338 	     (s->failed >= 1 &&
2339 	      (fdev[0]->toread || s->to_write)) ||
2340 	     (s->failed >= 2 &&
2341 	      (fdev[1]->toread || s->to_write)))) {
2342 		/* we would like to get this block, possibly by computing it,
2343 		 * otherwise read it if the backing disk is insync
2344 		 */
2345 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2346 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
2347 		if ((s->uptodate == disks - 1) &&
2348 		    (s->failed && (disk_idx == r6s->failed_num[0] ||
2349 				   disk_idx == r6s->failed_num[1]))) {
2350 			/* have disk failed, and we're requested to fetch it;
2351 			 * do compute it
2352 			 */
2353 			pr_debug("Computing stripe %llu block %d\n",
2354 			       (unsigned long long)sh->sector, disk_idx);
2355 			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2356 			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2357 			set_bit(R5_Wantcompute, &dev->flags);
2358 			sh->ops.target = disk_idx;
2359 			sh->ops.target2 = -1; /* no 2nd target */
2360 			s->req_compute = 1;
2361 			s->uptodate++;
2362 			return 1;
2363 		} else if (s->uptodate == disks-2 && s->failed >= 2) {
2364 			/* Computing 2-failure is *very* expensive; only
2365 			 * do it if failed >= 2
2366 			 */
2367 			int other;
2368 			for (other = disks; other--; ) {
2369 				if (other == disk_idx)
2370 					continue;
2371 				if (!test_bit(R5_UPTODATE,
2372 				      &sh->dev[other].flags))
2373 					break;
2374 			}
2375 			BUG_ON(other < 0);
2376 			pr_debug("Computing stripe %llu blocks %d,%d\n",
2377 			       (unsigned long long)sh->sector,
2378 			       disk_idx, other);
2379 			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2380 			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2381 			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2382 			set_bit(R5_Wantcompute, &sh->dev[other].flags);
2383 			sh->ops.target = disk_idx;
2384 			sh->ops.target2 = other;
2385 			s->uptodate += 2;
2386 			s->req_compute = 1;
2387 			return 1;
2388 		} else if (test_bit(R5_Insync, &dev->flags)) {
2389 			set_bit(R5_LOCKED, &dev->flags);
2390 			set_bit(R5_Wantread, &dev->flags);
2391 			s->locked++;
2392 			pr_debug("Reading block %d (sync=%d)\n",
2393 				disk_idx, s->syncing);
2394 		}
2395 	}
2396 
2397 	return 0;
2398 }
2399 
2400 /**
2401  * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2402  */
2403 static void handle_stripe_fill6(struct stripe_head *sh,
2404 			struct stripe_head_state *s, struct r6_state *r6s,
2405 			int disks)
2406 {
2407 	int i;
2408 
2409 	/* look for blocks to read/compute, skip this if a compute
2410 	 * is already in flight, or if the stripe contents are in the
2411 	 * midst of changing due to a write
2412 	 */
2413 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2414 	    !sh->reconstruct_state)
2415 		for (i = disks; i--; )
2416 			if (fetch_block6(sh, s, r6s, i, disks))
2417 				break;
2418 	set_bit(STRIPE_HANDLE, &sh->state);
2419 }
2420 
2421 
2422 /* handle_stripe_clean_event
2423  * any written block on an uptodate or failed drive can be returned.
2424  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2425  * never LOCKED, so we don't need to test 'failed' directly.
2426  */
2427 static void handle_stripe_clean_event(raid5_conf_t *conf,
2428 	struct stripe_head *sh, int disks, struct bio **return_bi)
2429 {
2430 	int i;
2431 	struct r5dev *dev;
2432 
2433 	for (i = disks; i--; )
2434 		if (sh->dev[i].written) {
2435 			dev = &sh->dev[i];
2436 			if (!test_bit(R5_LOCKED, &dev->flags) &&
2437 				test_bit(R5_UPTODATE, &dev->flags)) {
2438 				/* We can return any write requests */
2439 				struct bio *wbi, *wbi2;
2440 				int bitmap_end = 0;
2441 				pr_debug("Return write for disc %d\n", i);
2442 				spin_lock_irq(&conf->device_lock);
2443 				wbi = dev->written;
2444 				dev->written = NULL;
2445 				while (wbi && wbi->bi_sector <
2446 					dev->sector + STRIPE_SECTORS) {
2447 					wbi2 = r5_next_bio(wbi, dev->sector);
2448 					if (!raid5_dec_bi_phys_segments(wbi)) {
2449 						md_write_end(conf->mddev);
2450 						wbi->bi_next = *return_bi;
2451 						*return_bi = wbi;
2452 					}
2453 					wbi = wbi2;
2454 				}
2455 				if (dev->towrite == NULL)
2456 					bitmap_end = 1;
2457 				spin_unlock_irq(&conf->device_lock);
2458 				if (bitmap_end)
2459 					bitmap_endwrite(conf->mddev->bitmap,
2460 							sh->sector,
2461 							STRIPE_SECTORS,
2462 					 !test_bit(STRIPE_DEGRADED, &sh->state),
2463 							0);
2464 			}
2465 		}
2466 
2467 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2468 		if (atomic_dec_and_test(&conf->pending_full_writes))
2469 			md_wakeup_thread(conf->mddev->thread);
2470 }
2471 
2472 static void handle_stripe_dirtying5(raid5_conf_t *conf,
2473 		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
2474 {
2475 	int rmw = 0, rcw = 0, i;
2476 	for (i = disks; i--; ) {
2477 		/* would I have to read this buffer for read_modify_write */
2478 		struct r5dev *dev = &sh->dev[i];
2479 		if ((dev->towrite || i == sh->pd_idx) &&
2480 		    !test_bit(R5_LOCKED, &dev->flags) &&
2481 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2482 		      test_bit(R5_Wantcompute, &dev->flags))) {
2483 			if (test_bit(R5_Insync, &dev->flags))
2484 				rmw++;
2485 			else
2486 				rmw += 2*disks;  /* cannot read it */
2487 		}
2488 		/* Would I have to read this buffer for reconstruct_write */
2489 		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2490 		    !test_bit(R5_LOCKED, &dev->flags) &&
2491 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2492 		    test_bit(R5_Wantcompute, &dev->flags))) {
2493 			if (test_bit(R5_Insync, &dev->flags)) rcw++;
2494 			else
2495 				rcw += 2*disks;
2496 		}
2497 	}
2498 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2499 		(unsigned long long)sh->sector, rmw, rcw);
2500 	set_bit(STRIPE_HANDLE, &sh->state);
2501 	if (rmw < rcw && rmw > 0)
2502 		/* prefer read-modify-write, but need to get some data */
2503 		for (i = disks; i--; ) {
2504 			struct r5dev *dev = &sh->dev[i];
2505 			if ((dev->towrite || i == sh->pd_idx) &&
2506 			    !test_bit(R5_LOCKED, &dev->flags) &&
2507 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2508 			    test_bit(R5_Wantcompute, &dev->flags)) &&
2509 			    test_bit(R5_Insync, &dev->flags)) {
2510 				if (
2511 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2512 					pr_debug("Read_old block "
2513 						"%d for r-m-w\n", i);
2514 					set_bit(R5_LOCKED, &dev->flags);
2515 					set_bit(R5_Wantread, &dev->flags);
2516 					s->locked++;
2517 				} else {
2518 					set_bit(STRIPE_DELAYED, &sh->state);
2519 					set_bit(STRIPE_HANDLE, &sh->state);
2520 				}
2521 			}
2522 		}
2523 	if (rcw <= rmw && rcw > 0)
2524 		/* want reconstruct write, but need to get some data */
2525 		for (i = disks; i--; ) {
2526 			struct r5dev *dev = &sh->dev[i];
2527 			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2528 			    i != sh->pd_idx &&
2529 			    !test_bit(R5_LOCKED, &dev->flags) &&
2530 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
2531 			    test_bit(R5_Wantcompute, &dev->flags)) &&
2532 			    test_bit(R5_Insync, &dev->flags)) {
2533 				if (
2534 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2535 					pr_debug("Read_old block "
2536 						"%d for Reconstruct\n", i);
2537 					set_bit(R5_LOCKED, &dev->flags);
2538 					set_bit(R5_Wantread, &dev->flags);
2539 					s->locked++;
2540 				} else {
2541 					set_bit(STRIPE_DELAYED, &sh->state);
2542 					set_bit(STRIPE_HANDLE, &sh->state);
2543 				}
2544 			}
2545 		}
2546 	/* now if nothing is locked, and if we have enough data,
2547 	 * we can start a write request
2548 	 */
2549 	/* since handle_stripe can be called at any time we need to handle the
2550 	 * case where a compute block operation has been submitted and then a
2551 	 * subsequent call wants to start a write request.  raid_run_ops only
2552 	 * handles the case where compute block and reconstruct are requested
2553 	 * simultaneously.  If this is not the case then new writes need to be
2554 	 * held off until the compute completes.
2555 	 */
2556 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2557 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2558 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2559 		schedule_reconstruction(sh, s, rcw == 0, 0);
2560 }
2561 
2562 static void handle_stripe_dirtying6(raid5_conf_t *conf,
2563 		struct stripe_head *sh,	struct stripe_head_state *s,
2564 		struct r6_state *r6s, int disks)
2565 {
2566 	int rcw = 0, pd_idx = sh->pd_idx, i;
2567 	int qd_idx = sh->qd_idx;
2568 
2569 	set_bit(STRIPE_HANDLE, &sh->state);
2570 	for (i = disks; i--; ) {
2571 		struct r5dev *dev = &sh->dev[i];
2572 		/* check if we haven't enough data */
2573 		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2574 		    i != pd_idx && i != qd_idx &&
2575 		    !test_bit(R5_LOCKED, &dev->flags) &&
2576 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
2577 		      test_bit(R5_Wantcompute, &dev->flags))) {
2578 			rcw++;
2579 			if (!test_bit(R5_Insync, &dev->flags))
2580 				continue; /* it's a failed drive */
2581 
2582 			if (
2583 			  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2584 				pr_debug("Read_old stripe %llu "
2585 					"block %d for Reconstruct\n",
2586 				     (unsigned long long)sh->sector, i);
2587 				set_bit(R5_LOCKED, &dev->flags);
2588 				set_bit(R5_Wantread, &dev->flags);
2589 				s->locked++;
2590 			} else {
2591 				pr_debug("Request delayed stripe %llu "
2592 					"block %d for Reconstruct\n",
2593 				     (unsigned long long)sh->sector, i);
2594 				set_bit(STRIPE_DELAYED, &sh->state);
2595 				set_bit(STRIPE_HANDLE, &sh->state);
2596 			}
2597 		}
2598 	}
2599 	/* now if nothing is locked, and if we have enough data, we can start a
2600 	 * write request
2601 	 */
2602 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2603 	    s->locked == 0 && rcw == 0 &&
2604 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2605 		schedule_reconstruction(sh, s, 1, 0);
2606 	}
2607 }
2608 
2609 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2610 				struct stripe_head_state *s, int disks)
2611 {
2612 	struct r5dev *dev = NULL;
2613 
2614 	set_bit(STRIPE_HANDLE, &sh->state);
2615 
2616 	switch (sh->check_state) {
2617 	case check_state_idle:
2618 		/* start a new check operation if there are no failures */
2619 		if (s->failed == 0) {
2620 			BUG_ON(s->uptodate != disks);
2621 			sh->check_state = check_state_run;
2622 			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2623 			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2624 			s->uptodate--;
2625 			break;
2626 		}
2627 		dev = &sh->dev[s->failed_num];
2628 		/* fall through */
2629 	case check_state_compute_result:
2630 		sh->check_state = check_state_idle;
2631 		if (!dev)
2632 			dev = &sh->dev[sh->pd_idx];
2633 
2634 		/* check that a write has not made the stripe insync */
2635 		if (test_bit(STRIPE_INSYNC, &sh->state))
2636 			break;
2637 
2638 		/* either failed parity check, or recovery is happening */
2639 		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2640 		BUG_ON(s->uptodate != disks);
2641 
2642 		set_bit(R5_LOCKED, &dev->flags);
2643 		s->locked++;
2644 		set_bit(R5_Wantwrite, &dev->flags);
2645 
2646 		clear_bit(STRIPE_DEGRADED, &sh->state);
2647 		set_bit(STRIPE_INSYNC, &sh->state);
2648 		break;
2649 	case check_state_run:
2650 		break; /* we will be called again upon completion */
2651 	case check_state_check_result:
2652 		sh->check_state = check_state_idle;
2653 
2654 		/* if a failure occurred during the check operation, leave
2655 		 * STRIPE_INSYNC not set and let the stripe be handled again
2656 		 */
2657 		if (s->failed)
2658 			break;
2659 
2660 		/* handle a successful check operation, if parity is correct
2661 		 * we are done.  Otherwise update the mismatch count and repair
2662 		 * parity if !MD_RECOVERY_CHECK
2663 		 */
2664 		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2665 			/* parity is correct (on disc,
2666 			 * not in buffer any more)
2667 			 */
2668 			set_bit(STRIPE_INSYNC, &sh->state);
2669 		else {
2670 			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2671 			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2672 				/* don't try to repair!! */
2673 				set_bit(STRIPE_INSYNC, &sh->state);
2674 			else {
2675 				sh->check_state = check_state_compute_run;
2676 				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2677 				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2678 				set_bit(R5_Wantcompute,
2679 					&sh->dev[sh->pd_idx].flags);
2680 				sh->ops.target = sh->pd_idx;
2681 				sh->ops.target2 = -1;
2682 				s->uptodate++;
2683 			}
2684 		}
2685 		break;
2686 	case check_state_compute_run:
2687 		break;
2688 	default:
2689 		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2690 		       __func__, sh->check_state,
2691 		       (unsigned long long) sh->sector);
2692 		BUG();
2693 	}
2694 }
2695 
2696 
2697 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2698 				  struct stripe_head_state *s,
2699 				  struct r6_state *r6s, int disks)
2700 {
2701 	int pd_idx = sh->pd_idx;
2702 	int qd_idx = sh->qd_idx;
2703 	struct r5dev *dev;
2704 
2705 	set_bit(STRIPE_HANDLE, &sh->state);
2706 
2707 	BUG_ON(s->failed > 2);
2708 
2709 	/* Want to check and possibly repair P and Q.
2710 	 * However there could be one 'failed' device, in which
2711 	 * case we can only check one of them, possibly using the
2712 	 * other to generate missing data
2713 	 */
2714 
2715 	switch (sh->check_state) {
2716 	case check_state_idle:
2717 		/* start a new check operation if there are < 2 failures */
2718 		if (s->failed == r6s->q_failed) {
2719 			/* The only possible failed device holds Q, so it
2720 			 * makes sense to check P (If anything else were failed,
2721 			 * we would have used P to recreate it).
2722 			 */
2723 			sh->check_state = check_state_run;
2724 		}
2725 		if (!r6s->q_failed && s->failed < 2) {
2726 			/* Q is not failed, and we didn't use it to generate
2727 			 * anything, so it makes sense to check it
2728 			 */
2729 			if (sh->check_state == check_state_run)
2730 				sh->check_state = check_state_run_pq;
2731 			else
2732 				sh->check_state = check_state_run_q;
2733 		}
2734 
2735 		/* discard potentially stale zero_sum_result */
2736 		sh->ops.zero_sum_result = 0;
2737 
2738 		if (sh->check_state == check_state_run) {
2739 			/* async_xor_zero_sum destroys the contents of P */
2740 			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2741 			s->uptodate--;
2742 		}
2743 		if (sh->check_state >= check_state_run &&
2744 		    sh->check_state <= check_state_run_pq) {
2745 			/* async_syndrome_zero_sum preserves P and Q, so
2746 			 * no need to mark them !uptodate here
2747 			 */
2748 			set_bit(STRIPE_OP_CHECK, &s->ops_request);
2749 			break;
2750 		}
2751 
2752 		/* we have 2-disk failure */
2753 		BUG_ON(s->failed != 2);
2754 		/* fall through */
2755 	case check_state_compute_result:
2756 		sh->check_state = check_state_idle;
2757 
2758 		/* check that a write has not made the stripe insync */
2759 		if (test_bit(STRIPE_INSYNC, &sh->state))
2760 			break;
2761 
2762 		/* now write out any block on a failed drive,
2763 		 * or P or Q if they were recomputed
2764 		 */
2765 		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2766 		if (s->failed == 2) {
2767 			dev = &sh->dev[r6s->failed_num[1]];
2768 			s->locked++;
2769 			set_bit(R5_LOCKED, &dev->flags);
2770 			set_bit(R5_Wantwrite, &dev->flags);
2771 		}
2772 		if (s->failed >= 1) {
2773 			dev = &sh->dev[r6s->failed_num[0]];
2774 			s->locked++;
2775 			set_bit(R5_LOCKED, &dev->flags);
2776 			set_bit(R5_Wantwrite, &dev->flags);
2777 		}
2778 		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2779 			dev = &sh->dev[pd_idx];
2780 			s->locked++;
2781 			set_bit(R5_LOCKED, &dev->flags);
2782 			set_bit(R5_Wantwrite, &dev->flags);
2783 		}
2784 		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2785 			dev = &sh->dev[qd_idx];
2786 			s->locked++;
2787 			set_bit(R5_LOCKED, &dev->flags);
2788 			set_bit(R5_Wantwrite, &dev->flags);
2789 		}
2790 		clear_bit(STRIPE_DEGRADED, &sh->state);
2791 
2792 		set_bit(STRIPE_INSYNC, &sh->state);
2793 		break;
2794 	case check_state_run:
2795 	case check_state_run_q:
2796 	case check_state_run_pq:
2797 		break; /* we will be called again upon completion */
2798 	case check_state_check_result:
2799 		sh->check_state = check_state_idle;
2800 
2801 		/* handle a successful check operation, if parity is correct
2802 		 * we are done.  Otherwise update the mismatch count and repair
2803 		 * parity if !MD_RECOVERY_CHECK
2804 		 */
2805 		if (sh->ops.zero_sum_result == 0) {
2806 			/* both parities are correct */
2807 			if (!s->failed)
2808 				set_bit(STRIPE_INSYNC, &sh->state);
2809 			else {
2810 				/* in contrast to the raid5 case we can validate
2811 				 * parity, but still have a failure to write
2812 				 * back
2813 				 */
2814 				sh->check_state = check_state_compute_result;
2815 				/* Returning at this point means that we may go
2816 				 * off and bring p and/or q uptodate again so
2817 				 * we make sure to check zero_sum_result again
2818 				 * to verify if p or q need writeback
2819 				 */
2820 			}
2821 		} else {
2822 			conf->mddev->resync_mismatches += STRIPE_SECTORS;
2823 			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2824 				/* don't try to repair!! */
2825 				set_bit(STRIPE_INSYNC, &sh->state);
2826 			else {
2827 				int *target = &sh->ops.target;
2828 
2829 				sh->ops.target = -1;
2830 				sh->ops.target2 = -1;
2831 				sh->check_state = check_state_compute_run;
2832 				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2833 				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2834 				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2835 					set_bit(R5_Wantcompute,
2836 						&sh->dev[pd_idx].flags);
2837 					*target = pd_idx;
2838 					target = &sh->ops.target2;
2839 					s->uptodate++;
2840 				}
2841 				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2842 					set_bit(R5_Wantcompute,
2843 						&sh->dev[qd_idx].flags);
2844 					*target = qd_idx;
2845 					s->uptodate++;
2846 				}
2847 			}
2848 		}
2849 		break;
2850 	case check_state_compute_run:
2851 		break;
2852 	default:
2853 		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2854 		       __func__, sh->check_state,
2855 		       (unsigned long long) sh->sector);
2856 		BUG();
2857 	}
2858 }
2859 
2860 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2861 				struct r6_state *r6s)
2862 {
2863 	int i;
2864 
2865 	/* We have read all the blocks in this stripe and now we need to
2866 	 * copy some of them into a target stripe for expand.
2867 	 */
2868 	struct dma_async_tx_descriptor *tx = NULL;
2869 	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2870 	for (i = 0; i < sh->disks; i++)
2871 		if (i != sh->pd_idx && i != sh->qd_idx) {
2872 			int dd_idx, j;
2873 			struct stripe_head *sh2;
2874 			struct async_submit_ctl submit;
2875 
2876 			sector_t bn = compute_blocknr(sh, i, 1);
2877 			sector_t s = raid5_compute_sector(conf, bn, 0,
2878 							  &dd_idx, NULL);
2879 			sh2 = get_active_stripe(conf, s, 0, 1, 1);
2880 			if (sh2 == NULL)
2881 				/* so far only the early blocks of this stripe
2882 				 * have been requested.  When later blocks
2883 				 * get requested, we will try again
2884 				 */
2885 				continue;
2886 			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2887 			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2888 				/* must have already done this block */
2889 				release_stripe(sh2);
2890 				continue;
2891 			}
2892 
2893 			/* place all the copies on one channel */
2894 			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2895 			tx = async_memcpy(sh2->dev[dd_idx].page,
2896 					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
2897 					  &submit);
2898 
2899 			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2900 			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2901 			for (j = 0; j < conf->raid_disks; j++)
2902 				if (j != sh2->pd_idx &&
2903 				    (!r6s || j != sh2->qd_idx) &&
2904 				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
2905 					break;
2906 			if (j == conf->raid_disks) {
2907 				set_bit(STRIPE_EXPAND_READY, &sh2->state);
2908 				set_bit(STRIPE_HANDLE, &sh2->state);
2909 			}
2910 			release_stripe(sh2);
2911 
2912 		}
2913 	/* done submitting copies, wait for them to complete */
2914 	if (tx) {
2915 		async_tx_ack(tx);
2916 		dma_wait_for_async_tx(tx);
2917 	}
2918 }
2919 
2920 
2921 /*
2922  * handle_stripe - do things to a stripe.
2923  *
2924  * We lock the stripe and then examine the state of various bits
2925  * to see what needs to be done.
2926  * Possible results:
2927  *    return some read request which now have data
2928  *    return some write requests which are safely on disc
2929  *    schedule a read on some buffers
2930  *    schedule a write of some buffers
2931  *    return confirmation of parity correctness
2932  *
2933  * buffers are taken off read_list or write_list, and bh_cache buffers
2934  * get BH_Lock set before the stripe lock is released.
2935  *
2936  */
2937 
2938 static void handle_stripe5(struct stripe_head *sh)
2939 {
2940 	raid5_conf_t *conf = sh->raid_conf;
2941 	int disks = sh->disks, i;
2942 	struct bio *return_bi = NULL;
2943 	struct stripe_head_state s;
2944 	struct r5dev *dev;
2945 	mdk_rdev_t *blocked_rdev = NULL;
2946 	int prexor;
2947 	int dec_preread_active = 0;
2948 
2949 	memset(&s, 0, sizeof(s));
2950 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2951 		 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2952 		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2953 		 sh->reconstruct_state);
2954 
2955 	spin_lock(&sh->lock);
2956 	clear_bit(STRIPE_HANDLE, &sh->state);
2957 	clear_bit(STRIPE_DELAYED, &sh->state);
2958 
2959 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2960 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2961 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2962 
2963 	/* Now to look around and see what can be done */
2964 	rcu_read_lock();
2965 	for (i=disks; i--; ) {
2966 		mdk_rdev_t *rdev;
2967 
2968 		dev = &sh->dev[i];
2969 		clear_bit(R5_Insync, &dev->flags);
2970 
2971 		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2972 			"written %p\n",	i, dev->flags, dev->toread, dev->read,
2973 			dev->towrite, dev->written);
2974 
2975 		/* maybe we can request a biofill operation
2976 		 *
2977 		 * new wantfill requests are only permitted while
2978 		 * ops_complete_biofill is guaranteed to be inactive
2979 		 */
2980 		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2981 		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2982 			set_bit(R5_Wantfill, &dev->flags);
2983 
2984 		/* now count some things */
2985 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2986 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2987 		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2988 
2989 		if (test_bit(R5_Wantfill, &dev->flags))
2990 			s.to_fill++;
2991 		else if (dev->toread)
2992 			s.to_read++;
2993 		if (dev->towrite) {
2994 			s.to_write++;
2995 			if (!test_bit(R5_OVERWRITE, &dev->flags))
2996 				s.non_overwrite++;
2997 		}
2998 		if (dev->written)
2999 			s.written++;
3000 		rdev = rcu_dereference(conf->disks[i].rdev);
3001 		if (blocked_rdev == NULL &&
3002 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3003 			blocked_rdev = rdev;
3004 			atomic_inc(&rdev->nr_pending);
3005 		}
3006 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
3007 			/* The ReadError flag will just be confusing now */
3008 			clear_bit(R5_ReadError, &dev->flags);
3009 			clear_bit(R5_ReWrite, &dev->flags);
3010 		}
3011 		if (!rdev || !test_bit(In_sync, &rdev->flags)
3012 		    || test_bit(R5_ReadError, &dev->flags)) {
3013 			s.failed++;
3014 			s.failed_num = i;
3015 		} else
3016 			set_bit(R5_Insync, &dev->flags);
3017 	}
3018 	rcu_read_unlock();
3019 
3020 	if (unlikely(blocked_rdev)) {
3021 		if (s.syncing || s.expanding || s.expanded ||
3022 		    s.to_write || s.written) {
3023 			set_bit(STRIPE_HANDLE, &sh->state);
3024 			goto unlock;
3025 		}
3026 		/* There is nothing for the blocked_rdev to block */
3027 		rdev_dec_pending(blocked_rdev, conf->mddev);
3028 		blocked_rdev = NULL;
3029 	}
3030 
3031 	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3032 		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3033 		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3034 	}
3035 
3036 	pr_debug("locked=%d uptodate=%d to_read=%d"
3037 		" to_write=%d failed=%d failed_num=%d\n",
3038 		s.locked, s.uptodate, s.to_read, s.to_write,
3039 		s.failed, s.failed_num);
3040 	/* check if the array has lost two devices and, if so, some requests might
3041 	 * need to be failed
3042 	 */
3043 	if (s.failed > 1 && s.to_read+s.to_write+s.written)
3044 		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3045 	if (s.failed > 1 && s.syncing) {
3046 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3047 		clear_bit(STRIPE_SYNCING, &sh->state);
3048 		s.syncing = 0;
3049 	}
3050 
3051 	/* might be able to return some write requests if the parity block
3052 	 * is safe, or on a failed drive
3053 	 */
3054 	dev = &sh->dev[sh->pd_idx];
3055 	if ( s.written &&
3056 	     ((test_bit(R5_Insync, &dev->flags) &&
3057 	       !test_bit(R5_LOCKED, &dev->flags) &&
3058 	       test_bit(R5_UPTODATE, &dev->flags)) ||
3059 	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
3060 		handle_stripe_clean_event(conf, sh, disks, &return_bi);
3061 
3062 	/* Now we might consider reading some blocks, either to check/generate
3063 	 * parity, or to satisfy requests
3064 	 * or to load a block that is being partially written.
3065 	 */
3066 	if (s.to_read || s.non_overwrite ||
3067 	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3068 		handle_stripe_fill5(sh, &s, disks);
3069 
3070 	/* Now we check to see if any write operations have recently
3071 	 * completed
3072 	 */
3073 	prexor = 0;
3074 	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3075 		prexor = 1;
3076 	if (sh->reconstruct_state == reconstruct_state_drain_result ||
3077 	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3078 		sh->reconstruct_state = reconstruct_state_idle;
3079 
3080 		/* All the 'written' buffers and the parity block are ready to
3081 		 * be written back to disk
3082 		 */
3083 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3084 		for (i = disks; i--; ) {
3085 			dev = &sh->dev[i];
3086 			if (test_bit(R5_LOCKED, &dev->flags) &&
3087 				(i == sh->pd_idx || dev->written)) {
3088 				pr_debug("Writing block %d\n", i);
3089 				set_bit(R5_Wantwrite, &dev->flags);
3090 				if (prexor)
3091 					continue;
3092 				if (!test_bit(R5_Insync, &dev->flags) ||
3093 				    (i == sh->pd_idx && s.failed == 0))
3094 					set_bit(STRIPE_INSYNC, &sh->state);
3095 			}
3096 		}
3097 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3098 			dec_preread_active = 1;
3099 	}
3100 
3101 	/* Now to consider new write requests and what else, if anything
3102 	 * should be read.  We do not handle new writes when:
3103 	 * 1/ A 'write' operation (copy+xor) is already in flight.
3104 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
3105 	 *    block.
3106 	 */
3107 	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3108 		handle_stripe_dirtying5(conf, sh, &s, disks);
3109 
3110 	/* maybe we need to check and possibly fix the parity for this stripe
3111 	 * Any reads will already have been scheduled, so we just see if enough
3112 	 * data is available.  The parity check is held off while parity
3113 	 * dependent operations are in flight.
3114 	 */
3115 	if (sh->check_state ||
3116 	    (s.syncing && s.locked == 0 &&
3117 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3118 	     !test_bit(STRIPE_INSYNC, &sh->state)))
3119 		handle_parity_checks5(conf, sh, &s, disks);
3120 
3121 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3122 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3123 		clear_bit(STRIPE_SYNCING, &sh->state);
3124 	}
3125 
3126 	/* If the failed drive is just a ReadError, then we might need to progress
3127 	 * the repair/check process
3128 	 */
3129 	if (s.failed == 1 && !conf->mddev->ro &&
3130 	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3131 	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3132 	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3133 		) {
3134 		dev = &sh->dev[s.failed_num];
3135 		if (!test_bit(R5_ReWrite, &dev->flags)) {
3136 			set_bit(R5_Wantwrite, &dev->flags);
3137 			set_bit(R5_ReWrite, &dev->flags);
3138 			set_bit(R5_LOCKED, &dev->flags);
3139 			s.locked++;
3140 		} else {
3141 			/* let's read it back */
3142 			set_bit(R5_Wantread, &dev->flags);
3143 			set_bit(R5_LOCKED, &dev->flags);
3144 			s.locked++;
3145 		}
3146 	}
3147 
3148 	/* Finish reconstruct operations initiated by the expansion process */
3149 	if (sh->reconstruct_state == reconstruct_state_result) {
3150 		struct stripe_head *sh2
3151 			= get_active_stripe(conf, sh->sector, 1, 1, 1);
3152 		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3153 			/* sh cannot be written until sh2 has been read.
3154 			 * so arrange for sh to be delayed a little
3155 			 */
3156 			set_bit(STRIPE_DELAYED, &sh->state);
3157 			set_bit(STRIPE_HANDLE, &sh->state);
3158 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3159 					      &sh2->state))
3160 				atomic_inc(&conf->preread_active_stripes);
3161 			release_stripe(sh2);
3162 			goto unlock;
3163 		}
3164 		if (sh2)
3165 			release_stripe(sh2);
3166 
3167 		sh->reconstruct_state = reconstruct_state_idle;
3168 		clear_bit(STRIPE_EXPANDING, &sh->state);
3169 		for (i = conf->raid_disks; i--; ) {
3170 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3171 			set_bit(R5_LOCKED, &sh->dev[i].flags);
3172 			s.locked++;
3173 		}
3174 	}
3175 
3176 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3177 	    !sh->reconstruct_state) {
3178 		/* Need to write out all blocks after computing parity */
3179 		sh->disks = conf->raid_disks;
3180 		stripe_set_idx(sh->sector, conf, 0, sh);
3181 		schedule_reconstruction(sh, &s, 1, 1);
3182 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3183 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3184 		atomic_dec(&conf->reshape_stripes);
3185 		wake_up(&conf->wait_for_overlap);
3186 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3187 	}
3188 
3189 	if (s.expanding && s.locked == 0 &&
3190 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3191 		handle_stripe_expansion(conf, sh, NULL);
3192 
3193  unlock:
3194 	spin_unlock(&sh->lock);
3195 
3196 	/* wait for this device to become unblocked */
3197 	if (unlikely(blocked_rdev))
3198 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3199 
3200 	if (s.ops_request)
3201 		raid_run_ops(sh, s.ops_request);
3202 
3203 	ops_run_io(sh, &s);
3204 
3205 	if (dec_preread_active) {
3206 		/* We delay this until after ops_run_io so that if make_request
3207 		 * is waiting on a barrier, it won't continue until the writes
3208 		 * have actually been submitted.
3209 		 */
3210 		atomic_dec(&conf->preread_active_stripes);
3211 		if (atomic_read(&conf->preread_active_stripes) <
3212 		    IO_THRESHOLD)
3213 			md_wakeup_thread(conf->mddev->thread);
3214 	}
3215 	return_io(return_bi);
3216 }
3217 
3218 static void handle_stripe6(struct stripe_head *sh)
3219 {
3220 	raid5_conf_t *conf = sh->raid_conf;
3221 	int disks = sh->disks;
3222 	struct bio *return_bi = NULL;
3223 	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
3224 	struct stripe_head_state s;
3225 	struct r6_state r6s;
3226 	struct r5dev *dev, *pdev, *qdev;
3227 	mdk_rdev_t *blocked_rdev = NULL;
3228 	int dec_preread_active = 0;
3229 
3230 	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3231 		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3232 	       (unsigned long long)sh->sector, sh->state,
3233 	       atomic_read(&sh->count), pd_idx, qd_idx,
3234 	       sh->check_state, sh->reconstruct_state);
3235 	memset(&s, 0, sizeof(s));
3236 
3237 	spin_lock(&sh->lock);
3238 	clear_bit(STRIPE_HANDLE, &sh->state);
3239 	clear_bit(STRIPE_DELAYED, &sh->state);
3240 
3241 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3242 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3243 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3244 	/* Now to look around and see what can be done */
3245 
3246 	rcu_read_lock();
3247 	for (i=disks; i--; ) {
3248 		mdk_rdev_t *rdev;
3249 		dev = &sh->dev[i];
3250 		clear_bit(R5_Insync, &dev->flags);
3251 
3252 		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3253 			i, dev->flags, dev->toread, dev->towrite, dev->written);
3254 		/* maybe we can reply to a read
3255 		 *
3256 		 * new wantfill requests are only permitted while
3257 		 * ops_complete_biofill is guaranteed to be inactive
3258 		 */
3259 		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3260 		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3261 			set_bit(R5_Wantfill, &dev->flags);
3262 
3263 		/* now count some things */
3264 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3265 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3266 		if (test_bit(R5_Wantcompute, &dev->flags)) {
3267 			s.compute++;
3268 			BUG_ON(s.compute > 2);
3269 		}
3270 
3271 		if (test_bit(R5_Wantfill, &dev->flags)) {
3272 			s.to_fill++;
3273 		} else if (dev->toread)
3274 			s.to_read++;
3275 		if (dev->towrite) {
3276 			s.to_write++;
3277 			if (!test_bit(R5_OVERWRITE, &dev->flags))
3278 				s.non_overwrite++;
3279 		}
3280 		if (dev->written)
3281 			s.written++;
3282 		rdev = rcu_dereference(conf->disks[i].rdev);
3283 		if (blocked_rdev == NULL &&
3284 		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3285 			blocked_rdev = rdev;
3286 			atomic_inc(&rdev->nr_pending);
3287 		}
3288 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
3289 			/* The ReadError flag will just be confusing now */
3290 			clear_bit(R5_ReadError, &dev->flags);
3291 			clear_bit(R5_ReWrite, &dev->flags);
3292 		}
3293 		if (!rdev || !test_bit(In_sync, &rdev->flags)
3294 		    || test_bit(R5_ReadError, &dev->flags)) {
3295 			if (s.failed < 2)
3296 				r6s.failed_num[s.failed] = i;
3297 			s.failed++;
3298 		} else
3299 			set_bit(R5_Insync, &dev->flags);
3300 	}
3301 	rcu_read_unlock();
3302 
3303 	if (unlikely(blocked_rdev)) {
3304 		if (s.syncing || s.expanding || s.expanded ||
3305 		    s.to_write || s.written) {
3306 			set_bit(STRIPE_HANDLE, &sh->state);
3307 			goto unlock;
3308 		}
3309 		/* There is nothing for the blocked_rdev to block */
3310 		rdev_dec_pending(blocked_rdev, conf->mddev);
3311 		blocked_rdev = NULL;
3312 	}
3313 
3314 	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3315 		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3316 		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3317 	}
3318 
3319 	pr_debug("locked=%d uptodate=%d to_read=%d"
3320 	       " to_write=%d failed=%d failed_num=%d,%d\n",
3321 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3322 	       r6s.failed_num[0], r6s.failed_num[1]);
3323 	/* check if the array has lost >2 devices and, if so, some requests
3324 	 * might need to be failed
3325 	 */
3326 	if (s.failed > 2 && s.to_read+s.to_write+s.written)
3327 		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3328 	if (s.failed > 2 && s.syncing) {
3329 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3330 		clear_bit(STRIPE_SYNCING, &sh->state);
3331 		s.syncing = 0;
3332 	}
3333 
3334 	/*
3335 	 * might be able to return some write requests if the parity blocks
3336 	 * are safe, or on a failed drive
3337 	 */
3338 	pdev = &sh->dev[pd_idx];
3339 	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
3340 		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
3341 	qdev = &sh->dev[qd_idx];
3342 	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
3343 		|| (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
3344 
3345 	if ( s.written &&
3346 	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3347 			     && !test_bit(R5_LOCKED, &pdev->flags)
3348 			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3349 	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3350 			     && !test_bit(R5_LOCKED, &qdev->flags)
3351 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
3352 		handle_stripe_clean_event(conf, sh, disks, &return_bi);
3353 
3354 	/* Now we might consider reading some blocks, either to check/generate
3355 	 * parity, or to satisfy requests
3356 	 * or to load a block that is being partially written.
3357 	 */
3358 	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3359 	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3360 		handle_stripe_fill6(sh, &s, &r6s, disks);
3361 
3362 	/* Now we check to see if any write operations have recently
3363 	 * completed
3364 	 */
3365 	if (sh->reconstruct_state == reconstruct_state_drain_result) {
3366 
3367 		sh->reconstruct_state = reconstruct_state_idle;
3368 		/* All the 'written' buffers and the parity blocks are ready to
3369 		 * be written back to disk
3370 		 */
3371 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3372 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3373 		for (i = disks; i--; ) {
3374 			dev = &sh->dev[i];
3375 			if (test_bit(R5_LOCKED, &dev->flags) &&
3376 			    (i == sh->pd_idx || i == qd_idx ||
3377 			     dev->written)) {
3378 				pr_debug("Writing block %d\n", i);
3379 				BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3380 				set_bit(R5_Wantwrite, &dev->flags);
3381 				if (!test_bit(R5_Insync, &dev->flags) ||
3382 				    ((i == sh->pd_idx || i == qd_idx) &&
3383 				      s.failed == 0))
3384 					set_bit(STRIPE_INSYNC, &sh->state);
3385 			}
3386 		}
3387 		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3388 			dec_preread_active = 1;
3389 	}
3390 
3391 	/* Now to consider new write requests and what else, if anything
3392 	 * should be read.  We do not handle new writes when:
3393 	 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3394 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
3395 	 *    block.
3396 	 */
3397 	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3398 		handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3399 
3400 	/* maybe we need to check and possibly fix the parity for this stripe
3401 	 * Any reads will already have been scheduled, so we just see if enough
3402 	 * data is available.  The parity check is held off while parity
3403 	 * dependent operations are in flight.
3404 	 */
3405 	if (sh->check_state ||
3406 	    (s.syncing && s.locked == 0 &&
3407 	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3408 	     !test_bit(STRIPE_INSYNC, &sh->state)))
3409 		handle_parity_checks6(conf, sh, &s, &r6s, disks);
3410 
3411 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3412 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3413 		clear_bit(STRIPE_SYNCING, &sh->state);
3414 	}
3415 
3416 	/* If the failed drives are just a ReadError, then we might need
3417 	 * to progress the repair/check process
3418 	 */
3419 	if (s.failed <= 2 && !conf->mddev->ro)
3420 		for (i = 0; i < s.failed; i++) {
3421 			dev = &sh->dev[r6s.failed_num[i]];
3422 			if (test_bit(R5_ReadError, &dev->flags)
3423 			    && !test_bit(R5_LOCKED, &dev->flags)
3424 			    && test_bit(R5_UPTODATE, &dev->flags)
3425 				) {
3426 				if (!test_bit(R5_ReWrite, &dev->flags)) {
3427 					set_bit(R5_Wantwrite, &dev->flags);
3428 					set_bit(R5_ReWrite, &dev->flags);
3429 					set_bit(R5_LOCKED, &dev->flags);
3430 					s.locked++;
3431 				} else {
3432 					/* let's read it back */
3433 					set_bit(R5_Wantread, &dev->flags);
3434 					set_bit(R5_LOCKED, &dev->flags);
3435 					s.locked++;
3436 				}
3437 			}
3438 		}
3439 
3440 	/* Finish reconstruct operations initiated by the expansion process */
3441 	if (sh->reconstruct_state == reconstruct_state_result) {
3442 		sh->reconstruct_state = reconstruct_state_idle;
3443 		clear_bit(STRIPE_EXPANDING, &sh->state);
3444 		for (i = conf->raid_disks; i--; ) {
3445 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
3446 			set_bit(R5_LOCKED, &sh->dev[i].flags);
3447 			s.locked++;
3448 		}
3449 	}
3450 
3451 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3452 	    !sh->reconstruct_state) {
3453 		struct stripe_head *sh2
3454 			= get_active_stripe(conf, sh->sector, 1, 1, 1);
3455 		if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3456 			/* sh cannot be written until sh2 has been read.
3457 			 * so arrange for sh to be delayed a little
3458 			 */
3459 			set_bit(STRIPE_DELAYED, &sh->state);
3460 			set_bit(STRIPE_HANDLE, &sh->state);
3461 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3462 					      &sh2->state))
3463 				atomic_inc(&conf->preread_active_stripes);
3464 			release_stripe(sh2);
3465 			goto unlock;
3466 		}
3467 		if (sh2)
3468 			release_stripe(sh2);
3469 
3470 		/* Need to write out all blocks after computing P&Q */
3471 		sh->disks = conf->raid_disks;
3472 		stripe_set_idx(sh->sector, conf, 0, sh);
3473 		schedule_reconstruction(sh, &s, 1, 1);
3474 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3475 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
3476 		atomic_dec(&conf->reshape_stripes);
3477 		wake_up(&conf->wait_for_overlap);
3478 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3479 	}
3480 
3481 	if (s.expanding && s.locked == 0 &&
3482 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3483 		handle_stripe_expansion(conf, sh, &r6s);
3484 
3485  unlock:
3486 	spin_unlock(&sh->lock);
3487 
3488 	/* wait for this device to become unblocked */
3489 	if (unlikely(blocked_rdev))
3490 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3491 
3492 	if (s.ops_request)
3493 		raid_run_ops(sh, s.ops_request);
3494 
3495 	ops_run_io(sh, &s);
3496 
3497 
3498 	if (dec_preread_active) {
3499 		/* We delay this until after ops_run_io so that if make_request
3500 		 * is waiting on a barrier, it won't continue until the writes
3501 		 * have actually been submitted.
3502 		 */
3503 		atomic_dec(&conf->preread_active_stripes);
3504 		if (atomic_read(&conf->preread_active_stripes) <
3505 		    IO_THRESHOLD)
3506 			md_wakeup_thread(conf->mddev->thread);
3507 	}
3508 
3509 	return_io(return_bi);
3510 }
3511 
3512 static void handle_stripe(struct stripe_head *sh)
3513 {
3514 	if (sh->raid_conf->level == 6)
3515 		handle_stripe6(sh);
3516 	else
3517 		handle_stripe5(sh);
3518 }
3519 
3520 static void raid5_activate_delayed(raid5_conf_t *conf)
3521 {
3522 	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3523 		while (!list_empty(&conf->delayed_list)) {
3524 			struct list_head *l = conf->delayed_list.next;
3525 			struct stripe_head *sh;
3526 			sh = list_entry(l, struct stripe_head, lru);
3527 			list_del_init(l);
3528 			clear_bit(STRIPE_DELAYED, &sh->state);
3529 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3530 				atomic_inc(&conf->preread_active_stripes);
3531 			list_add_tail(&sh->lru, &conf->hold_list);
3532 		}
3533 	} else
3534 		blk_plug_device(conf->mddev->queue);
3535 }
3536 
3537 static void activate_bit_delay(raid5_conf_t *conf)
3538 {
3539 	/* device_lock is held */
3540 	struct list_head head;
3541 	list_add(&head, &conf->bitmap_list);
3542 	list_del_init(&conf->bitmap_list);
3543 	while (!list_empty(&head)) {
3544 		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3545 		list_del_init(&sh->lru);
3546 		atomic_inc(&sh->count);
3547 		__release_stripe(conf, sh);
3548 	}
3549 }
3550 
3551 static void unplug_slaves(mddev_t *mddev)
3552 {
3553 	raid5_conf_t *conf = mddev->private;
3554 	int i;
3555 	int devs = max(conf->raid_disks, conf->previous_raid_disks);
3556 
3557 	rcu_read_lock();
3558 	for (i = 0; i < devs; i++) {
3559 		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3560 		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3561 			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3562 
3563 			atomic_inc(&rdev->nr_pending);
3564 			rcu_read_unlock();
3565 
3566 			blk_unplug(r_queue);
3567 
3568 			rdev_dec_pending(rdev, mddev);
3569 			rcu_read_lock();
3570 		}
3571 	}
3572 	rcu_read_unlock();
3573 }
3574 
3575 static void raid5_unplug_device(struct request_queue *q)
3576 {
3577 	mddev_t *mddev = q->queuedata;
3578 	raid5_conf_t *conf = mddev->private;
3579 	unsigned long flags;
3580 
3581 	spin_lock_irqsave(&conf->device_lock, flags);
3582 
3583 	if (blk_remove_plug(q)) {
3584 		conf->seq_flush++;
3585 		raid5_activate_delayed(conf);
3586 	}
3587 	md_wakeup_thread(mddev->thread);
3588 
3589 	spin_unlock_irqrestore(&conf->device_lock, flags);
3590 
3591 	unplug_slaves(mddev);
3592 }
3593 
3594 static int raid5_congested(void *data, int bits)
3595 {
3596 	mddev_t *mddev = data;
3597 	raid5_conf_t *conf = mddev->private;
3598 
3599 	/* No difference between reads and writes.  Just check
3600 	 * how busy the stripe_cache is
3601 	 */
3602 
3603 	if (mddev_congested(mddev, bits))
3604 		return 1;
3605 	if (conf->inactive_blocked)
3606 		return 1;
3607 	if (conf->quiesce)
3608 		return 1;
3609 	if (list_empty_careful(&conf->inactive_list))
3610 		return 1;
3611 
3612 	return 0;
3613 }
3614 
3615 /* We want read requests to align with chunks where possible,
3616  * but write requests don't need to.
3617  */
3618 static int raid5_mergeable_bvec(struct request_queue *q,
3619 				struct bvec_merge_data *bvm,
3620 				struct bio_vec *biovec)
3621 {
3622 	mddev_t *mddev = q->queuedata;
3623 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3624 	int max;
3625 	unsigned int chunk_sectors = mddev->chunk_sectors;
3626 	unsigned int bio_sectors = bvm->bi_size >> 9;
3627 
3628 	if ((bvm->bi_rw & 1) == WRITE)
3629 		return biovec->bv_len; /* always allow writes to be mergeable */
3630 
3631 	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3632 		chunk_sectors = mddev->new_chunk_sectors;
3633 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3634 	if (max < 0) max = 0;
3635 	if (max <= biovec->bv_len && bio_sectors == 0)
3636 		return biovec->bv_len;
3637 	else
3638 		return max;
3639 }
3640 
3641 
3642 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3643 {
3644 	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3645 	unsigned int chunk_sectors = mddev->chunk_sectors;
3646 	unsigned int bio_sectors = bio->bi_size >> 9;
3647 
3648 	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3649 		chunk_sectors = mddev->new_chunk_sectors;
3650 	return  chunk_sectors >=
3651 		((sector & (chunk_sectors - 1)) + bio_sectors);
3652 }
3653 
3654 /*
3655  *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
3656  *  later sampled by raid5d.
3657  */
3658 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3659 {
3660 	unsigned long flags;
3661 
3662 	spin_lock_irqsave(&conf->device_lock, flags);
3663 
3664 	bi->bi_next = conf->retry_read_aligned_list;
3665 	conf->retry_read_aligned_list = bi;
3666 
3667 	spin_unlock_irqrestore(&conf->device_lock, flags);
3668 	md_wakeup_thread(conf->mddev->thread);
3669 }
3670 
3671 
3672 static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3673 {
3674 	struct bio *bi;
3675 
3676 	bi = conf->retry_read_aligned;
3677 	if (bi) {
3678 		conf->retry_read_aligned = NULL;
3679 		return bi;
3680 	}
3681 	bi = conf->retry_read_aligned_list;
3682 	if(bi) {
3683 		conf->retry_read_aligned_list = bi->bi_next;
3684 		bi->bi_next = NULL;
3685 		/*
3686 		 * this sets the active strip count to 1 and the processed
3687 		 * strip count to zero (upper 8 bits)
3688 		 */
3689 		bi->bi_phys_segments = 1; /* biased count of active stripes */
3690 	}
3691 
3692 	return bi;
3693 }
3694 
3695 
3696 /*
3697  *  The "raid5_align_endio" should check if the read succeeded and if it
3698  *  did, call bio_endio on the original bio (having bio_put the new bio
3699  *  first).
3700  *  If the read failed..
3701  */
3702 static void raid5_align_endio(struct bio *bi, int error)
3703 {
3704 	struct bio* raid_bi  = bi->bi_private;
3705 	mddev_t *mddev;
3706 	raid5_conf_t *conf;
3707 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3708 	mdk_rdev_t *rdev;
3709 
3710 	bio_put(bi);
3711 
3712 	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3713 	conf = mddev->private;
3714 	rdev = (void*)raid_bi->bi_next;
3715 	raid_bi->bi_next = NULL;
3716 
3717 	rdev_dec_pending(rdev, conf->mddev);
3718 
3719 	if (!error && uptodate) {
3720 		bio_endio(raid_bi, 0);
3721 		if (atomic_dec_and_test(&conf->active_aligned_reads))
3722 			wake_up(&conf->wait_for_stripe);
3723 		return;
3724 	}
3725 
3726 
3727 	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3728 
3729 	add_bio_to_retry(raid_bi, conf);
3730 }
3731 
3732 static int bio_fits_rdev(struct bio *bi)
3733 {
3734 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3735 
3736 	if ((bi->bi_size>>9) > queue_max_sectors(q))
3737 		return 0;
3738 	blk_recount_segments(q, bi);
3739 	if (bi->bi_phys_segments > queue_max_segments(q))
3740 		return 0;
3741 
3742 	if (q->merge_bvec_fn)
3743 		/* it's too hard to apply the merge_bvec_fn at this stage,
3744 		 * just just give up
3745 		 */
3746 		return 0;
3747 
3748 	return 1;
3749 }
3750 
3751 
3752 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3753 {
3754 	mddev_t *mddev = q->queuedata;
3755 	raid5_conf_t *conf = mddev->private;
3756 	int dd_idx;
3757 	struct bio* align_bi;
3758 	mdk_rdev_t *rdev;
3759 
3760 	if (!in_chunk_boundary(mddev, raid_bio)) {
3761 		pr_debug("chunk_aligned_read : non aligned\n");
3762 		return 0;
3763 	}
3764 	/*
3765 	 * use bio_clone to make a copy of the bio
3766 	 */
3767 	align_bi = bio_clone(raid_bio, GFP_NOIO);
3768 	if (!align_bi)
3769 		return 0;
3770 	/*
3771 	 *   set bi_end_io to a new function, and set bi_private to the
3772 	 *     original bio.
3773 	 */
3774 	align_bi->bi_end_io  = raid5_align_endio;
3775 	align_bi->bi_private = raid_bio;
3776 	/*
3777 	 *	compute position
3778 	 */
3779 	align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
3780 						    0,
3781 						    &dd_idx, NULL);
3782 
3783 	rcu_read_lock();
3784 	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3785 	if (rdev && test_bit(In_sync, &rdev->flags)) {
3786 		atomic_inc(&rdev->nr_pending);
3787 		rcu_read_unlock();
3788 		raid_bio->bi_next = (void*)rdev;
3789 		align_bi->bi_bdev =  rdev->bdev;
3790 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3791 		align_bi->bi_sector += rdev->data_offset;
3792 
3793 		if (!bio_fits_rdev(align_bi)) {
3794 			/* too big in some way */
3795 			bio_put(align_bi);
3796 			rdev_dec_pending(rdev, mddev);
3797 			return 0;
3798 		}
3799 
3800 		spin_lock_irq(&conf->device_lock);
3801 		wait_event_lock_irq(conf->wait_for_stripe,
3802 				    conf->quiesce == 0,
3803 				    conf->device_lock, /* nothing */);
3804 		atomic_inc(&conf->active_aligned_reads);
3805 		spin_unlock_irq(&conf->device_lock);
3806 
3807 		generic_make_request(align_bi);
3808 		return 1;
3809 	} else {
3810 		rcu_read_unlock();
3811 		bio_put(align_bi);
3812 		return 0;
3813 	}
3814 }
3815 
3816 /* __get_priority_stripe - get the next stripe to process
3817  *
3818  * Full stripe writes are allowed to pass preread active stripes up until
3819  * the bypass_threshold is exceeded.  In general the bypass_count
3820  * increments when the handle_list is handled before the hold_list; however, it
3821  * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3822  * stripe with in flight i/o.  The bypass_count will be reset when the
3823  * head of the hold_list has changed, i.e. the head was promoted to the
3824  * handle_list.
3825  */
3826 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3827 {
3828 	struct stripe_head *sh;
3829 
3830 	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3831 		  __func__,
3832 		  list_empty(&conf->handle_list) ? "empty" : "busy",
3833 		  list_empty(&conf->hold_list) ? "empty" : "busy",
3834 		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
3835 
3836 	if (!list_empty(&conf->handle_list)) {
3837 		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3838 
3839 		if (list_empty(&conf->hold_list))
3840 			conf->bypass_count = 0;
3841 		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3842 			if (conf->hold_list.next == conf->last_hold)
3843 				conf->bypass_count++;
3844 			else {
3845 				conf->last_hold = conf->hold_list.next;
3846 				conf->bypass_count -= conf->bypass_threshold;
3847 				if (conf->bypass_count < 0)
3848 					conf->bypass_count = 0;
3849 			}
3850 		}
3851 	} else if (!list_empty(&conf->hold_list) &&
3852 		   ((conf->bypass_threshold &&
3853 		     conf->bypass_count > conf->bypass_threshold) ||
3854 		    atomic_read(&conf->pending_full_writes) == 0)) {
3855 		sh = list_entry(conf->hold_list.next,
3856 				typeof(*sh), lru);
3857 		conf->bypass_count -= conf->bypass_threshold;
3858 		if (conf->bypass_count < 0)
3859 			conf->bypass_count = 0;
3860 	} else
3861 		return NULL;
3862 
3863 	list_del_init(&sh->lru);
3864 	atomic_inc(&sh->count);
3865 	BUG_ON(atomic_read(&sh->count) != 1);
3866 	return sh;
3867 }
3868 
3869 static int make_request(struct request_queue *q, struct bio * bi)
3870 {
3871 	mddev_t *mddev = q->queuedata;
3872 	raid5_conf_t *conf = mddev->private;
3873 	int dd_idx;
3874 	sector_t new_sector;
3875 	sector_t logical_sector, last_sector;
3876 	struct stripe_head *sh;
3877 	const int rw = bio_data_dir(bi);
3878 	int cpu, remaining;
3879 
3880 	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3881 		/* Drain all pending writes.  We only really need
3882 		 * to ensure they have been submitted, but this is
3883 		 * easier.
3884 		 */
3885 		mddev->pers->quiesce(mddev, 1);
3886 		mddev->pers->quiesce(mddev, 0);
3887 		md_barrier_request(mddev, bi);
3888 		return 0;
3889 	}
3890 
3891 	md_write_start(mddev, bi);
3892 
3893 	cpu = part_stat_lock();
3894 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
3895 	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
3896 		      bio_sectors(bi));
3897 	part_stat_unlock();
3898 
3899 	if (rw == READ &&
3900 	     mddev->reshape_position == MaxSector &&
3901 	     chunk_aligned_read(q,bi))
3902 		return 0;
3903 
3904 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3905 	last_sector = bi->bi_sector + (bi->bi_size>>9);
3906 	bi->bi_next = NULL;
3907 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
3908 
3909 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3910 		DEFINE_WAIT(w);
3911 		int disks, data_disks;
3912 		int previous;
3913 
3914 	retry:
3915 		previous = 0;
3916 		disks = conf->raid_disks;
3917 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3918 		if (unlikely(conf->reshape_progress != MaxSector)) {
3919 			/* spinlock is needed as reshape_progress may be
3920 			 * 64bit on a 32bit platform, and so it might be
3921 			 * possible to see a half-updated value
3922 			 * Ofcourse reshape_progress could change after
3923 			 * the lock is dropped, so once we get a reference
3924 			 * to the stripe that we think it is, we will have
3925 			 * to check again.
3926 			 */
3927 			spin_lock_irq(&conf->device_lock);
3928 			if (mddev->delta_disks < 0
3929 			    ? logical_sector < conf->reshape_progress
3930 			    : logical_sector >= conf->reshape_progress) {
3931 				disks = conf->previous_raid_disks;
3932 				previous = 1;
3933 			} else {
3934 				if (mddev->delta_disks < 0
3935 				    ? logical_sector < conf->reshape_safe
3936 				    : logical_sector >= conf->reshape_safe) {
3937 					spin_unlock_irq(&conf->device_lock);
3938 					schedule();
3939 					goto retry;
3940 				}
3941 			}
3942 			spin_unlock_irq(&conf->device_lock);
3943 		}
3944 		data_disks = disks - conf->max_degraded;
3945 
3946 		new_sector = raid5_compute_sector(conf, logical_sector,
3947 						  previous,
3948 						  &dd_idx, NULL);
3949 		pr_debug("raid5: make_request, sector %llu logical %llu\n",
3950 			(unsigned long long)new_sector,
3951 			(unsigned long long)logical_sector);
3952 
3953 		sh = get_active_stripe(conf, new_sector, previous,
3954 				       (bi->bi_rw&RWA_MASK), 0);
3955 		if (sh) {
3956 			if (unlikely(previous)) {
3957 				/* expansion might have moved on while waiting for a
3958 				 * stripe, so we must do the range check again.
3959 				 * Expansion could still move past after this
3960 				 * test, but as we are holding a reference to
3961 				 * 'sh', we know that if that happens,
3962 				 *  STRIPE_EXPANDING will get set and the expansion
3963 				 * won't proceed until we finish with the stripe.
3964 				 */
3965 				int must_retry = 0;
3966 				spin_lock_irq(&conf->device_lock);
3967 				if (mddev->delta_disks < 0
3968 				    ? logical_sector >= conf->reshape_progress
3969 				    : logical_sector < conf->reshape_progress)
3970 					/* mismatch, need to try again */
3971 					must_retry = 1;
3972 				spin_unlock_irq(&conf->device_lock);
3973 				if (must_retry) {
3974 					release_stripe(sh);
3975 					schedule();
3976 					goto retry;
3977 				}
3978 			}
3979 
3980 			if (bio_data_dir(bi) == WRITE &&
3981 			    logical_sector >= mddev->suspend_lo &&
3982 			    logical_sector < mddev->suspend_hi) {
3983 				release_stripe(sh);
3984 				/* As the suspend_* range is controlled by
3985 				 * userspace, we want an interruptible
3986 				 * wait.
3987 				 */
3988 				flush_signals(current);
3989 				prepare_to_wait(&conf->wait_for_overlap,
3990 						&w, TASK_INTERRUPTIBLE);
3991 				if (logical_sector >= mddev->suspend_lo &&
3992 				    logical_sector < mddev->suspend_hi)
3993 					schedule();
3994 				goto retry;
3995 			}
3996 
3997 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3998 			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
3999 				/* Stripe is busy expanding or
4000 				 * add failed due to overlap.  Flush everything
4001 				 * and wait a while
4002 				 */
4003 				raid5_unplug_device(mddev->queue);
4004 				release_stripe(sh);
4005 				schedule();
4006 				goto retry;
4007 			}
4008 			finish_wait(&conf->wait_for_overlap, &w);
4009 			set_bit(STRIPE_HANDLE, &sh->state);
4010 			clear_bit(STRIPE_DELAYED, &sh->state);
4011 			if (mddev->barrier &&
4012 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4013 				atomic_inc(&conf->preread_active_stripes);
4014 			release_stripe(sh);
4015 		} else {
4016 			/* cannot get stripe for read-ahead, just give-up */
4017 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
4018 			finish_wait(&conf->wait_for_overlap, &w);
4019 			break;
4020 		}
4021 
4022 	}
4023 	spin_lock_irq(&conf->device_lock);
4024 	remaining = raid5_dec_bi_phys_segments(bi);
4025 	spin_unlock_irq(&conf->device_lock);
4026 	if (remaining == 0) {
4027 
4028 		if ( rw == WRITE )
4029 			md_write_end(mddev);
4030 
4031 		bio_endio(bi, 0);
4032 	}
4033 
4034 	if (mddev->barrier) {
4035 		/* We need to wait for the stripes to all be handled.
4036 		 * So: wait for preread_active_stripes to drop to 0.
4037 		 */
4038 		wait_event(mddev->thread->wqueue,
4039 			   atomic_read(&conf->preread_active_stripes) == 0);
4040 	}
4041 	return 0;
4042 }
4043 
4044 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
4045 
4046 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
4047 {
4048 	/* reshaping is quite different to recovery/resync so it is
4049 	 * handled quite separately ... here.
4050 	 *
4051 	 * On each call to sync_request, we gather one chunk worth of
4052 	 * destination stripes and flag them as expanding.
4053 	 * Then we find all the source stripes and request reads.
4054 	 * As the reads complete, handle_stripe will copy the data
4055 	 * into the destination stripe and release that stripe.
4056 	 */
4057 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4058 	struct stripe_head *sh;
4059 	sector_t first_sector, last_sector;
4060 	int raid_disks = conf->previous_raid_disks;
4061 	int data_disks = raid_disks - conf->max_degraded;
4062 	int new_data_disks = conf->raid_disks - conf->max_degraded;
4063 	int i;
4064 	int dd_idx;
4065 	sector_t writepos, readpos, safepos;
4066 	sector_t stripe_addr;
4067 	int reshape_sectors;
4068 	struct list_head stripes;
4069 
4070 	if (sector_nr == 0) {
4071 		/* If restarting in the middle, skip the initial sectors */
4072 		if (mddev->delta_disks < 0 &&
4073 		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4074 			sector_nr = raid5_size(mddev, 0, 0)
4075 				- conf->reshape_progress;
4076 		} else if (mddev->delta_disks >= 0 &&
4077 			   conf->reshape_progress > 0)
4078 			sector_nr = conf->reshape_progress;
4079 		sector_div(sector_nr, new_data_disks);
4080 		if (sector_nr) {
4081 			mddev->curr_resync_completed = sector_nr;
4082 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4083 			*skipped = 1;
4084 			return sector_nr;
4085 		}
4086 	}
4087 
4088 	/* We need to process a full chunk at a time.
4089 	 * If old and new chunk sizes differ, we need to process the
4090 	 * largest of these
4091 	 */
4092 	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4093 		reshape_sectors = mddev->new_chunk_sectors;
4094 	else
4095 		reshape_sectors = mddev->chunk_sectors;
4096 
4097 	/* we update the metadata when there is more than 3Meg
4098 	 * in the block range (that is rather arbitrary, should
4099 	 * probably be time based) or when the data about to be
4100 	 * copied would over-write the source of the data at
4101 	 * the front of the range.
4102 	 * i.e. one new_stripe along from reshape_progress new_maps
4103 	 * to after where reshape_safe old_maps to
4104 	 */
4105 	writepos = conf->reshape_progress;
4106 	sector_div(writepos, new_data_disks);
4107 	readpos = conf->reshape_progress;
4108 	sector_div(readpos, data_disks);
4109 	safepos = conf->reshape_safe;
4110 	sector_div(safepos, data_disks);
4111 	if (mddev->delta_disks < 0) {
4112 		writepos -= min_t(sector_t, reshape_sectors, writepos);
4113 		readpos += reshape_sectors;
4114 		safepos += reshape_sectors;
4115 	} else {
4116 		writepos += reshape_sectors;
4117 		readpos -= min_t(sector_t, reshape_sectors, readpos);
4118 		safepos -= min_t(sector_t, reshape_sectors, safepos);
4119 	}
4120 
4121 	/* 'writepos' is the most advanced device address we might write.
4122 	 * 'readpos' is the least advanced device address we might read.
4123 	 * 'safepos' is the least address recorded in the metadata as having
4124 	 *     been reshaped.
4125 	 * If 'readpos' is behind 'writepos', then there is no way that we can
4126 	 * ensure safety in the face of a crash - that must be done by userspace
4127 	 * making a backup of the data.  So in that case there is no particular
4128 	 * rush to update metadata.
4129 	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
4130 	 * update the metadata to advance 'safepos' to match 'readpos' so that
4131 	 * we can be safe in the event of a crash.
4132 	 * So we insist on updating metadata if safepos is behind writepos and
4133 	 * readpos is beyond writepos.
4134 	 * In any case, update the metadata every 10 seconds.
4135 	 * Maybe that number should be configurable, but I'm not sure it is
4136 	 * worth it.... maybe it could be a multiple of safemode_delay???
4137 	 */
4138 	if ((mddev->delta_disks < 0
4139 	     ? (safepos > writepos && readpos < writepos)
4140 	     : (safepos < writepos && readpos > writepos)) ||
4141 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4142 		/* Cannot proceed until we've updated the superblock... */
4143 		wait_event(conf->wait_for_overlap,
4144 			   atomic_read(&conf->reshape_stripes)==0);
4145 		mddev->reshape_position = conf->reshape_progress;
4146 		mddev->curr_resync_completed = mddev->curr_resync;
4147 		conf->reshape_checkpoint = jiffies;
4148 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
4149 		md_wakeup_thread(mddev->thread);
4150 		wait_event(mddev->sb_wait, mddev->flags == 0 ||
4151 			   kthread_should_stop());
4152 		spin_lock_irq(&conf->device_lock);
4153 		conf->reshape_safe = mddev->reshape_position;
4154 		spin_unlock_irq(&conf->device_lock);
4155 		wake_up(&conf->wait_for_overlap);
4156 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4157 	}
4158 
4159 	if (mddev->delta_disks < 0) {
4160 		BUG_ON(conf->reshape_progress == 0);
4161 		stripe_addr = writepos;
4162 		BUG_ON((mddev->dev_sectors &
4163 			~((sector_t)reshape_sectors - 1))
4164 		       - reshape_sectors - stripe_addr
4165 		       != sector_nr);
4166 	} else {
4167 		BUG_ON(writepos != sector_nr + reshape_sectors);
4168 		stripe_addr = sector_nr;
4169 	}
4170 	INIT_LIST_HEAD(&stripes);
4171 	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4172 		int j;
4173 		int skipped_disk = 0;
4174 		sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4175 		set_bit(STRIPE_EXPANDING, &sh->state);
4176 		atomic_inc(&conf->reshape_stripes);
4177 		/* If any of this stripe is beyond the end of the old
4178 		 * array, then we need to zero those blocks
4179 		 */
4180 		for (j=sh->disks; j--;) {
4181 			sector_t s;
4182 			if (j == sh->pd_idx)
4183 				continue;
4184 			if (conf->level == 6 &&
4185 			    j == sh->qd_idx)
4186 				continue;
4187 			s = compute_blocknr(sh, j, 0);
4188 			if (s < raid5_size(mddev, 0, 0)) {
4189 				skipped_disk = 1;
4190 				continue;
4191 			}
4192 			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4193 			set_bit(R5_Expanded, &sh->dev[j].flags);
4194 			set_bit(R5_UPTODATE, &sh->dev[j].flags);
4195 		}
4196 		if (!skipped_disk) {
4197 			set_bit(STRIPE_EXPAND_READY, &sh->state);
4198 			set_bit(STRIPE_HANDLE, &sh->state);
4199 		}
4200 		list_add(&sh->lru, &stripes);
4201 	}
4202 	spin_lock_irq(&conf->device_lock);
4203 	if (mddev->delta_disks < 0)
4204 		conf->reshape_progress -= reshape_sectors * new_data_disks;
4205 	else
4206 		conf->reshape_progress += reshape_sectors * new_data_disks;
4207 	spin_unlock_irq(&conf->device_lock);
4208 	/* Ok, those stripe are ready. We can start scheduling
4209 	 * reads on the source stripes.
4210 	 * The source stripes are determined by mapping the first and last
4211 	 * block on the destination stripes.
4212 	 */
4213 	first_sector =
4214 		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4215 				     1, &dd_idx, NULL);
4216 	last_sector =
4217 		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4218 					    * new_data_disks - 1),
4219 				     1, &dd_idx, NULL);
4220 	if (last_sector >= mddev->dev_sectors)
4221 		last_sector = mddev->dev_sectors - 1;
4222 	while (first_sector <= last_sector) {
4223 		sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4224 		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4225 		set_bit(STRIPE_HANDLE, &sh->state);
4226 		release_stripe(sh);
4227 		first_sector += STRIPE_SECTORS;
4228 	}
4229 	/* Now that the sources are clearly marked, we can release
4230 	 * the destination stripes
4231 	 */
4232 	while (!list_empty(&stripes)) {
4233 		sh = list_entry(stripes.next, struct stripe_head, lru);
4234 		list_del_init(&sh->lru);
4235 		release_stripe(sh);
4236 	}
4237 	/* If this takes us to the resync_max point where we have to pause,
4238 	 * then we need to write out the superblock.
4239 	 */
4240 	sector_nr += reshape_sectors;
4241 	if ((sector_nr - mddev->curr_resync_completed) * 2
4242 	    >= mddev->resync_max - mddev->curr_resync_completed) {
4243 		/* Cannot proceed until we've updated the superblock... */
4244 		wait_event(conf->wait_for_overlap,
4245 			   atomic_read(&conf->reshape_stripes) == 0);
4246 		mddev->reshape_position = conf->reshape_progress;
4247 		mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
4248 		conf->reshape_checkpoint = jiffies;
4249 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
4250 		md_wakeup_thread(mddev->thread);
4251 		wait_event(mddev->sb_wait,
4252 			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4253 			   || kthread_should_stop());
4254 		spin_lock_irq(&conf->device_lock);
4255 		conf->reshape_safe = mddev->reshape_position;
4256 		spin_unlock_irq(&conf->device_lock);
4257 		wake_up(&conf->wait_for_overlap);
4258 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4259 	}
4260 	return reshape_sectors;
4261 }
4262 
4263 /* FIXME go_faster isn't used */
4264 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
4265 {
4266 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4267 	struct stripe_head *sh;
4268 	sector_t max_sector = mddev->dev_sectors;
4269 	int sync_blocks;
4270 	int still_degraded = 0;
4271 	int i;
4272 
4273 	if (sector_nr >= max_sector) {
4274 		/* just being told to finish up .. nothing much to do */
4275 		unplug_slaves(mddev);
4276 
4277 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4278 			end_reshape(conf);
4279 			return 0;
4280 		}
4281 
4282 		if (mddev->curr_resync < max_sector) /* aborted */
4283 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4284 					&sync_blocks, 1);
4285 		else /* completed sync */
4286 			conf->fullsync = 0;
4287 		bitmap_close_sync(mddev->bitmap);
4288 
4289 		return 0;
4290 	}
4291 
4292 	/* Allow raid5_quiesce to complete */
4293 	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4294 
4295 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4296 		return reshape_request(mddev, sector_nr, skipped);
4297 
4298 	/* No need to check resync_max as we never do more than one
4299 	 * stripe, and as resync_max will always be on a chunk boundary,
4300 	 * if the check in md_do_sync didn't fire, there is no chance
4301 	 * of overstepping resync_max here
4302 	 */
4303 
4304 	/* if there is too many failed drives and we are trying
4305 	 * to resync, then assert that we are finished, because there is
4306 	 * nothing we can do.
4307 	 */
4308 	if (mddev->degraded >= conf->max_degraded &&
4309 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4310 		sector_t rv = mddev->dev_sectors - sector_nr;
4311 		*skipped = 1;
4312 		return rv;
4313 	}
4314 	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4315 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4316 	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4317 		/* we can skip this block, and probably more */
4318 		sync_blocks /= STRIPE_SECTORS;
4319 		*skipped = 1;
4320 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4321 	}
4322 
4323 
4324 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4325 
4326 	sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4327 	if (sh == NULL) {
4328 		sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4329 		/* make sure we don't swamp the stripe cache if someone else
4330 		 * is trying to get access
4331 		 */
4332 		schedule_timeout_uninterruptible(1);
4333 	}
4334 	/* Need to check if array will still be degraded after recovery/resync
4335 	 * We don't need to check the 'failed' flag as when that gets set,
4336 	 * recovery aborts.
4337 	 */
4338 	for (i = 0; i < conf->raid_disks; i++)
4339 		if (conf->disks[i].rdev == NULL)
4340 			still_degraded = 1;
4341 
4342 	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4343 
4344 	spin_lock(&sh->lock);
4345 	set_bit(STRIPE_SYNCING, &sh->state);
4346 	clear_bit(STRIPE_INSYNC, &sh->state);
4347 	spin_unlock(&sh->lock);
4348 
4349 	handle_stripe(sh);
4350 	release_stripe(sh);
4351 
4352 	return STRIPE_SECTORS;
4353 }
4354 
4355 static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4356 {
4357 	/* We may not be able to submit a whole bio at once as there
4358 	 * may not be enough stripe_heads available.
4359 	 * We cannot pre-allocate enough stripe_heads as we may need
4360 	 * more than exist in the cache (if we allow ever large chunks).
4361 	 * So we do one stripe head at a time and record in
4362 	 * ->bi_hw_segments how many have been done.
4363 	 *
4364 	 * We *know* that this entire raid_bio is in one chunk, so
4365 	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4366 	 */
4367 	struct stripe_head *sh;
4368 	int dd_idx;
4369 	sector_t sector, logical_sector, last_sector;
4370 	int scnt = 0;
4371 	int remaining;
4372 	int handled = 0;
4373 
4374 	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4375 	sector = raid5_compute_sector(conf, logical_sector,
4376 				      0, &dd_idx, NULL);
4377 	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4378 
4379 	for (; logical_sector < last_sector;
4380 	     logical_sector += STRIPE_SECTORS,
4381 		     sector += STRIPE_SECTORS,
4382 		     scnt++) {
4383 
4384 		if (scnt < raid5_bi_hw_segments(raid_bio))
4385 			/* already done this stripe */
4386 			continue;
4387 
4388 		sh = get_active_stripe(conf, sector, 0, 1, 0);
4389 
4390 		if (!sh) {
4391 			/* failed to get a stripe - must wait */
4392 			raid5_set_bi_hw_segments(raid_bio, scnt);
4393 			conf->retry_read_aligned = raid_bio;
4394 			return handled;
4395 		}
4396 
4397 		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4398 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4399 			release_stripe(sh);
4400 			raid5_set_bi_hw_segments(raid_bio, scnt);
4401 			conf->retry_read_aligned = raid_bio;
4402 			return handled;
4403 		}
4404 
4405 		handle_stripe(sh);
4406 		release_stripe(sh);
4407 		handled++;
4408 	}
4409 	spin_lock_irq(&conf->device_lock);
4410 	remaining = raid5_dec_bi_phys_segments(raid_bio);
4411 	spin_unlock_irq(&conf->device_lock);
4412 	if (remaining == 0)
4413 		bio_endio(raid_bio, 0);
4414 	if (atomic_dec_and_test(&conf->active_aligned_reads))
4415 		wake_up(&conf->wait_for_stripe);
4416 	return handled;
4417 }
4418 
4419 
4420 /*
4421  * This is our raid5 kernel thread.
4422  *
4423  * We scan the hash table for stripes which can be handled now.
4424  * During the scan, completed stripes are saved for us by the interrupt
4425  * handler, so that they will not have to wait for our next wakeup.
4426  */
4427 static void raid5d(mddev_t *mddev)
4428 {
4429 	struct stripe_head *sh;
4430 	raid5_conf_t *conf = mddev->private;
4431 	int handled;
4432 
4433 	pr_debug("+++ raid5d active\n");
4434 
4435 	md_check_recovery(mddev);
4436 
4437 	handled = 0;
4438 	spin_lock_irq(&conf->device_lock);
4439 	while (1) {
4440 		struct bio *bio;
4441 
4442 		if (conf->seq_flush != conf->seq_write) {
4443 			int seq = conf->seq_flush;
4444 			spin_unlock_irq(&conf->device_lock);
4445 			bitmap_unplug(mddev->bitmap);
4446 			spin_lock_irq(&conf->device_lock);
4447 			conf->seq_write = seq;
4448 			activate_bit_delay(conf);
4449 		}
4450 
4451 		while ((bio = remove_bio_from_retry(conf))) {
4452 			int ok;
4453 			spin_unlock_irq(&conf->device_lock);
4454 			ok = retry_aligned_read(conf, bio);
4455 			spin_lock_irq(&conf->device_lock);
4456 			if (!ok)
4457 				break;
4458 			handled++;
4459 		}
4460 
4461 		sh = __get_priority_stripe(conf);
4462 
4463 		if (!sh)
4464 			break;
4465 		spin_unlock_irq(&conf->device_lock);
4466 
4467 		handled++;
4468 		handle_stripe(sh);
4469 		release_stripe(sh);
4470 		cond_resched();
4471 
4472 		spin_lock_irq(&conf->device_lock);
4473 	}
4474 	pr_debug("%d stripes handled\n", handled);
4475 
4476 	spin_unlock_irq(&conf->device_lock);
4477 
4478 	async_tx_issue_pending_all();
4479 	unplug_slaves(mddev);
4480 
4481 	pr_debug("--- raid5d inactive\n");
4482 }
4483 
4484 static ssize_t
4485 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4486 {
4487 	raid5_conf_t *conf = mddev->private;
4488 	if (conf)
4489 		return sprintf(page, "%d\n", conf->max_nr_stripes);
4490 	else
4491 		return 0;
4492 }
4493 
4494 static ssize_t
4495 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4496 {
4497 	raid5_conf_t *conf = mddev->private;
4498 	unsigned long new;
4499 	int err;
4500 
4501 	if (len >= PAGE_SIZE)
4502 		return -EINVAL;
4503 	if (!conf)
4504 		return -ENODEV;
4505 
4506 	if (strict_strtoul(page, 10, &new))
4507 		return -EINVAL;
4508 	if (new <= 16 || new > 32768)
4509 		return -EINVAL;
4510 	while (new < conf->max_nr_stripes) {
4511 		if (drop_one_stripe(conf))
4512 			conf->max_nr_stripes--;
4513 		else
4514 			break;
4515 	}
4516 	err = md_allow_write(mddev);
4517 	if (err)
4518 		return err;
4519 	while (new > conf->max_nr_stripes) {
4520 		if (grow_one_stripe(conf))
4521 			conf->max_nr_stripes++;
4522 		else break;
4523 	}
4524 	return len;
4525 }
4526 
4527 static struct md_sysfs_entry
4528 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4529 				raid5_show_stripe_cache_size,
4530 				raid5_store_stripe_cache_size);
4531 
4532 static ssize_t
4533 raid5_show_preread_threshold(mddev_t *mddev, char *page)
4534 {
4535 	raid5_conf_t *conf = mddev->private;
4536 	if (conf)
4537 		return sprintf(page, "%d\n", conf->bypass_threshold);
4538 	else
4539 		return 0;
4540 }
4541 
4542 static ssize_t
4543 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4544 {
4545 	raid5_conf_t *conf = mddev->private;
4546 	unsigned long new;
4547 	if (len >= PAGE_SIZE)
4548 		return -EINVAL;
4549 	if (!conf)
4550 		return -ENODEV;
4551 
4552 	if (strict_strtoul(page, 10, &new))
4553 		return -EINVAL;
4554 	if (new > conf->max_nr_stripes)
4555 		return -EINVAL;
4556 	conf->bypass_threshold = new;
4557 	return len;
4558 }
4559 
4560 static struct md_sysfs_entry
4561 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4562 					S_IRUGO | S_IWUSR,
4563 					raid5_show_preread_threshold,
4564 					raid5_store_preread_threshold);
4565 
4566 static ssize_t
4567 stripe_cache_active_show(mddev_t *mddev, char *page)
4568 {
4569 	raid5_conf_t *conf = mddev->private;
4570 	if (conf)
4571 		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4572 	else
4573 		return 0;
4574 }
4575 
4576 static struct md_sysfs_entry
4577 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4578 
4579 static struct attribute *raid5_attrs[] =  {
4580 	&raid5_stripecache_size.attr,
4581 	&raid5_stripecache_active.attr,
4582 	&raid5_preread_bypass_threshold.attr,
4583 	NULL,
4584 };
4585 static struct attribute_group raid5_attrs_group = {
4586 	.name = NULL,
4587 	.attrs = raid5_attrs,
4588 };
4589 
4590 static sector_t
4591 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4592 {
4593 	raid5_conf_t *conf = mddev->private;
4594 
4595 	if (!sectors)
4596 		sectors = mddev->dev_sectors;
4597 	if (!raid_disks)
4598 		/* size is defined by the smallest of previous and new size */
4599 		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4600 
4601 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4602 	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4603 	return sectors * (raid_disks - conf->max_degraded);
4604 }
4605 
4606 static void raid5_free_percpu(raid5_conf_t *conf)
4607 {
4608 	struct raid5_percpu *percpu;
4609 	unsigned long cpu;
4610 
4611 	if (!conf->percpu)
4612 		return;
4613 
4614 	get_online_cpus();
4615 	for_each_possible_cpu(cpu) {
4616 		percpu = per_cpu_ptr(conf->percpu, cpu);
4617 		safe_put_page(percpu->spare_page);
4618 		kfree(percpu->scribble);
4619 	}
4620 #ifdef CONFIG_HOTPLUG_CPU
4621 	unregister_cpu_notifier(&conf->cpu_notify);
4622 #endif
4623 	put_online_cpus();
4624 
4625 	free_percpu(conf->percpu);
4626 }
4627 
4628 static void free_conf(raid5_conf_t *conf)
4629 {
4630 	shrink_stripes(conf);
4631 	raid5_free_percpu(conf);
4632 	kfree(conf->disks);
4633 	kfree(conf->stripe_hashtbl);
4634 	kfree(conf);
4635 }
4636 
4637 #ifdef CONFIG_HOTPLUG_CPU
4638 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4639 			      void *hcpu)
4640 {
4641 	raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4642 	long cpu = (long)hcpu;
4643 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4644 
4645 	switch (action) {
4646 	case CPU_UP_PREPARE:
4647 	case CPU_UP_PREPARE_FROZEN:
4648 		if (conf->level == 6 && !percpu->spare_page)
4649 			percpu->spare_page = alloc_page(GFP_KERNEL);
4650 		if (!percpu->scribble)
4651 			percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4652 
4653 		if (!percpu->scribble ||
4654 		    (conf->level == 6 && !percpu->spare_page)) {
4655 			safe_put_page(percpu->spare_page);
4656 			kfree(percpu->scribble);
4657 			pr_err("%s: failed memory allocation for cpu%ld\n",
4658 			       __func__, cpu);
4659 			return NOTIFY_BAD;
4660 		}
4661 		break;
4662 	case CPU_DEAD:
4663 	case CPU_DEAD_FROZEN:
4664 		safe_put_page(percpu->spare_page);
4665 		kfree(percpu->scribble);
4666 		percpu->spare_page = NULL;
4667 		percpu->scribble = NULL;
4668 		break;
4669 	default:
4670 		break;
4671 	}
4672 	return NOTIFY_OK;
4673 }
4674 #endif
4675 
4676 static int raid5_alloc_percpu(raid5_conf_t *conf)
4677 {
4678 	unsigned long cpu;
4679 	struct page *spare_page;
4680 	struct raid5_percpu __percpu *allcpus;
4681 	void *scribble;
4682 	int err;
4683 
4684 	allcpus = alloc_percpu(struct raid5_percpu);
4685 	if (!allcpus)
4686 		return -ENOMEM;
4687 	conf->percpu = allcpus;
4688 
4689 	get_online_cpus();
4690 	err = 0;
4691 	for_each_present_cpu(cpu) {
4692 		if (conf->level == 6) {
4693 			spare_page = alloc_page(GFP_KERNEL);
4694 			if (!spare_page) {
4695 				err = -ENOMEM;
4696 				break;
4697 			}
4698 			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4699 		}
4700 		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4701 		if (!scribble) {
4702 			err = -ENOMEM;
4703 			break;
4704 		}
4705 		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4706 	}
4707 #ifdef CONFIG_HOTPLUG_CPU
4708 	conf->cpu_notify.notifier_call = raid456_cpu_notify;
4709 	conf->cpu_notify.priority = 0;
4710 	if (err == 0)
4711 		err = register_cpu_notifier(&conf->cpu_notify);
4712 #endif
4713 	put_online_cpus();
4714 
4715 	return err;
4716 }
4717 
4718 static raid5_conf_t *setup_conf(mddev_t *mddev)
4719 {
4720 	raid5_conf_t *conf;
4721 	int raid_disk, memory, max_disks;
4722 	mdk_rdev_t *rdev;
4723 	struct disk_info *disk;
4724 
4725 	if (mddev->new_level != 5
4726 	    && mddev->new_level != 4
4727 	    && mddev->new_level != 6) {
4728 		printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4729 		       mdname(mddev), mddev->new_level);
4730 		return ERR_PTR(-EIO);
4731 	}
4732 	if ((mddev->new_level == 5
4733 	     && !algorithm_valid_raid5(mddev->new_layout)) ||
4734 	    (mddev->new_level == 6
4735 	     && !algorithm_valid_raid6(mddev->new_layout))) {
4736 		printk(KERN_ERR "raid5: %s: layout %d not supported\n",
4737 		       mdname(mddev), mddev->new_layout);
4738 		return ERR_PTR(-EIO);
4739 	}
4740 	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4741 		printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4742 		       mdname(mddev), mddev->raid_disks);
4743 		return ERR_PTR(-EINVAL);
4744 	}
4745 
4746 	if (!mddev->new_chunk_sectors ||
4747 	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4748 	    !is_power_of_2(mddev->new_chunk_sectors)) {
4749 		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4750 		       mddev->new_chunk_sectors << 9, mdname(mddev));
4751 		return ERR_PTR(-EINVAL);
4752 	}
4753 
4754 	conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4755 	if (conf == NULL)
4756 		goto abort;
4757 	spin_lock_init(&conf->device_lock);
4758 	init_waitqueue_head(&conf->wait_for_stripe);
4759 	init_waitqueue_head(&conf->wait_for_overlap);
4760 	INIT_LIST_HEAD(&conf->handle_list);
4761 	INIT_LIST_HEAD(&conf->hold_list);
4762 	INIT_LIST_HEAD(&conf->delayed_list);
4763 	INIT_LIST_HEAD(&conf->bitmap_list);
4764 	INIT_LIST_HEAD(&conf->inactive_list);
4765 	atomic_set(&conf->active_stripes, 0);
4766 	atomic_set(&conf->preread_active_stripes, 0);
4767 	atomic_set(&conf->active_aligned_reads, 0);
4768 	conf->bypass_threshold = BYPASS_THRESHOLD;
4769 
4770 	conf->raid_disks = mddev->raid_disks;
4771 	if (mddev->reshape_position == MaxSector)
4772 		conf->previous_raid_disks = mddev->raid_disks;
4773 	else
4774 		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4775 	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
4776 	conf->scribble_len = scribble_len(max_disks);
4777 
4778 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
4779 			      GFP_KERNEL);
4780 	if (!conf->disks)
4781 		goto abort;
4782 
4783 	conf->mddev = mddev;
4784 
4785 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4786 		goto abort;
4787 
4788 	conf->level = mddev->new_level;
4789 	if (raid5_alloc_percpu(conf) != 0)
4790 		goto abort;
4791 
4792 	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4793 
4794 	list_for_each_entry(rdev, &mddev->disks, same_set) {
4795 		raid_disk = rdev->raid_disk;
4796 		if (raid_disk >= max_disks
4797 		    || raid_disk < 0)
4798 			continue;
4799 		disk = conf->disks + raid_disk;
4800 
4801 		disk->rdev = rdev;
4802 
4803 		if (test_bit(In_sync, &rdev->flags)) {
4804 			char b[BDEVNAME_SIZE];
4805 			printk(KERN_INFO "raid5: device %s operational as raid"
4806 				" disk %d\n", bdevname(rdev->bdev,b),
4807 				raid_disk);
4808 		} else
4809 			/* Cannot rely on bitmap to complete recovery */
4810 			conf->fullsync = 1;
4811 	}
4812 
4813 	conf->chunk_sectors = mddev->new_chunk_sectors;
4814 	conf->level = mddev->new_level;
4815 	if (conf->level == 6)
4816 		conf->max_degraded = 2;
4817 	else
4818 		conf->max_degraded = 1;
4819 	conf->algorithm = mddev->new_layout;
4820 	conf->max_nr_stripes = NR_STRIPES;
4821 	conf->reshape_progress = mddev->reshape_position;
4822 	if (conf->reshape_progress != MaxSector) {
4823 		conf->prev_chunk_sectors = mddev->chunk_sectors;
4824 		conf->prev_algo = mddev->layout;
4825 	}
4826 
4827 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4828 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4829 	if (grow_stripes(conf, conf->max_nr_stripes)) {
4830 		printk(KERN_ERR
4831 			"raid5: couldn't allocate %dkB for buffers\n", memory);
4832 		goto abort;
4833 	} else
4834 		printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4835 			memory, mdname(mddev));
4836 
4837 	conf->thread = md_register_thread(raid5d, mddev, NULL);
4838 	if (!conf->thread) {
4839 		printk(KERN_ERR
4840 		       "raid5: couldn't allocate thread for %s\n",
4841 		       mdname(mddev));
4842 		goto abort;
4843 	}
4844 
4845 	return conf;
4846 
4847  abort:
4848 	if (conf) {
4849 		free_conf(conf);
4850 		return ERR_PTR(-EIO);
4851 	} else
4852 		return ERR_PTR(-ENOMEM);
4853 }
4854 
4855 
4856 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
4857 {
4858 	switch (algo) {
4859 	case ALGORITHM_PARITY_0:
4860 		if (raid_disk < max_degraded)
4861 			return 1;
4862 		break;
4863 	case ALGORITHM_PARITY_N:
4864 		if (raid_disk >= raid_disks - max_degraded)
4865 			return 1;
4866 		break;
4867 	case ALGORITHM_PARITY_0_6:
4868 		if (raid_disk == 0 ||
4869 		    raid_disk == raid_disks - 1)
4870 			return 1;
4871 		break;
4872 	case ALGORITHM_LEFT_ASYMMETRIC_6:
4873 	case ALGORITHM_RIGHT_ASYMMETRIC_6:
4874 	case ALGORITHM_LEFT_SYMMETRIC_6:
4875 	case ALGORITHM_RIGHT_SYMMETRIC_6:
4876 		if (raid_disk == raid_disks - 1)
4877 			return 1;
4878 	}
4879 	return 0;
4880 }
4881 
4882 static int run(mddev_t *mddev)
4883 {
4884 	raid5_conf_t *conf;
4885 	int working_disks = 0, chunk_size;
4886 	int dirty_parity_disks = 0;
4887 	mdk_rdev_t *rdev;
4888 	sector_t reshape_offset = 0;
4889 
4890 	if (mddev->recovery_cp != MaxSector)
4891 		printk(KERN_NOTICE "raid5: %s is not clean"
4892 		       " -- starting background reconstruction\n",
4893 		       mdname(mddev));
4894 	if (mddev->reshape_position != MaxSector) {
4895 		/* Check that we can continue the reshape.
4896 		 * Currently only disks can change, it must
4897 		 * increase, and we must be past the point where
4898 		 * a stripe over-writes itself
4899 		 */
4900 		sector_t here_new, here_old;
4901 		int old_disks;
4902 		int max_degraded = (mddev->level == 6 ? 2 : 1);
4903 
4904 		if (mddev->new_level != mddev->level) {
4905 			printk(KERN_ERR "raid5: %s: unsupported reshape "
4906 			       "required - aborting.\n",
4907 			       mdname(mddev));
4908 			return -EINVAL;
4909 		}
4910 		old_disks = mddev->raid_disks - mddev->delta_disks;
4911 		/* reshape_position must be on a new-stripe boundary, and one
4912 		 * further up in new geometry must map after here in old
4913 		 * geometry.
4914 		 */
4915 		here_new = mddev->reshape_position;
4916 		if (sector_div(here_new, mddev->new_chunk_sectors *
4917 			       (mddev->raid_disks - max_degraded))) {
4918 			printk(KERN_ERR "raid5: reshape_position not "
4919 			       "on a stripe boundary\n");
4920 			return -EINVAL;
4921 		}
4922 		reshape_offset = here_new * mddev->new_chunk_sectors;
4923 		/* here_new is the stripe we will write to */
4924 		here_old = mddev->reshape_position;
4925 		sector_div(here_old, mddev->chunk_sectors *
4926 			   (old_disks-max_degraded));
4927 		/* here_old is the first stripe that we might need to read
4928 		 * from */
4929 		if (mddev->delta_disks == 0) {
4930 			/* We cannot be sure it is safe to start an in-place
4931 			 * reshape.  It is only safe if user-space if monitoring
4932 			 * and taking constant backups.
4933 			 * mdadm always starts a situation like this in
4934 			 * readonly mode so it can take control before
4935 			 * allowing any writes.  So just check for that.
4936 			 */
4937 			if ((here_new * mddev->new_chunk_sectors !=
4938 			     here_old * mddev->chunk_sectors) ||
4939 			    mddev->ro == 0) {
4940 				printk(KERN_ERR "raid5: in-place reshape must be started"
4941 				       " in read-only mode - aborting\n");
4942 				return -EINVAL;
4943 			}
4944 		} else if (mddev->delta_disks < 0
4945 		    ? (here_new * mddev->new_chunk_sectors <=
4946 		       here_old * mddev->chunk_sectors)
4947 		    : (here_new * mddev->new_chunk_sectors >=
4948 		       here_old * mddev->chunk_sectors)) {
4949 			/* Reading from the same stripe as writing to - bad */
4950 			printk(KERN_ERR "raid5: reshape_position too early for "
4951 			       "auto-recovery - aborting.\n");
4952 			return -EINVAL;
4953 		}
4954 		printk(KERN_INFO "raid5: reshape will continue\n");
4955 		/* OK, we should be able to continue; */
4956 	} else {
4957 		BUG_ON(mddev->level != mddev->new_level);
4958 		BUG_ON(mddev->layout != mddev->new_layout);
4959 		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
4960 		BUG_ON(mddev->delta_disks != 0);
4961 	}
4962 
4963 	if (mddev->private == NULL)
4964 		conf = setup_conf(mddev);
4965 	else
4966 		conf = mddev->private;
4967 
4968 	if (IS_ERR(conf))
4969 		return PTR_ERR(conf);
4970 
4971 	mddev->thread = conf->thread;
4972 	conf->thread = NULL;
4973 	mddev->private = conf;
4974 
4975 	/*
4976 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
4977 	 */
4978 	list_for_each_entry(rdev, &mddev->disks, same_set) {
4979 		if (rdev->raid_disk < 0)
4980 			continue;
4981 		if (test_bit(In_sync, &rdev->flags))
4982 			working_disks++;
4983 		/* This disc is not fully in-sync.  However if it
4984 		 * just stored parity (beyond the recovery_offset),
4985 		 * when we don't need to be concerned about the
4986 		 * array being dirty.
4987 		 * When reshape goes 'backwards', we never have
4988 		 * partially completed devices, so we only need
4989 		 * to worry about reshape going forwards.
4990 		 */
4991 		/* Hack because v0.91 doesn't store recovery_offset properly. */
4992 		if (mddev->major_version == 0 &&
4993 		    mddev->minor_version > 90)
4994 			rdev->recovery_offset = reshape_offset;
4995 
4996 		printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
4997 		       rdev->raid_disk, working_disks, conf->prev_algo,
4998 		       conf->previous_raid_disks, conf->max_degraded,
4999 		       conf->algorithm, conf->raid_disks,
5000 		       only_parity(rdev->raid_disk,
5001 				   conf->prev_algo,
5002 				   conf->previous_raid_disks,
5003 				   conf->max_degraded),
5004 		       only_parity(rdev->raid_disk,
5005 				   conf->algorithm,
5006 				   conf->raid_disks,
5007 				   conf->max_degraded));
5008 		if (rdev->recovery_offset < reshape_offset) {
5009 			/* We need to check old and new layout */
5010 			if (!only_parity(rdev->raid_disk,
5011 					 conf->algorithm,
5012 					 conf->raid_disks,
5013 					 conf->max_degraded))
5014 				continue;
5015 		}
5016 		if (!only_parity(rdev->raid_disk,
5017 				 conf->prev_algo,
5018 				 conf->previous_raid_disks,
5019 				 conf->max_degraded))
5020 			continue;
5021 		dirty_parity_disks++;
5022 	}
5023 
5024 	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5025 			   - working_disks);
5026 
5027 	if (mddev->degraded > conf->max_degraded) {
5028 		printk(KERN_ERR "raid5: not enough operational devices for %s"
5029 			" (%d/%d failed)\n",
5030 			mdname(mddev), mddev->degraded, conf->raid_disks);
5031 		goto abort;
5032 	}
5033 
5034 	/* device size must be a multiple of chunk size */
5035 	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
5036 	mddev->resync_max_sectors = mddev->dev_sectors;
5037 
5038 	if (mddev->degraded > dirty_parity_disks &&
5039 	    mddev->recovery_cp != MaxSector) {
5040 		if (mddev->ok_start_degraded)
5041 			printk(KERN_WARNING
5042 			       "raid5: starting dirty degraded array: %s"
5043 			       "- data corruption possible.\n",
5044 			       mdname(mddev));
5045 		else {
5046 			printk(KERN_ERR
5047 			       "raid5: cannot start dirty degraded array for %s\n",
5048 			       mdname(mddev));
5049 			goto abort;
5050 		}
5051 	}
5052 
5053 	if (mddev->degraded == 0)
5054 		printk("raid5: raid level %d set %s active with %d out of %d"
5055 		       " devices, algorithm %d\n", conf->level, mdname(mddev),
5056 		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
5057 		       mddev->new_layout);
5058 	else
5059 		printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
5060 			" out of %d devices, algorithm %d\n", conf->level,
5061 			mdname(mddev), mddev->raid_disks - mddev->degraded,
5062 			mddev->raid_disks, mddev->new_layout);
5063 
5064 	print_raid5_conf(conf);
5065 
5066 	if (conf->reshape_progress != MaxSector) {
5067 		printk("...ok start reshape thread\n");
5068 		conf->reshape_safe = conf->reshape_progress;
5069 		atomic_set(&conf->reshape_stripes, 0);
5070 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5071 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5072 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5073 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5074 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5075 							"reshape");
5076 	}
5077 
5078 	/* read-ahead size must cover two whole stripes, which is
5079 	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
5080 	 */
5081 	{
5082 		int data_disks = conf->previous_raid_disks - conf->max_degraded;
5083 		int stripe = data_disks *
5084 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
5085 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5086 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5087 	}
5088 
5089 	/* Ok, everything is just fine now */
5090 	if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
5091 		printk(KERN_WARNING
5092 		       "raid5: failed to create sysfs attributes for %s\n",
5093 		       mdname(mddev));
5094 
5095 	mddev->queue->queue_lock = &conf->device_lock;
5096 
5097 	mddev->queue->unplug_fn = raid5_unplug_device;
5098 	mddev->queue->backing_dev_info.congested_data = mddev;
5099 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
5100 
5101 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5102 
5103 	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
5104 	chunk_size = mddev->chunk_sectors << 9;
5105 	blk_queue_io_min(mddev->queue, chunk_size);
5106 	blk_queue_io_opt(mddev->queue, chunk_size *
5107 			 (conf->raid_disks - conf->max_degraded));
5108 
5109 	list_for_each_entry(rdev, &mddev->disks, same_set)
5110 		disk_stack_limits(mddev->gendisk, rdev->bdev,
5111 				  rdev->data_offset << 9);
5112 
5113 	return 0;
5114 abort:
5115 	md_unregister_thread(mddev->thread);
5116 	mddev->thread = NULL;
5117 	if (conf) {
5118 		print_raid5_conf(conf);
5119 		free_conf(conf);
5120 	}
5121 	mddev->private = NULL;
5122 	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
5123 	return -EIO;
5124 }
5125 
5126 
5127 
5128 static int stop(mddev_t *mddev)
5129 {
5130 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
5131 
5132 	md_unregister_thread(mddev->thread);
5133 	mddev->thread = NULL;
5134 	mddev->queue->backing_dev_info.congested_fn = NULL;
5135 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5136 	free_conf(conf);
5137 	mddev->private = &raid5_attrs_group;
5138 	return 0;
5139 }
5140 
5141 #ifdef DEBUG
5142 static void print_sh(struct seq_file *seq, struct stripe_head *sh)
5143 {
5144 	int i;
5145 
5146 	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
5147 		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
5148 	seq_printf(seq, "sh %llu,  count %d.\n",
5149 		   (unsigned long long)sh->sector, atomic_read(&sh->count));
5150 	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
5151 	for (i = 0; i < sh->disks; i++) {
5152 		seq_printf(seq, "(cache%d: %p %ld) ",
5153 			   i, sh->dev[i].page, sh->dev[i].flags);
5154 	}
5155 	seq_printf(seq, "\n");
5156 }
5157 
5158 static void printall(struct seq_file *seq, raid5_conf_t *conf)
5159 {
5160 	struct stripe_head *sh;
5161 	struct hlist_node *hn;
5162 	int i;
5163 
5164 	spin_lock_irq(&conf->device_lock);
5165 	for (i = 0; i < NR_HASH; i++) {
5166 		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
5167 			if (sh->raid_conf != conf)
5168 				continue;
5169 			print_sh(seq, sh);
5170 		}
5171 	}
5172 	spin_unlock_irq(&conf->device_lock);
5173 }
5174 #endif
5175 
5176 static void status(struct seq_file *seq, mddev_t *mddev)
5177 {
5178 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
5179 	int i;
5180 
5181 	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
5182 		mddev->chunk_sectors / 2, mddev->layout);
5183 	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
5184 	for (i = 0; i < conf->raid_disks; i++)
5185 		seq_printf (seq, "%s",
5186 			       conf->disks[i].rdev &&
5187 			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
5188 	seq_printf (seq, "]");
5189 #ifdef DEBUG
5190 	seq_printf (seq, "\n");
5191 	printall(seq, conf);
5192 #endif
5193 }
5194 
5195 static void print_raid5_conf (raid5_conf_t *conf)
5196 {
5197 	int i;
5198 	struct disk_info *tmp;
5199 
5200 	printk("RAID5 conf printout:\n");
5201 	if (!conf) {
5202 		printk("(conf==NULL)\n");
5203 		return;
5204 	}
5205 	printk(" --- rd:%d wd:%d\n", conf->raid_disks,
5206 		 conf->raid_disks - conf->mddev->degraded);
5207 
5208 	for (i = 0; i < conf->raid_disks; i++) {
5209 		char b[BDEVNAME_SIZE];
5210 		tmp = conf->disks + i;
5211 		if (tmp->rdev)
5212 		printk(" disk %d, o:%d, dev:%s\n",
5213 			i, !test_bit(Faulty, &tmp->rdev->flags),
5214 			bdevname(tmp->rdev->bdev,b));
5215 	}
5216 }
5217 
5218 static int raid5_spare_active(mddev_t *mddev)
5219 {
5220 	int i;
5221 	raid5_conf_t *conf = mddev->private;
5222 	struct disk_info *tmp;
5223 
5224 	for (i = 0; i < conf->raid_disks; i++) {
5225 		tmp = conf->disks + i;
5226 		if (tmp->rdev
5227 		    && !test_bit(Faulty, &tmp->rdev->flags)
5228 		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5229 			unsigned long flags;
5230 			spin_lock_irqsave(&conf->device_lock, flags);
5231 			mddev->degraded--;
5232 			spin_unlock_irqrestore(&conf->device_lock, flags);
5233 		}
5234 	}
5235 	print_raid5_conf(conf);
5236 	return 0;
5237 }
5238 
5239 static int raid5_remove_disk(mddev_t *mddev, int number)
5240 {
5241 	raid5_conf_t *conf = mddev->private;
5242 	int err = 0;
5243 	mdk_rdev_t *rdev;
5244 	struct disk_info *p = conf->disks + number;
5245 
5246 	print_raid5_conf(conf);
5247 	rdev = p->rdev;
5248 	if (rdev) {
5249 		if (number >= conf->raid_disks &&
5250 		    conf->reshape_progress == MaxSector)
5251 			clear_bit(In_sync, &rdev->flags);
5252 
5253 		if (test_bit(In_sync, &rdev->flags) ||
5254 		    atomic_read(&rdev->nr_pending)) {
5255 			err = -EBUSY;
5256 			goto abort;
5257 		}
5258 		/* Only remove non-faulty devices if recovery
5259 		 * isn't possible.
5260 		 */
5261 		if (!test_bit(Faulty, &rdev->flags) &&
5262 		    mddev->degraded <= conf->max_degraded &&
5263 		    number < conf->raid_disks) {
5264 			err = -EBUSY;
5265 			goto abort;
5266 		}
5267 		p->rdev = NULL;
5268 		synchronize_rcu();
5269 		if (atomic_read(&rdev->nr_pending)) {
5270 			/* lost the race, try later */
5271 			err = -EBUSY;
5272 			p->rdev = rdev;
5273 		}
5274 	}
5275 abort:
5276 
5277 	print_raid5_conf(conf);
5278 	return err;
5279 }
5280 
5281 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5282 {
5283 	raid5_conf_t *conf = mddev->private;
5284 	int err = -EEXIST;
5285 	int disk;
5286 	struct disk_info *p;
5287 	int first = 0;
5288 	int last = conf->raid_disks - 1;
5289 
5290 	if (mddev->degraded > conf->max_degraded)
5291 		/* no point adding a device */
5292 		return -EINVAL;
5293 
5294 	if (rdev->raid_disk >= 0)
5295 		first = last = rdev->raid_disk;
5296 
5297 	/*
5298 	 * find the disk ... but prefer rdev->saved_raid_disk
5299 	 * if possible.
5300 	 */
5301 	if (rdev->saved_raid_disk >= 0 &&
5302 	    rdev->saved_raid_disk >= first &&
5303 	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
5304 		disk = rdev->saved_raid_disk;
5305 	else
5306 		disk = first;
5307 	for ( ; disk <= last ; disk++)
5308 		if ((p=conf->disks + disk)->rdev == NULL) {
5309 			clear_bit(In_sync, &rdev->flags);
5310 			rdev->raid_disk = disk;
5311 			err = 0;
5312 			if (rdev->saved_raid_disk != disk)
5313 				conf->fullsync = 1;
5314 			rcu_assign_pointer(p->rdev, rdev);
5315 			break;
5316 		}
5317 	print_raid5_conf(conf);
5318 	return err;
5319 }
5320 
5321 static int raid5_resize(mddev_t *mddev, sector_t sectors)
5322 {
5323 	/* no resync is happening, and there is enough space
5324 	 * on all devices, so we can resize.
5325 	 * We need to make sure resync covers any new space.
5326 	 * If the array is shrinking we should possibly wait until
5327 	 * any io in the removed space completes, but it hardly seems
5328 	 * worth it.
5329 	 */
5330 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5331 	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5332 					       mddev->raid_disks));
5333 	if (mddev->array_sectors >
5334 	    raid5_size(mddev, sectors, mddev->raid_disks))
5335 		return -EINVAL;
5336 	set_capacity(mddev->gendisk, mddev->array_sectors);
5337 	mddev->changed = 1;
5338 	revalidate_disk(mddev->gendisk);
5339 	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
5340 		mddev->recovery_cp = mddev->dev_sectors;
5341 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5342 	}
5343 	mddev->dev_sectors = sectors;
5344 	mddev->resync_max_sectors = sectors;
5345 	return 0;
5346 }
5347 
5348 static int check_stripe_cache(mddev_t *mddev)
5349 {
5350 	/* Can only proceed if there are plenty of stripe_heads.
5351 	 * We need a minimum of one full stripe,, and for sensible progress
5352 	 * it is best to have about 4 times that.
5353 	 * If we require 4 times, then the default 256 4K stripe_heads will
5354 	 * allow for chunk sizes up to 256K, which is probably OK.
5355 	 * If the chunk size is greater, user-space should request more
5356 	 * stripe_heads first.
5357 	 */
5358 	raid5_conf_t *conf = mddev->private;
5359 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5360 	    > conf->max_nr_stripes ||
5361 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5362 	    > conf->max_nr_stripes) {
5363 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
5364 		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5365 			/ STRIPE_SIZE)*4);
5366 		return 0;
5367 	}
5368 	return 1;
5369 }
5370 
5371 static int check_reshape(mddev_t *mddev)
5372 {
5373 	raid5_conf_t *conf = mddev->private;
5374 
5375 	if (mddev->delta_disks == 0 &&
5376 	    mddev->new_layout == mddev->layout &&
5377 	    mddev->new_chunk_sectors == mddev->chunk_sectors)
5378 		return 0; /* nothing to do */
5379 	if (mddev->bitmap)
5380 		/* Cannot grow a bitmap yet */
5381 		return -EBUSY;
5382 	if (mddev->degraded > conf->max_degraded)
5383 		return -EINVAL;
5384 	if (mddev->delta_disks < 0) {
5385 		/* We might be able to shrink, but the devices must
5386 		 * be made bigger first.
5387 		 * For raid6, 4 is the minimum size.
5388 		 * Otherwise 2 is the minimum
5389 		 */
5390 		int min = 2;
5391 		if (mddev->level == 6)
5392 			min = 4;
5393 		if (mddev->raid_disks + mddev->delta_disks < min)
5394 			return -EINVAL;
5395 	}
5396 
5397 	if (!check_stripe_cache(mddev))
5398 		return -ENOSPC;
5399 
5400 	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5401 }
5402 
5403 static int raid5_start_reshape(mddev_t *mddev)
5404 {
5405 	raid5_conf_t *conf = mddev->private;
5406 	mdk_rdev_t *rdev;
5407 	int spares = 0;
5408 	int added_devices = 0;
5409 	unsigned long flags;
5410 
5411 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5412 		return -EBUSY;
5413 
5414 	if (!check_stripe_cache(mddev))
5415 		return -ENOSPC;
5416 
5417 	list_for_each_entry(rdev, &mddev->disks, same_set)
5418 		if (rdev->raid_disk < 0 &&
5419 		    !test_bit(Faulty, &rdev->flags))
5420 			spares++;
5421 
5422 	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5423 		/* Not enough devices even to make a degraded array
5424 		 * of that size
5425 		 */
5426 		return -EINVAL;
5427 
5428 	/* Refuse to reduce size of the array.  Any reductions in
5429 	 * array size must be through explicit setting of array_size
5430 	 * attribute.
5431 	 */
5432 	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5433 	    < mddev->array_sectors) {
5434 		printk(KERN_ERR "md: %s: array size must be reduced "
5435 		       "before number of disks\n", mdname(mddev));
5436 		return -EINVAL;
5437 	}
5438 
5439 	atomic_set(&conf->reshape_stripes, 0);
5440 	spin_lock_irq(&conf->device_lock);
5441 	conf->previous_raid_disks = conf->raid_disks;
5442 	conf->raid_disks += mddev->delta_disks;
5443 	conf->prev_chunk_sectors = conf->chunk_sectors;
5444 	conf->chunk_sectors = mddev->new_chunk_sectors;
5445 	conf->prev_algo = conf->algorithm;
5446 	conf->algorithm = mddev->new_layout;
5447 	if (mddev->delta_disks < 0)
5448 		conf->reshape_progress = raid5_size(mddev, 0, 0);
5449 	else
5450 		conf->reshape_progress = 0;
5451 	conf->reshape_safe = conf->reshape_progress;
5452 	conf->generation++;
5453 	spin_unlock_irq(&conf->device_lock);
5454 
5455 	/* Add some new drives, as many as will fit.
5456 	 * We know there are enough to make the newly sized array work.
5457 	 */
5458 	list_for_each_entry(rdev, &mddev->disks, same_set)
5459 		if (rdev->raid_disk < 0 &&
5460 		    !test_bit(Faulty, &rdev->flags)) {
5461 			if (raid5_add_disk(mddev, rdev) == 0) {
5462 				char nm[20];
5463 				if (rdev->raid_disk >= conf->previous_raid_disks) {
5464 					set_bit(In_sync, &rdev->flags);
5465 					added_devices++;
5466 				} else
5467 					rdev->recovery_offset = 0;
5468 				sprintf(nm, "rd%d", rdev->raid_disk);
5469 				if (sysfs_create_link(&mddev->kobj,
5470 						      &rdev->kobj, nm))
5471 					printk(KERN_WARNING
5472 					       "raid5: failed to create "
5473 					       " link %s for %s\n",
5474 					       nm, mdname(mddev));
5475 			} else
5476 				break;
5477 		}
5478 
5479 	/* When a reshape changes the number of devices, ->degraded
5480 	 * is measured against the large of the pre and post number of
5481 	 * devices.*/
5482 	if (mddev->delta_disks > 0) {
5483 		spin_lock_irqsave(&conf->device_lock, flags);
5484 		mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5485 			- added_devices;
5486 		spin_unlock_irqrestore(&conf->device_lock, flags);
5487 	}
5488 	mddev->raid_disks = conf->raid_disks;
5489 	mddev->reshape_position = conf->reshape_progress;
5490 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
5491 
5492 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5493 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5494 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5495 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5496 	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5497 						"reshape");
5498 	if (!mddev->sync_thread) {
5499 		mddev->recovery = 0;
5500 		spin_lock_irq(&conf->device_lock);
5501 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5502 		conf->reshape_progress = MaxSector;
5503 		spin_unlock_irq(&conf->device_lock);
5504 		return -EAGAIN;
5505 	}
5506 	conf->reshape_checkpoint = jiffies;
5507 	md_wakeup_thread(mddev->sync_thread);
5508 	md_new_event(mddev);
5509 	return 0;
5510 }
5511 
5512 /* This is called from the reshape thread and should make any
5513  * changes needed in 'conf'
5514  */
5515 static void end_reshape(raid5_conf_t *conf)
5516 {
5517 
5518 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5519 
5520 		spin_lock_irq(&conf->device_lock);
5521 		conf->previous_raid_disks = conf->raid_disks;
5522 		conf->reshape_progress = MaxSector;
5523 		spin_unlock_irq(&conf->device_lock);
5524 		wake_up(&conf->wait_for_overlap);
5525 
5526 		/* read-ahead size must cover two whole stripes, which is
5527 		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
5528 		 */
5529 		{
5530 			int data_disks = conf->raid_disks - conf->max_degraded;
5531 			int stripe = data_disks * ((conf->chunk_sectors << 9)
5532 						   / PAGE_SIZE);
5533 			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5534 				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5535 		}
5536 	}
5537 }
5538 
5539 /* This is called from the raid5d thread with mddev_lock held.
5540  * It makes config changes to the device.
5541  */
5542 static void raid5_finish_reshape(mddev_t *mddev)
5543 {
5544 	raid5_conf_t *conf = mddev->private;
5545 
5546 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5547 
5548 		if (mddev->delta_disks > 0) {
5549 			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5550 			set_capacity(mddev->gendisk, mddev->array_sectors);
5551 			mddev->changed = 1;
5552 			revalidate_disk(mddev->gendisk);
5553 		} else {
5554 			int d;
5555 			mddev->degraded = conf->raid_disks;
5556 			for (d = 0; d < conf->raid_disks ; d++)
5557 				if (conf->disks[d].rdev &&
5558 				    test_bit(In_sync,
5559 					     &conf->disks[d].rdev->flags))
5560 					mddev->degraded--;
5561 			for (d = conf->raid_disks ;
5562 			     d < conf->raid_disks - mddev->delta_disks;
5563 			     d++) {
5564 				mdk_rdev_t *rdev = conf->disks[d].rdev;
5565 				if (rdev && raid5_remove_disk(mddev, d) == 0) {
5566 					char nm[20];
5567 					sprintf(nm, "rd%d", rdev->raid_disk);
5568 					sysfs_remove_link(&mddev->kobj, nm);
5569 					rdev->raid_disk = -1;
5570 				}
5571 			}
5572 		}
5573 		mddev->layout = conf->algorithm;
5574 		mddev->chunk_sectors = conf->chunk_sectors;
5575 		mddev->reshape_position = MaxSector;
5576 		mddev->delta_disks = 0;
5577 	}
5578 }
5579 
5580 static void raid5_quiesce(mddev_t *mddev, int state)
5581 {
5582 	raid5_conf_t *conf = mddev->private;
5583 
5584 	switch(state) {
5585 	case 2: /* resume for a suspend */
5586 		wake_up(&conf->wait_for_overlap);
5587 		break;
5588 
5589 	case 1: /* stop all writes */
5590 		spin_lock_irq(&conf->device_lock);
5591 		/* '2' tells resync/reshape to pause so that all
5592 		 * active stripes can drain
5593 		 */
5594 		conf->quiesce = 2;
5595 		wait_event_lock_irq(conf->wait_for_stripe,
5596 				    atomic_read(&conf->active_stripes) == 0 &&
5597 				    atomic_read(&conf->active_aligned_reads) == 0,
5598 				    conf->device_lock, /* nothing */);
5599 		conf->quiesce = 1;
5600 		spin_unlock_irq(&conf->device_lock);
5601 		/* allow reshape to continue */
5602 		wake_up(&conf->wait_for_overlap);
5603 		break;
5604 
5605 	case 0: /* re-enable writes */
5606 		spin_lock_irq(&conf->device_lock);
5607 		conf->quiesce = 0;
5608 		wake_up(&conf->wait_for_stripe);
5609 		wake_up(&conf->wait_for_overlap);
5610 		spin_unlock_irq(&conf->device_lock);
5611 		break;
5612 	}
5613 }
5614 
5615 
5616 static void *raid5_takeover_raid1(mddev_t *mddev)
5617 {
5618 	int chunksect;
5619 
5620 	if (mddev->raid_disks != 2 ||
5621 	    mddev->degraded > 1)
5622 		return ERR_PTR(-EINVAL);
5623 
5624 	/* Should check if there are write-behind devices? */
5625 
5626 	chunksect = 64*2; /* 64K by default */
5627 
5628 	/* The array must be an exact multiple of chunksize */
5629 	while (chunksect && (mddev->array_sectors & (chunksect-1)))
5630 		chunksect >>= 1;
5631 
5632 	if ((chunksect<<9) < STRIPE_SIZE)
5633 		/* array size does not allow a suitable chunk size */
5634 		return ERR_PTR(-EINVAL);
5635 
5636 	mddev->new_level = 5;
5637 	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5638 	mddev->new_chunk_sectors = chunksect;
5639 
5640 	return setup_conf(mddev);
5641 }
5642 
5643 static void *raid5_takeover_raid6(mddev_t *mddev)
5644 {
5645 	int new_layout;
5646 
5647 	switch (mddev->layout) {
5648 	case ALGORITHM_LEFT_ASYMMETRIC_6:
5649 		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
5650 		break;
5651 	case ALGORITHM_RIGHT_ASYMMETRIC_6:
5652 		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
5653 		break;
5654 	case ALGORITHM_LEFT_SYMMETRIC_6:
5655 		new_layout = ALGORITHM_LEFT_SYMMETRIC;
5656 		break;
5657 	case ALGORITHM_RIGHT_SYMMETRIC_6:
5658 		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
5659 		break;
5660 	case ALGORITHM_PARITY_0_6:
5661 		new_layout = ALGORITHM_PARITY_0;
5662 		break;
5663 	case ALGORITHM_PARITY_N:
5664 		new_layout = ALGORITHM_PARITY_N;
5665 		break;
5666 	default:
5667 		return ERR_PTR(-EINVAL);
5668 	}
5669 	mddev->new_level = 5;
5670 	mddev->new_layout = new_layout;
5671 	mddev->delta_disks = -1;
5672 	mddev->raid_disks -= 1;
5673 	return setup_conf(mddev);
5674 }
5675 
5676 
5677 static int raid5_check_reshape(mddev_t *mddev)
5678 {
5679 	/* For a 2-drive array, the layout and chunk size can be changed
5680 	 * immediately as not restriping is needed.
5681 	 * For larger arrays we record the new value - after validation
5682 	 * to be used by a reshape pass.
5683 	 */
5684 	raid5_conf_t *conf = mddev->private;
5685 	int new_chunk = mddev->new_chunk_sectors;
5686 
5687 	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
5688 		return -EINVAL;
5689 	if (new_chunk > 0) {
5690 		if (!is_power_of_2(new_chunk))
5691 			return -EINVAL;
5692 		if (new_chunk < (PAGE_SIZE>>9))
5693 			return -EINVAL;
5694 		if (mddev->array_sectors & (new_chunk-1))
5695 			/* not factor of array size */
5696 			return -EINVAL;
5697 	}
5698 
5699 	/* They look valid */
5700 
5701 	if (mddev->raid_disks == 2) {
5702 		/* can make the change immediately */
5703 		if (mddev->new_layout >= 0) {
5704 			conf->algorithm = mddev->new_layout;
5705 			mddev->layout = mddev->new_layout;
5706 		}
5707 		if (new_chunk > 0) {
5708 			conf->chunk_sectors = new_chunk ;
5709 			mddev->chunk_sectors = new_chunk;
5710 		}
5711 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5712 		md_wakeup_thread(mddev->thread);
5713 	}
5714 	return check_reshape(mddev);
5715 }
5716 
5717 static int raid6_check_reshape(mddev_t *mddev)
5718 {
5719 	int new_chunk = mddev->new_chunk_sectors;
5720 
5721 	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
5722 		return -EINVAL;
5723 	if (new_chunk > 0) {
5724 		if (!is_power_of_2(new_chunk))
5725 			return -EINVAL;
5726 		if (new_chunk < (PAGE_SIZE >> 9))
5727 			return -EINVAL;
5728 		if (mddev->array_sectors & (new_chunk-1))
5729 			/* not factor of array size */
5730 			return -EINVAL;
5731 	}
5732 
5733 	/* They look valid */
5734 	return check_reshape(mddev);
5735 }
5736 
5737 static void *raid5_takeover(mddev_t *mddev)
5738 {
5739 	/* raid5 can take over:
5740 	 *  raid0 - if all devices are the same - make it a raid4 layout
5741 	 *  raid1 - if there are two drives.  We need to know the chunk size
5742 	 *  raid4 - trivial - just use a raid4 layout.
5743 	 *  raid6 - Providing it is a *_6 layout
5744 	 */
5745 
5746 	if (mddev->level == 1)
5747 		return raid5_takeover_raid1(mddev);
5748 	if (mddev->level == 4) {
5749 		mddev->new_layout = ALGORITHM_PARITY_N;
5750 		mddev->new_level = 5;
5751 		return setup_conf(mddev);
5752 	}
5753 	if (mddev->level == 6)
5754 		return raid5_takeover_raid6(mddev);
5755 
5756 	return ERR_PTR(-EINVAL);
5757 }
5758 
5759 
5760 static struct mdk_personality raid5_personality;
5761 
5762 static void *raid6_takeover(mddev_t *mddev)
5763 {
5764 	/* Currently can only take over a raid5.  We map the
5765 	 * personality to an equivalent raid6 personality
5766 	 * with the Q block at the end.
5767 	 */
5768 	int new_layout;
5769 
5770 	if (mddev->pers != &raid5_personality)
5771 		return ERR_PTR(-EINVAL);
5772 	if (mddev->degraded > 1)
5773 		return ERR_PTR(-EINVAL);
5774 	if (mddev->raid_disks > 253)
5775 		return ERR_PTR(-EINVAL);
5776 	if (mddev->raid_disks < 3)
5777 		return ERR_PTR(-EINVAL);
5778 
5779 	switch (mddev->layout) {
5780 	case ALGORITHM_LEFT_ASYMMETRIC:
5781 		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
5782 		break;
5783 	case ALGORITHM_RIGHT_ASYMMETRIC:
5784 		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
5785 		break;
5786 	case ALGORITHM_LEFT_SYMMETRIC:
5787 		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
5788 		break;
5789 	case ALGORITHM_RIGHT_SYMMETRIC:
5790 		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
5791 		break;
5792 	case ALGORITHM_PARITY_0:
5793 		new_layout = ALGORITHM_PARITY_0_6;
5794 		break;
5795 	case ALGORITHM_PARITY_N:
5796 		new_layout = ALGORITHM_PARITY_N;
5797 		break;
5798 	default:
5799 		return ERR_PTR(-EINVAL);
5800 	}
5801 	mddev->new_level = 6;
5802 	mddev->new_layout = new_layout;
5803 	mddev->delta_disks = 1;
5804 	mddev->raid_disks += 1;
5805 	return setup_conf(mddev);
5806 }
5807 
5808 
5809 static struct mdk_personality raid6_personality =
5810 {
5811 	.name		= "raid6",
5812 	.level		= 6,
5813 	.owner		= THIS_MODULE,
5814 	.make_request	= make_request,
5815 	.run		= run,
5816 	.stop		= stop,
5817 	.status		= status,
5818 	.error_handler	= error,
5819 	.hot_add_disk	= raid5_add_disk,
5820 	.hot_remove_disk= raid5_remove_disk,
5821 	.spare_active	= raid5_spare_active,
5822 	.sync_request	= sync_request,
5823 	.resize		= raid5_resize,
5824 	.size		= raid5_size,
5825 	.check_reshape	= raid6_check_reshape,
5826 	.start_reshape  = raid5_start_reshape,
5827 	.finish_reshape = raid5_finish_reshape,
5828 	.quiesce	= raid5_quiesce,
5829 	.takeover	= raid6_takeover,
5830 };
5831 static struct mdk_personality raid5_personality =
5832 {
5833 	.name		= "raid5",
5834 	.level		= 5,
5835 	.owner		= THIS_MODULE,
5836 	.make_request	= make_request,
5837 	.run		= run,
5838 	.stop		= stop,
5839 	.status		= status,
5840 	.error_handler	= error,
5841 	.hot_add_disk	= raid5_add_disk,
5842 	.hot_remove_disk= raid5_remove_disk,
5843 	.spare_active	= raid5_spare_active,
5844 	.sync_request	= sync_request,
5845 	.resize		= raid5_resize,
5846 	.size		= raid5_size,
5847 	.check_reshape	= raid5_check_reshape,
5848 	.start_reshape  = raid5_start_reshape,
5849 	.finish_reshape = raid5_finish_reshape,
5850 	.quiesce	= raid5_quiesce,
5851 	.takeover	= raid5_takeover,
5852 };
5853 
5854 static struct mdk_personality raid4_personality =
5855 {
5856 	.name		= "raid4",
5857 	.level		= 4,
5858 	.owner		= THIS_MODULE,
5859 	.make_request	= make_request,
5860 	.run		= run,
5861 	.stop		= stop,
5862 	.status		= status,
5863 	.error_handler	= error,
5864 	.hot_add_disk	= raid5_add_disk,
5865 	.hot_remove_disk= raid5_remove_disk,
5866 	.spare_active	= raid5_spare_active,
5867 	.sync_request	= sync_request,
5868 	.resize		= raid5_resize,
5869 	.size		= raid5_size,
5870 	.check_reshape	= raid5_check_reshape,
5871 	.start_reshape  = raid5_start_reshape,
5872 	.finish_reshape = raid5_finish_reshape,
5873 	.quiesce	= raid5_quiesce,
5874 };
5875 
5876 static int __init raid5_init(void)
5877 {
5878 	register_md_personality(&raid6_personality);
5879 	register_md_personality(&raid5_personality);
5880 	register_md_personality(&raid4_personality);
5881 	return 0;
5882 }
5883 
5884 static void raid5_exit(void)
5885 {
5886 	unregister_md_personality(&raid6_personality);
5887 	unregister_md_personality(&raid5_personality);
5888 	unregister_md_personality(&raid4_personality);
5889 }
5890 
5891 module_init(raid5_init);
5892 module_exit(raid5_exit);
5893 MODULE_LICENSE("GPL");
5894 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5895 MODULE_ALIAS("md-personality-4"); /* RAID5 */
5896 MODULE_ALIAS("md-raid5");
5897 MODULE_ALIAS("md-raid4");
5898 MODULE_ALIAS("md-level-5");
5899 MODULE_ALIAS("md-level-4");
5900 MODULE_ALIAS("md-personality-8"); /* RAID6 */
5901 MODULE_ALIAS("md-raid6");
5902 MODULE_ALIAS("md-level-6");
5903 
5904 /* This used to be two separate modules, they were: */
5905 MODULE_ALIAS("raid5");
5906 MODULE_ALIAS("raid6");
5907