xref: /linux/fs/btrfs/raid56.c (revision 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "misc.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 
23 /* set when additional merges to this rbio are not allowed */
24 #define RBIO_RMW_LOCKED_BIT	1
25 
26 /*
27  * set when this rbio is sitting in the hash, but it is just a cache
28  * of past RMW
29  */
30 #define RBIO_CACHE_BIT		2
31 
32 /*
33  * set when it is safe to trust the stripe_pages for caching
34  */
35 #define RBIO_CACHE_READY_BIT	3
36 
37 #define RBIO_CACHE_SIZE 1024
38 
39 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
40 
41 /* Used by the raid56 code to lock stripes for read/modify/write */
42 struct btrfs_stripe_hash {
43 	struct list_head hash_list;
44 	spinlock_t lock;
45 };
46 
47 /* Used by the raid56 code to lock stripes for read/modify/write */
48 struct btrfs_stripe_hash_table {
49 	struct list_head stripe_cache;
50 	spinlock_t cache_lock;
51 	int cache_size;
52 	struct btrfs_stripe_hash table[];
53 };
54 
55 /*
56  * A bvec like structure to present a sector inside a page.
57  *
58  * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
59  */
60 struct sector_ptr {
61 	struct page *page;
62 	unsigned int pgoff:24;
63 	unsigned int uptodate:8;
64 };
65 
66 enum btrfs_rbio_ops {
67 	BTRFS_RBIO_WRITE,
68 	BTRFS_RBIO_READ_REBUILD,
69 	BTRFS_RBIO_PARITY_SCRUB,
70 	BTRFS_RBIO_REBUILD_MISSING,
71 };
72 
73 struct btrfs_raid_bio {
74 	struct btrfs_io_context *bioc;
75 
76 	/* while we're doing rmw on a stripe
77 	 * we put it into a hash table so we can
78 	 * lock the stripe and merge more rbios
79 	 * into it.
80 	 */
81 	struct list_head hash_list;
82 
83 	/*
84 	 * LRU list for the stripe cache
85 	 */
86 	struct list_head stripe_cache;
87 
88 	/*
89 	 * for scheduling work in the helper threads
90 	 */
91 	struct work_struct work;
92 
93 	/*
94 	 * bio list and bio_list_lock are used
95 	 * to add more bios into the stripe
96 	 * in hopes of avoiding the full rmw
97 	 */
98 	struct bio_list bio_list;
99 	spinlock_t bio_list_lock;
100 
101 	/* also protected by the bio_list_lock, the
102 	 * plug list is used by the plugging code
103 	 * to collect partial bios while plugged.  The
104 	 * stripe locking code also uses it to hand off
105 	 * the stripe lock to the next pending IO
106 	 */
107 	struct list_head plug_list;
108 
109 	/*
110 	 * flags that tell us if it is safe to
111 	 * merge with this bio
112 	 */
113 	unsigned long flags;
114 
115 	/*
116 	 * set if we're doing a parity rebuild
117 	 * for a read from higher up, which is handled
118 	 * differently from a parity rebuild as part of
119 	 * rmw
120 	 */
121 	enum btrfs_rbio_ops operation;
122 
123 	/* Size of each individual stripe on disk */
124 	u32 stripe_len;
125 
126 	/* How many pages there are for the full stripe including P/Q */
127 	u16 nr_pages;
128 
129 	/* How many sectors there are for the full stripe including P/Q */
130 	u16 nr_sectors;
131 
132 	/* Number of data stripes (no p/q) */
133 	u8 nr_data;
134 
135 	/* Numer of all stripes (including P/Q) */
136 	u8 real_stripes;
137 
138 	/* How many pages there are for each stripe */
139 	u8 stripe_npages;
140 
141 	/* How many sectors there are for each stripe */
142 	u8 stripe_nsectors;
143 
144 	/* First bad stripe, -1 means no corruption */
145 	s8 faila;
146 
147 	/* Second bad stripe (for RAID6 use) */
148 	s8 failb;
149 
150 	/* Stripe number that we're scrubbing  */
151 	u8 scrubp;
152 
153 	/*
154 	 * size of all the bios in the bio_list.  This
155 	 * helps us decide if the rbio maps to a full
156 	 * stripe or not
157 	 */
158 	int bio_list_bytes;
159 
160 	int generic_bio_cnt;
161 
162 	refcount_t refs;
163 
164 	atomic_t stripes_pending;
165 
166 	atomic_t error;
167 	/*
168 	 * these are two arrays of pointers.  We allocate the
169 	 * rbio big enough to hold them both and setup their
170 	 * locations when the rbio is allocated
171 	 */
172 
173 	/* pointers to pages that we allocated for
174 	 * reading/writing stripes directly from the disk (including P/Q)
175 	 */
176 	struct page **stripe_pages;
177 
178 	/* Pointers to the sectors in the bio_list, for faster lookup */
179 	struct sector_ptr *bio_sectors;
180 
181 	/*
182 	 * For subpage support, we need to map each sector to above
183 	 * stripe_pages.
184 	 */
185 	struct sector_ptr *stripe_sectors;
186 
187 	/* Bitmap to record which horizontal stripe has data */
188 	unsigned long *dbitmap;
189 
190 	/* allocated with real_stripes-many pointers for finish_*() calls */
191 	void **finish_pointers;
192 
193 	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
194 	unsigned long *finish_pbitmap;
195 };
196 
197 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
198 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
199 static void rmw_work(struct work_struct *work);
200 static void read_rebuild_work(struct work_struct *work);
201 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
202 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
203 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
204 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
205 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
206 
207 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
208 					 int need_check);
209 static void scrub_parity_work(struct work_struct *work);
210 
211 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
212 {
213 	INIT_WORK(&rbio->work, work_func);
214 	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
215 }
216 
217 /*
218  * the stripe hash table is used for locking, and to collect
219  * bios in hopes of making a full stripe
220  */
221 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
222 {
223 	struct btrfs_stripe_hash_table *table;
224 	struct btrfs_stripe_hash_table *x;
225 	struct btrfs_stripe_hash *cur;
226 	struct btrfs_stripe_hash *h;
227 	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
228 	int i;
229 
230 	if (info->stripe_hash_table)
231 		return 0;
232 
233 	/*
234 	 * The table is large, starting with order 4 and can go as high as
235 	 * order 7 in case lock debugging is turned on.
236 	 *
237 	 * Try harder to allocate and fallback to vmalloc to lower the chance
238 	 * of a failing mount.
239 	 */
240 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
241 	if (!table)
242 		return -ENOMEM;
243 
244 	spin_lock_init(&table->cache_lock);
245 	INIT_LIST_HEAD(&table->stripe_cache);
246 
247 	h = table->table;
248 
249 	for (i = 0; i < num_entries; i++) {
250 		cur = h + i;
251 		INIT_LIST_HEAD(&cur->hash_list);
252 		spin_lock_init(&cur->lock);
253 	}
254 
255 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
256 	kvfree(x);
257 	return 0;
258 }
259 
260 /*
261  * caching an rbio means to copy anything from the
262  * bio_sectors array into the stripe_pages array.  We
263  * use the page uptodate bit in the stripe cache array
264  * to indicate if it has valid data
265  *
266  * once the caching is done, we set the cache ready
267  * bit.
268  */
269 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
270 {
271 	int i;
272 	int ret;
273 
274 	ret = alloc_rbio_pages(rbio);
275 	if (ret)
276 		return;
277 
278 	for (i = 0; i < rbio->nr_sectors; i++) {
279 		/* Some range not covered by bio (partial write), skip it */
280 		if (!rbio->bio_sectors[i].page)
281 			continue;
282 
283 		ASSERT(rbio->stripe_sectors[i].page);
284 		memcpy_page(rbio->stripe_sectors[i].page,
285 			    rbio->stripe_sectors[i].pgoff,
286 			    rbio->bio_sectors[i].page,
287 			    rbio->bio_sectors[i].pgoff,
288 			    rbio->bioc->fs_info->sectorsize);
289 		rbio->stripe_sectors[i].uptodate = 1;
290 	}
291 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
292 }
293 
294 /*
295  * we hash on the first logical address of the stripe
296  */
297 static int rbio_bucket(struct btrfs_raid_bio *rbio)
298 {
299 	u64 num = rbio->bioc->raid_map[0];
300 
301 	/*
302 	 * we shift down quite a bit.  We're using byte
303 	 * addressing, and most of the lower bits are zeros.
304 	 * This tends to upset hash_64, and it consistently
305 	 * returns just one or two different values.
306 	 *
307 	 * shifting off the lower bits fixes things.
308 	 */
309 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
310 }
311 
312 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
313 				       unsigned int page_nr)
314 {
315 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
316 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
317 	int i;
318 
319 	ASSERT(page_nr < rbio->nr_pages);
320 
321 	for (i = sectors_per_page * page_nr;
322 	     i < sectors_per_page * page_nr + sectors_per_page;
323 	     i++) {
324 		if (!rbio->stripe_sectors[i].uptodate)
325 			return false;
326 	}
327 	return true;
328 }
329 
330 /*
331  * Update the stripe_sectors[] array to use correct page and pgoff
332  *
333  * Should be called every time any page pointer in stripes_pages[] got modified.
334  */
335 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
336 {
337 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
338 	u32 offset;
339 	int i;
340 
341 	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
342 		int page_index = offset >> PAGE_SHIFT;
343 
344 		ASSERT(page_index < rbio->nr_pages);
345 		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
346 		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
347 	}
348 }
349 
350 /*
351  * Stealing an rbio means taking all the uptodate pages from the stripe array
352  * in the source rbio and putting them into the destination rbio.
353  *
354  * This will also update the involved stripe_sectors[] which are referring to
355  * the old pages.
356  */
357 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
358 {
359 	int i;
360 	struct page *s;
361 	struct page *d;
362 
363 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
364 		return;
365 
366 	for (i = 0; i < dest->nr_pages; i++) {
367 		s = src->stripe_pages[i];
368 		if (!s || !full_page_sectors_uptodate(src, i))
369 			continue;
370 
371 		d = dest->stripe_pages[i];
372 		if (d)
373 			__free_page(d);
374 
375 		dest->stripe_pages[i] = s;
376 		src->stripe_pages[i] = NULL;
377 	}
378 	index_stripe_sectors(dest);
379 	index_stripe_sectors(src);
380 }
381 
382 /*
383  * merging means we take the bio_list from the victim and
384  * splice it into the destination.  The victim should
385  * be discarded afterwards.
386  *
387  * must be called with dest->rbio_list_lock held
388  */
389 static void merge_rbio(struct btrfs_raid_bio *dest,
390 		       struct btrfs_raid_bio *victim)
391 {
392 	bio_list_merge(&dest->bio_list, &victim->bio_list);
393 	dest->bio_list_bytes += victim->bio_list_bytes;
394 	dest->generic_bio_cnt += victim->generic_bio_cnt;
395 	bio_list_init(&victim->bio_list);
396 }
397 
398 /*
399  * used to prune items that are in the cache.  The caller
400  * must hold the hash table lock.
401  */
402 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
403 {
404 	int bucket = rbio_bucket(rbio);
405 	struct btrfs_stripe_hash_table *table;
406 	struct btrfs_stripe_hash *h;
407 	int freeit = 0;
408 
409 	/*
410 	 * check the bit again under the hash table lock.
411 	 */
412 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
413 		return;
414 
415 	table = rbio->bioc->fs_info->stripe_hash_table;
416 	h = table->table + bucket;
417 
418 	/* hold the lock for the bucket because we may be
419 	 * removing it from the hash table
420 	 */
421 	spin_lock(&h->lock);
422 
423 	/*
424 	 * hold the lock for the bio list because we need
425 	 * to make sure the bio list is empty
426 	 */
427 	spin_lock(&rbio->bio_list_lock);
428 
429 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
430 		list_del_init(&rbio->stripe_cache);
431 		table->cache_size -= 1;
432 		freeit = 1;
433 
434 		/* if the bio list isn't empty, this rbio is
435 		 * still involved in an IO.  We take it out
436 		 * of the cache list, and drop the ref that
437 		 * was held for the list.
438 		 *
439 		 * If the bio_list was empty, we also remove
440 		 * the rbio from the hash_table, and drop
441 		 * the corresponding ref
442 		 */
443 		if (bio_list_empty(&rbio->bio_list)) {
444 			if (!list_empty(&rbio->hash_list)) {
445 				list_del_init(&rbio->hash_list);
446 				refcount_dec(&rbio->refs);
447 				BUG_ON(!list_empty(&rbio->plug_list));
448 			}
449 		}
450 	}
451 
452 	spin_unlock(&rbio->bio_list_lock);
453 	spin_unlock(&h->lock);
454 
455 	if (freeit)
456 		__free_raid_bio(rbio);
457 }
458 
459 /*
460  * prune a given rbio from the cache
461  */
462 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
463 {
464 	struct btrfs_stripe_hash_table *table;
465 	unsigned long flags;
466 
467 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
468 		return;
469 
470 	table = rbio->bioc->fs_info->stripe_hash_table;
471 
472 	spin_lock_irqsave(&table->cache_lock, flags);
473 	__remove_rbio_from_cache(rbio);
474 	spin_unlock_irqrestore(&table->cache_lock, flags);
475 }
476 
477 /*
478  * remove everything in the cache
479  */
480 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
481 {
482 	struct btrfs_stripe_hash_table *table;
483 	unsigned long flags;
484 	struct btrfs_raid_bio *rbio;
485 
486 	table = info->stripe_hash_table;
487 
488 	spin_lock_irqsave(&table->cache_lock, flags);
489 	while (!list_empty(&table->stripe_cache)) {
490 		rbio = list_entry(table->stripe_cache.next,
491 				  struct btrfs_raid_bio,
492 				  stripe_cache);
493 		__remove_rbio_from_cache(rbio);
494 	}
495 	spin_unlock_irqrestore(&table->cache_lock, flags);
496 }
497 
498 /*
499  * remove all cached entries and free the hash table
500  * used by unmount
501  */
502 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
503 {
504 	if (!info->stripe_hash_table)
505 		return;
506 	btrfs_clear_rbio_cache(info);
507 	kvfree(info->stripe_hash_table);
508 	info->stripe_hash_table = NULL;
509 }
510 
511 /*
512  * insert an rbio into the stripe cache.  It
513  * must have already been prepared by calling
514  * cache_rbio_pages
515  *
516  * If this rbio was already cached, it gets
517  * moved to the front of the lru.
518  *
519  * If the size of the rbio cache is too big, we
520  * prune an item.
521  */
522 static void cache_rbio(struct btrfs_raid_bio *rbio)
523 {
524 	struct btrfs_stripe_hash_table *table;
525 	unsigned long flags;
526 
527 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
528 		return;
529 
530 	table = rbio->bioc->fs_info->stripe_hash_table;
531 
532 	spin_lock_irqsave(&table->cache_lock, flags);
533 	spin_lock(&rbio->bio_list_lock);
534 
535 	/* bump our ref if we were not in the list before */
536 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
537 		refcount_inc(&rbio->refs);
538 
539 	if (!list_empty(&rbio->stripe_cache)){
540 		list_move(&rbio->stripe_cache, &table->stripe_cache);
541 	} else {
542 		list_add(&rbio->stripe_cache, &table->stripe_cache);
543 		table->cache_size += 1;
544 	}
545 
546 	spin_unlock(&rbio->bio_list_lock);
547 
548 	if (table->cache_size > RBIO_CACHE_SIZE) {
549 		struct btrfs_raid_bio *found;
550 
551 		found = list_entry(table->stripe_cache.prev,
552 				  struct btrfs_raid_bio,
553 				  stripe_cache);
554 
555 		if (found != rbio)
556 			__remove_rbio_from_cache(found);
557 	}
558 
559 	spin_unlock_irqrestore(&table->cache_lock, flags);
560 }
561 
562 /*
563  * helper function to run the xor_blocks api.  It is only
564  * able to do MAX_XOR_BLOCKS at a time, so we need to
565  * loop through.
566  */
567 static void run_xor(void **pages, int src_cnt, ssize_t len)
568 {
569 	int src_off = 0;
570 	int xor_src_cnt = 0;
571 	void *dest = pages[src_cnt];
572 
573 	while(src_cnt > 0) {
574 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
575 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
576 
577 		src_cnt -= xor_src_cnt;
578 		src_off += xor_src_cnt;
579 	}
580 }
581 
582 /*
583  * Returns true if the bio list inside this rbio covers an entire stripe (no
584  * rmw required).
585  */
586 static int rbio_is_full(struct btrfs_raid_bio *rbio)
587 {
588 	unsigned long flags;
589 	unsigned long size = rbio->bio_list_bytes;
590 	int ret = 1;
591 
592 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
593 	if (size != rbio->nr_data * rbio->stripe_len)
594 		ret = 0;
595 	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
596 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
597 
598 	return ret;
599 }
600 
601 /*
602  * returns 1 if it is safe to merge two rbios together.
603  * The merging is safe if the two rbios correspond to
604  * the same stripe and if they are both going in the same
605  * direction (read vs write), and if neither one is
606  * locked for final IO
607  *
608  * The caller is responsible for locking such that
609  * rmw_locked is safe to test
610  */
611 static int rbio_can_merge(struct btrfs_raid_bio *last,
612 			  struct btrfs_raid_bio *cur)
613 {
614 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
615 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
616 		return 0;
617 
618 	/*
619 	 * we can't merge with cached rbios, since the
620 	 * idea is that when we merge the destination
621 	 * rbio is going to run our IO for us.  We can
622 	 * steal from cached rbios though, other functions
623 	 * handle that.
624 	 */
625 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
626 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
627 		return 0;
628 
629 	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
630 		return 0;
631 
632 	/* we can't merge with different operations */
633 	if (last->operation != cur->operation)
634 		return 0;
635 	/*
636 	 * We've need read the full stripe from the drive.
637 	 * check and repair the parity and write the new results.
638 	 *
639 	 * We're not allowed to add any new bios to the
640 	 * bio list here, anyone else that wants to
641 	 * change this stripe needs to do their own rmw.
642 	 */
643 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
644 		return 0;
645 
646 	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
647 		return 0;
648 
649 	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
650 		int fa = last->faila;
651 		int fb = last->failb;
652 		int cur_fa = cur->faila;
653 		int cur_fb = cur->failb;
654 
655 		if (last->faila >= last->failb) {
656 			fa = last->failb;
657 			fb = last->faila;
658 		}
659 
660 		if (cur->faila >= cur->failb) {
661 			cur_fa = cur->failb;
662 			cur_fb = cur->faila;
663 		}
664 
665 		if (fa != cur_fa || fb != cur_fb)
666 			return 0;
667 	}
668 	return 1;
669 }
670 
671 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
672 					     unsigned int stripe_nr,
673 					     unsigned int sector_nr)
674 {
675 	ASSERT(stripe_nr < rbio->real_stripes);
676 	ASSERT(sector_nr < rbio->stripe_nsectors);
677 
678 	return stripe_nr * rbio->stripe_nsectors + sector_nr;
679 }
680 
681 /* Return a sector from rbio->stripe_sectors, not from the bio list */
682 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
683 					     unsigned int stripe_nr,
684 					     unsigned int sector_nr)
685 {
686 	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
687 							      sector_nr)];
688 }
689 
690 /* Grab a sector inside P stripe */
691 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
692 					      unsigned int sector_nr)
693 {
694 	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
695 }
696 
697 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
698 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
699 					      unsigned int sector_nr)
700 {
701 	if (rbio->nr_data + 1 == rbio->real_stripes)
702 		return NULL;
703 	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
704 }
705 
706 /*
707  * The first stripe in the table for a logical address
708  * has the lock.  rbios are added in one of three ways:
709  *
710  * 1) Nobody has the stripe locked yet.  The rbio is given
711  * the lock and 0 is returned.  The caller must start the IO
712  * themselves.
713  *
714  * 2) Someone has the stripe locked, but we're able to merge
715  * with the lock owner.  The rbio is freed and the IO will
716  * start automatically along with the existing rbio.  1 is returned.
717  *
718  * 3) Someone has the stripe locked, but we're not able to merge.
719  * The rbio is added to the lock owner's plug list, or merged into
720  * an rbio already on the plug list.  When the lock owner unlocks,
721  * the next rbio on the list is run and the IO is started automatically.
722  * 1 is returned
723  *
724  * If we return 0, the caller still owns the rbio and must continue with
725  * IO submission.  If we return 1, the caller must assume the rbio has
726  * already been freed.
727  */
728 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
729 {
730 	struct btrfs_stripe_hash *h;
731 	struct btrfs_raid_bio *cur;
732 	struct btrfs_raid_bio *pending;
733 	unsigned long flags;
734 	struct btrfs_raid_bio *freeit = NULL;
735 	struct btrfs_raid_bio *cache_drop = NULL;
736 	int ret = 0;
737 
738 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
739 
740 	spin_lock_irqsave(&h->lock, flags);
741 	list_for_each_entry(cur, &h->hash_list, hash_list) {
742 		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
743 			continue;
744 
745 		spin_lock(&cur->bio_list_lock);
746 
747 		/* Can we steal this cached rbio's pages? */
748 		if (bio_list_empty(&cur->bio_list) &&
749 		    list_empty(&cur->plug_list) &&
750 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
751 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
752 			list_del_init(&cur->hash_list);
753 			refcount_dec(&cur->refs);
754 
755 			steal_rbio(cur, rbio);
756 			cache_drop = cur;
757 			spin_unlock(&cur->bio_list_lock);
758 
759 			goto lockit;
760 		}
761 
762 		/* Can we merge into the lock owner? */
763 		if (rbio_can_merge(cur, rbio)) {
764 			merge_rbio(cur, rbio);
765 			spin_unlock(&cur->bio_list_lock);
766 			freeit = rbio;
767 			ret = 1;
768 			goto out;
769 		}
770 
771 
772 		/*
773 		 * We couldn't merge with the running rbio, see if we can merge
774 		 * with the pending ones.  We don't have to check for rmw_locked
775 		 * because there is no way they are inside finish_rmw right now
776 		 */
777 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
778 			if (rbio_can_merge(pending, rbio)) {
779 				merge_rbio(pending, rbio);
780 				spin_unlock(&cur->bio_list_lock);
781 				freeit = rbio;
782 				ret = 1;
783 				goto out;
784 			}
785 		}
786 
787 		/*
788 		 * No merging, put us on the tail of the plug list, our rbio
789 		 * will be started with the currently running rbio unlocks
790 		 */
791 		list_add_tail(&rbio->plug_list, &cur->plug_list);
792 		spin_unlock(&cur->bio_list_lock);
793 		ret = 1;
794 		goto out;
795 	}
796 lockit:
797 	refcount_inc(&rbio->refs);
798 	list_add(&rbio->hash_list, &h->hash_list);
799 out:
800 	spin_unlock_irqrestore(&h->lock, flags);
801 	if (cache_drop)
802 		remove_rbio_from_cache(cache_drop);
803 	if (freeit)
804 		__free_raid_bio(freeit);
805 	return ret;
806 }
807 
808 /*
809  * called as rmw or parity rebuild is completed.  If the plug list has more
810  * rbios waiting for this stripe, the next one on the list will be started
811  */
812 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
813 {
814 	int bucket;
815 	struct btrfs_stripe_hash *h;
816 	unsigned long flags;
817 	int keep_cache = 0;
818 
819 	bucket = rbio_bucket(rbio);
820 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
821 
822 	if (list_empty(&rbio->plug_list))
823 		cache_rbio(rbio);
824 
825 	spin_lock_irqsave(&h->lock, flags);
826 	spin_lock(&rbio->bio_list_lock);
827 
828 	if (!list_empty(&rbio->hash_list)) {
829 		/*
830 		 * if we're still cached and there is no other IO
831 		 * to perform, just leave this rbio here for others
832 		 * to steal from later
833 		 */
834 		if (list_empty(&rbio->plug_list) &&
835 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
836 			keep_cache = 1;
837 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
838 			BUG_ON(!bio_list_empty(&rbio->bio_list));
839 			goto done;
840 		}
841 
842 		list_del_init(&rbio->hash_list);
843 		refcount_dec(&rbio->refs);
844 
845 		/*
846 		 * we use the plug list to hold all the rbios
847 		 * waiting for the chance to lock this stripe.
848 		 * hand the lock over to one of them.
849 		 */
850 		if (!list_empty(&rbio->plug_list)) {
851 			struct btrfs_raid_bio *next;
852 			struct list_head *head = rbio->plug_list.next;
853 
854 			next = list_entry(head, struct btrfs_raid_bio,
855 					  plug_list);
856 
857 			list_del_init(&rbio->plug_list);
858 
859 			list_add(&next->hash_list, &h->hash_list);
860 			refcount_inc(&next->refs);
861 			spin_unlock(&rbio->bio_list_lock);
862 			spin_unlock_irqrestore(&h->lock, flags);
863 
864 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
865 				start_async_work(next, read_rebuild_work);
866 			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
867 				steal_rbio(rbio, next);
868 				start_async_work(next, read_rebuild_work);
869 			} else if (next->operation == BTRFS_RBIO_WRITE) {
870 				steal_rbio(rbio, next);
871 				start_async_work(next, rmw_work);
872 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
873 				steal_rbio(rbio, next);
874 				start_async_work(next, scrub_parity_work);
875 			}
876 
877 			goto done_nolock;
878 		}
879 	}
880 done:
881 	spin_unlock(&rbio->bio_list_lock);
882 	spin_unlock_irqrestore(&h->lock, flags);
883 
884 done_nolock:
885 	if (!keep_cache)
886 		remove_rbio_from_cache(rbio);
887 }
888 
889 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
890 {
891 	int i;
892 
893 	if (!refcount_dec_and_test(&rbio->refs))
894 		return;
895 
896 	WARN_ON(!list_empty(&rbio->stripe_cache));
897 	WARN_ON(!list_empty(&rbio->hash_list));
898 	WARN_ON(!bio_list_empty(&rbio->bio_list));
899 
900 	for (i = 0; i < rbio->nr_pages; i++) {
901 		if (rbio->stripe_pages[i]) {
902 			__free_page(rbio->stripe_pages[i]);
903 			rbio->stripe_pages[i] = NULL;
904 		}
905 	}
906 
907 	btrfs_put_bioc(rbio->bioc);
908 	kfree(rbio);
909 }
910 
911 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
912 {
913 	struct bio *next;
914 
915 	while (cur) {
916 		next = cur->bi_next;
917 		cur->bi_next = NULL;
918 		cur->bi_status = err;
919 		bio_endio(cur);
920 		cur = next;
921 	}
922 }
923 
924 /*
925  * this frees the rbio and runs through all the bios in the
926  * bio_list and calls end_io on them
927  */
928 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
929 {
930 	struct bio *cur = bio_list_get(&rbio->bio_list);
931 	struct bio *extra;
932 
933 	if (rbio->generic_bio_cnt)
934 		btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
935 
936 	/*
937 	 * At this moment, rbio->bio_list is empty, however since rbio does not
938 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
939 	 * hash list, rbio may be merged with others so that rbio->bio_list
940 	 * becomes non-empty.
941 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
942 	 * more and we can call bio_endio() on all queued bios.
943 	 */
944 	unlock_stripe(rbio);
945 	extra = bio_list_get(&rbio->bio_list);
946 	__free_raid_bio(rbio);
947 
948 	rbio_endio_bio_list(cur, err);
949 	if (extra)
950 		rbio_endio_bio_list(extra, err);
951 }
952 
953 /*
954  * end io function used by finish_rmw.  When we finally
955  * get here, we've written a full stripe
956  */
957 static void raid_write_end_io(struct bio *bio)
958 {
959 	struct btrfs_raid_bio *rbio = bio->bi_private;
960 	blk_status_t err = bio->bi_status;
961 	int max_errors;
962 
963 	if (err)
964 		fail_bio_stripe(rbio, bio);
965 
966 	bio_put(bio);
967 
968 	if (!atomic_dec_and_test(&rbio->stripes_pending))
969 		return;
970 
971 	err = BLK_STS_OK;
972 
973 	/* OK, we have read all the stripes we need to. */
974 	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
975 		     0 : rbio->bioc->max_errors;
976 	if (atomic_read(&rbio->error) > max_errors)
977 		err = BLK_STS_IOERR;
978 
979 	rbio_orig_end_io(rbio, err);
980 }
981 
982 /**
983  * Get a sector pointer specified by its @stripe_nr and @sector_nr
984  *
985  * @rbio:               The raid bio
986  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
987  * @sector_nr:		Sector number inside the stripe,
988  *			valid range [0, stripe_nsectors)
989  * @bio_list_only:      Whether to use sectors inside the bio list only.
990  *
991  * The read/modify/write code wants to reuse the original bio page as much
992  * as possible, and only use stripe_sectors as fallback.
993  */
994 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
995 					 int stripe_nr, int sector_nr,
996 					 bool bio_list_only)
997 {
998 	struct sector_ptr *sector;
999 	int index;
1000 
1001 	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
1002 	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1003 
1004 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
1005 	ASSERT(index >= 0 && index < rbio->nr_sectors);
1006 
1007 	spin_lock_irq(&rbio->bio_list_lock);
1008 	sector = &rbio->bio_sectors[index];
1009 	if (sector->page || bio_list_only) {
1010 		/* Don't return sector without a valid page pointer */
1011 		if (!sector->page)
1012 			sector = NULL;
1013 		spin_unlock_irq(&rbio->bio_list_lock);
1014 		return sector;
1015 	}
1016 	spin_unlock_irq(&rbio->bio_list_lock);
1017 
1018 	return &rbio->stripe_sectors[index];
1019 }
1020 
1021 /*
1022  * allocation and initial setup for the btrfs_raid_bio.  Not
1023  * this does not allocate any pages for rbio->pages.
1024  */
1025 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
1026 					 struct btrfs_io_context *bioc,
1027 					 u32 stripe_len)
1028 {
1029 	const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
1030 	const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
1031 	const unsigned int num_pages = stripe_npages * real_stripes;
1032 	const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
1033 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
1034 	struct btrfs_raid_bio *rbio;
1035 	int nr_data = 0;
1036 	void *p;
1037 
1038 	ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
1039 	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1040 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
1041 
1042 	rbio = kzalloc(sizeof(*rbio) +
1043 		       sizeof(*rbio->stripe_pages) * num_pages +
1044 		       sizeof(*rbio->bio_sectors) * num_sectors +
1045 		       sizeof(*rbio->stripe_sectors) * num_sectors +
1046 		       sizeof(*rbio->finish_pointers) * real_stripes +
1047 		       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
1048 		       sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
1049 		       GFP_NOFS);
1050 	if (!rbio)
1051 		return ERR_PTR(-ENOMEM);
1052 
1053 	bio_list_init(&rbio->bio_list);
1054 	INIT_LIST_HEAD(&rbio->plug_list);
1055 	spin_lock_init(&rbio->bio_list_lock);
1056 	INIT_LIST_HEAD(&rbio->stripe_cache);
1057 	INIT_LIST_HEAD(&rbio->hash_list);
1058 	rbio->bioc = bioc;
1059 	rbio->stripe_len = stripe_len;
1060 	rbio->nr_pages = num_pages;
1061 	rbio->nr_sectors = num_sectors;
1062 	rbio->real_stripes = real_stripes;
1063 	rbio->stripe_npages = stripe_npages;
1064 	rbio->stripe_nsectors = stripe_nsectors;
1065 	rbio->faila = -1;
1066 	rbio->failb = -1;
1067 	refcount_set(&rbio->refs, 1);
1068 	atomic_set(&rbio->error, 0);
1069 	atomic_set(&rbio->stripes_pending, 0);
1070 
1071 	/*
1072 	 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
1073 	 * we allocated past the end of the rbio.
1074 	 */
1075 	p = rbio + 1;
1076 #define CONSUME_ALLOC(ptr, count)	do {				\
1077 		ptr = p;						\
1078 		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
1079 	} while (0)
1080 	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1081 	CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
1082 	CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1083 	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1084 	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
1085 	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
1086 #undef  CONSUME_ALLOC
1087 
1088 	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1089 		nr_data = real_stripes - 1;
1090 	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1091 		nr_data = real_stripes - 2;
1092 	else
1093 		BUG();
1094 
1095 	rbio->nr_data = nr_data;
1096 	return rbio;
1097 }
1098 
1099 /* allocate pages for all the stripes in the bio, including parity */
1100 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1101 {
1102 	int ret;
1103 
1104 	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
1105 	if (ret < 0)
1106 		return ret;
1107 	/* Mapping all sectors */
1108 	index_stripe_sectors(rbio);
1109 	return 0;
1110 }
1111 
1112 /* only allocate pages for p/q stripes */
1113 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1114 {
1115 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1116 	int ret;
1117 
1118 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1119 				     rbio->stripe_pages + data_pages);
1120 	if (ret < 0)
1121 		return ret;
1122 
1123 	index_stripe_sectors(rbio);
1124 	return 0;
1125 }
1126 
1127 /*
1128  * Add a single sector @sector into our list of bios for IO.
1129  *
1130  * Return 0 if everything went well.
1131  * Return <0 for error.
1132  */
1133 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1134 			      struct bio_list *bio_list,
1135 			      struct sector_ptr *sector,
1136 			      unsigned int stripe_nr,
1137 			      unsigned int sector_nr,
1138 			      unsigned long bio_max_len,
1139 			      unsigned int opf)
1140 {
1141 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1142 	struct bio *last = bio_list->tail;
1143 	int ret;
1144 	struct bio *bio;
1145 	struct btrfs_io_stripe *stripe;
1146 	u64 disk_start;
1147 
1148 	/*
1149 	 * Note: here stripe_nr has taken device replace into consideration,
1150 	 * thus it can be larger than rbio->real_stripe.
1151 	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1152 	 */
1153 	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1154 	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1155 	ASSERT(sector->page);
1156 
1157 	stripe = &rbio->bioc->stripes[stripe_nr];
1158 	disk_start = stripe->physical + sector_nr * sectorsize;
1159 
1160 	/* if the device is missing, just fail this stripe */
1161 	if (!stripe->dev->bdev)
1162 		return fail_rbio_index(rbio, stripe_nr);
1163 
1164 	/* see if we can add this page onto our existing bio */
1165 	if (last) {
1166 		u64 last_end = last->bi_iter.bi_sector << 9;
1167 		last_end += last->bi_iter.bi_size;
1168 
1169 		/*
1170 		 * we can't merge these if they are from different
1171 		 * devices or if they are not contiguous
1172 		 */
1173 		if (last_end == disk_start && !last->bi_status &&
1174 		    last->bi_bdev == stripe->dev->bdev) {
1175 			ret = bio_add_page(last, sector->page, sectorsize,
1176 					   sector->pgoff);
1177 			if (ret == sectorsize)
1178 				return 0;
1179 		}
1180 	}
1181 
1182 	/* put a new bio on the list */
1183 	bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1184 			opf, GFP_NOFS);
1185 	bio->bi_iter.bi_sector = disk_start >> 9;
1186 	bio->bi_private = rbio;
1187 
1188 	bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1189 	bio_list_add(bio_list, bio);
1190 	return 0;
1191 }
1192 
1193 /*
1194  * while we're doing the read/modify/write cycle, we could
1195  * have errors in reading pages off the disk.  This checks
1196  * for errors and if we're not able to read the page it'll
1197  * trigger parity reconstruction.  The rmw will be finished
1198  * after we've reconstructed the failed stripes
1199  */
1200 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1201 {
1202 	if (rbio->faila >= 0 || rbio->failb >= 0) {
1203 		BUG_ON(rbio->faila == rbio->real_stripes - 1);
1204 		__raid56_parity_recover(rbio);
1205 	} else {
1206 		finish_rmw(rbio);
1207 	}
1208 }
1209 
1210 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1211 {
1212 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1213 	struct bio_vec bvec;
1214 	struct bvec_iter iter;
1215 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1216 		     rbio->bioc->raid_map[0];
1217 
1218 	if (bio_flagged(bio, BIO_CLONED))
1219 		bio->bi_iter = btrfs_bio(bio)->iter;
1220 
1221 	bio_for_each_segment(bvec, bio, iter) {
1222 		u32 bvec_offset;
1223 
1224 		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1225 		     bvec_offset += sectorsize, offset += sectorsize) {
1226 			int index = offset / sectorsize;
1227 			struct sector_ptr *sector = &rbio->bio_sectors[index];
1228 
1229 			sector->page = bvec.bv_page;
1230 			sector->pgoff = bvec.bv_offset + bvec_offset;
1231 			ASSERT(sector->pgoff < PAGE_SIZE);
1232 		}
1233 	}
1234 }
1235 
1236 /*
1237  * helper function to walk our bio list and populate the bio_pages array with
1238  * the result.  This seems expensive, but it is faster than constantly
1239  * searching through the bio list as we setup the IO in finish_rmw or stripe
1240  * reconstruction.
1241  *
1242  * This must be called before you trust the answers from page_in_rbio
1243  */
1244 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1245 {
1246 	struct bio *bio;
1247 
1248 	spin_lock_irq(&rbio->bio_list_lock);
1249 	bio_list_for_each(bio, &rbio->bio_list)
1250 		index_one_bio(rbio, bio);
1251 
1252 	spin_unlock_irq(&rbio->bio_list_lock);
1253 }
1254 
1255 /*
1256  * this is called from one of two situations.  We either
1257  * have a full stripe from the higher layers, or we've read all
1258  * the missing bits off disk.
1259  *
1260  * This will calculate the parity and then send down any
1261  * changed blocks.
1262  */
1263 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1264 {
1265 	struct btrfs_io_context *bioc = rbio->bioc;
1266 	const u32 sectorsize = bioc->fs_info->sectorsize;
1267 	void **pointers = rbio->finish_pointers;
1268 	int nr_data = rbio->nr_data;
1269 	int stripe;
1270 	int sectornr;
1271 	bool has_qstripe;
1272 	struct bio_list bio_list;
1273 	struct bio *bio;
1274 	int ret;
1275 
1276 	bio_list_init(&bio_list);
1277 
1278 	if (rbio->real_stripes - rbio->nr_data == 1)
1279 		has_qstripe = false;
1280 	else if (rbio->real_stripes - rbio->nr_data == 2)
1281 		has_qstripe = true;
1282 	else
1283 		BUG();
1284 
1285 	/* at this point we either have a full stripe,
1286 	 * or we've read the full stripe from the drive.
1287 	 * recalculate the parity and write the new results.
1288 	 *
1289 	 * We're not allowed to add any new bios to the
1290 	 * bio list here, anyone else that wants to
1291 	 * change this stripe needs to do their own rmw.
1292 	 */
1293 	spin_lock_irq(&rbio->bio_list_lock);
1294 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1295 	spin_unlock_irq(&rbio->bio_list_lock);
1296 
1297 	atomic_set(&rbio->error, 0);
1298 
1299 	/*
1300 	 * now that we've set rmw_locked, run through the
1301 	 * bio list one last time and map the page pointers
1302 	 *
1303 	 * We don't cache full rbios because we're assuming
1304 	 * the higher layers are unlikely to use this area of
1305 	 * the disk again soon.  If they do use it again,
1306 	 * hopefully they will send another full bio.
1307 	 */
1308 	index_rbio_pages(rbio);
1309 	if (!rbio_is_full(rbio))
1310 		cache_rbio_pages(rbio);
1311 	else
1312 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1313 
1314 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1315 		struct sector_ptr *sector;
1316 
1317 		/* First collect one sector from each data stripe */
1318 		for (stripe = 0; stripe < nr_data; stripe++) {
1319 			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1320 			pointers[stripe] = kmap_local_page(sector->page) +
1321 					   sector->pgoff;
1322 		}
1323 
1324 		/* Then add the parity stripe */
1325 		sector = rbio_pstripe_sector(rbio, sectornr);
1326 		sector->uptodate = 1;
1327 		pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1328 
1329 		if (has_qstripe) {
1330 			/*
1331 			 * RAID6, add the qstripe and call the library function
1332 			 * to fill in our p/q
1333 			 */
1334 			sector = rbio_qstripe_sector(rbio, sectornr);
1335 			sector->uptodate = 1;
1336 			pointers[stripe++] = kmap_local_page(sector->page) +
1337 					     sector->pgoff;
1338 
1339 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1340 						pointers);
1341 		} else {
1342 			/* raid5 */
1343 			memcpy(pointers[nr_data], pointers[0], sectorsize);
1344 			run_xor(pointers + 1, nr_data - 1, sectorsize);
1345 		}
1346 		for (stripe = stripe - 1; stripe >= 0; stripe--)
1347 			kunmap_local(pointers[stripe]);
1348 	}
1349 
1350 	/*
1351 	 * time to start writing.  Make bios for everything from the
1352 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1353 	 * everything else.
1354 	 */
1355 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1356 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1357 			struct sector_ptr *sector;
1358 
1359 			if (stripe < rbio->nr_data) {
1360 				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1361 				if (!sector)
1362 					continue;
1363 			} else {
1364 				sector = rbio_stripe_sector(rbio, stripe, sectornr);
1365 			}
1366 
1367 			ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1368 						 sectornr, rbio->stripe_len,
1369 						 REQ_OP_WRITE);
1370 			if (ret)
1371 				goto cleanup;
1372 		}
1373 	}
1374 
1375 	if (likely(!bioc->num_tgtdevs))
1376 		goto write_data;
1377 
1378 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1379 		if (!bioc->tgtdev_map[stripe])
1380 			continue;
1381 
1382 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1383 			struct sector_ptr *sector;
1384 
1385 			if (stripe < rbio->nr_data) {
1386 				sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1387 				if (!sector)
1388 					continue;
1389 			} else {
1390 				sector = rbio_stripe_sector(rbio, stripe, sectornr);
1391 			}
1392 
1393 			ret = rbio_add_io_sector(rbio, &bio_list, sector,
1394 					       rbio->bioc->tgtdev_map[stripe],
1395 					       sectornr, rbio->stripe_len,
1396 					       REQ_OP_WRITE);
1397 			if (ret)
1398 				goto cleanup;
1399 		}
1400 	}
1401 
1402 write_data:
1403 	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1404 	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1405 
1406 	while ((bio = bio_list_pop(&bio_list))) {
1407 		bio->bi_end_io = raid_write_end_io;
1408 
1409 		submit_bio(bio);
1410 	}
1411 	return;
1412 
1413 cleanup:
1414 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1415 
1416 	while ((bio = bio_list_pop(&bio_list)))
1417 		bio_put(bio);
1418 }
1419 
1420 /*
1421  * helper to find the stripe number for a given bio.  Used to figure out which
1422  * stripe has failed.  This expects the bio to correspond to a physical disk,
1423  * so it looks up based on physical sector numbers.
1424  */
1425 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1426 			   struct bio *bio)
1427 {
1428 	u64 physical = bio->bi_iter.bi_sector;
1429 	int i;
1430 	struct btrfs_io_stripe *stripe;
1431 
1432 	physical <<= 9;
1433 
1434 	for (i = 0; i < rbio->bioc->num_stripes; i++) {
1435 		stripe = &rbio->bioc->stripes[i];
1436 		if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1437 		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1438 			return i;
1439 		}
1440 	}
1441 	return -1;
1442 }
1443 
1444 /*
1445  * helper to find the stripe number for a given
1446  * bio (before mapping).  Used to figure out which stripe has
1447  * failed.  This looks up based on logical block numbers.
1448  */
1449 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1450 				   struct bio *bio)
1451 {
1452 	u64 logical = bio->bi_iter.bi_sector << 9;
1453 	int i;
1454 
1455 	for (i = 0; i < rbio->nr_data; i++) {
1456 		u64 stripe_start = rbio->bioc->raid_map[i];
1457 
1458 		if (in_range(logical, stripe_start, rbio->stripe_len))
1459 			return i;
1460 	}
1461 	return -1;
1462 }
1463 
1464 /*
1465  * returns -EIO if we had too many failures
1466  */
1467 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1468 {
1469 	unsigned long flags;
1470 	int ret = 0;
1471 
1472 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
1473 
1474 	/* we already know this stripe is bad, move on */
1475 	if (rbio->faila == failed || rbio->failb == failed)
1476 		goto out;
1477 
1478 	if (rbio->faila == -1) {
1479 		/* first failure on this rbio */
1480 		rbio->faila = failed;
1481 		atomic_inc(&rbio->error);
1482 	} else if (rbio->failb == -1) {
1483 		/* second failure on this rbio */
1484 		rbio->failb = failed;
1485 		atomic_inc(&rbio->error);
1486 	} else {
1487 		ret = -EIO;
1488 	}
1489 out:
1490 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1491 
1492 	return ret;
1493 }
1494 
1495 /*
1496  * helper to fail a stripe based on a physical disk
1497  * bio.
1498  */
1499 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1500 			   struct bio *bio)
1501 {
1502 	int failed = find_bio_stripe(rbio, bio);
1503 
1504 	if (failed < 0)
1505 		return -EIO;
1506 
1507 	return fail_rbio_index(rbio, failed);
1508 }
1509 
1510 /*
1511  * For subpage case, we can no longer set page Uptodate directly for
1512  * stripe_pages[], thus we need to locate the sector.
1513  */
1514 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1515 					     struct page *page,
1516 					     unsigned int pgoff)
1517 {
1518 	int i;
1519 
1520 	for (i = 0; i < rbio->nr_sectors; i++) {
1521 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1522 
1523 		if (sector->page == page && sector->pgoff == pgoff)
1524 			return sector;
1525 	}
1526 	return NULL;
1527 }
1528 
1529 /*
1530  * this sets each page in the bio uptodate.  It should only be used on private
1531  * rbio pages, nothing that comes in from the higher layers
1532  */
1533 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1534 {
1535 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1536 	struct bio_vec *bvec;
1537 	struct bvec_iter_all iter_all;
1538 
1539 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1540 
1541 	bio_for_each_segment_all(bvec, bio, iter_all) {
1542 		struct sector_ptr *sector;
1543 		int pgoff;
1544 
1545 		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1546 		     pgoff += sectorsize) {
1547 			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1548 			ASSERT(sector);
1549 			if (sector)
1550 				sector->uptodate = 1;
1551 		}
1552 	}
1553 }
1554 
1555 /*
1556  * end io for the read phase of the rmw cycle.  All the bios here are physical
1557  * stripe bios we've read from the disk so we can recalculate the parity of the
1558  * stripe.
1559  *
1560  * This will usually kick off finish_rmw once all the bios are read in, but it
1561  * may trigger parity reconstruction if we had any errors along the way
1562  */
1563 static void raid_rmw_end_io(struct bio *bio)
1564 {
1565 	struct btrfs_raid_bio *rbio = bio->bi_private;
1566 
1567 	if (bio->bi_status)
1568 		fail_bio_stripe(rbio, bio);
1569 	else
1570 		set_bio_pages_uptodate(rbio, bio);
1571 
1572 	bio_put(bio);
1573 
1574 	if (!atomic_dec_and_test(&rbio->stripes_pending))
1575 		return;
1576 
1577 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1578 		goto cleanup;
1579 
1580 	/*
1581 	 * this will normally call finish_rmw to start our write
1582 	 * but if there are any failed stripes we'll reconstruct
1583 	 * from parity first
1584 	 */
1585 	validate_rbio_for_rmw(rbio);
1586 	return;
1587 
1588 cleanup:
1589 
1590 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1591 }
1592 
1593 /*
1594  * the stripe must be locked by the caller.  It will
1595  * unlock after all the writes are done
1596  */
1597 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1598 {
1599 	int bios_to_read = 0;
1600 	struct bio_list bio_list;
1601 	int ret;
1602 	int sectornr;
1603 	int stripe;
1604 	struct bio *bio;
1605 
1606 	bio_list_init(&bio_list);
1607 
1608 	ret = alloc_rbio_pages(rbio);
1609 	if (ret)
1610 		goto cleanup;
1611 
1612 	index_rbio_pages(rbio);
1613 
1614 	atomic_set(&rbio->error, 0);
1615 	/*
1616 	 * build a list of bios to read all the missing parts of this
1617 	 * stripe
1618 	 */
1619 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1620 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1621 			struct sector_ptr *sector;
1622 
1623 			/*
1624 			 * We want to find all the sectors missing from the
1625 			 * rbio and read them from the disk.  If * sector_in_rbio()
1626 			 * finds a page in the bio list we don't need to read
1627 			 * it off the stripe.
1628 			 */
1629 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1630 			if (sector)
1631 				continue;
1632 
1633 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1634 			/*
1635 			 * The bio cache may have handed us an uptodate page.
1636 			 * If so, be happy and use it.
1637 			 */
1638 			if (sector->uptodate)
1639 				continue;
1640 
1641 			ret = rbio_add_io_sector(rbio, &bio_list, sector,
1642 				       stripe, sectornr, rbio->stripe_len,
1643 				       REQ_OP_READ);
1644 			if (ret)
1645 				goto cleanup;
1646 		}
1647 	}
1648 
1649 	bios_to_read = bio_list_size(&bio_list);
1650 	if (!bios_to_read) {
1651 		/*
1652 		 * this can happen if others have merged with
1653 		 * us, it means there is nothing left to read.
1654 		 * But if there are missing devices it may not be
1655 		 * safe to do the full stripe write yet.
1656 		 */
1657 		goto finish;
1658 	}
1659 
1660 	/*
1661 	 * The bioc may be freed once we submit the last bio. Make sure not to
1662 	 * touch it after that.
1663 	 */
1664 	atomic_set(&rbio->stripes_pending, bios_to_read);
1665 	while ((bio = bio_list_pop(&bio_list))) {
1666 		bio->bi_end_io = raid_rmw_end_io;
1667 
1668 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1669 
1670 		submit_bio(bio);
1671 	}
1672 	/* the actual write will happen once the reads are done */
1673 	return 0;
1674 
1675 cleanup:
1676 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1677 
1678 	while ((bio = bio_list_pop(&bio_list)))
1679 		bio_put(bio);
1680 
1681 	return -EIO;
1682 
1683 finish:
1684 	validate_rbio_for_rmw(rbio);
1685 	return 0;
1686 }
1687 
1688 /*
1689  * if the upper layers pass in a full stripe, we thank them by only allocating
1690  * enough pages to hold the parity, and sending it all down quickly.
1691  */
1692 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1693 {
1694 	int ret;
1695 
1696 	ret = alloc_rbio_parity_pages(rbio);
1697 	if (ret) {
1698 		__free_raid_bio(rbio);
1699 		return ret;
1700 	}
1701 
1702 	ret = lock_stripe_add(rbio);
1703 	if (ret == 0)
1704 		finish_rmw(rbio);
1705 	return 0;
1706 }
1707 
1708 /*
1709  * partial stripe writes get handed over to async helpers.
1710  * We're really hoping to merge a few more writes into this
1711  * rbio before calculating new parity
1712  */
1713 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1714 {
1715 	int ret;
1716 
1717 	ret = lock_stripe_add(rbio);
1718 	if (ret == 0)
1719 		start_async_work(rbio, rmw_work);
1720 	return 0;
1721 }
1722 
1723 /*
1724  * sometimes while we were reading from the drive to
1725  * recalculate parity, enough new bios come into create
1726  * a full stripe.  So we do a check here to see if we can
1727  * go directly to finish_rmw
1728  */
1729 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1730 {
1731 	/* head off into rmw land if we don't have a full stripe */
1732 	if (!rbio_is_full(rbio))
1733 		return partial_stripe_write(rbio);
1734 	return full_stripe_write(rbio);
1735 }
1736 
1737 /*
1738  * We use plugging call backs to collect full stripes.
1739  * Any time we get a partial stripe write while plugged
1740  * we collect it into a list.  When the unplug comes down,
1741  * we sort the list by logical block number and merge
1742  * everything we can into the same rbios
1743  */
1744 struct btrfs_plug_cb {
1745 	struct blk_plug_cb cb;
1746 	struct btrfs_fs_info *info;
1747 	struct list_head rbio_list;
1748 	struct work_struct work;
1749 };
1750 
1751 /*
1752  * rbios on the plug list are sorted for easier merging.
1753  */
1754 static int plug_cmp(void *priv, const struct list_head *a,
1755 		    const struct list_head *b)
1756 {
1757 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1758 						       plug_list);
1759 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1760 						       plug_list);
1761 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1762 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1763 
1764 	if (a_sector < b_sector)
1765 		return -1;
1766 	if (a_sector > b_sector)
1767 		return 1;
1768 	return 0;
1769 }
1770 
1771 static void run_plug(struct btrfs_plug_cb *plug)
1772 {
1773 	struct btrfs_raid_bio *cur;
1774 	struct btrfs_raid_bio *last = NULL;
1775 
1776 	/*
1777 	 * sort our plug list then try to merge
1778 	 * everything we can in hopes of creating full
1779 	 * stripes.
1780 	 */
1781 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1782 	while (!list_empty(&plug->rbio_list)) {
1783 		cur = list_entry(plug->rbio_list.next,
1784 				 struct btrfs_raid_bio, plug_list);
1785 		list_del_init(&cur->plug_list);
1786 
1787 		if (rbio_is_full(cur)) {
1788 			int ret;
1789 
1790 			/* we have a full stripe, send it down */
1791 			ret = full_stripe_write(cur);
1792 			BUG_ON(ret);
1793 			continue;
1794 		}
1795 		if (last) {
1796 			if (rbio_can_merge(last, cur)) {
1797 				merge_rbio(last, cur);
1798 				__free_raid_bio(cur);
1799 				continue;
1800 
1801 			}
1802 			__raid56_parity_write(last);
1803 		}
1804 		last = cur;
1805 	}
1806 	if (last) {
1807 		__raid56_parity_write(last);
1808 	}
1809 	kfree(plug);
1810 }
1811 
1812 /*
1813  * if the unplug comes from schedule, we have to push the
1814  * work off to a helper thread
1815  */
1816 static void unplug_work(struct work_struct *work)
1817 {
1818 	struct btrfs_plug_cb *plug;
1819 	plug = container_of(work, struct btrfs_plug_cb, work);
1820 	run_plug(plug);
1821 }
1822 
1823 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1824 {
1825 	struct btrfs_plug_cb *plug;
1826 	plug = container_of(cb, struct btrfs_plug_cb, cb);
1827 
1828 	if (from_schedule) {
1829 		INIT_WORK(&plug->work, unplug_work);
1830 		queue_work(plug->info->rmw_workers, &plug->work);
1831 		return;
1832 	}
1833 	run_plug(plug);
1834 }
1835 
1836 /*
1837  * our main entry point for writes from the rest of the FS.
1838  */
1839 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
1840 {
1841 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1842 	struct btrfs_raid_bio *rbio;
1843 	struct btrfs_plug_cb *plug = NULL;
1844 	struct blk_plug_cb *cb;
1845 	int ret;
1846 
1847 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
1848 	if (IS_ERR(rbio)) {
1849 		btrfs_put_bioc(bioc);
1850 		return PTR_ERR(rbio);
1851 	}
1852 	bio_list_add(&rbio->bio_list, bio);
1853 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
1854 	rbio->operation = BTRFS_RBIO_WRITE;
1855 
1856 	btrfs_bio_counter_inc_noblocked(fs_info);
1857 	rbio->generic_bio_cnt = 1;
1858 
1859 	/*
1860 	 * don't plug on full rbios, just get them out the door
1861 	 * as quickly as we can
1862 	 */
1863 	if (rbio_is_full(rbio)) {
1864 		ret = full_stripe_write(rbio);
1865 		if (ret)
1866 			btrfs_bio_counter_dec(fs_info);
1867 		return ret;
1868 	}
1869 
1870 	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1871 	if (cb) {
1872 		plug = container_of(cb, struct btrfs_plug_cb, cb);
1873 		if (!plug->info) {
1874 			plug->info = fs_info;
1875 			INIT_LIST_HEAD(&plug->rbio_list);
1876 		}
1877 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
1878 		ret = 0;
1879 	} else {
1880 		ret = __raid56_parity_write(rbio);
1881 		if (ret)
1882 			btrfs_bio_counter_dec(fs_info);
1883 	}
1884 	return ret;
1885 }
1886 
1887 /*
1888  * all parity reconstruction happens here.  We've read in everything
1889  * we can find from the drives and this does the heavy lifting of
1890  * sorting the good from the bad.
1891  */
1892 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1893 {
1894 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1895 	int sectornr, stripe;
1896 	void **pointers;
1897 	void **unmap_array;
1898 	int faila = -1, failb = -1;
1899 	blk_status_t err;
1900 	int i;
1901 
1902 	/*
1903 	 * This array stores the pointer for each sector, thus it has the extra
1904 	 * pgoff value added from each sector
1905 	 */
1906 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1907 	if (!pointers) {
1908 		err = BLK_STS_RESOURCE;
1909 		goto cleanup_io;
1910 	}
1911 
1912 	/*
1913 	 * Store copy of pointers that does not get reordered during
1914 	 * reconstruction so that kunmap_local works.
1915 	 */
1916 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1917 	if (!unmap_array) {
1918 		err = BLK_STS_RESOURCE;
1919 		goto cleanup_pointers;
1920 	}
1921 
1922 	faila = rbio->faila;
1923 	failb = rbio->failb;
1924 
1925 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1926 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1927 		spin_lock_irq(&rbio->bio_list_lock);
1928 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1929 		spin_unlock_irq(&rbio->bio_list_lock);
1930 	}
1931 
1932 	index_rbio_pages(rbio);
1933 
1934 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1935 		struct sector_ptr *sector;
1936 
1937 		/*
1938 		 * Now we just use bitmap to mark the horizontal stripes in
1939 		 * which we have data when doing parity scrub.
1940 		 */
1941 		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1942 		    !test_bit(sectornr, rbio->dbitmap))
1943 			continue;
1944 
1945 		/*
1946 		 * Setup our array of pointers with sectors from each stripe
1947 		 *
1948 		 * NOTE: store a duplicate array of pointers to preserve the
1949 		 * pointer order
1950 		 */
1951 		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1952 			/*
1953 			 * If we're rebuilding a read, we have to use
1954 			 * pages from the bio list
1955 			 */
1956 			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1957 			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1958 			    (stripe == faila || stripe == failb)) {
1959 				sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1960 			} else {
1961 				sector = rbio_stripe_sector(rbio, stripe, sectornr);
1962 			}
1963 			ASSERT(sector->page);
1964 			pointers[stripe] = kmap_local_page(sector->page) +
1965 					   sector->pgoff;
1966 			unmap_array[stripe] = pointers[stripe];
1967 		}
1968 
1969 		/* All raid6 handling here */
1970 		if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1971 			/* Single failure, rebuild from parity raid5 style */
1972 			if (failb < 0) {
1973 				if (faila == rbio->nr_data) {
1974 					/*
1975 					 * Just the P stripe has failed, without
1976 					 * a bad data or Q stripe.
1977 					 * TODO, we should redo the xor here.
1978 					 */
1979 					err = BLK_STS_IOERR;
1980 					goto cleanup;
1981 				}
1982 				/*
1983 				 * a single failure in raid6 is rebuilt
1984 				 * in the pstripe code below
1985 				 */
1986 				goto pstripe;
1987 			}
1988 
1989 			/* make sure our ps and qs are in order */
1990 			if (faila > failb)
1991 				swap(faila, failb);
1992 
1993 			/* if the q stripe is failed, do a pstripe reconstruction
1994 			 * from the xors.
1995 			 * If both the q stripe and the P stripe are failed, we're
1996 			 * here due to a crc mismatch and we can't give them the
1997 			 * data they want
1998 			 */
1999 			if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
2000 				if (rbio->bioc->raid_map[faila] ==
2001 				    RAID5_P_STRIPE) {
2002 					err = BLK_STS_IOERR;
2003 					goto cleanup;
2004 				}
2005 				/*
2006 				 * otherwise we have one bad data stripe and
2007 				 * a good P stripe.  raid5!
2008 				 */
2009 				goto pstripe;
2010 			}
2011 
2012 			if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2013 				raid6_datap_recov(rbio->real_stripes,
2014 						  sectorsize, faila, pointers);
2015 			} else {
2016 				raid6_2data_recov(rbio->real_stripes,
2017 						  sectorsize, faila, failb,
2018 						  pointers);
2019 			}
2020 		} else {
2021 			void *p;
2022 
2023 			/* rebuild from P stripe here (raid5 or raid6) */
2024 			BUG_ON(failb != -1);
2025 pstripe:
2026 			/* Copy parity block into failed block to start with */
2027 			memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
2028 
2029 			/* rearrange the pointer array */
2030 			p = pointers[faila];
2031 			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2032 				pointers[stripe] = pointers[stripe + 1];
2033 			pointers[rbio->nr_data - 1] = p;
2034 
2035 			/* xor in the rest */
2036 			run_xor(pointers, rbio->nr_data - 1, sectorsize);
2037 		}
2038 		/* if we're doing this rebuild as part of an rmw, go through
2039 		 * and set all of our private rbio pages in the
2040 		 * failed stripes as uptodate.  This way finish_rmw will
2041 		 * know they can be trusted.  If this was a read reconstruction,
2042 		 * other endio functions will fiddle the uptodate bits
2043 		 */
2044 		if (rbio->operation == BTRFS_RBIO_WRITE) {
2045 			for (i = 0;  i < rbio->stripe_nsectors; i++) {
2046 				if (faila != -1) {
2047 					sector = rbio_stripe_sector(rbio, faila, i);
2048 					sector->uptodate = 1;
2049 				}
2050 				if (failb != -1) {
2051 					sector = rbio_stripe_sector(rbio, failb, i);
2052 					sector->uptodate = 1;
2053 				}
2054 			}
2055 		}
2056 		for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2057 			kunmap_local(unmap_array[stripe]);
2058 	}
2059 
2060 	err = BLK_STS_OK;
2061 cleanup:
2062 	kfree(unmap_array);
2063 cleanup_pointers:
2064 	kfree(pointers);
2065 
2066 cleanup_io:
2067 	/*
2068 	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2069 	 * valid rbio which is consistent with ondisk content, thus such a
2070 	 * valid rbio can be cached to avoid further disk reads.
2071 	 */
2072 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2073 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
2074 		/*
2075 		 * - In case of two failures, where rbio->failb != -1:
2076 		 *
2077 		 *   Do not cache this rbio since the above read reconstruction
2078 		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
2079 		 *   changed some content of stripes which are not identical to
2080 		 *   on-disk content any more, otherwise, a later write/recover
2081 		 *   may steal stripe_pages from this rbio and end up with
2082 		 *   corruptions or rebuild failures.
2083 		 *
2084 		 * - In case of single failure, where rbio->failb == -1:
2085 		 *
2086 		 *   Cache this rbio iff the above read reconstruction is
2087 		 *   executed without problems.
2088 		 */
2089 		if (err == BLK_STS_OK && rbio->failb < 0)
2090 			cache_rbio_pages(rbio);
2091 		else
2092 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2093 
2094 		rbio_orig_end_io(rbio, err);
2095 	} else if (err == BLK_STS_OK) {
2096 		rbio->faila = -1;
2097 		rbio->failb = -1;
2098 
2099 		if (rbio->operation == BTRFS_RBIO_WRITE)
2100 			finish_rmw(rbio);
2101 		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2102 			finish_parity_scrub(rbio, 0);
2103 		else
2104 			BUG();
2105 	} else {
2106 		rbio_orig_end_io(rbio, err);
2107 	}
2108 }
2109 
2110 /*
2111  * This is called only for stripes we've read from disk to
2112  * reconstruct the parity.
2113  */
2114 static void raid_recover_end_io(struct bio *bio)
2115 {
2116 	struct btrfs_raid_bio *rbio = bio->bi_private;
2117 
2118 	/*
2119 	 * we only read stripe pages off the disk, set them
2120 	 * up to date if there were no errors
2121 	 */
2122 	if (bio->bi_status)
2123 		fail_bio_stripe(rbio, bio);
2124 	else
2125 		set_bio_pages_uptodate(rbio, bio);
2126 	bio_put(bio);
2127 
2128 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2129 		return;
2130 
2131 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2132 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2133 	else
2134 		__raid_recover_end_io(rbio);
2135 }
2136 
2137 /*
2138  * reads everything we need off the disk to reconstruct
2139  * the parity. endio handlers trigger final reconstruction
2140  * when the IO is done.
2141  *
2142  * This is used both for reads from the higher layers and for
2143  * parity construction required to finish a rmw cycle.
2144  */
2145 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2146 {
2147 	int bios_to_read = 0;
2148 	struct bio_list bio_list;
2149 	int ret;
2150 	int sectornr;
2151 	int stripe;
2152 	struct bio *bio;
2153 
2154 	bio_list_init(&bio_list);
2155 
2156 	ret = alloc_rbio_pages(rbio);
2157 	if (ret)
2158 		goto cleanup;
2159 
2160 	atomic_set(&rbio->error, 0);
2161 
2162 	/*
2163 	 * read everything that hasn't failed.  Thanks to the
2164 	 * stripe cache, it is possible that some or all of these
2165 	 * pages are going to be uptodate.
2166 	 */
2167 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2168 		if (rbio->faila == stripe || rbio->failb == stripe) {
2169 			atomic_inc(&rbio->error);
2170 			continue;
2171 		}
2172 
2173 		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2174 			struct sector_ptr *sector;
2175 
2176 			/*
2177 			 * the rmw code may have already read this
2178 			 * page in
2179 			 */
2180 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
2181 			if (sector->uptodate)
2182 				continue;
2183 
2184 			ret = rbio_add_io_sector(rbio, &bio_list, sector,
2185 						 stripe, sectornr, rbio->stripe_len,
2186 						 REQ_OP_READ);
2187 			if (ret < 0)
2188 				goto cleanup;
2189 		}
2190 	}
2191 
2192 	bios_to_read = bio_list_size(&bio_list);
2193 	if (!bios_to_read) {
2194 		/*
2195 		 * we might have no bios to read just because the pages
2196 		 * were up to date, or we might have no bios to read because
2197 		 * the devices were gone.
2198 		 */
2199 		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2200 			__raid_recover_end_io(rbio);
2201 			return 0;
2202 		} else {
2203 			goto cleanup;
2204 		}
2205 	}
2206 
2207 	/*
2208 	 * The bioc may be freed once we submit the last bio. Make sure not to
2209 	 * touch it after that.
2210 	 */
2211 	atomic_set(&rbio->stripes_pending, bios_to_read);
2212 	while ((bio = bio_list_pop(&bio_list))) {
2213 		bio->bi_end_io = raid_recover_end_io;
2214 
2215 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2216 
2217 		submit_bio(bio);
2218 	}
2219 
2220 	return 0;
2221 
2222 cleanup:
2223 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2224 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2225 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2226 
2227 	while ((bio = bio_list_pop(&bio_list)))
2228 		bio_put(bio);
2229 
2230 	return -EIO;
2231 }
2232 
2233 /*
2234  * the main entry point for reads from the higher layers.  This
2235  * is really only called when the normal read path had a failure,
2236  * so we assume the bio they send down corresponds to a failed part
2237  * of the drive.
2238  */
2239 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2240 			  u32 stripe_len, int mirror_num, int generic_io)
2241 {
2242 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2243 	struct btrfs_raid_bio *rbio;
2244 	int ret;
2245 
2246 	if (generic_io) {
2247 		ASSERT(bioc->mirror_num == mirror_num);
2248 		btrfs_bio(bio)->mirror_num = mirror_num;
2249 	}
2250 
2251 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
2252 	if (IS_ERR(rbio)) {
2253 		if (generic_io)
2254 			btrfs_put_bioc(bioc);
2255 		return PTR_ERR(rbio);
2256 	}
2257 
2258 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2259 	bio_list_add(&rbio->bio_list, bio);
2260 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
2261 
2262 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2263 	if (rbio->faila == -1) {
2264 		btrfs_warn(fs_info,
2265 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2266 			   __func__, bio->bi_iter.bi_sector << 9,
2267 			   (u64)bio->bi_iter.bi_size, bioc->map_type);
2268 		if (generic_io)
2269 			btrfs_put_bioc(bioc);
2270 		kfree(rbio);
2271 		return -EIO;
2272 	}
2273 
2274 	if (generic_io) {
2275 		btrfs_bio_counter_inc_noblocked(fs_info);
2276 		rbio->generic_bio_cnt = 1;
2277 	} else {
2278 		btrfs_get_bioc(bioc);
2279 	}
2280 
2281 	/*
2282 	 * Loop retry:
2283 	 * for 'mirror == 2', reconstruct from all other stripes.
2284 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2285 	 */
2286 	if (mirror_num > 2) {
2287 		/*
2288 		 * 'mirror == 3' is to fail the p stripe and
2289 		 * reconstruct from the q stripe.  'mirror > 3' is to
2290 		 * fail a data stripe and reconstruct from p+q stripe.
2291 		 */
2292 		rbio->failb = rbio->real_stripes - (mirror_num - 1);
2293 		ASSERT(rbio->failb > 0);
2294 		if (rbio->failb <= rbio->faila)
2295 			rbio->failb--;
2296 	}
2297 
2298 	ret = lock_stripe_add(rbio);
2299 
2300 	/*
2301 	 * __raid56_parity_recover will end the bio with
2302 	 * any errors it hits.  We don't want to return
2303 	 * its error value up the stack because our caller
2304 	 * will end up calling bio_endio with any nonzero
2305 	 * return
2306 	 */
2307 	if (ret == 0)
2308 		__raid56_parity_recover(rbio);
2309 	/*
2310 	 * our rbio has been added to the list of
2311 	 * rbios that will be handled after the
2312 	 * currently lock owner is done
2313 	 */
2314 	return 0;
2315 
2316 }
2317 
2318 static void rmw_work(struct work_struct *work)
2319 {
2320 	struct btrfs_raid_bio *rbio;
2321 
2322 	rbio = container_of(work, struct btrfs_raid_bio, work);
2323 	raid56_rmw_stripe(rbio);
2324 }
2325 
2326 static void read_rebuild_work(struct work_struct *work)
2327 {
2328 	struct btrfs_raid_bio *rbio;
2329 
2330 	rbio = container_of(work, struct btrfs_raid_bio, work);
2331 	__raid56_parity_recover(rbio);
2332 }
2333 
2334 /*
2335  * The following code is used to scrub/replace the parity stripe
2336  *
2337  * Caller must have already increased bio_counter for getting @bioc.
2338  *
2339  * Note: We need make sure all the pages that add into the scrub/replace
2340  * raid bio are correct and not be changed during the scrub/replace. That
2341  * is those pages just hold metadata or file data with checksum.
2342  */
2343 
2344 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2345 				struct btrfs_io_context *bioc,
2346 				u32 stripe_len, struct btrfs_device *scrub_dev,
2347 				unsigned long *dbitmap, int stripe_nsectors)
2348 {
2349 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2350 	struct btrfs_raid_bio *rbio;
2351 	int i;
2352 
2353 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
2354 	if (IS_ERR(rbio))
2355 		return NULL;
2356 	bio_list_add(&rbio->bio_list, bio);
2357 	/*
2358 	 * This is a special bio which is used to hold the completion handler
2359 	 * and make the scrub rbio is similar to the other types
2360 	 */
2361 	ASSERT(!bio->bi_iter.bi_size);
2362 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2363 
2364 	/*
2365 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2366 	 * to the end position, so this search can start from the first parity
2367 	 * stripe.
2368 	 */
2369 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2370 		if (bioc->stripes[i].dev == scrub_dev) {
2371 			rbio->scrubp = i;
2372 			break;
2373 		}
2374 	}
2375 	ASSERT(i < rbio->real_stripes);
2376 
2377 	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2378 
2379 	/*
2380 	 * We have already increased bio_counter when getting bioc, record it
2381 	 * so we can free it at rbio_orig_end_io().
2382 	 */
2383 	rbio->generic_bio_cnt = 1;
2384 
2385 	return rbio;
2386 }
2387 
2388 /* Used for both parity scrub and missing. */
2389 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2390 			    unsigned int pgoff, u64 logical)
2391 {
2392 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2393 	int stripe_offset;
2394 	int index;
2395 
2396 	ASSERT(logical >= rbio->bioc->raid_map[0]);
2397 	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
2398 				rbio->stripe_len * rbio->nr_data);
2399 	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2400 	index = stripe_offset / sectorsize;
2401 	rbio->bio_sectors[index].page = page;
2402 	rbio->bio_sectors[index].pgoff = pgoff;
2403 }
2404 
2405 /*
2406  * We just scrub the parity that we have correct data on the same horizontal,
2407  * so we needn't allocate all pages for all the stripes.
2408  */
2409 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2410 {
2411 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2412 	int stripe;
2413 	int sectornr;
2414 
2415 	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2416 		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2417 			struct page *page;
2418 			int index = (stripe * rbio->stripe_nsectors + sectornr) *
2419 				    sectorsize >> PAGE_SHIFT;
2420 
2421 			if (rbio->stripe_pages[index])
2422 				continue;
2423 
2424 			page = alloc_page(GFP_NOFS);
2425 			if (!page)
2426 				return -ENOMEM;
2427 			rbio->stripe_pages[index] = page;
2428 		}
2429 	}
2430 	index_stripe_sectors(rbio);
2431 	return 0;
2432 }
2433 
2434 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2435 					 int need_check)
2436 {
2437 	struct btrfs_io_context *bioc = rbio->bioc;
2438 	const u32 sectorsize = bioc->fs_info->sectorsize;
2439 	void **pointers = rbio->finish_pointers;
2440 	unsigned long *pbitmap = rbio->finish_pbitmap;
2441 	int nr_data = rbio->nr_data;
2442 	int stripe;
2443 	int sectornr;
2444 	bool has_qstripe;
2445 	struct sector_ptr p_sector = { 0 };
2446 	struct sector_ptr q_sector = { 0 };
2447 	struct bio_list bio_list;
2448 	struct bio *bio;
2449 	int is_replace = 0;
2450 	int ret;
2451 
2452 	bio_list_init(&bio_list);
2453 
2454 	if (rbio->real_stripes - rbio->nr_data == 1)
2455 		has_qstripe = false;
2456 	else if (rbio->real_stripes - rbio->nr_data == 2)
2457 		has_qstripe = true;
2458 	else
2459 		BUG();
2460 
2461 	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2462 		is_replace = 1;
2463 		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
2464 	}
2465 
2466 	/*
2467 	 * Because the higher layers(scrubber) are unlikely to
2468 	 * use this area of the disk again soon, so don't cache
2469 	 * it.
2470 	 */
2471 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2472 
2473 	if (!need_check)
2474 		goto writeback;
2475 
2476 	p_sector.page = alloc_page(GFP_NOFS);
2477 	if (!p_sector.page)
2478 		goto cleanup;
2479 	p_sector.pgoff = 0;
2480 	p_sector.uptodate = 1;
2481 
2482 	if (has_qstripe) {
2483 		/* RAID6, allocate and map temp space for the Q stripe */
2484 		q_sector.page = alloc_page(GFP_NOFS);
2485 		if (!q_sector.page) {
2486 			__free_page(p_sector.page);
2487 			p_sector.page = NULL;
2488 			goto cleanup;
2489 		}
2490 		q_sector.pgoff = 0;
2491 		q_sector.uptodate = 1;
2492 		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2493 	}
2494 
2495 	atomic_set(&rbio->error, 0);
2496 
2497 	/* Map the parity stripe just once */
2498 	pointers[nr_data] = kmap_local_page(p_sector.page);
2499 
2500 	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2501 		struct sector_ptr *sector;
2502 		void *parity;
2503 
2504 		/* first collect one page from each data stripe */
2505 		for (stripe = 0; stripe < nr_data; stripe++) {
2506 			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2507 			pointers[stripe] = kmap_local_page(sector->page) +
2508 					   sector->pgoff;
2509 		}
2510 
2511 		if (has_qstripe) {
2512 			/* RAID6, call the library function to fill in our P/Q */
2513 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2514 						pointers);
2515 		} else {
2516 			/* raid5 */
2517 			memcpy(pointers[nr_data], pointers[0], sectorsize);
2518 			run_xor(pointers + 1, nr_data - 1, sectorsize);
2519 		}
2520 
2521 		/* Check scrubbing parity and repair it */
2522 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2523 		parity = kmap_local_page(sector->page) + sector->pgoff;
2524 		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2525 			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2526 		else
2527 			/* Parity is right, needn't writeback */
2528 			bitmap_clear(rbio->dbitmap, sectornr, 1);
2529 		kunmap_local(parity);
2530 
2531 		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2532 			kunmap_local(pointers[stripe]);
2533 	}
2534 
2535 	kunmap_local(pointers[nr_data]);
2536 	__free_page(p_sector.page);
2537 	p_sector.page = NULL;
2538 	if (q_sector.page) {
2539 		kunmap_local(pointers[rbio->real_stripes - 1]);
2540 		__free_page(q_sector.page);
2541 		q_sector.page = NULL;
2542 	}
2543 
2544 writeback:
2545 	/*
2546 	 * time to start writing.  Make bios for everything from the
2547 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2548 	 * everything else.
2549 	 */
2550 	for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2551 		struct sector_ptr *sector;
2552 
2553 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2554 		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2555 					 sectornr, rbio->stripe_len, REQ_OP_WRITE);
2556 		if (ret)
2557 			goto cleanup;
2558 	}
2559 
2560 	if (!is_replace)
2561 		goto submit_write;
2562 
2563 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2564 		struct sector_ptr *sector;
2565 
2566 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2567 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2568 				       bioc->tgtdev_map[rbio->scrubp],
2569 				       sectornr, rbio->stripe_len, REQ_OP_WRITE);
2570 		if (ret)
2571 			goto cleanup;
2572 	}
2573 
2574 submit_write:
2575 	nr_data = bio_list_size(&bio_list);
2576 	if (!nr_data) {
2577 		/* Every parity is right */
2578 		rbio_orig_end_io(rbio, BLK_STS_OK);
2579 		return;
2580 	}
2581 
2582 	atomic_set(&rbio->stripes_pending, nr_data);
2583 
2584 	while ((bio = bio_list_pop(&bio_list))) {
2585 		bio->bi_end_io = raid_write_end_io;
2586 
2587 		submit_bio(bio);
2588 	}
2589 	return;
2590 
2591 cleanup:
2592 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2593 
2594 	while ((bio = bio_list_pop(&bio_list)))
2595 		bio_put(bio);
2596 }
2597 
2598 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2599 {
2600 	if (stripe >= 0 && stripe < rbio->nr_data)
2601 		return 1;
2602 	return 0;
2603 }
2604 
2605 /*
2606  * While we're doing the parity check and repair, we could have errors
2607  * in reading pages off the disk.  This checks for errors and if we're
2608  * not able to read the page it'll trigger parity reconstruction.  The
2609  * parity scrub will be finished after we've reconstructed the failed
2610  * stripes
2611  */
2612 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2613 {
2614 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2615 		goto cleanup;
2616 
2617 	if (rbio->faila >= 0 || rbio->failb >= 0) {
2618 		int dfail = 0, failp = -1;
2619 
2620 		if (is_data_stripe(rbio, rbio->faila))
2621 			dfail++;
2622 		else if (is_parity_stripe(rbio->faila))
2623 			failp = rbio->faila;
2624 
2625 		if (is_data_stripe(rbio, rbio->failb))
2626 			dfail++;
2627 		else if (is_parity_stripe(rbio->failb))
2628 			failp = rbio->failb;
2629 
2630 		/*
2631 		 * Because we can not use a scrubbing parity to repair
2632 		 * the data, so the capability of the repair is declined.
2633 		 * (In the case of RAID5, we can not repair anything)
2634 		 */
2635 		if (dfail > rbio->bioc->max_errors - 1)
2636 			goto cleanup;
2637 
2638 		/*
2639 		 * If all data is good, only parity is correctly, just
2640 		 * repair the parity.
2641 		 */
2642 		if (dfail == 0) {
2643 			finish_parity_scrub(rbio, 0);
2644 			return;
2645 		}
2646 
2647 		/*
2648 		 * Here means we got one corrupted data stripe and one
2649 		 * corrupted parity on RAID6, if the corrupted parity
2650 		 * is scrubbing parity, luckily, use the other one to repair
2651 		 * the data, or we can not repair the data stripe.
2652 		 */
2653 		if (failp != rbio->scrubp)
2654 			goto cleanup;
2655 
2656 		__raid_recover_end_io(rbio);
2657 	} else {
2658 		finish_parity_scrub(rbio, 1);
2659 	}
2660 	return;
2661 
2662 cleanup:
2663 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2664 }
2665 
2666 /*
2667  * end io for the read phase of the rmw cycle.  All the bios here are physical
2668  * stripe bios we've read from the disk so we can recalculate the parity of the
2669  * stripe.
2670  *
2671  * This will usually kick off finish_rmw once all the bios are read in, but it
2672  * may trigger parity reconstruction if we had any errors along the way
2673  */
2674 static void raid56_parity_scrub_end_io(struct bio *bio)
2675 {
2676 	struct btrfs_raid_bio *rbio = bio->bi_private;
2677 
2678 	if (bio->bi_status)
2679 		fail_bio_stripe(rbio, bio);
2680 	else
2681 		set_bio_pages_uptodate(rbio, bio);
2682 
2683 	bio_put(bio);
2684 
2685 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2686 		return;
2687 
2688 	/*
2689 	 * this will normally call finish_rmw to start our write
2690 	 * but if there are any failed stripes we'll reconstruct
2691 	 * from parity first
2692 	 */
2693 	validate_rbio_for_parity_scrub(rbio);
2694 }
2695 
2696 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2697 {
2698 	int bios_to_read = 0;
2699 	struct bio_list bio_list;
2700 	int ret;
2701 	int sectornr;
2702 	int stripe;
2703 	struct bio *bio;
2704 
2705 	bio_list_init(&bio_list);
2706 
2707 	ret = alloc_rbio_essential_pages(rbio);
2708 	if (ret)
2709 		goto cleanup;
2710 
2711 	atomic_set(&rbio->error, 0);
2712 	/*
2713 	 * build a list of bios to read all the missing parts of this
2714 	 * stripe
2715 	 */
2716 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2717 		for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
2718 			struct sector_ptr *sector;
2719 			/*
2720 			 * We want to find all the sectors missing from the
2721 			 * rbio and read them from the disk.  If * sector_in_rbio()
2722 			 * finds a sector in the bio list we don't need to read
2723 			 * it off the stripe.
2724 			 */
2725 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2726 			if (sector)
2727 				continue;
2728 
2729 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
2730 			/*
2731 			 * The bio cache may have handed us an uptodate sector.
2732 			 * If so, be happy and use it.
2733 			 */
2734 			if (sector->uptodate)
2735 				continue;
2736 
2737 			ret = rbio_add_io_sector(rbio, &bio_list, sector,
2738 						 stripe, sectornr, rbio->stripe_len,
2739 						 REQ_OP_READ);
2740 			if (ret)
2741 				goto cleanup;
2742 		}
2743 	}
2744 
2745 	bios_to_read = bio_list_size(&bio_list);
2746 	if (!bios_to_read) {
2747 		/*
2748 		 * this can happen if others have merged with
2749 		 * us, it means there is nothing left to read.
2750 		 * But if there are missing devices it may not be
2751 		 * safe to do the full stripe write yet.
2752 		 */
2753 		goto finish;
2754 	}
2755 
2756 	/*
2757 	 * The bioc may be freed once we submit the last bio. Make sure not to
2758 	 * touch it after that.
2759 	 */
2760 	atomic_set(&rbio->stripes_pending, bios_to_read);
2761 	while ((bio = bio_list_pop(&bio_list))) {
2762 		bio->bi_end_io = raid56_parity_scrub_end_io;
2763 
2764 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2765 
2766 		submit_bio(bio);
2767 	}
2768 	/* the actual write will happen once the reads are done */
2769 	return;
2770 
2771 cleanup:
2772 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2773 
2774 	while ((bio = bio_list_pop(&bio_list)))
2775 		bio_put(bio);
2776 
2777 	return;
2778 
2779 finish:
2780 	validate_rbio_for_parity_scrub(rbio);
2781 }
2782 
2783 static void scrub_parity_work(struct work_struct *work)
2784 {
2785 	struct btrfs_raid_bio *rbio;
2786 
2787 	rbio = container_of(work, struct btrfs_raid_bio, work);
2788 	raid56_parity_scrub_stripe(rbio);
2789 }
2790 
2791 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2792 {
2793 	if (!lock_stripe_add(rbio))
2794 		start_async_work(rbio, scrub_parity_work);
2795 }
2796 
2797 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2798 
2799 struct btrfs_raid_bio *
2800 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2801 			  u64 length)
2802 {
2803 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2804 	struct btrfs_raid_bio *rbio;
2805 
2806 	rbio = alloc_rbio(fs_info, bioc, length);
2807 	if (IS_ERR(rbio))
2808 		return NULL;
2809 
2810 	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2811 	bio_list_add(&rbio->bio_list, bio);
2812 	/*
2813 	 * This is a special bio which is used to hold the completion handler
2814 	 * and make the scrub rbio is similar to the other types
2815 	 */
2816 	ASSERT(!bio->bi_iter.bi_size);
2817 
2818 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2819 	if (rbio->faila == -1) {
2820 		BUG();
2821 		kfree(rbio);
2822 		return NULL;
2823 	}
2824 
2825 	/*
2826 	 * When we get bioc, we have already increased bio_counter, record it
2827 	 * so we can free it at rbio_orig_end_io()
2828 	 */
2829 	rbio->generic_bio_cnt = 1;
2830 
2831 	return rbio;
2832 }
2833 
2834 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2835 {
2836 	if (!lock_stripe_add(rbio))
2837 		start_async_work(rbio, read_rebuild_work);
2838 }
2839