xref: /linux/fs/btrfs/raid56.c (revision 3f0a50f345f78183f6e9b39c2f45ca5dcaa511ca)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "misc.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 
23 /* set when additional merges to this rbio are not allowed */
24 #define RBIO_RMW_LOCKED_BIT	1
25 
26 /*
27  * set when this rbio is sitting in the hash, but it is just a cache
28  * of past RMW
29  */
30 #define RBIO_CACHE_BIT		2
31 
32 /*
33  * set when it is safe to trust the stripe_pages for caching
34  */
35 #define RBIO_CACHE_READY_BIT	3
36 
37 #define RBIO_CACHE_SIZE 1024
38 
39 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
40 
41 /* Used by the raid56 code to lock stripes for read/modify/write */
42 struct btrfs_stripe_hash {
43 	struct list_head hash_list;
44 	spinlock_t lock;
45 };
46 
47 /* Used by the raid56 code to lock stripes for read/modify/write */
48 struct btrfs_stripe_hash_table {
49 	struct list_head stripe_cache;
50 	spinlock_t cache_lock;
51 	int cache_size;
52 	struct btrfs_stripe_hash table[];
53 };
54 
55 enum btrfs_rbio_ops {
56 	BTRFS_RBIO_WRITE,
57 	BTRFS_RBIO_READ_REBUILD,
58 	BTRFS_RBIO_PARITY_SCRUB,
59 	BTRFS_RBIO_REBUILD_MISSING,
60 };
61 
62 struct btrfs_raid_bio {
63 	struct btrfs_io_context *bioc;
64 
65 	/* while we're doing rmw on a stripe
66 	 * we put it into a hash table so we can
67 	 * lock the stripe and merge more rbios
68 	 * into it.
69 	 */
70 	struct list_head hash_list;
71 
72 	/*
73 	 * LRU list for the stripe cache
74 	 */
75 	struct list_head stripe_cache;
76 
77 	/*
78 	 * for scheduling work in the helper threads
79 	 */
80 	struct btrfs_work work;
81 
82 	/*
83 	 * bio list and bio_list_lock are used
84 	 * to add more bios into the stripe
85 	 * in hopes of avoiding the full rmw
86 	 */
87 	struct bio_list bio_list;
88 	spinlock_t bio_list_lock;
89 
90 	/* also protected by the bio_list_lock, the
91 	 * plug list is used by the plugging code
92 	 * to collect partial bios while plugged.  The
93 	 * stripe locking code also uses it to hand off
94 	 * the stripe lock to the next pending IO
95 	 */
96 	struct list_head plug_list;
97 
98 	/*
99 	 * flags that tell us if it is safe to
100 	 * merge with this bio
101 	 */
102 	unsigned long flags;
103 
104 	/* size of each individual stripe on disk */
105 	int stripe_len;
106 
107 	/* number of data stripes (no p/q) */
108 	int nr_data;
109 
110 	int real_stripes;
111 
112 	int stripe_npages;
113 	/*
114 	 * set if we're doing a parity rebuild
115 	 * for a read from higher up, which is handled
116 	 * differently from a parity rebuild as part of
117 	 * rmw
118 	 */
119 	enum btrfs_rbio_ops operation;
120 
121 	/* first bad stripe */
122 	int faila;
123 
124 	/* second bad stripe (for raid6 use) */
125 	int failb;
126 
127 	int scrubp;
128 	/*
129 	 * number of pages needed to represent the full
130 	 * stripe
131 	 */
132 	int nr_pages;
133 
134 	/*
135 	 * size of all the bios in the bio_list.  This
136 	 * helps us decide if the rbio maps to a full
137 	 * stripe or not
138 	 */
139 	int bio_list_bytes;
140 
141 	int generic_bio_cnt;
142 
143 	refcount_t refs;
144 
145 	atomic_t stripes_pending;
146 
147 	atomic_t error;
148 	/*
149 	 * these are two arrays of pointers.  We allocate the
150 	 * rbio big enough to hold them both and setup their
151 	 * locations when the rbio is allocated
152 	 */
153 
154 	/* pointers to pages that we allocated for
155 	 * reading/writing stripes directly from the disk (including P/Q)
156 	 */
157 	struct page **stripe_pages;
158 
159 	/*
160 	 * pointers to the pages in the bio_list.  Stored
161 	 * here for faster lookup
162 	 */
163 	struct page **bio_pages;
164 
165 	/*
166 	 * bitmap to record which horizontal stripe has data
167 	 */
168 	unsigned long *dbitmap;
169 
170 	/* allocated with real_stripes-many pointers for finish_*() calls */
171 	void **finish_pointers;
172 
173 	/* allocated with stripe_npages-many bits for finish_*() calls */
174 	unsigned long *finish_pbitmap;
175 };
176 
177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179 static void rmw_work(struct btrfs_work *work);
180 static void read_rebuild_work(struct btrfs_work *work);
181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
186 
187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
188 					 int need_check);
189 static void scrub_parity_work(struct btrfs_work *work);
190 
191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
192 {
193 	btrfs_init_work(&rbio->work, work_func, NULL, NULL);
194 	btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
195 }
196 
197 /*
198  * the stripe hash table is used for locking, and to collect
199  * bios in hopes of making a full stripe
200  */
201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
202 {
203 	struct btrfs_stripe_hash_table *table;
204 	struct btrfs_stripe_hash_table *x;
205 	struct btrfs_stripe_hash *cur;
206 	struct btrfs_stripe_hash *h;
207 	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
208 	int i;
209 
210 	if (info->stripe_hash_table)
211 		return 0;
212 
213 	/*
214 	 * The table is large, starting with order 4 and can go as high as
215 	 * order 7 in case lock debugging is turned on.
216 	 *
217 	 * Try harder to allocate and fallback to vmalloc to lower the chance
218 	 * of a failing mount.
219 	 */
220 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
221 	if (!table)
222 		return -ENOMEM;
223 
224 	spin_lock_init(&table->cache_lock);
225 	INIT_LIST_HEAD(&table->stripe_cache);
226 
227 	h = table->table;
228 
229 	for (i = 0; i < num_entries; i++) {
230 		cur = h + i;
231 		INIT_LIST_HEAD(&cur->hash_list);
232 		spin_lock_init(&cur->lock);
233 	}
234 
235 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
236 	kvfree(x);
237 	return 0;
238 }
239 
240 /*
241  * caching an rbio means to copy anything from the
242  * bio_pages array into the stripe_pages array.  We
243  * use the page uptodate bit in the stripe cache array
244  * to indicate if it has valid data
245  *
246  * once the caching is done, we set the cache ready
247  * bit.
248  */
249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
250 {
251 	int i;
252 	int ret;
253 
254 	ret = alloc_rbio_pages(rbio);
255 	if (ret)
256 		return;
257 
258 	for (i = 0; i < rbio->nr_pages; i++) {
259 		if (!rbio->bio_pages[i])
260 			continue;
261 
262 		copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
263 		SetPageUptodate(rbio->stripe_pages[i]);
264 	}
265 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
266 }
267 
268 /*
269  * we hash on the first logical address of the stripe
270  */
271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
272 {
273 	u64 num = rbio->bioc->raid_map[0];
274 
275 	/*
276 	 * we shift down quite a bit.  We're using byte
277 	 * addressing, and most of the lower bits are zeros.
278 	 * This tends to upset hash_64, and it consistently
279 	 * returns just one or two different values.
280 	 *
281 	 * shifting off the lower bits fixes things.
282 	 */
283 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
284 }
285 
286 /*
287  * stealing an rbio means taking all the uptodate pages from the stripe
288  * array in the source rbio and putting them into the destination rbio
289  */
290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
291 {
292 	int i;
293 	struct page *s;
294 	struct page *d;
295 
296 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
297 		return;
298 
299 	for (i = 0; i < dest->nr_pages; i++) {
300 		s = src->stripe_pages[i];
301 		if (!s || !PageUptodate(s)) {
302 			continue;
303 		}
304 
305 		d = dest->stripe_pages[i];
306 		if (d)
307 			__free_page(d);
308 
309 		dest->stripe_pages[i] = s;
310 		src->stripe_pages[i] = NULL;
311 	}
312 }
313 
314 /*
315  * merging means we take the bio_list from the victim and
316  * splice it into the destination.  The victim should
317  * be discarded afterwards.
318  *
319  * must be called with dest->rbio_list_lock held
320  */
321 static void merge_rbio(struct btrfs_raid_bio *dest,
322 		       struct btrfs_raid_bio *victim)
323 {
324 	bio_list_merge(&dest->bio_list, &victim->bio_list);
325 	dest->bio_list_bytes += victim->bio_list_bytes;
326 	dest->generic_bio_cnt += victim->generic_bio_cnt;
327 	bio_list_init(&victim->bio_list);
328 }
329 
330 /*
331  * used to prune items that are in the cache.  The caller
332  * must hold the hash table lock.
333  */
334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335 {
336 	int bucket = rbio_bucket(rbio);
337 	struct btrfs_stripe_hash_table *table;
338 	struct btrfs_stripe_hash *h;
339 	int freeit = 0;
340 
341 	/*
342 	 * check the bit again under the hash table lock.
343 	 */
344 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 		return;
346 
347 	table = rbio->bioc->fs_info->stripe_hash_table;
348 	h = table->table + bucket;
349 
350 	/* hold the lock for the bucket because we may be
351 	 * removing it from the hash table
352 	 */
353 	spin_lock(&h->lock);
354 
355 	/*
356 	 * hold the lock for the bio list because we need
357 	 * to make sure the bio list is empty
358 	 */
359 	spin_lock(&rbio->bio_list_lock);
360 
361 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 		list_del_init(&rbio->stripe_cache);
363 		table->cache_size -= 1;
364 		freeit = 1;
365 
366 		/* if the bio list isn't empty, this rbio is
367 		 * still involved in an IO.  We take it out
368 		 * of the cache list, and drop the ref that
369 		 * was held for the list.
370 		 *
371 		 * If the bio_list was empty, we also remove
372 		 * the rbio from the hash_table, and drop
373 		 * the corresponding ref
374 		 */
375 		if (bio_list_empty(&rbio->bio_list)) {
376 			if (!list_empty(&rbio->hash_list)) {
377 				list_del_init(&rbio->hash_list);
378 				refcount_dec(&rbio->refs);
379 				BUG_ON(!list_empty(&rbio->plug_list));
380 			}
381 		}
382 	}
383 
384 	spin_unlock(&rbio->bio_list_lock);
385 	spin_unlock(&h->lock);
386 
387 	if (freeit)
388 		__free_raid_bio(rbio);
389 }
390 
391 /*
392  * prune a given rbio from the cache
393  */
394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395 {
396 	struct btrfs_stripe_hash_table *table;
397 	unsigned long flags;
398 
399 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 		return;
401 
402 	table = rbio->bioc->fs_info->stripe_hash_table;
403 
404 	spin_lock_irqsave(&table->cache_lock, flags);
405 	__remove_rbio_from_cache(rbio);
406 	spin_unlock_irqrestore(&table->cache_lock, flags);
407 }
408 
409 /*
410  * remove everything in the cache
411  */
412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413 {
414 	struct btrfs_stripe_hash_table *table;
415 	unsigned long flags;
416 	struct btrfs_raid_bio *rbio;
417 
418 	table = info->stripe_hash_table;
419 
420 	spin_lock_irqsave(&table->cache_lock, flags);
421 	while (!list_empty(&table->stripe_cache)) {
422 		rbio = list_entry(table->stripe_cache.next,
423 				  struct btrfs_raid_bio,
424 				  stripe_cache);
425 		__remove_rbio_from_cache(rbio);
426 	}
427 	spin_unlock_irqrestore(&table->cache_lock, flags);
428 }
429 
430 /*
431  * remove all cached entries and free the hash table
432  * used by unmount
433  */
434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435 {
436 	if (!info->stripe_hash_table)
437 		return;
438 	btrfs_clear_rbio_cache(info);
439 	kvfree(info->stripe_hash_table);
440 	info->stripe_hash_table = NULL;
441 }
442 
443 /*
444  * insert an rbio into the stripe cache.  It
445  * must have already been prepared by calling
446  * cache_rbio_pages
447  *
448  * If this rbio was already cached, it gets
449  * moved to the front of the lru.
450  *
451  * If the size of the rbio cache is too big, we
452  * prune an item.
453  */
454 static void cache_rbio(struct btrfs_raid_bio *rbio)
455 {
456 	struct btrfs_stripe_hash_table *table;
457 	unsigned long flags;
458 
459 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
460 		return;
461 
462 	table = rbio->bioc->fs_info->stripe_hash_table;
463 
464 	spin_lock_irqsave(&table->cache_lock, flags);
465 	spin_lock(&rbio->bio_list_lock);
466 
467 	/* bump our ref if we were not in the list before */
468 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
469 		refcount_inc(&rbio->refs);
470 
471 	if (!list_empty(&rbio->stripe_cache)){
472 		list_move(&rbio->stripe_cache, &table->stripe_cache);
473 	} else {
474 		list_add(&rbio->stripe_cache, &table->stripe_cache);
475 		table->cache_size += 1;
476 	}
477 
478 	spin_unlock(&rbio->bio_list_lock);
479 
480 	if (table->cache_size > RBIO_CACHE_SIZE) {
481 		struct btrfs_raid_bio *found;
482 
483 		found = list_entry(table->stripe_cache.prev,
484 				  struct btrfs_raid_bio,
485 				  stripe_cache);
486 
487 		if (found != rbio)
488 			__remove_rbio_from_cache(found);
489 	}
490 
491 	spin_unlock_irqrestore(&table->cache_lock, flags);
492 }
493 
494 /*
495  * helper function to run the xor_blocks api.  It is only
496  * able to do MAX_XOR_BLOCKS at a time, so we need to
497  * loop through.
498  */
499 static void run_xor(void **pages, int src_cnt, ssize_t len)
500 {
501 	int src_off = 0;
502 	int xor_src_cnt = 0;
503 	void *dest = pages[src_cnt];
504 
505 	while(src_cnt > 0) {
506 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
508 
509 		src_cnt -= xor_src_cnt;
510 		src_off += xor_src_cnt;
511 	}
512 }
513 
514 /*
515  * Returns true if the bio list inside this rbio covers an entire stripe (no
516  * rmw required).
517  */
518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
519 {
520 	unsigned long flags;
521 	unsigned long size = rbio->bio_list_bytes;
522 	int ret = 1;
523 
524 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
525 	if (size != rbio->nr_data * rbio->stripe_len)
526 		ret = 0;
527 	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
528 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
529 
530 	return ret;
531 }
532 
533 /*
534  * returns 1 if it is safe to merge two rbios together.
535  * The merging is safe if the two rbios correspond to
536  * the same stripe and if they are both going in the same
537  * direction (read vs write), and if neither one is
538  * locked for final IO
539  *
540  * The caller is responsible for locking such that
541  * rmw_locked is safe to test
542  */
543 static int rbio_can_merge(struct btrfs_raid_bio *last,
544 			  struct btrfs_raid_bio *cur)
545 {
546 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
548 		return 0;
549 
550 	/*
551 	 * we can't merge with cached rbios, since the
552 	 * idea is that when we merge the destination
553 	 * rbio is going to run our IO for us.  We can
554 	 * steal from cached rbios though, other functions
555 	 * handle that.
556 	 */
557 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
559 		return 0;
560 
561 	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
562 		return 0;
563 
564 	/* we can't merge with different operations */
565 	if (last->operation != cur->operation)
566 		return 0;
567 	/*
568 	 * We've need read the full stripe from the drive.
569 	 * check and repair the parity and write the new results.
570 	 *
571 	 * We're not allowed to add any new bios to the
572 	 * bio list here, anyone else that wants to
573 	 * change this stripe needs to do their own rmw.
574 	 */
575 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
576 		return 0;
577 
578 	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
579 		return 0;
580 
581 	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
582 		int fa = last->faila;
583 		int fb = last->failb;
584 		int cur_fa = cur->faila;
585 		int cur_fb = cur->failb;
586 
587 		if (last->faila >= last->failb) {
588 			fa = last->failb;
589 			fb = last->faila;
590 		}
591 
592 		if (cur->faila >= cur->failb) {
593 			cur_fa = cur->failb;
594 			cur_fb = cur->faila;
595 		}
596 
597 		if (fa != cur_fa || fb != cur_fb)
598 			return 0;
599 	}
600 	return 1;
601 }
602 
603 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
604 				  int index)
605 {
606 	return stripe * rbio->stripe_npages + index;
607 }
608 
609 /*
610  * these are just the pages from the rbio array, not from anything
611  * the FS sent down to us
612  */
613 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
614 				     int index)
615 {
616 	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
617 }
618 
619 /*
620  * helper to index into the pstripe
621  */
622 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
623 {
624 	return rbio_stripe_page(rbio, rbio->nr_data, index);
625 }
626 
627 /*
628  * helper to index into the qstripe, returns null
629  * if there is no qstripe
630  */
631 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
632 {
633 	if (rbio->nr_data + 1 == rbio->real_stripes)
634 		return NULL;
635 	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
636 }
637 
638 /*
639  * The first stripe in the table for a logical address
640  * has the lock.  rbios are added in one of three ways:
641  *
642  * 1) Nobody has the stripe locked yet.  The rbio is given
643  * the lock and 0 is returned.  The caller must start the IO
644  * themselves.
645  *
646  * 2) Someone has the stripe locked, but we're able to merge
647  * with the lock owner.  The rbio is freed and the IO will
648  * start automatically along with the existing rbio.  1 is returned.
649  *
650  * 3) Someone has the stripe locked, but we're not able to merge.
651  * The rbio is added to the lock owner's plug list, or merged into
652  * an rbio already on the plug list.  When the lock owner unlocks,
653  * the next rbio on the list is run and the IO is started automatically.
654  * 1 is returned
655  *
656  * If we return 0, the caller still owns the rbio and must continue with
657  * IO submission.  If we return 1, the caller must assume the rbio has
658  * already been freed.
659  */
660 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
661 {
662 	struct btrfs_stripe_hash *h;
663 	struct btrfs_raid_bio *cur;
664 	struct btrfs_raid_bio *pending;
665 	unsigned long flags;
666 	struct btrfs_raid_bio *freeit = NULL;
667 	struct btrfs_raid_bio *cache_drop = NULL;
668 	int ret = 0;
669 
670 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
671 
672 	spin_lock_irqsave(&h->lock, flags);
673 	list_for_each_entry(cur, &h->hash_list, hash_list) {
674 		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
675 			continue;
676 
677 		spin_lock(&cur->bio_list_lock);
678 
679 		/* Can we steal this cached rbio's pages? */
680 		if (bio_list_empty(&cur->bio_list) &&
681 		    list_empty(&cur->plug_list) &&
682 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
683 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
684 			list_del_init(&cur->hash_list);
685 			refcount_dec(&cur->refs);
686 
687 			steal_rbio(cur, rbio);
688 			cache_drop = cur;
689 			spin_unlock(&cur->bio_list_lock);
690 
691 			goto lockit;
692 		}
693 
694 		/* Can we merge into the lock owner? */
695 		if (rbio_can_merge(cur, rbio)) {
696 			merge_rbio(cur, rbio);
697 			spin_unlock(&cur->bio_list_lock);
698 			freeit = rbio;
699 			ret = 1;
700 			goto out;
701 		}
702 
703 
704 		/*
705 		 * We couldn't merge with the running rbio, see if we can merge
706 		 * with the pending ones.  We don't have to check for rmw_locked
707 		 * because there is no way they are inside finish_rmw right now
708 		 */
709 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
710 			if (rbio_can_merge(pending, rbio)) {
711 				merge_rbio(pending, rbio);
712 				spin_unlock(&cur->bio_list_lock);
713 				freeit = rbio;
714 				ret = 1;
715 				goto out;
716 			}
717 		}
718 
719 		/*
720 		 * No merging, put us on the tail of the plug list, our rbio
721 		 * will be started with the currently running rbio unlocks
722 		 */
723 		list_add_tail(&rbio->plug_list, &cur->plug_list);
724 		spin_unlock(&cur->bio_list_lock);
725 		ret = 1;
726 		goto out;
727 	}
728 lockit:
729 	refcount_inc(&rbio->refs);
730 	list_add(&rbio->hash_list, &h->hash_list);
731 out:
732 	spin_unlock_irqrestore(&h->lock, flags);
733 	if (cache_drop)
734 		remove_rbio_from_cache(cache_drop);
735 	if (freeit)
736 		__free_raid_bio(freeit);
737 	return ret;
738 }
739 
740 /*
741  * called as rmw or parity rebuild is completed.  If the plug list has more
742  * rbios waiting for this stripe, the next one on the list will be started
743  */
744 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
745 {
746 	int bucket;
747 	struct btrfs_stripe_hash *h;
748 	unsigned long flags;
749 	int keep_cache = 0;
750 
751 	bucket = rbio_bucket(rbio);
752 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
753 
754 	if (list_empty(&rbio->plug_list))
755 		cache_rbio(rbio);
756 
757 	spin_lock_irqsave(&h->lock, flags);
758 	spin_lock(&rbio->bio_list_lock);
759 
760 	if (!list_empty(&rbio->hash_list)) {
761 		/*
762 		 * if we're still cached and there is no other IO
763 		 * to perform, just leave this rbio here for others
764 		 * to steal from later
765 		 */
766 		if (list_empty(&rbio->plug_list) &&
767 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
768 			keep_cache = 1;
769 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
770 			BUG_ON(!bio_list_empty(&rbio->bio_list));
771 			goto done;
772 		}
773 
774 		list_del_init(&rbio->hash_list);
775 		refcount_dec(&rbio->refs);
776 
777 		/*
778 		 * we use the plug list to hold all the rbios
779 		 * waiting for the chance to lock this stripe.
780 		 * hand the lock over to one of them.
781 		 */
782 		if (!list_empty(&rbio->plug_list)) {
783 			struct btrfs_raid_bio *next;
784 			struct list_head *head = rbio->plug_list.next;
785 
786 			next = list_entry(head, struct btrfs_raid_bio,
787 					  plug_list);
788 
789 			list_del_init(&rbio->plug_list);
790 
791 			list_add(&next->hash_list, &h->hash_list);
792 			refcount_inc(&next->refs);
793 			spin_unlock(&rbio->bio_list_lock);
794 			spin_unlock_irqrestore(&h->lock, flags);
795 
796 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
797 				start_async_work(next, read_rebuild_work);
798 			else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
799 				steal_rbio(rbio, next);
800 				start_async_work(next, read_rebuild_work);
801 			} else if (next->operation == BTRFS_RBIO_WRITE) {
802 				steal_rbio(rbio, next);
803 				start_async_work(next, rmw_work);
804 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
805 				steal_rbio(rbio, next);
806 				start_async_work(next, scrub_parity_work);
807 			}
808 
809 			goto done_nolock;
810 		}
811 	}
812 done:
813 	spin_unlock(&rbio->bio_list_lock);
814 	spin_unlock_irqrestore(&h->lock, flags);
815 
816 done_nolock:
817 	if (!keep_cache)
818 		remove_rbio_from_cache(rbio);
819 }
820 
821 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
822 {
823 	int i;
824 
825 	if (!refcount_dec_and_test(&rbio->refs))
826 		return;
827 
828 	WARN_ON(!list_empty(&rbio->stripe_cache));
829 	WARN_ON(!list_empty(&rbio->hash_list));
830 	WARN_ON(!bio_list_empty(&rbio->bio_list));
831 
832 	for (i = 0; i < rbio->nr_pages; i++) {
833 		if (rbio->stripe_pages[i]) {
834 			__free_page(rbio->stripe_pages[i]);
835 			rbio->stripe_pages[i] = NULL;
836 		}
837 	}
838 
839 	btrfs_put_bioc(rbio->bioc);
840 	kfree(rbio);
841 }
842 
843 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
844 {
845 	struct bio *next;
846 
847 	while (cur) {
848 		next = cur->bi_next;
849 		cur->bi_next = NULL;
850 		cur->bi_status = err;
851 		bio_endio(cur);
852 		cur = next;
853 	}
854 }
855 
856 /*
857  * this frees the rbio and runs through all the bios in the
858  * bio_list and calls end_io on them
859  */
860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
861 {
862 	struct bio *cur = bio_list_get(&rbio->bio_list);
863 	struct bio *extra;
864 
865 	if (rbio->generic_bio_cnt)
866 		btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
867 
868 	/*
869 	 * At this moment, rbio->bio_list is empty, however since rbio does not
870 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
871 	 * hash list, rbio may be merged with others so that rbio->bio_list
872 	 * becomes non-empty.
873 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
874 	 * more and we can call bio_endio() on all queued bios.
875 	 */
876 	unlock_stripe(rbio);
877 	extra = bio_list_get(&rbio->bio_list);
878 	__free_raid_bio(rbio);
879 
880 	rbio_endio_bio_list(cur, err);
881 	if (extra)
882 		rbio_endio_bio_list(extra, err);
883 }
884 
885 /*
886  * end io function used by finish_rmw.  When we finally
887  * get here, we've written a full stripe
888  */
889 static void raid_write_end_io(struct bio *bio)
890 {
891 	struct btrfs_raid_bio *rbio = bio->bi_private;
892 	blk_status_t err = bio->bi_status;
893 	int max_errors;
894 
895 	if (err)
896 		fail_bio_stripe(rbio, bio);
897 
898 	bio_put(bio);
899 
900 	if (!atomic_dec_and_test(&rbio->stripes_pending))
901 		return;
902 
903 	err = BLK_STS_OK;
904 
905 	/* OK, we have read all the stripes we need to. */
906 	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
907 		     0 : rbio->bioc->max_errors;
908 	if (atomic_read(&rbio->error) > max_errors)
909 		err = BLK_STS_IOERR;
910 
911 	rbio_orig_end_io(rbio, err);
912 }
913 
914 /*
915  * the read/modify/write code wants to use the original bio for
916  * any pages it included, and then use the rbio for everything
917  * else.  This function decides if a given index (stripe number)
918  * and page number in that stripe fall inside the original bio
919  * or the rbio.
920  *
921  * if you set bio_list_only, you'll get a NULL back for any ranges
922  * that are outside the bio_list
923  *
924  * This doesn't take any refs on anything, you get a bare page pointer
925  * and the caller must bump refs as required.
926  *
927  * You must call index_rbio_pages once before you can trust
928  * the answers from this function.
929  */
930 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
931 				 int index, int pagenr, int bio_list_only)
932 {
933 	int chunk_page;
934 	struct page *p = NULL;
935 
936 	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
937 
938 	spin_lock_irq(&rbio->bio_list_lock);
939 	p = rbio->bio_pages[chunk_page];
940 	spin_unlock_irq(&rbio->bio_list_lock);
941 
942 	if (p || bio_list_only)
943 		return p;
944 
945 	return rbio->stripe_pages[chunk_page];
946 }
947 
948 /*
949  * number of pages we need for the entire stripe across all the
950  * drives
951  */
952 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
953 {
954 	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
955 }
956 
957 /*
958  * allocation and initial setup for the btrfs_raid_bio.  Not
959  * this does not allocate any pages for rbio->pages.
960  */
961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
962 					 struct btrfs_io_context *bioc,
963 					 u64 stripe_len)
964 {
965 	struct btrfs_raid_bio *rbio;
966 	int nr_data = 0;
967 	int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
968 	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
969 	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
970 	void *p;
971 
972 	rbio = kzalloc(sizeof(*rbio) +
973 		       sizeof(*rbio->stripe_pages) * num_pages +
974 		       sizeof(*rbio->bio_pages) * num_pages +
975 		       sizeof(*rbio->finish_pointers) * real_stripes +
976 		       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
977 		       sizeof(*rbio->finish_pbitmap) *
978 				BITS_TO_LONGS(stripe_npages),
979 		       GFP_NOFS);
980 	if (!rbio)
981 		return ERR_PTR(-ENOMEM);
982 
983 	bio_list_init(&rbio->bio_list);
984 	INIT_LIST_HEAD(&rbio->plug_list);
985 	spin_lock_init(&rbio->bio_list_lock);
986 	INIT_LIST_HEAD(&rbio->stripe_cache);
987 	INIT_LIST_HEAD(&rbio->hash_list);
988 	rbio->bioc = bioc;
989 	rbio->stripe_len = stripe_len;
990 	rbio->nr_pages = num_pages;
991 	rbio->real_stripes = real_stripes;
992 	rbio->stripe_npages = stripe_npages;
993 	rbio->faila = -1;
994 	rbio->failb = -1;
995 	refcount_set(&rbio->refs, 1);
996 	atomic_set(&rbio->error, 0);
997 	atomic_set(&rbio->stripes_pending, 0);
998 
999 	/*
1000 	 * the stripe_pages, bio_pages, etc arrays point to the extra
1001 	 * memory we allocated past the end of the rbio
1002 	 */
1003 	p = rbio + 1;
1004 #define CONSUME_ALLOC(ptr, count)	do {				\
1005 		ptr = p;						\
1006 		p = (unsigned char *)p + sizeof(*(ptr)) * (count);	\
1007 	} while (0)
1008 	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1009 	CONSUME_ALLOC(rbio->bio_pages, num_pages);
1010 	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1011 	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1012 	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1013 #undef  CONSUME_ALLOC
1014 
1015 	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1016 		nr_data = real_stripes - 1;
1017 	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1018 		nr_data = real_stripes - 2;
1019 	else
1020 		BUG();
1021 
1022 	rbio->nr_data = nr_data;
1023 	return rbio;
1024 }
1025 
1026 /* allocate pages for all the stripes in the bio, including parity */
1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1028 {
1029 	int i;
1030 	struct page *page;
1031 
1032 	for (i = 0; i < rbio->nr_pages; i++) {
1033 		if (rbio->stripe_pages[i])
1034 			continue;
1035 		page = alloc_page(GFP_NOFS);
1036 		if (!page)
1037 			return -ENOMEM;
1038 		rbio->stripe_pages[i] = page;
1039 	}
1040 	return 0;
1041 }
1042 
1043 /* only allocate pages for p/q stripes */
1044 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1045 {
1046 	int i;
1047 	struct page *page;
1048 
1049 	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1050 
1051 	for (; i < rbio->nr_pages; i++) {
1052 		if (rbio->stripe_pages[i])
1053 			continue;
1054 		page = alloc_page(GFP_NOFS);
1055 		if (!page)
1056 			return -ENOMEM;
1057 		rbio->stripe_pages[i] = page;
1058 	}
1059 	return 0;
1060 }
1061 
1062 /*
1063  * add a single page from a specific stripe into our list of bios for IO
1064  * this will try to merge into existing bios if possible, and returns
1065  * zero if all went well.
1066  */
1067 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1068 			    struct bio_list *bio_list,
1069 			    struct page *page,
1070 			    int stripe_nr,
1071 			    unsigned long page_index,
1072 			    unsigned long bio_max_len)
1073 {
1074 	struct bio *last = bio_list->tail;
1075 	int ret;
1076 	struct bio *bio;
1077 	struct btrfs_io_stripe *stripe;
1078 	u64 disk_start;
1079 
1080 	stripe = &rbio->bioc->stripes[stripe_nr];
1081 	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1082 
1083 	/* if the device is missing, just fail this stripe */
1084 	if (!stripe->dev->bdev)
1085 		return fail_rbio_index(rbio, stripe_nr);
1086 
1087 	/* see if we can add this page onto our existing bio */
1088 	if (last) {
1089 		u64 last_end = last->bi_iter.bi_sector << 9;
1090 		last_end += last->bi_iter.bi_size;
1091 
1092 		/*
1093 		 * we can't merge these if they are from different
1094 		 * devices or if they are not contiguous
1095 		 */
1096 		if (last_end == disk_start && !last->bi_status &&
1097 		    last->bi_bdev == stripe->dev->bdev) {
1098 			ret = bio_add_page(last, page, PAGE_SIZE, 0);
1099 			if (ret == PAGE_SIZE)
1100 				return 0;
1101 		}
1102 	}
1103 
1104 	/* put a new bio on the list */
1105 	bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1106 	btrfs_bio(bio)->device = stripe->dev;
1107 	bio->bi_iter.bi_size = 0;
1108 	bio_set_dev(bio, stripe->dev->bdev);
1109 	bio->bi_iter.bi_sector = disk_start >> 9;
1110 
1111 	bio_add_page(bio, page, PAGE_SIZE, 0);
1112 	bio_list_add(bio_list, bio);
1113 	return 0;
1114 }
1115 
1116 /*
1117  * while we're doing the read/modify/write cycle, we could
1118  * have errors in reading pages off the disk.  This checks
1119  * for errors and if we're not able to read the page it'll
1120  * trigger parity reconstruction.  The rmw will be finished
1121  * after we've reconstructed the failed stripes
1122  */
1123 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1124 {
1125 	if (rbio->faila >= 0 || rbio->failb >= 0) {
1126 		BUG_ON(rbio->faila == rbio->real_stripes - 1);
1127 		__raid56_parity_recover(rbio);
1128 	} else {
1129 		finish_rmw(rbio);
1130 	}
1131 }
1132 
1133 /*
1134  * helper function to walk our bio list and populate the bio_pages array with
1135  * the result.  This seems expensive, but it is faster than constantly
1136  * searching through the bio list as we setup the IO in finish_rmw or stripe
1137  * reconstruction.
1138  *
1139  * This must be called before you trust the answers from page_in_rbio
1140  */
1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1142 {
1143 	struct bio *bio;
1144 	u64 start;
1145 	unsigned long stripe_offset;
1146 	unsigned long page_index;
1147 
1148 	spin_lock_irq(&rbio->bio_list_lock);
1149 	bio_list_for_each(bio, &rbio->bio_list) {
1150 		struct bio_vec bvec;
1151 		struct bvec_iter iter;
1152 		int i = 0;
1153 
1154 		start = bio->bi_iter.bi_sector << 9;
1155 		stripe_offset = start - rbio->bioc->raid_map[0];
1156 		page_index = stripe_offset >> PAGE_SHIFT;
1157 
1158 		if (bio_flagged(bio, BIO_CLONED))
1159 			bio->bi_iter = btrfs_bio(bio)->iter;
1160 
1161 		bio_for_each_segment(bvec, bio, iter) {
1162 			rbio->bio_pages[page_index + i] = bvec.bv_page;
1163 			i++;
1164 		}
1165 	}
1166 	spin_unlock_irq(&rbio->bio_list_lock);
1167 }
1168 
1169 /*
1170  * this is called from one of two situations.  We either
1171  * have a full stripe from the higher layers, or we've read all
1172  * the missing bits off disk.
1173  *
1174  * This will calculate the parity and then send down any
1175  * changed blocks.
1176  */
1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1178 {
1179 	struct btrfs_io_context *bioc = rbio->bioc;
1180 	void **pointers = rbio->finish_pointers;
1181 	int nr_data = rbio->nr_data;
1182 	int stripe;
1183 	int pagenr;
1184 	bool has_qstripe;
1185 	struct bio_list bio_list;
1186 	struct bio *bio;
1187 	int ret;
1188 
1189 	bio_list_init(&bio_list);
1190 
1191 	if (rbio->real_stripes - rbio->nr_data == 1)
1192 		has_qstripe = false;
1193 	else if (rbio->real_stripes - rbio->nr_data == 2)
1194 		has_qstripe = true;
1195 	else
1196 		BUG();
1197 
1198 	/* at this point we either have a full stripe,
1199 	 * or we've read the full stripe from the drive.
1200 	 * recalculate the parity and write the new results.
1201 	 *
1202 	 * We're not allowed to add any new bios to the
1203 	 * bio list here, anyone else that wants to
1204 	 * change this stripe needs to do their own rmw.
1205 	 */
1206 	spin_lock_irq(&rbio->bio_list_lock);
1207 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1208 	spin_unlock_irq(&rbio->bio_list_lock);
1209 
1210 	atomic_set(&rbio->error, 0);
1211 
1212 	/*
1213 	 * now that we've set rmw_locked, run through the
1214 	 * bio list one last time and map the page pointers
1215 	 *
1216 	 * We don't cache full rbios because we're assuming
1217 	 * the higher layers are unlikely to use this area of
1218 	 * the disk again soon.  If they do use it again,
1219 	 * hopefully they will send another full bio.
1220 	 */
1221 	index_rbio_pages(rbio);
1222 	if (!rbio_is_full(rbio))
1223 		cache_rbio_pages(rbio);
1224 	else
1225 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1226 
1227 	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1228 		struct page *p;
1229 		/* first collect one page from each data stripe */
1230 		for (stripe = 0; stripe < nr_data; stripe++) {
1231 			p = page_in_rbio(rbio, stripe, pagenr, 0);
1232 			pointers[stripe] = kmap_local_page(p);
1233 		}
1234 
1235 		/* then add the parity stripe */
1236 		p = rbio_pstripe_page(rbio, pagenr);
1237 		SetPageUptodate(p);
1238 		pointers[stripe++] = kmap_local_page(p);
1239 
1240 		if (has_qstripe) {
1241 
1242 			/*
1243 			 * raid6, add the qstripe and call the
1244 			 * library function to fill in our p/q
1245 			 */
1246 			p = rbio_qstripe_page(rbio, pagenr);
1247 			SetPageUptodate(p);
1248 			pointers[stripe++] = kmap_local_page(p);
1249 
1250 			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1251 						pointers);
1252 		} else {
1253 			/* raid5 */
1254 			copy_page(pointers[nr_data], pointers[0]);
1255 			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1256 		}
1257 		for (stripe = stripe - 1; stripe >= 0; stripe--)
1258 			kunmap_local(pointers[stripe]);
1259 	}
1260 
1261 	/*
1262 	 * time to start writing.  Make bios for everything from the
1263 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1264 	 * everything else.
1265 	 */
1266 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1267 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1268 			struct page *page;
1269 			if (stripe < rbio->nr_data) {
1270 				page = page_in_rbio(rbio, stripe, pagenr, 1);
1271 				if (!page)
1272 					continue;
1273 			} else {
1274 			       page = rbio_stripe_page(rbio, stripe, pagenr);
1275 			}
1276 
1277 			ret = rbio_add_io_page(rbio, &bio_list,
1278 				       page, stripe, pagenr, rbio->stripe_len);
1279 			if (ret)
1280 				goto cleanup;
1281 		}
1282 	}
1283 
1284 	if (likely(!bioc->num_tgtdevs))
1285 		goto write_data;
1286 
1287 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1288 		if (!bioc->tgtdev_map[stripe])
1289 			continue;
1290 
1291 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1292 			struct page *page;
1293 			if (stripe < rbio->nr_data) {
1294 				page = page_in_rbio(rbio, stripe, pagenr, 1);
1295 				if (!page)
1296 					continue;
1297 			} else {
1298 			       page = rbio_stripe_page(rbio, stripe, pagenr);
1299 			}
1300 
1301 			ret = rbio_add_io_page(rbio, &bio_list, page,
1302 					       rbio->bioc->tgtdev_map[stripe],
1303 					       pagenr, rbio->stripe_len);
1304 			if (ret)
1305 				goto cleanup;
1306 		}
1307 	}
1308 
1309 write_data:
1310 	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1311 	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1312 
1313 	while ((bio = bio_list_pop(&bio_list))) {
1314 		bio->bi_private = rbio;
1315 		bio->bi_end_io = raid_write_end_io;
1316 		bio->bi_opf = REQ_OP_WRITE;
1317 
1318 		submit_bio(bio);
1319 	}
1320 	return;
1321 
1322 cleanup:
1323 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1324 
1325 	while ((bio = bio_list_pop(&bio_list)))
1326 		bio_put(bio);
1327 }
1328 
1329 /*
1330  * helper to find the stripe number for a given bio.  Used to figure out which
1331  * stripe has failed.  This expects the bio to correspond to a physical disk,
1332  * so it looks up based on physical sector numbers.
1333  */
1334 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1335 			   struct bio *bio)
1336 {
1337 	u64 physical = bio->bi_iter.bi_sector;
1338 	int i;
1339 	struct btrfs_io_stripe *stripe;
1340 
1341 	physical <<= 9;
1342 
1343 	for (i = 0; i < rbio->bioc->num_stripes; i++) {
1344 		stripe = &rbio->bioc->stripes[i];
1345 		if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1346 		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1347 			return i;
1348 		}
1349 	}
1350 	return -1;
1351 }
1352 
1353 /*
1354  * helper to find the stripe number for a given
1355  * bio (before mapping).  Used to figure out which stripe has
1356  * failed.  This looks up based on logical block numbers.
1357  */
1358 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1359 				   struct bio *bio)
1360 {
1361 	u64 logical = bio->bi_iter.bi_sector << 9;
1362 	int i;
1363 
1364 	for (i = 0; i < rbio->nr_data; i++) {
1365 		u64 stripe_start = rbio->bioc->raid_map[i];
1366 
1367 		if (in_range(logical, stripe_start, rbio->stripe_len))
1368 			return i;
1369 	}
1370 	return -1;
1371 }
1372 
1373 /*
1374  * returns -EIO if we had too many failures
1375  */
1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1377 {
1378 	unsigned long flags;
1379 	int ret = 0;
1380 
1381 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
1382 
1383 	/* we already know this stripe is bad, move on */
1384 	if (rbio->faila == failed || rbio->failb == failed)
1385 		goto out;
1386 
1387 	if (rbio->faila == -1) {
1388 		/* first failure on this rbio */
1389 		rbio->faila = failed;
1390 		atomic_inc(&rbio->error);
1391 	} else if (rbio->failb == -1) {
1392 		/* second failure on this rbio */
1393 		rbio->failb = failed;
1394 		atomic_inc(&rbio->error);
1395 	} else {
1396 		ret = -EIO;
1397 	}
1398 out:
1399 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1400 
1401 	return ret;
1402 }
1403 
1404 /*
1405  * helper to fail a stripe based on a physical disk
1406  * bio.
1407  */
1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1409 			   struct bio *bio)
1410 {
1411 	int failed = find_bio_stripe(rbio, bio);
1412 
1413 	if (failed < 0)
1414 		return -EIO;
1415 
1416 	return fail_rbio_index(rbio, failed);
1417 }
1418 
1419 /*
1420  * this sets each page in the bio uptodate.  It should only be used on private
1421  * rbio pages, nothing that comes in from the higher layers
1422  */
1423 static void set_bio_pages_uptodate(struct bio *bio)
1424 {
1425 	struct bio_vec *bvec;
1426 	struct bvec_iter_all iter_all;
1427 
1428 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1429 
1430 	bio_for_each_segment_all(bvec, bio, iter_all)
1431 		SetPageUptodate(bvec->bv_page);
1432 }
1433 
1434 /*
1435  * end io for the read phase of the rmw cycle.  All the bios here are physical
1436  * stripe bios we've read from the disk so we can recalculate the parity of the
1437  * stripe.
1438  *
1439  * This will usually kick off finish_rmw once all the bios are read in, but it
1440  * may trigger parity reconstruction if we had any errors along the way
1441  */
1442 static void raid_rmw_end_io(struct bio *bio)
1443 {
1444 	struct btrfs_raid_bio *rbio = bio->bi_private;
1445 
1446 	if (bio->bi_status)
1447 		fail_bio_stripe(rbio, bio);
1448 	else
1449 		set_bio_pages_uptodate(bio);
1450 
1451 	bio_put(bio);
1452 
1453 	if (!atomic_dec_and_test(&rbio->stripes_pending))
1454 		return;
1455 
1456 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1457 		goto cleanup;
1458 
1459 	/*
1460 	 * this will normally call finish_rmw to start our write
1461 	 * but if there are any failed stripes we'll reconstruct
1462 	 * from parity first
1463 	 */
1464 	validate_rbio_for_rmw(rbio);
1465 	return;
1466 
1467 cleanup:
1468 
1469 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1470 }
1471 
1472 /*
1473  * the stripe must be locked by the caller.  It will
1474  * unlock after all the writes are done
1475  */
1476 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1477 {
1478 	int bios_to_read = 0;
1479 	struct bio_list bio_list;
1480 	int ret;
1481 	int pagenr;
1482 	int stripe;
1483 	struct bio *bio;
1484 
1485 	bio_list_init(&bio_list);
1486 
1487 	ret = alloc_rbio_pages(rbio);
1488 	if (ret)
1489 		goto cleanup;
1490 
1491 	index_rbio_pages(rbio);
1492 
1493 	atomic_set(&rbio->error, 0);
1494 	/*
1495 	 * build a list of bios to read all the missing parts of this
1496 	 * stripe
1497 	 */
1498 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1499 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1500 			struct page *page;
1501 			/*
1502 			 * we want to find all the pages missing from
1503 			 * the rbio and read them from the disk.  If
1504 			 * page_in_rbio finds a page in the bio list
1505 			 * we don't need to read it off the stripe.
1506 			 */
1507 			page = page_in_rbio(rbio, stripe, pagenr, 1);
1508 			if (page)
1509 				continue;
1510 
1511 			page = rbio_stripe_page(rbio, stripe, pagenr);
1512 			/*
1513 			 * the bio cache may have handed us an uptodate
1514 			 * page.  If so, be happy and use it
1515 			 */
1516 			if (PageUptodate(page))
1517 				continue;
1518 
1519 			ret = rbio_add_io_page(rbio, &bio_list, page,
1520 				       stripe, pagenr, rbio->stripe_len);
1521 			if (ret)
1522 				goto cleanup;
1523 		}
1524 	}
1525 
1526 	bios_to_read = bio_list_size(&bio_list);
1527 	if (!bios_to_read) {
1528 		/*
1529 		 * this can happen if others have merged with
1530 		 * us, it means there is nothing left to read.
1531 		 * But if there are missing devices it may not be
1532 		 * safe to do the full stripe write yet.
1533 		 */
1534 		goto finish;
1535 	}
1536 
1537 	/*
1538 	 * The bioc may be freed once we submit the last bio. Make sure not to
1539 	 * touch it after that.
1540 	 */
1541 	atomic_set(&rbio->stripes_pending, bios_to_read);
1542 	while ((bio = bio_list_pop(&bio_list))) {
1543 		bio->bi_private = rbio;
1544 		bio->bi_end_io = raid_rmw_end_io;
1545 		bio->bi_opf = REQ_OP_READ;
1546 
1547 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1548 
1549 		submit_bio(bio);
1550 	}
1551 	/* the actual write will happen once the reads are done */
1552 	return 0;
1553 
1554 cleanup:
1555 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
1556 
1557 	while ((bio = bio_list_pop(&bio_list)))
1558 		bio_put(bio);
1559 
1560 	return -EIO;
1561 
1562 finish:
1563 	validate_rbio_for_rmw(rbio);
1564 	return 0;
1565 }
1566 
1567 /*
1568  * if the upper layers pass in a full stripe, we thank them by only allocating
1569  * enough pages to hold the parity, and sending it all down quickly.
1570  */
1571 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1572 {
1573 	int ret;
1574 
1575 	ret = alloc_rbio_parity_pages(rbio);
1576 	if (ret) {
1577 		__free_raid_bio(rbio);
1578 		return ret;
1579 	}
1580 
1581 	ret = lock_stripe_add(rbio);
1582 	if (ret == 0)
1583 		finish_rmw(rbio);
1584 	return 0;
1585 }
1586 
1587 /*
1588  * partial stripe writes get handed over to async helpers.
1589  * We're really hoping to merge a few more writes into this
1590  * rbio before calculating new parity
1591  */
1592 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1593 {
1594 	int ret;
1595 
1596 	ret = lock_stripe_add(rbio);
1597 	if (ret == 0)
1598 		start_async_work(rbio, rmw_work);
1599 	return 0;
1600 }
1601 
1602 /*
1603  * sometimes while we were reading from the drive to
1604  * recalculate parity, enough new bios come into create
1605  * a full stripe.  So we do a check here to see if we can
1606  * go directly to finish_rmw
1607  */
1608 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1609 {
1610 	/* head off into rmw land if we don't have a full stripe */
1611 	if (!rbio_is_full(rbio))
1612 		return partial_stripe_write(rbio);
1613 	return full_stripe_write(rbio);
1614 }
1615 
1616 /*
1617  * We use plugging call backs to collect full stripes.
1618  * Any time we get a partial stripe write while plugged
1619  * we collect it into a list.  When the unplug comes down,
1620  * we sort the list by logical block number and merge
1621  * everything we can into the same rbios
1622  */
1623 struct btrfs_plug_cb {
1624 	struct blk_plug_cb cb;
1625 	struct btrfs_fs_info *info;
1626 	struct list_head rbio_list;
1627 	struct btrfs_work work;
1628 };
1629 
1630 /*
1631  * rbios on the plug list are sorted for easier merging.
1632  */
1633 static int plug_cmp(void *priv, const struct list_head *a,
1634 		    const struct list_head *b)
1635 {
1636 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1637 						       plug_list);
1638 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1639 						       plug_list);
1640 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1641 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1642 
1643 	if (a_sector < b_sector)
1644 		return -1;
1645 	if (a_sector > b_sector)
1646 		return 1;
1647 	return 0;
1648 }
1649 
1650 static void run_plug(struct btrfs_plug_cb *plug)
1651 {
1652 	struct btrfs_raid_bio *cur;
1653 	struct btrfs_raid_bio *last = NULL;
1654 
1655 	/*
1656 	 * sort our plug list then try to merge
1657 	 * everything we can in hopes of creating full
1658 	 * stripes.
1659 	 */
1660 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1661 	while (!list_empty(&plug->rbio_list)) {
1662 		cur = list_entry(plug->rbio_list.next,
1663 				 struct btrfs_raid_bio, plug_list);
1664 		list_del_init(&cur->plug_list);
1665 
1666 		if (rbio_is_full(cur)) {
1667 			int ret;
1668 
1669 			/* we have a full stripe, send it down */
1670 			ret = full_stripe_write(cur);
1671 			BUG_ON(ret);
1672 			continue;
1673 		}
1674 		if (last) {
1675 			if (rbio_can_merge(last, cur)) {
1676 				merge_rbio(last, cur);
1677 				__free_raid_bio(cur);
1678 				continue;
1679 
1680 			}
1681 			__raid56_parity_write(last);
1682 		}
1683 		last = cur;
1684 	}
1685 	if (last) {
1686 		__raid56_parity_write(last);
1687 	}
1688 	kfree(plug);
1689 }
1690 
1691 /*
1692  * if the unplug comes from schedule, we have to push the
1693  * work off to a helper thread
1694  */
1695 static void unplug_work(struct btrfs_work *work)
1696 {
1697 	struct btrfs_plug_cb *plug;
1698 	plug = container_of(work, struct btrfs_plug_cb, work);
1699 	run_plug(plug);
1700 }
1701 
1702 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1703 {
1704 	struct btrfs_plug_cb *plug;
1705 	plug = container_of(cb, struct btrfs_plug_cb, cb);
1706 
1707 	if (from_schedule) {
1708 		btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1709 		btrfs_queue_work(plug->info->rmw_workers,
1710 				 &plug->work);
1711 		return;
1712 	}
1713 	run_plug(plug);
1714 }
1715 
1716 /*
1717  * our main entry point for writes from the rest of the FS.
1718  */
1719 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
1720 			u64 stripe_len)
1721 {
1722 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1723 	struct btrfs_raid_bio *rbio;
1724 	struct btrfs_plug_cb *plug = NULL;
1725 	struct blk_plug_cb *cb;
1726 	int ret;
1727 
1728 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
1729 	if (IS_ERR(rbio)) {
1730 		btrfs_put_bioc(bioc);
1731 		return PTR_ERR(rbio);
1732 	}
1733 	bio_list_add(&rbio->bio_list, bio);
1734 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
1735 	rbio->operation = BTRFS_RBIO_WRITE;
1736 
1737 	btrfs_bio_counter_inc_noblocked(fs_info);
1738 	rbio->generic_bio_cnt = 1;
1739 
1740 	/*
1741 	 * don't plug on full rbios, just get them out the door
1742 	 * as quickly as we can
1743 	 */
1744 	if (rbio_is_full(rbio)) {
1745 		ret = full_stripe_write(rbio);
1746 		if (ret)
1747 			btrfs_bio_counter_dec(fs_info);
1748 		return ret;
1749 	}
1750 
1751 	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1752 	if (cb) {
1753 		plug = container_of(cb, struct btrfs_plug_cb, cb);
1754 		if (!plug->info) {
1755 			plug->info = fs_info;
1756 			INIT_LIST_HEAD(&plug->rbio_list);
1757 		}
1758 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
1759 		ret = 0;
1760 	} else {
1761 		ret = __raid56_parity_write(rbio);
1762 		if (ret)
1763 			btrfs_bio_counter_dec(fs_info);
1764 	}
1765 	return ret;
1766 }
1767 
1768 /*
1769  * all parity reconstruction happens here.  We've read in everything
1770  * we can find from the drives and this does the heavy lifting of
1771  * sorting the good from the bad.
1772  */
1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1774 {
1775 	int pagenr, stripe;
1776 	void **pointers;
1777 	void **unmap_array;
1778 	int faila = -1, failb = -1;
1779 	struct page *page;
1780 	blk_status_t err;
1781 	int i;
1782 
1783 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1784 	if (!pointers) {
1785 		err = BLK_STS_RESOURCE;
1786 		goto cleanup_io;
1787 	}
1788 
1789 	/*
1790 	 * Store copy of pointers that does not get reordered during
1791 	 * reconstruction so that kunmap_local works.
1792 	 */
1793 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1794 	if (!unmap_array) {
1795 		err = BLK_STS_RESOURCE;
1796 		goto cleanup_pointers;
1797 	}
1798 
1799 	faila = rbio->faila;
1800 	failb = rbio->failb;
1801 
1802 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1803 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1804 		spin_lock_irq(&rbio->bio_list_lock);
1805 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1806 		spin_unlock_irq(&rbio->bio_list_lock);
1807 	}
1808 
1809 	index_rbio_pages(rbio);
1810 
1811 	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1812 		/*
1813 		 * Now we just use bitmap to mark the horizontal stripes in
1814 		 * which we have data when doing parity scrub.
1815 		 */
1816 		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1817 		    !test_bit(pagenr, rbio->dbitmap))
1818 			continue;
1819 
1820 		/*
1821 		 * Setup our array of pointers with pages from each stripe
1822 		 *
1823 		 * NOTE: store a duplicate array of pointers to preserve the
1824 		 * pointer order
1825 		 */
1826 		for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1827 			/*
1828 			 * if we're rebuilding a read, we have to use
1829 			 * pages from the bio list
1830 			 */
1831 			if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1832 			     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1833 			    (stripe == faila || stripe == failb)) {
1834 				page = page_in_rbio(rbio, stripe, pagenr, 0);
1835 			} else {
1836 				page = rbio_stripe_page(rbio, stripe, pagenr);
1837 			}
1838 			pointers[stripe] = kmap_local_page(page);
1839 			unmap_array[stripe] = pointers[stripe];
1840 		}
1841 
1842 		/* all raid6 handling here */
1843 		if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1844 			/*
1845 			 * single failure, rebuild from parity raid5
1846 			 * style
1847 			 */
1848 			if (failb < 0) {
1849 				if (faila == rbio->nr_data) {
1850 					/*
1851 					 * Just the P stripe has failed, without
1852 					 * a bad data or Q stripe.
1853 					 * TODO, we should redo the xor here.
1854 					 */
1855 					err = BLK_STS_IOERR;
1856 					goto cleanup;
1857 				}
1858 				/*
1859 				 * a single failure in raid6 is rebuilt
1860 				 * in the pstripe code below
1861 				 */
1862 				goto pstripe;
1863 			}
1864 
1865 			/* make sure our ps and qs are in order */
1866 			if (faila > failb)
1867 				swap(faila, failb);
1868 
1869 			/* if the q stripe is failed, do a pstripe reconstruction
1870 			 * from the xors.
1871 			 * If both the q stripe and the P stripe are failed, we're
1872 			 * here due to a crc mismatch and we can't give them the
1873 			 * data they want
1874 			 */
1875 			if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1876 				if (rbio->bioc->raid_map[faila] ==
1877 				    RAID5_P_STRIPE) {
1878 					err = BLK_STS_IOERR;
1879 					goto cleanup;
1880 				}
1881 				/*
1882 				 * otherwise we have one bad data stripe and
1883 				 * a good P stripe.  raid5!
1884 				 */
1885 				goto pstripe;
1886 			}
1887 
1888 			if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1889 				raid6_datap_recov(rbio->real_stripes,
1890 						  PAGE_SIZE, faila, pointers);
1891 			} else {
1892 				raid6_2data_recov(rbio->real_stripes,
1893 						  PAGE_SIZE, faila, failb,
1894 						  pointers);
1895 			}
1896 		} else {
1897 			void *p;
1898 
1899 			/* rebuild from P stripe here (raid5 or raid6) */
1900 			BUG_ON(failb != -1);
1901 pstripe:
1902 			/* Copy parity block into failed block to start with */
1903 			copy_page(pointers[faila], pointers[rbio->nr_data]);
1904 
1905 			/* rearrange the pointer array */
1906 			p = pointers[faila];
1907 			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1908 				pointers[stripe] = pointers[stripe + 1];
1909 			pointers[rbio->nr_data - 1] = p;
1910 
1911 			/* xor in the rest */
1912 			run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1913 		}
1914 		/* if we're doing this rebuild as part of an rmw, go through
1915 		 * and set all of our private rbio pages in the
1916 		 * failed stripes as uptodate.  This way finish_rmw will
1917 		 * know they can be trusted.  If this was a read reconstruction,
1918 		 * other endio functions will fiddle the uptodate bits
1919 		 */
1920 		if (rbio->operation == BTRFS_RBIO_WRITE) {
1921 			for (i = 0;  i < rbio->stripe_npages; i++) {
1922 				if (faila != -1) {
1923 					page = rbio_stripe_page(rbio, faila, i);
1924 					SetPageUptodate(page);
1925 				}
1926 				if (failb != -1) {
1927 					page = rbio_stripe_page(rbio, failb, i);
1928 					SetPageUptodate(page);
1929 				}
1930 			}
1931 		}
1932 		for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1933 			kunmap_local(unmap_array[stripe]);
1934 	}
1935 
1936 	err = BLK_STS_OK;
1937 cleanup:
1938 	kfree(unmap_array);
1939 cleanup_pointers:
1940 	kfree(pointers);
1941 
1942 cleanup_io:
1943 	/*
1944 	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1945 	 * valid rbio which is consistent with ondisk content, thus such a
1946 	 * valid rbio can be cached to avoid further disk reads.
1947 	 */
1948 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1949 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1950 		/*
1951 		 * - In case of two failures, where rbio->failb != -1:
1952 		 *
1953 		 *   Do not cache this rbio since the above read reconstruction
1954 		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
1955 		 *   changed some content of stripes which are not identical to
1956 		 *   on-disk content any more, otherwise, a later write/recover
1957 		 *   may steal stripe_pages from this rbio and end up with
1958 		 *   corruptions or rebuild failures.
1959 		 *
1960 		 * - In case of single failure, where rbio->failb == -1:
1961 		 *
1962 		 *   Cache this rbio iff the above read reconstruction is
1963 		 *   executed without problems.
1964 		 */
1965 		if (err == BLK_STS_OK && rbio->failb < 0)
1966 			cache_rbio_pages(rbio);
1967 		else
1968 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1969 
1970 		rbio_orig_end_io(rbio, err);
1971 	} else if (err == BLK_STS_OK) {
1972 		rbio->faila = -1;
1973 		rbio->failb = -1;
1974 
1975 		if (rbio->operation == BTRFS_RBIO_WRITE)
1976 			finish_rmw(rbio);
1977 		else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1978 			finish_parity_scrub(rbio, 0);
1979 		else
1980 			BUG();
1981 	} else {
1982 		rbio_orig_end_io(rbio, err);
1983 	}
1984 }
1985 
1986 /*
1987  * This is called only for stripes we've read from disk to
1988  * reconstruct the parity.
1989  */
1990 static void raid_recover_end_io(struct bio *bio)
1991 {
1992 	struct btrfs_raid_bio *rbio = bio->bi_private;
1993 
1994 	/*
1995 	 * we only read stripe pages off the disk, set them
1996 	 * up to date if there were no errors
1997 	 */
1998 	if (bio->bi_status)
1999 		fail_bio_stripe(rbio, bio);
2000 	else
2001 		set_bio_pages_uptodate(bio);
2002 	bio_put(bio);
2003 
2004 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2005 		return;
2006 
2007 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2008 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2009 	else
2010 		__raid_recover_end_io(rbio);
2011 }
2012 
2013 /*
2014  * reads everything we need off the disk to reconstruct
2015  * the parity. endio handlers trigger final reconstruction
2016  * when the IO is done.
2017  *
2018  * This is used both for reads from the higher layers and for
2019  * parity construction required to finish a rmw cycle.
2020  */
2021 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2022 {
2023 	int bios_to_read = 0;
2024 	struct bio_list bio_list;
2025 	int ret;
2026 	int pagenr;
2027 	int stripe;
2028 	struct bio *bio;
2029 
2030 	bio_list_init(&bio_list);
2031 
2032 	ret = alloc_rbio_pages(rbio);
2033 	if (ret)
2034 		goto cleanup;
2035 
2036 	atomic_set(&rbio->error, 0);
2037 
2038 	/*
2039 	 * read everything that hasn't failed.  Thanks to the
2040 	 * stripe cache, it is possible that some or all of these
2041 	 * pages are going to be uptodate.
2042 	 */
2043 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2044 		if (rbio->faila == stripe || rbio->failb == stripe) {
2045 			atomic_inc(&rbio->error);
2046 			continue;
2047 		}
2048 
2049 		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2050 			struct page *p;
2051 
2052 			/*
2053 			 * the rmw code may have already read this
2054 			 * page in
2055 			 */
2056 			p = rbio_stripe_page(rbio, stripe, pagenr);
2057 			if (PageUptodate(p))
2058 				continue;
2059 
2060 			ret = rbio_add_io_page(rbio, &bio_list,
2061 				       rbio_stripe_page(rbio, stripe, pagenr),
2062 				       stripe, pagenr, rbio->stripe_len);
2063 			if (ret < 0)
2064 				goto cleanup;
2065 		}
2066 	}
2067 
2068 	bios_to_read = bio_list_size(&bio_list);
2069 	if (!bios_to_read) {
2070 		/*
2071 		 * we might have no bios to read just because the pages
2072 		 * were up to date, or we might have no bios to read because
2073 		 * the devices were gone.
2074 		 */
2075 		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2076 			__raid_recover_end_io(rbio);
2077 			return 0;
2078 		} else {
2079 			goto cleanup;
2080 		}
2081 	}
2082 
2083 	/*
2084 	 * The bioc may be freed once we submit the last bio. Make sure not to
2085 	 * touch it after that.
2086 	 */
2087 	atomic_set(&rbio->stripes_pending, bios_to_read);
2088 	while ((bio = bio_list_pop(&bio_list))) {
2089 		bio->bi_private = rbio;
2090 		bio->bi_end_io = raid_recover_end_io;
2091 		bio->bi_opf = REQ_OP_READ;
2092 
2093 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2094 
2095 		submit_bio(bio);
2096 	}
2097 
2098 	return 0;
2099 
2100 cleanup:
2101 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2102 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2103 		rbio_orig_end_io(rbio, BLK_STS_IOERR);
2104 
2105 	while ((bio = bio_list_pop(&bio_list)))
2106 		bio_put(bio);
2107 
2108 	return -EIO;
2109 }
2110 
2111 /*
2112  * the main entry point for reads from the higher layers.  This
2113  * is really only called when the normal read path had a failure,
2114  * so we assume the bio they send down corresponds to a failed part
2115  * of the drive.
2116  */
2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2118 			  u64 stripe_len, int mirror_num, int generic_io)
2119 {
2120 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2121 	struct btrfs_raid_bio *rbio;
2122 	int ret;
2123 
2124 	if (generic_io) {
2125 		ASSERT(bioc->mirror_num == mirror_num);
2126 		btrfs_bio(bio)->mirror_num = mirror_num;
2127 	}
2128 
2129 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
2130 	if (IS_ERR(rbio)) {
2131 		if (generic_io)
2132 			btrfs_put_bioc(bioc);
2133 		return PTR_ERR(rbio);
2134 	}
2135 
2136 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2137 	bio_list_add(&rbio->bio_list, bio);
2138 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
2139 
2140 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2141 	if (rbio->faila == -1) {
2142 		btrfs_warn(fs_info,
2143 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2144 			   __func__, bio->bi_iter.bi_sector << 9,
2145 			   (u64)bio->bi_iter.bi_size, bioc->map_type);
2146 		if (generic_io)
2147 			btrfs_put_bioc(bioc);
2148 		kfree(rbio);
2149 		return -EIO;
2150 	}
2151 
2152 	if (generic_io) {
2153 		btrfs_bio_counter_inc_noblocked(fs_info);
2154 		rbio->generic_bio_cnt = 1;
2155 	} else {
2156 		btrfs_get_bioc(bioc);
2157 	}
2158 
2159 	/*
2160 	 * Loop retry:
2161 	 * for 'mirror == 2', reconstruct from all other stripes.
2162 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2163 	 */
2164 	if (mirror_num > 2) {
2165 		/*
2166 		 * 'mirror == 3' is to fail the p stripe and
2167 		 * reconstruct from the q stripe.  'mirror > 3' is to
2168 		 * fail a data stripe and reconstruct from p+q stripe.
2169 		 */
2170 		rbio->failb = rbio->real_stripes - (mirror_num - 1);
2171 		ASSERT(rbio->failb > 0);
2172 		if (rbio->failb <= rbio->faila)
2173 			rbio->failb--;
2174 	}
2175 
2176 	ret = lock_stripe_add(rbio);
2177 
2178 	/*
2179 	 * __raid56_parity_recover will end the bio with
2180 	 * any errors it hits.  We don't want to return
2181 	 * its error value up the stack because our caller
2182 	 * will end up calling bio_endio with any nonzero
2183 	 * return
2184 	 */
2185 	if (ret == 0)
2186 		__raid56_parity_recover(rbio);
2187 	/*
2188 	 * our rbio has been added to the list of
2189 	 * rbios that will be handled after the
2190 	 * currently lock owner is done
2191 	 */
2192 	return 0;
2193 
2194 }
2195 
2196 static void rmw_work(struct btrfs_work *work)
2197 {
2198 	struct btrfs_raid_bio *rbio;
2199 
2200 	rbio = container_of(work, struct btrfs_raid_bio, work);
2201 	raid56_rmw_stripe(rbio);
2202 }
2203 
2204 static void read_rebuild_work(struct btrfs_work *work)
2205 {
2206 	struct btrfs_raid_bio *rbio;
2207 
2208 	rbio = container_of(work, struct btrfs_raid_bio, work);
2209 	__raid56_parity_recover(rbio);
2210 }
2211 
2212 /*
2213  * The following code is used to scrub/replace the parity stripe
2214  *
2215  * Caller must have already increased bio_counter for getting @bioc.
2216  *
2217  * Note: We need make sure all the pages that add into the scrub/replace
2218  * raid bio are correct and not be changed during the scrub/replace. That
2219  * is those pages just hold metadata or file data with checksum.
2220  */
2221 
2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2223 				struct btrfs_io_context *bioc,
2224 				u64 stripe_len, struct btrfs_device *scrub_dev,
2225 				unsigned long *dbitmap, int stripe_nsectors)
2226 {
2227 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2228 	struct btrfs_raid_bio *rbio;
2229 	int i;
2230 
2231 	rbio = alloc_rbio(fs_info, bioc, stripe_len);
2232 	if (IS_ERR(rbio))
2233 		return NULL;
2234 	bio_list_add(&rbio->bio_list, bio);
2235 	/*
2236 	 * This is a special bio which is used to hold the completion handler
2237 	 * and make the scrub rbio is similar to the other types
2238 	 */
2239 	ASSERT(!bio->bi_iter.bi_size);
2240 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2241 
2242 	/*
2243 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2244 	 * to the end position, so this search can start from the first parity
2245 	 * stripe.
2246 	 */
2247 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2248 		if (bioc->stripes[i].dev == scrub_dev) {
2249 			rbio->scrubp = i;
2250 			break;
2251 		}
2252 	}
2253 	ASSERT(i < rbio->real_stripes);
2254 
2255 	/* Now we just support the sectorsize equals to page size */
2256 	ASSERT(fs_info->sectorsize == PAGE_SIZE);
2257 	ASSERT(rbio->stripe_npages == stripe_nsectors);
2258 	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2259 
2260 	/*
2261 	 * We have already increased bio_counter when getting bioc, record it
2262 	 * so we can free it at rbio_orig_end_io().
2263 	 */
2264 	rbio->generic_bio_cnt = 1;
2265 
2266 	return rbio;
2267 }
2268 
2269 /* Used for both parity scrub and missing. */
2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2271 			    u64 logical)
2272 {
2273 	int stripe_offset;
2274 	int index;
2275 
2276 	ASSERT(logical >= rbio->bioc->raid_map[0]);
2277 	ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
2278 				rbio->stripe_len * rbio->nr_data);
2279 	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2280 	index = stripe_offset >> PAGE_SHIFT;
2281 	rbio->bio_pages[index] = page;
2282 }
2283 
2284 /*
2285  * We just scrub the parity that we have correct data on the same horizontal,
2286  * so we needn't allocate all pages for all the stripes.
2287  */
2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2289 {
2290 	int i;
2291 	int bit;
2292 	int index;
2293 	struct page *page;
2294 
2295 	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2296 		for (i = 0; i < rbio->real_stripes; i++) {
2297 			index = i * rbio->stripe_npages + bit;
2298 			if (rbio->stripe_pages[index])
2299 				continue;
2300 
2301 			page = alloc_page(GFP_NOFS);
2302 			if (!page)
2303 				return -ENOMEM;
2304 			rbio->stripe_pages[index] = page;
2305 		}
2306 	}
2307 	return 0;
2308 }
2309 
2310 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2311 					 int need_check)
2312 {
2313 	struct btrfs_io_context *bioc = rbio->bioc;
2314 	void **pointers = rbio->finish_pointers;
2315 	unsigned long *pbitmap = rbio->finish_pbitmap;
2316 	int nr_data = rbio->nr_data;
2317 	int stripe;
2318 	int pagenr;
2319 	bool has_qstripe;
2320 	struct page *p_page = NULL;
2321 	struct page *q_page = NULL;
2322 	struct bio_list bio_list;
2323 	struct bio *bio;
2324 	int is_replace = 0;
2325 	int ret;
2326 
2327 	bio_list_init(&bio_list);
2328 
2329 	if (rbio->real_stripes - rbio->nr_data == 1)
2330 		has_qstripe = false;
2331 	else if (rbio->real_stripes - rbio->nr_data == 2)
2332 		has_qstripe = true;
2333 	else
2334 		BUG();
2335 
2336 	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2337 		is_replace = 1;
2338 		bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2339 	}
2340 
2341 	/*
2342 	 * Because the higher layers(scrubber) are unlikely to
2343 	 * use this area of the disk again soon, so don't cache
2344 	 * it.
2345 	 */
2346 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2347 
2348 	if (!need_check)
2349 		goto writeback;
2350 
2351 	p_page = alloc_page(GFP_NOFS);
2352 	if (!p_page)
2353 		goto cleanup;
2354 	SetPageUptodate(p_page);
2355 
2356 	if (has_qstripe) {
2357 		/* RAID6, allocate and map temp space for the Q stripe */
2358 		q_page = alloc_page(GFP_NOFS);
2359 		if (!q_page) {
2360 			__free_page(p_page);
2361 			goto cleanup;
2362 		}
2363 		SetPageUptodate(q_page);
2364 		pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
2365 	}
2366 
2367 	atomic_set(&rbio->error, 0);
2368 
2369 	/* Map the parity stripe just once */
2370 	pointers[nr_data] = kmap_local_page(p_page);
2371 
2372 	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2373 		struct page *p;
2374 		void *parity;
2375 		/* first collect one page from each data stripe */
2376 		for (stripe = 0; stripe < nr_data; stripe++) {
2377 			p = page_in_rbio(rbio, stripe, pagenr, 0);
2378 			pointers[stripe] = kmap_local_page(p);
2379 		}
2380 
2381 		if (has_qstripe) {
2382 			/* RAID6, call the library function to fill in our P/Q */
2383 			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2384 						pointers);
2385 		} else {
2386 			/* raid5 */
2387 			copy_page(pointers[nr_data], pointers[0]);
2388 			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2389 		}
2390 
2391 		/* Check scrubbing parity and repair it */
2392 		p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2393 		parity = kmap_local_page(p);
2394 		if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2395 			copy_page(parity, pointers[rbio->scrubp]);
2396 		else
2397 			/* Parity is right, needn't writeback */
2398 			bitmap_clear(rbio->dbitmap, pagenr, 1);
2399 		kunmap_local(parity);
2400 
2401 		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2402 			kunmap_local(pointers[stripe]);
2403 	}
2404 
2405 	kunmap_local(pointers[nr_data]);
2406 	__free_page(p_page);
2407 	if (q_page) {
2408 		kunmap_local(pointers[rbio->real_stripes - 1]);
2409 		__free_page(q_page);
2410 	}
2411 
2412 writeback:
2413 	/*
2414 	 * time to start writing.  Make bios for everything from the
2415 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2416 	 * everything else.
2417 	 */
2418 	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2419 		struct page *page;
2420 
2421 		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2422 		ret = rbio_add_io_page(rbio, &bio_list,
2423 			       page, rbio->scrubp, pagenr, rbio->stripe_len);
2424 		if (ret)
2425 			goto cleanup;
2426 	}
2427 
2428 	if (!is_replace)
2429 		goto submit_write;
2430 
2431 	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2432 		struct page *page;
2433 
2434 		page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2435 		ret = rbio_add_io_page(rbio, &bio_list, page,
2436 				       bioc->tgtdev_map[rbio->scrubp],
2437 				       pagenr, rbio->stripe_len);
2438 		if (ret)
2439 			goto cleanup;
2440 	}
2441 
2442 submit_write:
2443 	nr_data = bio_list_size(&bio_list);
2444 	if (!nr_data) {
2445 		/* Every parity is right */
2446 		rbio_orig_end_io(rbio, BLK_STS_OK);
2447 		return;
2448 	}
2449 
2450 	atomic_set(&rbio->stripes_pending, nr_data);
2451 
2452 	while ((bio = bio_list_pop(&bio_list))) {
2453 		bio->bi_private = rbio;
2454 		bio->bi_end_io = raid_write_end_io;
2455 		bio->bi_opf = REQ_OP_WRITE;
2456 
2457 		submit_bio(bio);
2458 	}
2459 	return;
2460 
2461 cleanup:
2462 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2463 
2464 	while ((bio = bio_list_pop(&bio_list)))
2465 		bio_put(bio);
2466 }
2467 
2468 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2469 {
2470 	if (stripe >= 0 && stripe < rbio->nr_data)
2471 		return 1;
2472 	return 0;
2473 }
2474 
2475 /*
2476  * While we're doing the parity check and repair, we could have errors
2477  * in reading pages off the disk.  This checks for errors and if we're
2478  * not able to read the page it'll trigger parity reconstruction.  The
2479  * parity scrub will be finished after we've reconstructed the failed
2480  * stripes
2481  */
2482 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2483 {
2484 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2485 		goto cleanup;
2486 
2487 	if (rbio->faila >= 0 || rbio->failb >= 0) {
2488 		int dfail = 0, failp = -1;
2489 
2490 		if (is_data_stripe(rbio, rbio->faila))
2491 			dfail++;
2492 		else if (is_parity_stripe(rbio->faila))
2493 			failp = rbio->faila;
2494 
2495 		if (is_data_stripe(rbio, rbio->failb))
2496 			dfail++;
2497 		else if (is_parity_stripe(rbio->failb))
2498 			failp = rbio->failb;
2499 
2500 		/*
2501 		 * Because we can not use a scrubbing parity to repair
2502 		 * the data, so the capability of the repair is declined.
2503 		 * (In the case of RAID5, we can not repair anything)
2504 		 */
2505 		if (dfail > rbio->bioc->max_errors - 1)
2506 			goto cleanup;
2507 
2508 		/*
2509 		 * If all data is good, only parity is correctly, just
2510 		 * repair the parity.
2511 		 */
2512 		if (dfail == 0) {
2513 			finish_parity_scrub(rbio, 0);
2514 			return;
2515 		}
2516 
2517 		/*
2518 		 * Here means we got one corrupted data stripe and one
2519 		 * corrupted parity on RAID6, if the corrupted parity
2520 		 * is scrubbing parity, luckily, use the other one to repair
2521 		 * the data, or we can not repair the data stripe.
2522 		 */
2523 		if (failp != rbio->scrubp)
2524 			goto cleanup;
2525 
2526 		__raid_recover_end_io(rbio);
2527 	} else {
2528 		finish_parity_scrub(rbio, 1);
2529 	}
2530 	return;
2531 
2532 cleanup:
2533 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2534 }
2535 
2536 /*
2537  * end io for the read phase of the rmw cycle.  All the bios here are physical
2538  * stripe bios we've read from the disk so we can recalculate the parity of the
2539  * stripe.
2540  *
2541  * This will usually kick off finish_rmw once all the bios are read in, but it
2542  * may trigger parity reconstruction if we had any errors along the way
2543  */
2544 static void raid56_parity_scrub_end_io(struct bio *bio)
2545 {
2546 	struct btrfs_raid_bio *rbio = bio->bi_private;
2547 
2548 	if (bio->bi_status)
2549 		fail_bio_stripe(rbio, bio);
2550 	else
2551 		set_bio_pages_uptodate(bio);
2552 
2553 	bio_put(bio);
2554 
2555 	if (!atomic_dec_and_test(&rbio->stripes_pending))
2556 		return;
2557 
2558 	/*
2559 	 * this will normally call finish_rmw to start our write
2560 	 * but if there are any failed stripes we'll reconstruct
2561 	 * from parity first
2562 	 */
2563 	validate_rbio_for_parity_scrub(rbio);
2564 }
2565 
2566 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2567 {
2568 	int bios_to_read = 0;
2569 	struct bio_list bio_list;
2570 	int ret;
2571 	int pagenr;
2572 	int stripe;
2573 	struct bio *bio;
2574 
2575 	bio_list_init(&bio_list);
2576 
2577 	ret = alloc_rbio_essential_pages(rbio);
2578 	if (ret)
2579 		goto cleanup;
2580 
2581 	atomic_set(&rbio->error, 0);
2582 	/*
2583 	 * build a list of bios to read all the missing parts of this
2584 	 * stripe
2585 	 */
2586 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2587 		for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2588 			struct page *page;
2589 			/*
2590 			 * we want to find all the pages missing from
2591 			 * the rbio and read them from the disk.  If
2592 			 * page_in_rbio finds a page in the bio list
2593 			 * we don't need to read it off the stripe.
2594 			 */
2595 			page = page_in_rbio(rbio, stripe, pagenr, 1);
2596 			if (page)
2597 				continue;
2598 
2599 			page = rbio_stripe_page(rbio, stripe, pagenr);
2600 			/*
2601 			 * the bio cache may have handed us an uptodate
2602 			 * page.  If so, be happy and use it
2603 			 */
2604 			if (PageUptodate(page))
2605 				continue;
2606 
2607 			ret = rbio_add_io_page(rbio, &bio_list, page,
2608 				       stripe, pagenr, rbio->stripe_len);
2609 			if (ret)
2610 				goto cleanup;
2611 		}
2612 	}
2613 
2614 	bios_to_read = bio_list_size(&bio_list);
2615 	if (!bios_to_read) {
2616 		/*
2617 		 * this can happen if others have merged with
2618 		 * us, it means there is nothing left to read.
2619 		 * But if there are missing devices it may not be
2620 		 * safe to do the full stripe write yet.
2621 		 */
2622 		goto finish;
2623 	}
2624 
2625 	/*
2626 	 * The bioc may be freed once we submit the last bio. Make sure not to
2627 	 * touch it after that.
2628 	 */
2629 	atomic_set(&rbio->stripes_pending, bios_to_read);
2630 	while ((bio = bio_list_pop(&bio_list))) {
2631 		bio->bi_private = rbio;
2632 		bio->bi_end_io = raid56_parity_scrub_end_io;
2633 		bio->bi_opf = REQ_OP_READ;
2634 
2635 		btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2636 
2637 		submit_bio(bio);
2638 	}
2639 	/* the actual write will happen once the reads are done */
2640 	return;
2641 
2642 cleanup:
2643 	rbio_orig_end_io(rbio, BLK_STS_IOERR);
2644 
2645 	while ((bio = bio_list_pop(&bio_list)))
2646 		bio_put(bio);
2647 
2648 	return;
2649 
2650 finish:
2651 	validate_rbio_for_parity_scrub(rbio);
2652 }
2653 
2654 static void scrub_parity_work(struct btrfs_work *work)
2655 {
2656 	struct btrfs_raid_bio *rbio;
2657 
2658 	rbio = container_of(work, struct btrfs_raid_bio, work);
2659 	raid56_parity_scrub_stripe(rbio);
2660 }
2661 
2662 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2663 {
2664 	if (!lock_stripe_add(rbio))
2665 		start_async_work(rbio, scrub_parity_work);
2666 }
2667 
2668 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2669 
2670 struct btrfs_raid_bio *
2671 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2672 			  u64 length)
2673 {
2674 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2675 	struct btrfs_raid_bio *rbio;
2676 
2677 	rbio = alloc_rbio(fs_info, bioc, length);
2678 	if (IS_ERR(rbio))
2679 		return NULL;
2680 
2681 	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2682 	bio_list_add(&rbio->bio_list, bio);
2683 	/*
2684 	 * This is a special bio which is used to hold the completion handler
2685 	 * and make the scrub rbio is similar to the other types
2686 	 */
2687 	ASSERT(!bio->bi_iter.bi_size);
2688 
2689 	rbio->faila = find_logical_bio_stripe(rbio, bio);
2690 	if (rbio->faila == -1) {
2691 		BUG();
2692 		kfree(rbio);
2693 		return NULL;
2694 	}
2695 
2696 	/*
2697 	 * When we get bioc, we have already increased bio_counter, record it
2698 	 * so we can free it at rbio_orig_end_io()
2699 	 */
2700 	rbio->generic_bio_cnt = 1;
2701 
2702 	return rbio;
2703 }
2704 
2705 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2706 {
2707 	if (!lock_stripe_add(rbio))
2708 		start_async_work(rbio, read_rebuild_work);
2709 }
2710