xref: /linux/fs/btrfs/raid56.c (revision 7696286034ac72cf9b46499be1715ac62fd302c3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
24 
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT	1
27 
28 /*
29  * set when this rbio is sitting in the hash, but it is just a cache
30  * of past RMW
31  */
32 #define RBIO_CACHE_BIT		2
33 
34 /*
35  * set when it is safe to trust the stripe_pages for caching
36  */
37 #define RBIO_CACHE_READY_BIT	3
38 
39 #define RBIO_CACHE_SIZE 1024
40 
41 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
42 
43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44 {
45 	if (unlikely(!bioc)) {
46 		btrfs_crit(fs_info, "bioc=NULL");
47 		return;
48 	}
49 	btrfs_crit(fs_info,
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 		bioc->logical, bioc->full_stripe_logical, bioc->size,
52 		bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 		bioc->replace_stripe_src, bioc->num_stripes);
54 	for (int i = 0; i < bioc->num_stripes; i++) {
55 		btrfs_crit(fs_info, "    nr=%d devid=%llu physical=%llu",
56 			   i, bioc->stripes[i].dev->devid,
57 			   bioc->stripes[i].physical);
58 	}
59 }
60 
61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 			    const struct btrfs_raid_bio *rbio)
63 {
64 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 		return;
66 
67 	dump_bioc(fs_info, rbio->bioc);
68 	btrfs_crit(fs_info,
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
70 		rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 		rbio->real_stripes, rbio->stripe_nsectors,
72 		rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
73 }
74 
75 #define ASSERT_RBIO(expr, rbio)						\
76 ({									\
77 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
78 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
79 					(rbio)->bioc->fs_info : NULL;	\
80 									\
81 		btrfs_dump_rbio(__fs_info, (rbio));			\
82 	}								\
83 	ASSERT((expr));							\
84 })
85 
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr)			\
87 ({									\
88 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
89 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
90 					(rbio)->bioc->fs_info : NULL;	\
91 									\
92 		btrfs_dump_rbio(__fs_info, (rbio));			\
93 		btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr));	\
94 	}								\
95 	ASSERT((expr));							\
96 })
97 
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr)			\
99 ({									\
100 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
101 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
102 					(rbio)->bioc->fs_info : NULL;	\
103 									\
104 		btrfs_dump_rbio(__fs_info, (rbio));			\
105 		btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr));	\
106 	}								\
107 	ASSERT((expr));							\
108 })
109 
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical)			\
111 ({									\
112 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
113 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
114 					(rbio)->bioc->fs_info : NULL;	\
115 									\
116 		btrfs_dump_rbio(__fs_info, (rbio));			\
117 		btrfs_crit(__fs_info, "logical=%llu", (logical));		\
118 	}								\
119 	ASSERT((expr));							\
120 })
121 
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash {
124 	struct list_head hash_list;
125 	spinlock_t lock;
126 };
127 
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table {
130 	struct list_head stripe_cache;
131 	spinlock_t cache_lock;
132 	int cache_size;
133 	struct btrfs_stripe_hash table[];
134 };
135 
136 /*
137  * The PFN may still be valid, but our paddrs should always be block size
138  * aligned, thus such -1 paddr is definitely not a valid one.
139  */
140 #define INVALID_PADDR	(~(phys_addr_t)0)
141 
142 static void rmw_rbio_work(struct work_struct *work);
143 static void rmw_rbio_work_locked(struct work_struct *work);
144 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
145 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
146 
147 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
148 static void scrub_rbio_work_locked(struct work_struct *work);
149 
150 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
151 {
152 	bitmap_free(rbio->error_bitmap);
153 	kfree(rbio->stripe_pages);
154 	kfree(rbio->bio_paddrs);
155 	kfree(rbio->stripe_paddrs);
156 	kfree(rbio->finish_pointers);
157 }
158 
159 static void free_raid_bio(struct btrfs_raid_bio *rbio)
160 {
161 	int i;
162 
163 	if (!refcount_dec_and_test(&rbio->refs))
164 		return;
165 
166 	WARN_ON(!list_empty(&rbio->stripe_cache));
167 	WARN_ON(!list_empty(&rbio->hash_list));
168 	WARN_ON(!bio_list_empty(&rbio->bio_list));
169 
170 	for (i = 0; i < rbio->nr_pages; i++) {
171 		if (rbio->stripe_pages[i]) {
172 			__free_page(rbio->stripe_pages[i]);
173 			rbio->stripe_pages[i] = NULL;
174 		}
175 	}
176 
177 	btrfs_put_bioc(rbio->bioc);
178 	free_raid_bio_pointers(rbio);
179 	kfree(rbio);
180 }
181 
182 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
183 {
184 	INIT_WORK(&rbio->work, work_func);
185 	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
186 }
187 
188 /*
189  * the stripe hash table is used for locking, and to collect
190  * bios in hopes of making a full stripe
191  */
192 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
193 {
194 	struct btrfs_stripe_hash_table *table;
195 	struct btrfs_stripe_hash_table *x;
196 	struct btrfs_stripe_hash *cur;
197 	struct btrfs_stripe_hash *h;
198 	unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
199 
200 	if (info->stripe_hash_table)
201 		return 0;
202 
203 	/*
204 	 * The table is large, starting with order 4 and can go as high as
205 	 * order 7 in case lock debugging is turned on.
206 	 *
207 	 * Try harder to allocate and fallback to vmalloc to lower the chance
208 	 * of a failing mount.
209 	 */
210 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
211 	if (!table)
212 		return -ENOMEM;
213 
214 	spin_lock_init(&table->cache_lock);
215 	INIT_LIST_HEAD(&table->stripe_cache);
216 
217 	h = table->table;
218 
219 	for (unsigned int i = 0; i < num_entries; i++) {
220 		cur = h + i;
221 		INIT_LIST_HEAD(&cur->hash_list);
222 		spin_lock_init(&cur->lock);
223 	}
224 
225 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
226 	kvfree(x);
227 	return 0;
228 }
229 
230 static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
231 {
232 	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
233 
234 	ASSERT(sector_nr < rbio->nr_sectors);
235 	for (int i = 0; i < rbio->sector_nsteps; i++) {
236 		unsigned int index = sector_nr * rbio->sector_nsteps + i;
237 		phys_addr_t dst = rbio->stripe_paddrs[index];
238 		phys_addr_t src = rbio->bio_paddrs[index];
239 
240 		ASSERT(dst != INVALID_PADDR);
241 		ASSERT(src != INVALID_PADDR);
242 
243 		memcpy_page(phys_to_page(dst), offset_in_page(dst),
244 			    phys_to_page(src), offset_in_page(src), step);
245 	}
246 }
247 
248 /*
249  * caching an rbio means to copy anything from the
250  * bio_sectors array into the stripe_pages array.  We
251  * use the page uptodate bit in the stripe cache array
252  * to indicate if it has valid data
253  *
254  * once the caching is done, we set the cache ready
255  * bit.
256  */
257 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
258 {
259 	int i;
260 	int ret;
261 
262 	ret = alloc_rbio_pages(rbio);
263 	if (ret)
264 		return;
265 
266 	for (i = 0; i < rbio->nr_sectors; i++) {
267 		/* Some range not covered by bio (partial write), skip it */
268 		if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
269 			/*
270 			 * Even if the sector is not covered by bio, if it is
271 			 * a data sector it should still be uptodate as it is
272 			 * read from disk.
273 			 */
274 			if (i < rbio->nr_data * rbio->stripe_nsectors)
275 				ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
276 			continue;
277 		}
278 
279 		memcpy_from_bio_to_stripe(rbio, i);
280 		set_bit(i, rbio->stripe_uptodate_bitmap);
281 	}
282 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
283 }
284 
285 /*
286  * we hash on the first logical address of the stripe
287  */
288 static int rbio_bucket(struct btrfs_raid_bio *rbio)
289 {
290 	u64 num = rbio->bioc->full_stripe_logical;
291 
292 	/*
293 	 * we shift down quite a bit.  We're using byte
294 	 * addressing, and most of the lower bits are zeros.
295 	 * This tends to upset hash_64, and it consistently
296 	 * returns just one or two different values.
297 	 *
298 	 * shifting off the lower bits fixes things.
299 	 */
300 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
301 }
302 
303 /* Get the sector number of the first sector covered by @page_nr. */
304 static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
305 {
306 	u32 sector_nr;
307 
308 	ASSERT(page_nr < rbio->nr_pages);
309 
310 	sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
311 	ASSERT(sector_nr < rbio->nr_sectors);
312 	return sector_nr;
313 }
314 
315 /*
316  * Get the number of sectors covered by @page_nr.
317  *
318  * For bs > ps cases, the result will always be 1.
319  * For bs <= ps cases, the result will be ps / bs.
320  */
321 static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
322 {
323 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
324 	u32 nr_sectors;
325 
326 	ASSERT(page_nr < rbio->nr_pages);
327 
328 	nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
329 	ASSERT(nr_sectors > 0);
330 	return nr_sectors;
331 }
332 
333 static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
334 						      unsigned int page_nr)
335 {
336 	const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
337 	const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
338 	int i;
339 
340 	ASSERT(page_nr < rbio->nr_pages);
341 	ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
342 
343 	for (i = sector_nr; i < sector_nr + nr_bits; i++) {
344 		if (!test_bit(i, rbio->stripe_uptodate_bitmap))
345 			return false;
346 	}
347 	return true;
348 }
349 
350 /*
351  * Update the stripe_sectors[] array to use correct page and pgoff
352  *
353  * Should be called every time any page pointer in stripes_pages[] got modified.
354  */
355 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
356 {
357 	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
358 	u32 offset;
359 	int i;
360 
361 	for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
362 	     i++, offset += step) {
363 		int page_index = offset >> PAGE_SHIFT;
364 
365 		ASSERT(page_index < rbio->nr_pages);
366 		if (!rbio->stripe_pages[page_index])
367 			continue;
368 
369 		rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
370 					 offset_in_page(offset);
371 	}
372 }
373 
374 static void steal_rbio_page(struct btrfs_raid_bio *src,
375 			    struct btrfs_raid_bio *dest, int page_nr)
376 {
377 	const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
378 	const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
379 
380 	ASSERT(page_nr < src->nr_pages);
381 	ASSERT(sector_nr + nr_bits < src->nr_sectors);
382 
383 	if (dest->stripe_pages[page_nr])
384 		__free_page(dest->stripe_pages[page_nr]);
385 	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
386 	src->stripe_pages[page_nr] = NULL;
387 
388 	/* Also update the stripe_uptodate_bitmap bits. */
389 	bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
390 }
391 
392 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
393 {
394 	const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
395 
396 	/*
397 	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
398 	 * we won't have a page which is half data half parity.
399 	 *
400 	 * Thus if the first sector of the page belongs to data stripes, then
401 	 * the full page belongs to data stripes.
402 	 */
403 	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
404 }
405 
406 /*
407  * Stealing an rbio means taking all the uptodate pages from the stripe array
408  * in the source rbio and putting them into the destination rbio.
409  *
410  * This will also update the involved stripe_sectors[] which are referring to
411  * the old pages.
412  */
413 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
414 {
415 	int i;
416 
417 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
418 		return;
419 
420 	for (i = 0; i < dest->nr_pages; i++) {
421 		struct page *p = src->stripe_pages[i];
422 
423 		/*
424 		 * We don't need to steal P/Q pages as they will always be
425 		 * regenerated for RMW or full write anyway.
426 		 */
427 		if (!is_data_stripe_page(src, i))
428 			continue;
429 
430 		/*
431 		 * If @src already has RBIO_CACHE_READY_BIT, it should have
432 		 * all data stripe pages present and uptodate.
433 		 */
434 		ASSERT(p);
435 		ASSERT(full_page_sectors_uptodate(src, i));
436 		steal_rbio_page(src, dest, i);
437 	}
438 	index_stripe_sectors(dest);
439 	index_stripe_sectors(src);
440 }
441 
442 /*
443  * merging means we take the bio_list from the victim and
444  * splice it into the destination.  The victim should
445  * be discarded afterwards.
446  *
447  * must be called with dest->rbio_list_lock held
448  */
449 static void merge_rbio(struct btrfs_raid_bio *dest,
450 		       struct btrfs_raid_bio *victim)
451 {
452 	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
453 	dest->bio_list_bytes += victim->bio_list_bytes;
454 	/* Also inherit the bitmaps from @victim. */
455 	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
456 		  dest->stripe_nsectors);
457 }
458 
459 /*
460  * used to prune items that are in the cache.  The caller
461  * must hold the hash table lock.
462  */
463 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
464 {
465 	int bucket = rbio_bucket(rbio);
466 	struct btrfs_stripe_hash_table *table;
467 	struct btrfs_stripe_hash *h;
468 	int freeit = 0;
469 
470 	/*
471 	 * check the bit again under the hash table lock.
472 	 */
473 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
474 		return;
475 
476 	table = rbio->bioc->fs_info->stripe_hash_table;
477 	h = table->table + bucket;
478 
479 	/* hold the lock for the bucket because we may be
480 	 * removing it from the hash table
481 	 */
482 	spin_lock(&h->lock);
483 
484 	/*
485 	 * hold the lock for the bio list because we need
486 	 * to make sure the bio list is empty
487 	 */
488 	spin_lock(&rbio->bio_list_lock);
489 
490 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
491 		list_del_init(&rbio->stripe_cache);
492 		table->cache_size -= 1;
493 		freeit = 1;
494 
495 		/* if the bio list isn't empty, this rbio is
496 		 * still involved in an IO.  We take it out
497 		 * of the cache list, and drop the ref that
498 		 * was held for the list.
499 		 *
500 		 * If the bio_list was empty, we also remove
501 		 * the rbio from the hash_table, and drop
502 		 * the corresponding ref
503 		 */
504 		if (bio_list_empty(&rbio->bio_list)) {
505 			if (!list_empty(&rbio->hash_list)) {
506 				list_del_init(&rbio->hash_list);
507 				refcount_dec(&rbio->refs);
508 				BUG_ON(!list_empty(&rbio->plug_list));
509 			}
510 		}
511 	}
512 
513 	spin_unlock(&rbio->bio_list_lock);
514 	spin_unlock(&h->lock);
515 
516 	if (freeit)
517 		free_raid_bio(rbio);
518 }
519 
520 /*
521  * prune a given rbio from the cache
522  */
523 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
524 {
525 	struct btrfs_stripe_hash_table *table;
526 
527 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
528 		return;
529 
530 	table = rbio->bioc->fs_info->stripe_hash_table;
531 
532 	spin_lock(&table->cache_lock);
533 	__remove_rbio_from_cache(rbio);
534 	spin_unlock(&table->cache_lock);
535 }
536 
537 /*
538  * remove everything in the cache
539  */
540 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
541 {
542 	struct btrfs_stripe_hash_table *table;
543 	struct btrfs_raid_bio *rbio;
544 
545 	table = info->stripe_hash_table;
546 
547 	spin_lock(&table->cache_lock);
548 	while (!list_empty(&table->stripe_cache)) {
549 		rbio = list_first_entry(&table->stripe_cache,
550 					struct btrfs_raid_bio, stripe_cache);
551 		__remove_rbio_from_cache(rbio);
552 	}
553 	spin_unlock(&table->cache_lock);
554 }
555 
556 /*
557  * remove all cached entries and free the hash table
558  * used by unmount
559  */
560 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
561 {
562 	if (!info->stripe_hash_table)
563 		return;
564 	btrfs_clear_rbio_cache(info);
565 	kvfree(info->stripe_hash_table);
566 	info->stripe_hash_table = NULL;
567 }
568 
569 /*
570  * insert an rbio into the stripe cache.  It
571  * must have already been prepared by calling
572  * cache_rbio_pages
573  *
574  * If this rbio was already cached, it gets
575  * moved to the front of the lru.
576  *
577  * If the size of the rbio cache is too big, we
578  * prune an item.
579  */
580 static void cache_rbio(struct btrfs_raid_bio *rbio)
581 {
582 	struct btrfs_stripe_hash_table *table;
583 
584 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
585 		return;
586 
587 	table = rbio->bioc->fs_info->stripe_hash_table;
588 
589 	spin_lock(&table->cache_lock);
590 	spin_lock(&rbio->bio_list_lock);
591 
592 	/* bump our ref if we were not in the list before */
593 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
594 		refcount_inc(&rbio->refs);
595 
596 	if (!list_empty(&rbio->stripe_cache)){
597 		list_move(&rbio->stripe_cache, &table->stripe_cache);
598 	} else {
599 		list_add(&rbio->stripe_cache, &table->stripe_cache);
600 		table->cache_size += 1;
601 	}
602 
603 	spin_unlock(&rbio->bio_list_lock);
604 
605 	if (table->cache_size > RBIO_CACHE_SIZE) {
606 		struct btrfs_raid_bio *found;
607 
608 		found = list_last_entry(&table->stripe_cache,
609 					struct btrfs_raid_bio,
610 					stripe_cache);
611 
612 		if (found != rbio)
613 			__remove_rbio_from_cache(found);
614 	}
615 
616 	spin_unlock(&table->cache_lock);
617 }
618 
619 /*
620  * helper function to run the xor_blocks api.  It is only
621  * able to do MAX_XOR_BLOCKS at a time, so we need to
622  * loop through.
623  */
624 static void run_xor(void **pages, int src_cnt, ssize_t len)
625 {
626 	int src_off = 0;
627 	int xor_src_cnt = 0;
628 	void *dest = pages[src_cnt];
629 
630 	while(src_cnt > 0) {
631 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
632 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
633 
634 		src_cnt -= xor_src_cnt;
635 		src_off += xor_src_cnt;
636 	}
637 }
638 
639 /*
640  * Returns true if the bio list inside this rbio covers an entire stripe (no
641  * rmw required).
642  */
643 static int rbio_is_full(struct btrfs_raid_bio *rbio)
644 {
645 	unsigned long size = rbio->bio_list_bytes;
646 	int ret = 1;
647 
648 	spin_lock(&rbio->bio_list_lock);
649 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
650 		ret = 0;
651 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
652 	spin_unlock(&rbio->bio_list_lock);
653 
654 	return ret;
655 }
656 
657 /*
658  * returns 1 if it is safe to merge two rbios together.
659  * The merging is safe if the two rbios correspond to
660  * the same stripe and if they are both going in the same
661  * direction (read vs write), and if neither one is
662  * locked for final IO
663  *
664  * The caller is responsible for locking such that
665  * rmw_locked is safe to test
666  */
667 static int rbio_can_merge(struct btrfs_raid_bio *last,
668 			  struct btrfs_raid_bio *cur)
669 {
670 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
671 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
672 		return 0;
673 
674 	/*
675 	 * we can't merge with cached rbios, since the
676 	 * idea is that when we merge the destination
677 	 * rbio is going to run our IO for us.  We can
678 	 * steal from cached rbios though, other functions
679 	 * handle that.
680 	 */
681 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
682 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
683 		return 0;
684 
685 	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
686 		return 0;
687 
688 	/* we can't merge with different operations */
689 	if (last->operation != cur->operation)
690 		return 0;
691 	/*
692 	 * We've need read the full stripe from the drive.
693 	 * check and repair the parity and write the new results.
694 	 *
695 	 * We're not allowed to add any new bios to the
696 	 * bio list here, anyone else that wants to
697 	 * change this stripe needs to do their own rmw.
698 	 */
699 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
700 		return 0;
701 
702 	if (last->operation == BTRFS_RBIO_READ_REBUILD)
703 		return 0;
704 
705 	return 1;
706 }
707 
708 /* Return the sector index for @stripe_nr and @sector_nr. */
709 static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
710 				      unsigned int stripe_nr,
711 				      unsigned int sector_nr)
712 {
713 	unsigned int ret;
714 
715 	ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
716 	ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
717 
718 	ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
719 	ASSERT(ret < rbio->nr_sectors);
720 	return ret;
721 }
722 
723 /* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
724 static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
725 				     unsigned int stripe_nr,
726 				     unsigned int sector_nr,
727 				     unsigned int step_nr)
728 {
729 	unsigned int ret;
730 
731 	ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
732 
733 	ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
734 	ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
735 	return ret;
736 }
737 
738 static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
739 					  unsigned int stripe_nr, unsigned int sector_nr,
740 					  unsigned int step_nr)
741 {
742 	return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
743 }
744 
745 static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
746 					   unsigned int sector_nr, unsigned int step_nr)
747 {
748 	return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
749 }
750 
751 static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
752 					   unsigned int sector_nr, unsigned int step_nr)
753 {
754 	if (rbio->nr_data + 1 == rbio->real_stripes)
755 		return INVALID_PADDR;
756 	return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
757 }
758 
759 /* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
760 static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
761 				       unsigned int stripe_nr, unsigned int sector_nr)
762 {
763 	return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
764 }
765 
766 /*
767  * The first stripe in the table for a logical address
768  * has the lock.  rbios are added in one of three ways:
769  *
770  * 1) Nobody has the stripe locked yet.  The rbio is given
771  * the lock and 0 is returned.  The caller must start the IO
772  * themselves.
773  *
774  * 2) Someone has the stripe locked, but we're able to merge
775  * with the lock owner.  The rbio is freed and the IO will
776  * start automatically along with the existing rbio.  1 is returned.
777  *
778  * 3) Someone has the stripe locked, but we're not able to merge.
779  * The rbio is added to the lock owner's plug list, or merged into
780  * an rbio already on the plug list.  When the lock owner unlocks,
781  * the next rbio on the list is run and the IO is started automatically.
782  * 1 is returned
783  *
784  * If we return 0, the caller still owns the rbio and must continue with
785  * IO submission.  If we return 1, the caller must assume the rbio has
786  * already been freed.
787  */
788 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
789 {
790 	struct btrfs_stripe_hash *h;
791 	struct btrfs_raid_bio *cur;
792 	struct btrfs_raid_bio *pending;
793 	struct btrfs_raid_bio *freeit = NULL;
794 	struct btrfs_raid_bio *cache_drop = NULL;
795 	int ret = 0;
796 
797 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
798 
799 	spin_lock(&h->lock);
800 	list_for_each_entry(cur, &h->hash_list, hash_list) {
801 		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
802 			continue;
803 
804 		spin_lock(&cur->bio_list_lock);
805 
806 		/* Can we steal this cached rbio's pages? */
807 		if (bio_list_empty(&cur->bio_list) &&
808 		    list_empty(&cur->plug_list) &&
809 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
810 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
811 			list_del_init(&cur->hash_list);
812 			refcount_dec(&cur->refs);
813 
814 			steal_rbio(cur, rbio);
815 			cache_drop = cur;
816 			spin_unlock(&cur->bio_list_lock);
817 
818 			goto lockit;
819 		}
820 
821 		/* Can we merge into the lock owner? */
822 		if (rbio_can_merge(cur, rbio)) {
823 			merge_rbio(cur, rbio);
824 			spin_unlock(&cur->bio_list_lock);
825 			freeit = rbio;
826 			ret = 1;
827 			goto out;
828 		}
829 
830 
831 		/*
832 		 * We couldn't merge with the running rbio, see if we can merge
833 		 * with the pending ones.  We don't have to check for rmw_locked
834 		 * because there is no way they are inside finish_rmw right now
835 		 */
836 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
837 			if (rbio_can_merge(pending, rbio)) {
838 				merge_rbio(pending, rbio);
839 				spin_unlock(&cur->bio_list_lock);
840 				freeit = rbio;
841 				ret = 1;
842 				goto out;
843 			}
844 		}
845 
846 		/*
847 		 * No merging, put us on the tail of the plug list, our rbio
848 		 * will be started with the currently running rbio unlocks
849 		 */
850 		list_add_tail(&rbio->plug_list, &cur->plug_list);
851 		spin_unlock(&cur->bio_list_lock);
852 		ret = 1;
853 		goto out;
854 	}
855 lockit:
856 	refcount_inc(&rbio->refs);
857 	list_add(&rbio->hash_list, &h->hash_list);
858 out:
859 	spin_unlock(&h->lock);
860 	if (cache_drop)
861 		remove_rbio_from_cache(cache_drop);
862 	if (freeit)
863 		free_raid_bio(freeit);
864 	return ret;
865 }
866 
867 static void recover_rbio_work_locked(struct work_struct *work);
868 
869 /*
870  * called as rmw or parity rebuild is completed.  If the plug list has more
871  * rbios waiting for this stripe, the next one on the list will be started
872  */
873 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
874 {
875 	int bucket;
876 	struct btrfs_stripe_hash *h;
877 	int keep_cache = 0;
878 
879 	bucket = rbio_bucket(rbio);
880 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
881 
882 	if (list_empty(&rbio->plug_list))
883 		cache_rbio(rbio);
884 
885 	spin_lock(&h->lock);
886 	spin_lock(&rbio->bio_list_lock);
887 
888 	if (!list_empty(&rbio->hash_list)) {
889 		/*
890 		 * if we're still cached and there is no other IO
891 		 * to perform, just leave this rbio here for others
892 		 * to steal from later
893 		 */
894 		if (list_empty(&rbio->plug_list) &&
895 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
896 			keep_cache = 1;
897 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
898 			BUG_ON(!bio_list_empty(&rbio->bio_list));
899 			goto done;
900 		}
901 
902 		list_del_init(&rbio->hash_list);
903 		refcount_dec(&rbio->refs);
904 
905 		/*
906 		 * we use the plug list to hold all the rbios
907 		 * waiting for the chance to lock this stripe.
908 		 * hand the lock over to one of them.
909 		 */
910 		if (!list_empty(&rbio->plug_list)) {
911 			struct btrfs_raid_bio *next;
912 			struct list_head *head = rbio->plug_list.next;
913 
914 			next = list_entry(head, struct btrfs_raid_bio,
915 					  plug_list);
916 
917 			list_del_init(&rbio->plug_list);
918 
919 			list_add(&next->hash_list, &h->hash_list);
920 			refcount_inc(&next->refs);
921 			spin_unlock(&rbio->bio_list_lock);
922 			spin_unlock(&h->lock);
923 
924 			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
925 				start_async_work(next, recover_rbio_work_locked);
926 			} else if (next->operation == BTRFS_RBIO_WRITE) {
927 				steal_rbio(rbio, next);
928 				start_async_work(next, rmw_rbio_work_locked);
929 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
930 				steal_rbio(rbio, next);
931 				start_async_work(next, scrub_rbio_work_locked);
932 			}
933 
934 			goto done_nolock;
935 		}
936 	}
937 done:
938 	spin_unlock(&rbio->bio_list_lock);
939 	spin_unlock(&h->lock);
940 
941 done_nolock:
942 	if (!keep_cache)
943 		remove_rbio_from_cache(rbio);
944 }
945 
946 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
947 {
948 	struct bio *next;
949 
950 	while (cur) {
951 		next = cur->bi_next;
952 		cur->bi_next = NULL;
953 		cur->bi_status = status;
954 		bio_endio(cur);
955 		cur = next;
956 	}
957 }
958 
959 /*
960  * this frees the rbio and runs through all the bios in the
961  * bio_list and calls end_io on them
962  */
963 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
964 {
965 	struct bio *cur = bio_list_get(&rbio->bio_list);
966 	struct bio *extra;
967 
968 	kfree(rbio->csum_buf);
969 	bitmap_free(rbio->csum_bitmap);
970 	rbio->csum_buf = NULL;
971 	rbio->csum_bitmap = NULL;
972 
973 	/*
974 	 * Clear the data bitmap, as the rbio may be cached for later usage.
975 	 * do this before before unlock_stripe() so there will be no new bio
976 	 * for this bio.
977 	 */
978 	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
979 
980 	/*
981 	 * At this moment, rbio->bio_list is empty, however since rbio does not
982 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
983 	 * hash list, rbio may be merged with others so that rbio->bio_list
984 	 * becomes non-empty.
985 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
986 	 * more and we can call bio_endio() on all queued bios.
987 	 */
988 	unlock_stripe(rbio);
989 	extra = bio_list_get(&rbio->bio_list);
990 	free_raid_bio(rbio);
991 
992 	rbio_endio_bio_list(cur, status);
993 	if (extra)
994 		rbio_endio_bio_list(extra, status);
995 }
996 
997 /*
998  * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
999  *
1000  * @rbio:               The raid bio
1001  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
1002  * @sector_nr:		Sector number inside the stripe,
1003  *			valid range [0, stripe_nsectors)
1004  * @bio_list_only:      Whether to use sectors inside the bio list only.
1005  *
1006  * The read/modify/write code wants to reuse the original bio page as much
1007  * as possible, and only use stripe_sectors as fallback.
1008  *
1009  * Return NULL if bio_list_only is set but the specified sector has no
1010  * coresponding bio.
1011  */
1012 static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
1013 					  int stripe_nr, int sector_nr,
1014 					  bool bio_list_only)
1015 {
1016 	phys_addr_t *ret = NULL;
1017 	const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
1018 
1019 	ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1020 
1021 	scoped_guard(spinlock, &rbio->bio_list_lock) {
1022 		if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1023 			/* Don't return sector without a valid page pointer */
1024 			if (rbio->bio_paddrs[index] != INVALID_PADDR)
1025 				ret = &rbio->bio_paddrs[index];
1026 			return ret;
1027 		}
1028 	}
1029 	return &rbio->stripe_paddrs[index];
1030 }
1031 
1032 /*
1033  * Similar to sector_paddr_in_rbio(), but with extra consideration for
1034  * bs > ps cases, where we can have multiple steps for a fs block.
1035  */
1036 static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
1037 					int stripe_nr, int sector_nr, int step_nr,
1038 					bool bio_list_only)
1039 {
1040 	phys_addr_t ret = INVALID_PADDR;
1041 	const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
1042 
1043 	ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1044 
1045 	scoped_guard(spinlock, &rbio->bio_list_lock) {
1046 		if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1047 			/* Don't return sector without a valid page pointer */
1048 			if (rbio->bio_paddrs[index] != INVALID_PADDR)
1049 				ret = rbio->bio_paddrs[index];
1050 			return ret;
1051 		}
1052 	}
1053 	return rbio->stripe_paddrs[index];
1054 }
1055 
1056 /*
1057  * allocation and initial setup for the btrfs_raid_bio.  Not
1058  * this does not allocate any pages for rbio->pages.
1059  */
1060 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
1061 					 struct btrfs_io_context *bioc)
1062 {
1063 	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
1064 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
1065 	const unsigned int num_pages = stripe_npages * real_stripes;
1066 	const unsigned int stripe_nsectors =
1067 		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
1068 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
1069 	const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
1070 	const unsigned int sector_nsteps = fs_info->sectorsize / step;
1071 	struct btrfs_raid_bio *rbio;
1072 
1073 	/*
1074 	 * For bs <= ps cases, ps must be aligned to bs.
1075 	 * For bs > ps cases, bs must be aligned to ps.
1076 	 */
1077 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
1078 	       IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
1079 	/*
1080 	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1081 	 * (at most 16) should be no larger than BITS_PER_LONG.
1082 	 */
1083 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
1084 
1085 	/*
1086 	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1087 	 * (limited by u8).
1088 	 */
1089 	ASSERT(real_stripes >= 2);
1090 	ASSERT(real_stripes <= U8_MAX);
1091 
1092 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1093 	if (!rbio)
1094 		return ERR_PTR(-ENOMEM);
1095 	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1096 				     GFP_NOFS);
1097 	rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
1098 	rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS);
1099 	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1100 	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1101 	rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1102 
1103 	if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
1104 	    !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
1105 		free_raid_bio_pointers(rbio);
1106 		kfree(rbio);
1107 		return ERR_PTR(-ENOMEM);
1108 	}
1109 	for (int i = 0; i < num_sectors * sector_nsteps; i++) {
1110 		rbio->stripe_paddrs[i] = INVALID_PADDR;
1111 		rbio->bio_paddrs[i] = INVALID_PADDR;
1112 	}
1113 
1114 	bio_list_init(&rbio->bio_list);
1115 	init_waitqueue_head(&rbio->io_wait);
1116 	INIT_LIST_HEAD(&rbio->plug_list);
1117 	spin_lock_init(&rbio->bio_list_lock);
1118 	INIT_LIST_HEAD(&rbio->stripe_cache);
1119 	INIT_LIST_HEAD(&rbio->hash_list);
1120 	btrfs_get_bioc(bioc);
1121 	rbio->bioc = bioc;
1122 	rbio->nr_pages = num_pages;
1123 	rbio->nr_sectors = num_sectors;
1124 	rbio->real_stripes = real_stripes;
1125 	rbio->stripe_npages = stripe_npages;
1126 	rbio->stripe_nsectors = stripe_nsectors;
1127 	rbio->sector_nsteps = sector_nsteps;
1128 	refcount_set(&rbio->refs, 1);
1129 	atomic_set(&rbio->stripes_pending, 0);
1130 
1131 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1132 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1133 	ASSERT(rbio->nr_data > 0);
1134 
1135 	return rbio;
1136 }
1137 
1138 /* allocate pages for all the stripes in the bio, including parity */
1139 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1140 {
1141 	int ret;
1142 
1143 	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1144 	if (ret < 0)
1145 		return ret;
1146 	/* Mapping all sectors */
1147 	index_stripe_sectors(rbio);
1148 	return 0;
1149 }
1150 
1151 /* only allocate pages for p/q stripes */
1152 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1153 {
1154 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1155 	int ret;
1156 
1157 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1158 				     rbio->stripe_pages + data_pages, false);
1159 	if (ret < 0)
1160 		return ret;
1161 
1162 	index_stripe_sectors(rbio);
1163 	return 0;
1164 }
1165 
1166 /*
1167  * Return the total number of errors found in the vertical stripe of @sector_nr.
1168  *
1169  * @faila and @failb will also be updated to the first and second stripe
1170  * number of the errors.
1171  */
1172 static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1173 				    int *faila, int *failb)
1174 {
1175 	int stripe_nr;
1176 	int found_errors = 0;
1177 
1178 	if (faila || failb) {
1179 		/*
1180 		 * Both @faila and @failb should be valid pointers if any of
1181 		 * them is specified.
1182 		 */
1183 		ASSERT(faila && failb);
1184 		*faila = -1;
1185 		*failb = -1;
1186 	}
1187 
1188 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1189 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1190 
1191 		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1192 			found_errors++;
1193 			if (faila) {
1194 				/* Update faila and failb. */
1195 				if (*faila < 0)
1196 					*faila = stripe_nr;
1197 				else if (*failb < 0)
1198 					*failb = stripe_nr;
1199 			}
1200 		}
1201 	}
1202 	return found_errors;
1203 }
1204 
1205 static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
1206 			  unsigned int step)
1207 {
1208 	int added = 0;
1209 	int ret;
1210 
1211 	for (int i = 0; i < nr_steps; i++) {
1212 		ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
1213 				   offset_in_page(paddrs[i]));
1214 		if (ret != step)
1215 			goto revert;
1216 		added += ret;
1217 	}
1218 	return added;
1219 revert:
1220 	/*
1221 	 * We don't need to revert the bvec, as the bio will be submitted immediately,
1222 	 * as long as the size is reduced the extra bvec will not be accessed.
1223 	 */
1224 	bio->bi_iter.bi_size -= added;
1225 	return 0;
1226 }
1227 
1228 /*
1229  * Add a single sector @sector into our list of bios for IO.
1230  *
1231  * Return 0 if everything went well.
1232  * Return <0 for error, and no byte will be added to @rbio.
1233  */
1234 static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
1235 			      phys_addr_t *paddrs, unsigned int stripe_nr,
1236 			      unsigned int sector_nr, enum req_op op)
1237 {
1238 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1239 	const u32 step = min(sectorsize, PAGE_SIZE);
1240 	struct bio *last = bio_list->tail;
1241 	int ret;
1242 	struct bio *bio;
1243 	struct btrfs_io_stripe *stripe;
1244 	u64 disk_start;
1245 
1246 	/*
1247 	 * Note: here stripe_nr has taken device replace into consideration,
1248 	 * thus it can be larger than rbio->real_stripe.
1249 	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1250 	 */
1251 	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1252 			   rbio, stripe_nr);
1253 	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1254 			   rbio, sector_nr);
1255 	ASSERT(paddrs != NULL);
1256 
1257 	stripe = &rbio->bioc->stripes[stripe_nr];
1258 	disk_start = stripe->physical + sector_nr * sectorsize;
1259 
1260 	/* if the device is missing, just fail this stripe */
1261 	if (!stripe->dev->bdev) {
1262 		int found_errors;
1263 
1264 		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1265 			rbio->error_bitmap);
1266 
1267 		/* Check if we have reached tolerance early. */
1268 		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
1269 							NULL, NULL);
1270 		if (unlikely(found_errors > rbio->bioc->max_errors))
1271 			return -EIO;
1272 		return 0;
1273 	}
1274 
1275 	/* see if we can add this page onto our existing bio */
1276 	if (last) {
1277 		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1278 		last_end += last->bi_iter.bi_size;
1279 
1280 		/*
1281 		 * we can't merge these if they are from different
1282 		 * devices or if they are not contiguous
1283 		 */
1284 		if (last_end == disk_start && !last->bi_status &&
1285 		    last->bi_bdev == stripe->dev->bdev) {
1286 			ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
1287 			if (ret == sectorsize)
1288 				return 0;
1289 		}
1290 	}
1291 
1292 	/* put a new bio on the list */
1293 	bio = bio_alloc(stripe->dev->bdev,
1294 			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1295 			op, GFP_NOFS);
1296 	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1297 	bio->bi_private = rbio;
1298 
1299 	ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
1300 	ASSERT(ret == sectorsize);
1301 	bio_list_add(bio_list, bio);
1302 	return 0;
1303 }
1304 
1305 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1306 {
1307 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1308 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1309 	const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
1310 	struct bvec_iter iter = bio->bi_iter;
1311 	phys_addr_t paddr;
1312 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1313 		     rbio->bioc->full_stripe_logical;
1314 
1315 	btrfs_bio_for_each_block(paddr, bio, &iter, step) {
1316 		unsigned int index = (offset >> step_bits);
1317 
1318 		rbio->bio_paddrs[index] = paddr;
1319 		offset += step;
1320 	}
1321 }
1322 
1323 /*
1324  * helper function to walk our bio list and populate the bio_pages array with
1325  * the result.  This seems expensive, but it is faster than constantly
1326  * searching through the bio list as we setup the IO in finish_rmw or stripe
1327  * reconstruction.
1328  *
1329  * This must be called before you trust the answers from page_in_rbio
1330  */
1331 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1332 {
1333 	struct bio *bio;
1334 
1335 	spin_lock(&rbio->bio_list_lock);
1336 	bio_list_for_each(bio, &rbio->bio_list)
1337 		index_one_bio(rbio, bio);
1338 
1339 	spin_unlock(&rbio->bio_list_lock);
1340 }
1341 
1342 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1343 			       struct raid56_bio_trace_info *trace_info)
1344 {
1345 	const struct btrfs_io_context *bioc = rbio->bioc;
1346 	int i;
1347 
1348 	ASSERT(bioc);
1349 
1350 	/* We rely on bio->bi_bdev to find the stripe number. */
1351 	if (!bio->bi_bdev)
1352 		goto not_found;
1353 
1354 	for (i = 0; i < bioc->num_stripes; i++) {
1355 		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1356 			continue;
1357 		trace_info->stripe_nr = i;
1358 		trace_info->devid = bioc->stripes[i].dev->devid;
1359 		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1360 				     bioc->stripes[i].physical;
1361 		return;
1362 	}
1363 
1364 not_found:
1365 	trace_info->devid = -1;
1366 	trace_info->offset = -1;
1367 	trace_info->stripe_nr = -1;
1368 }
1369 
1370 static inline void bio_list_put(struct bio_list *bio_list)
1371 {
1372 	struct bio *bio;
1373 
1374 	while ((bio = bio_list_pop(bio_list)))
1375 		bio_put(bio);
1376 }
1377 
1378 static void assert_rbio(struct btrfs_raid_bio *rbio)
1379 {
1380 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1381 		return;
1382 
1383 	/*
1384 	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1385 	 * we won't go beyond 256 disks anyway.
1386 	 */
1387 	ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1388 	ASSERT_RBIO(rbio->nr_data > 0, rbio);
1389 
1390 	/*
1391 	 * This is another check to make sure nr data stripes is smaller
1392 	 * than total stripes.
1393 	 */
1394 	ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1395 }
1396 
1397 static inline void *kmap_local_paddr(phys_addr_t paddr)
1398 {
1399 	/* The sector pointer must have a page mapped to it. */
1400 	ASSERT(paddr != INVALID_PADDR);
1401 
1402 	return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
1403 }
1404 
1405 static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
1406 				      unsigned int step_nr)
1407 {
1408 	void **pointers = rbio->finish_pointers;
1409 	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
1410 	int stripe;
1411 	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1412 
1413 	/* First collect one sector from each data stripe */
1414 	for (stripe = 0; stripe < rbio->nr_data; stripe++)
1415 		pointers[stripe] = kmap_local_paddr(
1416 				sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
1417 
1418 	/* Then add the parity stripe */
1419 	pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
1420 
1421 	if (has_qstripe) {
1422 		/*
1423 		 * RAID6, add the qstripe and call the library function
1424 		 * to fill in our p/q
1425 		 */
1426 		pointers[stripe++] = kmap_local_paddr(
1427 				rbio_qstripe_paddr(rbio, sector_nr, step_nr));
1428 
1429 		assert_rbio(rbio);
1430 		raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
1431 	} else {
1432 		/* raid5 */
1433 		memcpy(pointers[rbio->nr_data], pointers[0], step);
1434 		run_xor(pointers + 1, rbio->nr_data - 1, step);
1435 	}
1436 	for (stripe = stripe - 1; stripe >= 0; stripe--)
1437 		kunmap_local(pointers[stripe]);
1438 }
1439 
1440 /* Generate PQ for one vertical stripe. */
1441 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1442 {
1443 	const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
1444 
1445 	for (int i = 0; i < rbio->sector_nsteps; i++)
1446 		generate_pq_vertical_step(rbio, sectornr, i);
1447 
1448 	set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
1449 		rbio->stripe_uptodate_bitmap);
1450 	if (has_qstripe)
1451 		set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
1452 			rbio->stripe_uptodate_bitmap);
1453 }
1454 
1455 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1456 				   struct bio_list *bio_list)
1457 {
1458 	/* The total sector number inside the full stripe. */
1459 	int total_sector_nr;
1460 	int sectornr;
1461 	int stripe;
1462 	int ret;
1463 
1464 	ASSERT(bio_list_size(bio_list) == 0);
1465 
1466 	/* We should have at least one data sector. */
1467 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1468 
1469 	/*
1470 	 * Reset errors, as we may have errors inherited from from degraded
1471 	 * write.
1472 	 */
1473 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1474 
1475 	/*
1476 	 * Start assembly.  Make bios for everything from the higher layers (the
1477 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1478 	 */
1479 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1480 	     total_sector_nr++) {
1481 		phys_addr_t *paddrs;
1482 
1483 		stripe = total_sector_nr / rbio->stripe_nsectors;
1484 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1485 
1486 		/* This vertical stripe has no data, skip it. */
1487 		if (!test_bit(sectornr, &rbio->dbitmap))
1488 			continue;
1489 
1490 		if (stripe < rbio->nr_data) {
1491 			paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1492 			if (paddrs == NULL)
1493 				continue;
1494 		} else {
1495 			paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1496 		}
1497 
1498 		ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
1499 					 sectornr, REQ_OP_WRITE);
1500 		if (ret)
1501 			goto error;
1502 	}
1503 
1504 	if (likely(!rbio->bioc->replace_nr_stripes))
1505 		return 0;
1506 
1507 	/*
1508 	 * Make a copy for the replace target device.
1509 	 *
1510 	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1511 	 */
1512 	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1513 
1514 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1515 	     total_sector_nr++) {
1516 		phys_addr_t *paddrs;
1517 
1518 		stripe = total_sector_nr / rbio->stripe_nsectors;
1519 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1520 
1521 		/*
1522 		 * For RAID56, there is only one device that can be replaced,
1523 		 * and replace_stripe_src[0] indicates the stripe number we
1524 		 * need to copy from.
1525 		 */
1526 		if (stripe != rbio->bioc->replace_stripe_src) {
1527 			/*
1528 			 * We can skip the whole stripe completely, note
1529 			 * total_sector_nr will be increased by one anyway.
1530 			 */
1531 			ASSERT(sectornr == 0);
1532 			total_sector_nr += rbio->stripe_nsectors - 1;
1533 			continue;
1534 		}
1535 
1536 		/* This vertical stripe has no data, skip it. */
1537 		if (!test_bit(sectornr, &rbio->dbitmap))
1538 			continue;
1539 
1540 		if (stripe < rbio->nr_data) {
1541 			paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1542 			if (paddrs == NULL)
1543 				continue;
1544 		} else {
1545 			paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1546 		}
1547 
1548 		ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
1549 					 rbio->real_stripes,
1550 					 sectornr, REQ_OP_WRITE);
1551 		if (ret)
1552 			goto error;
1553 	}
1554 
1555 	return 0;
1556 error:
1557 	bio_list_put(bio_list);
1558 	return -EIO;
1559 }
1560 
1561 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1562 {
1563 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1564 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1565 		     rbio->bioc->full_stripe_logical;
1566 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1567 
1568 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1569 
1570 	bitmap_set(rbio->error_bitmap, total_nr_sector,
1571 		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1572 
1573 	/*
1574 	 * Special handling for raid56_alloc_missing_rbio() used by
1575 	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1576 	 * pass an empty bio here.  Thus we have to find out the missing device
1577 	 * and mark the stripe error instead.
1578 	 */
1579 	if (bio->bi_iter.bi_size == 0) {
1580 		bool found_missing = false;
1581 		int stripe_nr;
1582 
1583 		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1584 			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1585 				found_missing = true;
1586 				bitmap_set(rbio->error_bitmap,
1587 					   stripe_nr * rbio->stripe_nsectors,
1588 					   rbio->stripe_nsectors);
1589 			}
1590 		}
1591 		ASSERT(found_missing);
1592 	}
1593 }
1594 
1595 /*
1596  * Return the index inside the rbio->stripe_sectors[] array.
1597  *
1598  * Return -1 if not found.
1599  */
1600 static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
1601 {
1602 	for (int i = 0; i < rbio->nr_sectors; i++) {
1603 		if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
1604 			return i;
1605 	}
1606 	return -1;
1607 }
1608 
1609 /*
1610  * this sets each page in the bio uptodate.  It should only be used on private
1611  * rbio pages, nothing that comes in from the higher layers
1612  */
1613 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1614 {
1615 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1616 	const u32 step = min(sectorsize, PAGE_SIZE);
1617 	u32 offset = 0;
1618 	phys_addr_t paddr;
1619 
1620 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1621 
1622 	btrfs_bio_for_each_block_all(paddr, bio, step) {
1623 		/* Hitting the first step of a sector. */
1624 		if (IS_ALIGNED(offset, sectorsize)) {
1625 			int sector_nr = find_stripe_sector_nr(rbio, paddr);
1626 
1627 			ASSERT(sector_nr >= 0);
1628 			if (sector_nr >= 0)
1629 				set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
1630 		}
1631 		offset += step;
1632 	}
1633 }
1634 
1635 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1636 {
1637 	phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1638 	int i;
1639 
1640 	for (i = 0; i < rbio->nr_sectors; i++) {
1641 		if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1642 			break;
1643 		if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1644 			break;
1645 	}
1646 	ASSERT(i < rbio->nr_sectors);
1647 	return i;
1648 }
1649 
1650 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1651 {
1652 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1653 	u32 bio_size = 0;
1654 	struct bio_vec *bvec;
1655 	int i;
1656 
1657 	bio_for_each_bvec_all(bvec, bio, i)
1658 		bio_size += bvec->bv_len;
1659 
1660 	/*
1661 	 * Since we can have multiple bios touching the error_bitmap, we cannot
1662 	 * call bitmap_set() without protection.
1663 	 *
1664 	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1665 	 */
1666 	for (i = total_sector_nr; i < total_sector_nr +
1667 	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1668 		set_bit(i, rbio->error_bitmap);
1669 }
1670 
1671 /* Verify the data sectors at read time. */
1672 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1673 				    struct bio *bio)
1674 {
1675 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1676 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1677 	const u32 nr_steps = rbio->sector_nsteps;
1678 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1679 	u32 offset = 0;
1680 	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
1681 	phys_addr_t paddr;
1682 
1683 	/* No data csum for the whole stripe, no need to verify. */
1684 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1685 		return;
1686 
1687 	/* P/Q stripes, they have no data csum to verify against. */
1688 	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1689 		return;
1690 
1691 	btrfs_bio_for_each_block_all(paddr, bio, step) {
1692 		u8 csum_buf[BTRFS_CSUM_SIZE];
1693 		u8 *expected_csum;
1694 
1695 		paddrs[(offset / step) % nr_steps] = paddr;
1696 		offset += step;
1697 
1698 		/* Not yet covering the full fs block, continue to the next step. */
1699 		if (!IS_ALIGNED(offset, fs_info->sectorsize))
1700 			continue;
1701 
1702 		/* No csum for this sector, skip to the next sector. */
1703 		if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1704 			continue;
1705 
1706 		expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
1707 		btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1708 		if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
1709 			set_bit(total_sector_nr, rbio->error_bitmap);
1710 		total_sector_nr++;
1711 	}
1712 }
1713 
1714 static void raid_wait_read_end_io(struct bio *bio)
1715 {
1716 	struct btrfs_raid_bio *rbio = bio->bi_private;
1717 
1718 	if (bio->bi_status) {
1719 		rbio_update_error_bitmap(rbio, bio);
1720 	} else {
1721 		set_bio_pages_uptodate(rbio, bio);
1722 		verify_bio_data_sectors(rbio, bio);
1723 	}
1724 
1725 	bio_put(bio);
1726 	if (atomic_dec_and_test(&rbio->stripes_pending))
1727 		wake_up(&rbio->io_wait);
1728 }
1729 
1730 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1731 			     struct bio_list *bio_list)
1732 {
1733 	struct bio *bio;
1734 
1735 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1736 	while ((bio = bio_list_pop(bio_list))) {
1737 		bio->bi_end_io = raid_wait_read_end_io;
1738 
1739 		if (trace_raid56_read_enabled()) {
1740 			struct raid56_bio_trace_info trace_info = { 0 };
1741 
1742 			bio_get_trace_info(rbio, bio, &trace_info);
1743 			trace_raid56_read(rbio, bio, &trace_info);
1744 		}
1745 		submit_bio(bio);
1746 	}
1747 
1748 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1749 }
1750 
1751 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1752 {
1753 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1754 	int ret;
1755 
1756 	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1757 	if (ret < 0)
1758 		return ret;
1759 
1760 	index_stripe_sectors(rbio);
1761 	return 0;
1762 }
1763 
1764 /*
1765  * We use plugging call backs to collect full stripes.
1766  * Any time we get a partial stripe write while plugged
1767  * we collect it into a list.  When the unplug comes down,
1768  * we sort the list by logical block number and merge
1769  * everything we can into the same rbios
1770  */
1771 struct btrfs_plug_cb {
1772 	struct blk_plug_cb cb;
1773 	struct btrfs_fs_info *info;
1774 	struct list_head rbio_list;
1775 };
1776 
1777 /*
1778  * rbios on the plug list are sorted for easier merging.
1779  */
1780 static int plug_cmp(void *priv, const struct list_head *a,
1781 		    const struct list_head *b)
1782 {
1783 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1784 						       plug_list);
1785 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1786 						       plug_list);
1787 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1788 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1789 
1790 	if (a_sector < b_sector)
1791 		return -1;
1792 	if (a_sector > b_sector)
1793 		return 1;
1794 	return 0;
1795 }
1796 
1797 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1798 {
1799 	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1800 	struct btrfs_raid_bio *cur;
1801 	struct btrfs_raid_bio *last = NULL;
1802 
1803 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1804 
1805 	while (!list_empty(&plug->rbio_list)) {
1806 		cur = list_first_entry(&plug->rbio_list,
1807 				       struct btrfs_raid_bio, plug_list);
1808 		list_del_init(&cur->plug_list);
1809 
1810 		if (rbio_is_full(cur)) {
1811 			/* We have a full stripe, queue it down. */
1812 			start_async_work(cur, rmw_rbio_work);
1813 			continue;
1814 		}
1815 		if (last) {
1816 			if (rbio_can_merge(last, cur)) {
1817 				merge_rbio(last, cur);
1818 				free_raid_bio(cur);
1819 				continue;
1820 			}
1821 			start_async_work(last, rmw_rbio_work);
1822 		}
1823 		last = cur;
1824 	}
1825 	if (last)
1826 		start_async_work(last, rmw_rbio_work);
1827 	kfree(plug);
1828 }
1829 
1830 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1831 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1832 {
1833 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1834 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1835 	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1836 	const u32 orig_len = orig_bio->bi_iter.bi_size;
1837 	const u32 sectorsize = fs_info->sectorsize;
1838 	u64 cur_logical;
1839 
1840 	ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1841 			    orig_logical + orig_len <= full_stripe_start +
1842 			    rbio->nr_data * BTRFS_STRIPE_LEN,
1843 			    rbio, orig_logical);
1844 
1845 	bio_list_add(&rbio->bio_list, orig_bio);
1846 	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1847 
1848 	/* Update the dbitmap. */
1849 	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1850 	     cur_logical += sectorsize) {
1851 		int bit = ((u32)(cur_logical - full_stripe_start) >>
1852 			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1853 
1854 		set_bit(bit, &rbio->dbitmap);
1855 	}
1856 }
1857 
1858 /*
1859  * our main entry point for writes from the rest of the FS.
1860  */
1861 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1862 {
1863 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1864 	struct btrfs_raid_bio *rbio;
1865 	struct btrfs_plug_cb *plug = NULL;
1866 	struct blk_plug_cb *cb;
1867 
1868 	rbio = alloc_rbio(fs_info, bioc);
1869 	if (IS_ERR(rbio)) {
1870 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1871 		bio_endio(bio);
1872 		return;
1873 	}
1874 	rbio->operation = BTRFS_RBIO_WRITE;
1875 	rbio_add_bio(rbio, bio);
1876 
1877 	/*
1878 	 * Don't plug on full rbios, just get them out the door
1879 	 * as quickly as we can
1880 	 */
1881 	if (!rbio_is_full(rbio)) {
1882 		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1883 		if (cb) {
1884 			plug = container_of(cb, struct btrfs_plug_cb, cb);
1885 			if (!plug->info) {
1886 				plug->info = fs_info;
1887 				INIT_LIST_HEAD(&plug->rbio_list);
1888 			}
1889 			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1890 			return;
1891 		}
1892 	}
1893 
1894 	/*
1895 	 * Either we don't have any existing plug, or we're doing a full stripe,
1896 	 * queue the rmw work now.
1897 	 */
1898 	start_async_work(rbio, rmw_rbio_work);
1899 }
1900 
1901 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1902 			     int stripe_nr, int sector_nr)
1903 {
1904 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1905 	phys_addr_t *paddrs;
1906 	u8 csum_buf[BTRFS_CSUM_SIZE];
1907 	u8 *csum_expected;
1908 
1909 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1910 		return 0;
1911 
1912 	/* No way to verify P/Q as they are not covered by data csum. */
1913 	if (stripe_nr >= rbio->nr_data)
1914 		return 0;
1915 	/*
1916 	 * If we're rebuilding a read, we have to use pages from the
1917 	 * bio list if possible.
1918 	 */
1919 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1920 		paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
1921 	} else {
1922 		paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
1923 	}
1924 
1925 	csum_expected = rbio->csum_buf +
1926 			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1927 			fs_info->csum_size;
1928 	btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1929 	if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
1930 		return -EIO;
1931 	return 0;
1932 }
1933 
1934 static void recover_vertical_step(struct btrfs_raid_bio *rbio,
1935 				  unsigned int sector_nr,
1936 				  unsigned int step_nr,
1937 				  int faila, int failb,
1938 				  void **pointers, void **unmap_array)
1939 {
1940 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1941 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1942 	int stripe_nr;
1943 
1944 	ASSERT(step_nr < rbio->sector_nsteps);
1945 	ASSERT(sector_nr < rbio->stripe_nsectors);
1946 
1947 	/*
1948 	 * Setup our array of pointers with sectors from each stripe
1949 	 *
1950 	 * NOTE: store a duplicate array of pointers to preserve the
1951 	 * pointer order.
1952 	 */
1953 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1954 		phys_addr_t paddr;
1955 
1956 		/*
1957 		 * If we're rebuilding a read, we have to use pages from the
1958 		 * bio list if possible.
1959 		 */
1960 		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1961 			paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
1962 		} else {
1963 			paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
1964 		}
1965 		pointers[stripe_nr] = kmap_local_paddr(paddr);
1966 		unmap_array[stripe_nr] = pointers[stripe_nr];
1967 	}
1968 
1969 	/* All raid6 handling here */
1970 	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1971 		/* Single failure, rebuild from parity raid5 style */
1972 		if (failb < 0) {
1973 			if (faila == rbio->nr_data)
1974 				/*
1975 				 * Just the P stripe has failed, without
1976 				 * a bad data or Q stripe.
1977 				 * We have nothing to do, just skip the
1978 				 * recovery for this stripe.
1979 				 */
1980 				goto cleanup;
1981 			/*
1982 			 * a single failure in raid6 is rebuilt
1983 			 * in the pstripe code below
1984 			 */
1985 			goto pstripe;
1986 		}
1987 
1988 		/*
1989 		 * If the q stripe is failed, do a pstripe reconstruction from
1990 		 * the xors.
1991 		 * If both the q stripe and the P stripe are failed, we're
1992 		 * here due to a crc mismatch and we can't give them the
1993 		 * data they want.
1994 		 */
1995 		if (failb == rbio->real_stripes - 1) {
1996 			if (faila == rbio->real_stripes - 2)
1997 				/*
1998 				 * Only P and Q are corrupted.
1999 				 * We only care about data stripes recovery,
2000 				 * can skip this vertical stripe.
2001 				 */
2002 				goto cleanup;
2003 			/*
2004 			 * Otherwise we have one bad data stripe and
2005 			 * a good P stripe.  raid5!
2006 			 */
2007 			goto pstripe;
2008 		}
2009 
2010 		if (failb == rbio->real_stripes - 2) {
2011 			raid6_datap_recov(rbio->real_stripes, step,
2012 					  faila, pointers);
2013 		} else {
2014 			raid6_2data_recov(rbio->real_stripes, step,
2015 					  faila, failb, pointers);
2016 		}
2017 	} else {
2018 		void *p;
2019 
2020 		/* Rebuild from P stripe here (raid5 or raid6). */
2021 		ASSERT(failb == -1);
2022 pstripe:
2023 		/* Copy parity block into failed block to start with */
2024 		memcpy(pointers[faila], pointers[rbio->nr_data], step);
2025 
2026 		/* Rearrange the pointer array */
2027 		p = pointers[faila];
2028 		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
2029 		     stripe_nr++)
2030 			pointers[stripe_nr] = pointers[stripe_nr + 1];
2031 		pointers[rbio->nr_data - 1] = p;
2032 
2033 		/* Xor in the rest */
2034 		run_xor(pointers, rbio->nr_data - 1, step);
2035 	}
2036 
2037 cleanup:
2038 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
2039 		kunmap_local(unmap_array[stripe_nr]);
2040 }
2041 
2042 /*
2043  * Recover a vertical stripe specified by @sector_nr.
2044  * @*pointers are the pre-allocated pointers by the caller, so we don't
2045  * need to allocate/free the pointers again and again.
2046  */
2047 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
2048 			    void **pointers, void **unmap_array)
2049 {
2050 	int found_errors;
2051 	int faila;
2052 	int failb;
2053 	int ret = 0;
2054 
2055 	/*
2056 	 * Now we just use bitmap to mark the horizontal stripes in
2057 	 * which we have data when doing parity scrub.
2058 	 */
2059 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
2060 	    !test_bit(sector_nr, &rbio->dbitmap))
2061 		return 0;
2062 
2063 	found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
2064 						&failb);
2065 	/*
2066 	 * No errors in the vertical stripe, skip it.  Can happen for recovery
2067 	 * which only part of a stripe failed csum check.
2068 	 */
2069 	if (!found_errors)
2070 		return 0;
2071 
2072 	if (unlikely(found_errors > rbio->bioc->max_errors))
2073 		return -EIO;
2074 
2075 	for (int i = 0; i < rbio->sector_nsteps; i++)
2076 		recover_vertical_step(rbio, sector_nr, i, faila, failb,
2077 					    pointers, unmap_array);
2078 	if (faila >= 0) {
2079 		ret = verify_one_sector(rbio, faila, sector_nr);
2080 		if (ret < 0)
2081 			return ret;
2082 
2083 		set_bit(rbio_sector_index(rbio, faila, sector_nr),
2084 			rbio->stripe_uptodate_bitmap);
2085 	}
2086 	if (failb >= 0) {
2087 		ret = verify_one_sector(rbio, failb, sector_nr);
2088 		if (ret < 0)
2089 			return ret;
2090 
2091 		set_bit(rbio_sector_index(rbio, failb, sector_nr),
2092 			rbio->stripe_uptodate_bitmap);
2093 	}
2094 	return ret;
2095 }
2096 
2097 static int recover_sectors(struct btrfs_raid_bio *rbio)
2098 {
2099 	void **pointers = NULL;
2100 	void **unmap_array = NULL;
2101 	int sectornr;
2102 	int ret = 0;
2103 
2104 	/*
2105 	 * @pointers array stores the pointer for each sector.
2106 	 *
2107 	 * @unmap_array stores copy of pointers that does not get reordered
2108 	 * during reconstruction so that kunmap_local works.
2109 	 */
2110 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2111 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2112 	if (!pointers || !unmap_array) {
2113 		ret = -ENOMEM;
2114 		goto out;
2115 	}
2116 
2117 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2118 		spin_lock(&rbio->bio_list_lock);
2119 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2120 		spin_unlock(&rbio->bio_list_lock);
2121 	}
2122 
2123 	index_rbio_pages(rbio);
2124 
2125 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2126 		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2127 		if (ret < 0)
2128 			break;
2129 	}
2130 
2131 out:
2132 	kfree(pointers);
2133 	kfree(unmap_array);
2134 	return ret;
2135 }
2136 
2137 static void recover_rbio(struct btrfs_raid_bio *rbio)
2138 {
2139 	struct bio_list bio_list = BIO_EMPTY_LIST;
2140 	int total_sector_nr;
2141 	int ret = 0;
2142 
2143 	/*
2144 	 * Either we're doing recover for a read failure or degraded write,
2145 	 * caller should have set error bitmap correctly.
2146 	 */
2147 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2148 
2149 	/* For recovery, we need to read all sectors including P/Q. */
2150 	ret = alloc_rbio_pages(rbio);
2151 	if (ret < 0)
2152 		goto out;
2153 
2154 	index_rbio_pages(rbio);
2155 
2156 	/*
2157 	 * Read everything that hasn't failed. However this time we will
2158 	 * not trust any cached sector.
2159 	 * As we may read out some stale data but higher layer is not reading
2160 	 * that stale part.
2161 	 *
2162 	 * So here we always re-read everything in recovery path.
2163 	 */
2164 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2165 	     total_sector_nr++) {
2166 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2167 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2168 		phys_addr_t *paddrs;
2169 
2170 		/*
2171 		 * Skip the range which has error.  It can be a range which is
2172 		 * marked error (for csum mismatch), or it can be a missing
2173 		 * device.
2174 		 */
2175 		if (!rbio->bioc->stripes[stripe].dev->bdev ||
2176 		    test_bit(total_sector_nr, rbio->error_bitmap)) {
2177 			/*
2178 			 * Also set the error bit for missing device, which
2179 			 * may not yet have its error bit set.
2180 			 */
2181 			set_bit(total_sector_nr, rbio->error_bitmap);
2182 			continue;
2183 		}
2184 
2185 		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2186 		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2187 					 sectornr, REQ_OP_READ);
2188 		if (ret < 0) {
2189 			bio_list_put(&bio_list);
2190 			goto out;
2191 		}
2192 	}
2193 
2194 	submit_read_wait_bio_list(rbio, &bio_list);
2195 	ret = recover_sectors(rbio);
2196 out:
2197 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2198 }
2199 
2200 static void recover_rbio_work(struct work_struct *work)
2201 {
2202 	struct btrfs_raid_bio *rbio;
2203 
2204 	rbio = container_of(work, struct btrfs_raid_bio, work);
2205 	if (!lock_stripe_add(rbio))
2206 		recover_rbio(rbio);
2207 }
2208 
2209 static void recover_rbio_work_locked(struct work_struct *work)
2210 {
2211 	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2212 }
2213 
2214 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2215 {
2216 	bool found = false;
2217 	int sector_nr;
2218 
2219 	/*
2220 	 * This is for RAID6 extra recovery tries, thus mirror number should
2221 	 * be large than 2.
2222 	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2223 	 * RAID5 methods.
2224 	 */
2225 	ASSERT(mirror_num > 2);
2226 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2227 		int found_errors;
2228 		int faila;
2229 		int failb;
2230 
2231 		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2232 							 &faila, &failb);
2233 		/* This vertical stripe doesn't have errors. */
2234 		if (!found_errors)
2235 			continue;
2236 
2237 		/*
2238 		 * If we found errors, there should be only one error marked
2239 		 * by previous set_rbio_range_error().
2240 		 */
2241 		ASSERT(found_errors == 1);
2242 		found = true;
2243 
2244 		/* Now select another stripe to mark as error. */
2245 		failb = rbio->real_stripes - (mirror_num - 1);
2246 		if (failb <= faila)
2247 			failb--;
2248 
2249 		/* Set the extra bit in error bitmap. */
2250 		if (failb >= 0)
2251 			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2252 				rbio->error_bitmap);
2253 	}
2254 
2255 	/* We should found at least one vertical stripe with error.*/
2256 	ASSERT(found);
2257 }
2258 
2259 /*
2260  * the main entry point for reads from the higher layers.  This
2261  * is really only called when the normal read path had a failure,
2262  * so we assume the bio they send down corresponds to a failed part
2263  * of the drive.
2264  */
2265 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2266 			   int mirror_num)
2267 {
2268 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2269 	struct btrfs_raid_bio *rbio;
2270 
2271 	rbio = alloc_rbio(fs_info, bioc);
2272 	if (IS_ERR(rbio)) {
2273 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2274 		bio_endio(bio);
2275 		return;
2276 	}
2277 
2278 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2279 	rbio_add_bio(rbio, bio);
2280 
2281 	set_rbio_range_error(rbio, bio);
2282 
2283 	/*
2284 	 * Loop retry:
2285 	 * for 'mirror == 2', reconstruct from all other stripes.
2286 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2287 	 */
2288 	if (mirror_num > 2)
2289 		set_rbio_raid6_extra_error(rbio, mirror_num);
2290 
2291 	start_async_work(rbio, recover_rbio_work);
2292 }
2293 
2294 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2295 {
2296 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2297 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2298 						       rbio->bioc->full_stripe_logical);
2299 	const u64 start = rbio->bioc->full_stripe_logical;
2300 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2301 			fs_info->sectorsize_bits;
2302 	int ret;
2303 
2304 	/* The rbio should not have its csum buffer initialized. */
2305 	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2306 
2307 	/*
2308 	 * Skip the csum search if:
2309 	 *
2310 	 * - The rbio doesn't belong to data block groups
2311 	 *   Then we are doing IO for tree blocks, no need to search csums.
2312 	 *
2313 	 * - The rbio belongs to mixed block groups
2314 	 *   This is to avoid deadlock, as we're already holding the full
2315 	 *   stripe lock, if we trigger a metadata read, and it needs to do
2316 	 *   raid56 recovery, we will deadlock.
2317 	 */
2318 	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2319 	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2320 		return;
2321 
2322 	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2323 				 fs_info->csum_size, GFP_NOFS);
2324 	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2325 					  GFP_NOFS);
2326 	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2327 		ret = -ENOMEM;
2328 		goto error;
2329 	}
2330 
2331 	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2332 					rbio->csum_buf, rbio->csum_bitmap);
2333 	if (ret < 0)
2334 		goto error;
2335 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2336 		goto no_csum;
2337 	return;
2338 
2339 error:
2340 	/*
2341 	 * We failed to allocate memory or grab the csum, but it's not fatal,
2342 	 * we can still continue.  But better to warn users that RMW is no
2343 	 * longer safe for this particular sub-stripe write.
2344 	 */
2345 	btrfs_warn_rl(fs_info,
2346 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2347 			rbio->bioc->full_stripe_logical, ret);
2348 no_csum:
2349 	kfree(rbio->csum_buf);
2350 	bitmap_free(rbio->csum_bitmap);
2351 	rbio->csum_buf = NULL;
2352 	rbio->csum_bitmap = NULL;
2353 }
2354 
2355 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2356 {
2357 	struct bio_list bio_list = BIO_EMPTY_LIST;
2358 	int total_sector_nr;
2359 	int ret = 0;
2360 
2361 	/*
2362 	 * Fill the data csums we need for data verification.  We need to fill
2363 	 * the csum_bitmap/csum_buf first, as our endio function will try to
2364 	 * verify the data sectors.
2365 	 */
2366 	fill_data_csums(rbio);
2367 
2368 	/*
2369 	 * Build a list of bios to read all sectors (including data and P/Q).
2370 	 *
2371 	 * This behavior is to compensate the later csum verification and recovery.
2372 	 */
2373 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2374 	     total_sector_nr++) {
2375 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2376 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2377 		phys_addr_t *paddrs;
2378 
2379 		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2380 		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2381 					 sectornr, REQ_OP_READ);
2382 		if (ret) {
2383 			bio_list_put(&bio_list);
2384 			return ret;
2385 		}
2386 	}
2387 
2388 	/*
2389 	 * We may or may not have any corrupted sectors (including missing dev
2390 	 * and csum mismatch), just let recover_sectors() to handle them all.
2391 	 */
2392 	submit_read_wait_bio_list(rbio, &bio_list);
2393 	return recover_sectors(rbio);
2394 }
2395 
2396 static void raid_wait_write_end_io(struct bio *bio)
2397 {
2398 	struct btrfs_raid_bio *rbio = bio->bi_private;
2399 
2400 	if (bio->bi_status)
2401 		rbio_update_error_bitmap(rbio, bio);
2402 	bio_put(bio);
2403 	if (atomic_dec_and_test(&rbio->stripes_pending))
2404 		wake_up(&rbio->io_wait);
2405 }
2406 
2407 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2408 			      struct bio_list *bio_list)
2409 {
2410 	struct bio *bio;
2411 
2412 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2413 	while ((bio = bio_list_pop(bio_list))) {
2414 		bio->bi_end_io = raid_wait_write_end_io;
2415 
2416 		if (trace_raid56_write_enabled()) {
2417 			struct raid56_bio_trace_info trace_info = { 0 };
2418 
2419 			bio_get_trace_info(rbio, bio, &trace_info);
2420 			trace_raid56_write(rbio, bio, &trace_info);
2421 		}
2422 		submit_bio(bio);
2423 	}
2424 }
2425 
2426 /*
2427  * To determine if we need to read any sector from the disk.
2428  * Should only be utilized in RMW path, to skip cached rbio.
2429  */
2430 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2431 {
2432 	int i;
2433 
2434 	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2435 		phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
2436 
2437 		/*
2438 		 * We have a sector which doesn't have page nor uptodate,
2439 		 * thus this rbio can not be cached one, as cached one must
2440 		 * have all its data sectors present and uptodate.
2441 		 */
2442 		if (paddr == INVALID_PADDR ||
2443 		    !test_bit(i, rbio->stripe_uptodate_bitmap))
2444 			return true;
2445 	}
2446 	return false;
2447 }
2448 
2449 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2450 {
2451 	struct bio_list bio_list;
2452 	int sectornr;
2453 	int ret = 0;
2454 
2455 	/*
2456 	 * Allocate the pages for parity first, as P/Q pages will always be
2457 	 * needed for both full-stripe and sub-stripe writes.
2458 	 */
2459 	ret = alloc_rbio_parity_pages(rbio);
2460 	if (ret < 0)
2461 		goto out;
2462 
2463 	/*
2464 	 * Either full stripe write, or we have every data sector already
2465 	 * cached, can go to write path immediately.
2466 	 */
2467 	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2468 		/*
2469 		 * Now we're doing sub-stripe write, also need all data stripes
2470 		 * to do the full RMW.
2471 		 */
2472 		ret = alloc_rbio_data_pages(rbio);
2473 		if (ret < 0)
2474 			goto out;
2475 
2476 		index_rbio_pages(rbio);
2477 
2478 		ret = rmw_read_wait_recover(rbio);
2479 		if (ret < 0)
2480 			goto out;
2481 	}
2482 
2483 	/*
2484 	 * At this stage we're not allowed to add any new bios to the
2485 	 * bio list any more, anyone else that wants to change this stripe
2486 	 * needs to do their own rmw.
2487 	 */
2488 	spin_lock(&rbio->bio_list_lock);
2489 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2490 	spin_unlock(&rbio->bio_list_lock);
2491 
2492 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2493 
2494 	index_rbio_pages(rbio);
2495 
2496 	/*
2497 	 * We don't cache full rbios because we're assuming
2498 	 * the higher layers are unlikely to use this area of
2499 	 * the disk again soon.  If they do use it again,
2500 	 * hopefully they will send another full bio.
2501 	 */
2502 	if (!rbio_is_full(rbio))
2503 		cache_rbio_pages(rbio);
2504 	else
2505 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2506 
2507 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2508 		generate_pq_vertical(rbio, sectornr);
2509 
2510 	bio_list_init(&bio_list);
2511 	ret = rmw_assemble_write_bios(rbio, &bio_list);
2512 	if (ret < 0)
2513 		goto out;
2514 
2515 	/* We should have at least one bio assembled. */
2516 	ASSERT(bio_list_size(&bio_list));
2517 	submit_write_bios(rbio, &bio_list);
2518 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2519 
2520 	/* We may have more errors than our tolerance during the read. */
2521 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2522 		int found_errors;
2523 
2524 		found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
2525 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2526 			ret = -EIO;
2527 			break;
2528 		}
2529 	}
2530 out:
2531 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2532 }
2533 
2534 static void rmw_rbio_work(struct work_struct *work)
2535 {
2536 	struct btrfs_raid_bio *rbio;
2537 
2538 	rbio = container_of(work, struct btrfs_raid_bio, work);
2539 	if (lock_stripe_add(rbio) == 0)
2540 		rmw_rbio(rbio);
2541 }
2542 
2543 static void rmw_rbio_work_locked(struct work_struct *work)
2544 {
2545 	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2546 }
2547 
2548 /*
2549  * The following code is used to scrub/replace the parity stripe
2550  *
2551  * Caller must have already increased bio_counter for getting @bioc.
2552  *
2553  * Note: We need make sure all the pages that add into the scrub/replace
2554  * raid bio are correct and not be changed during the scrub/replace. That
2555  * is those pages just hold metadata or file data with checksum.
2556  */
2557 
2558 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2559 				struct btrfs_io_context *bioc,
2560 				struct btrfs_device *scrub_dev,
2561 				unsigned long *dbitmap, int stripe_nsectors)
2562 {
2563 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2564 	struct btrfs_raid_bio *rbio;
2565 	int i;
2566 
2567 	rbio = alloc_rbio(fs_info, bioc);
2568 	if (IS_ERR(rbio))
2569 		return NULL;
2570 	bio_list_add(&rbio->bio_list, bio);
2571 	/*
2572 	 * This is a special bio which is used to hold the completion handler
2573 	 * and make the scrub rbio is similar to the other types
2574 	 */
2575 	ASSERT(!bio->bi_iter.bi_size);
2576 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2577 
2578 	/*
2579 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2580 	 * to the end position, so this search can start from the first parity
2581 	 * stripe.
2582 	 */
2583 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2584 		if (bioc->stripes[i].dev == scrub_dev) {
2585 			rbio->scrubp = i;
2586 			break;
2587 		}
2588 	}
2589 	ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2590 
2591 	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2592 	return rbio;
2593 }
2594 
2595 static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
2596 				  int sector_nr)
2597 {
2598 	const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
2599 	const u32 base = sector_nr * rbio->sector_nsteps;
2600 
2601 	for (int i = base; i < base + rbio->sector_nsteps; i++) {
2602 		const unsigned int page_index = (i * step) >> PAGE_SHIFT;
2603 		struct page *page;
2604 
2605 		if (rbio->stripe_pages[page_index])
2606 			continue;
2607 		page = alloc_page(GFP_NOFS);
2608 		if (!page)
2609 			return -ENOMEM;
2610 		rbio->stripe_pages[page_index] = page;
2611 	}
2612 	return 0;
2613 }
2614 
2615 /*
2616  * We just scrub the parity that we have correct data on the same horizontal,
2617  * so we needn't allocate all pages for all the stripes.
2618  */
2619 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2620 {
2621 	int total_sector_nr;
2622 
2623 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2624 	     total_sector_nr++) {
2625 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2626 		int ret;
2627 
2628 		if (!test_bit(sectornr, &rbio->dbitmap))
2629 			continue;
2630 		ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
2631 		if (ret < 0)
2632 			return ret;
2633 	}
2634 	index_stripe_sectors(rbio);
2635 	return 0;
2636 }
2637 
2638 /* Return true if the content of the step matches the caclulated one. */
2639 static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
2640 				   void *pointers[], unsigned int sector_nr,
2641 				   unsigned int step_nr)
2642 {
2643 	const unsigned int nr_data = rbio->nr_data;
2644 	const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
2645 	const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
2646 	void *parity;
2647 	bool ret = false;
2648 
2649 	ASSERT(step_nr < rbio->sector_nsteps);
2650 
2651 	/* First collect one page from each data stripe. */
2652 	for (int stripe = 0; stripe < nr_data; stripe++)
2653 		pointers[stripe] = kmap_local_paddr(
2654 				sector_paddr_in_rbio(rbio, stripe, sector_nr,
2655 						     step_nr, 0));
2656 
2657 	if (has_qstripe) {
2658 		assert_rbio(rbio);
2659 		/* RAID6, call the library function to fill in our P/Q. */
2660 		raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
2661 	} else {
2662 		/* RAID5. */
2663 		memcpy(pointers[nr_data], pointers[0], step);
2664 		run_xor(pointers + 1, nr_data - 1, step);
2665 	}
2666 
2667 	/* Check scrubbing parity and repair it. */
2668 	parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
2669 	if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
2670 		memcpy(parity, pointers[rbio->scrubp], step);
2671 	else
2672 		ret = true;
2673 	kunmap_local(parity);
2674 
2675 	for (int stripe = nr_data - 1; stripe >= 0; stripe--)
2676 		kunmap_local(pointers[stripe]);
2677 	return ret;
2678 }
2679 
2680 /*
2681  * The @pointers array should have the P/Q parity already mapped.
2682  */
2683 static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
2684 				     void *pointers[], unsigned int sector_nr)
2685 {
2686 	bool found_error = false;
2687 
2688 	for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
2689 		bool match;
2690 
2691 		match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
2692 		if (!match)
2693 			found_error = true;
2694 	}
2695 	if (!found_error)
2696 		bitmap_clear(&rbio->dbitmap, sector_nr, 1);
2697 }
2698 
2699 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2700 {
2701 	struct btrfs_io_context *bioc = rbio->bioc;
2702 	void **pointers = rbio->finish_pointers;
2703 	unsigned long *pbitmap = &rbio->finish_pbitmap;
2704 	int nr_data = rbio->nr_data;
2705 	int sectornr;
2706 	bool has_qstripe;
2707 	struct page *page;
2708 	phys_addr_t p_paddr = INVALID_PADDR;
2709 	phys_addr_t q_paddr = INVALID_PADDR;
2710 	struct bio_list bio_list;
2711 	int is_replace = 0;
2712 	int ret;
2713 
2714 	bio_list_init(&bio_list);
2715 
2716 	if (rbio->real_stripes - rbio->nr_data == 1)
2717 		has_qstripe = false;
2718 	else if (rbio->real_stripes - rbio->nr_data == 2)
2719 		has_qstripe = true;
2720 	else
2721 		BUG();
2722 
2723 	/*
2724 	 * Replace is running and our P/Q stripe is being replaced, then we
2725 	 * need to duplicate the final write to replace target.
2726 	 */
2727 	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2728 		is_replace = 1;
2729 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2730 	}
2731 
2732 	/*
2733 	 * Because the higher layers(scrubber) are unlikely to
2734 	 * use this area of the disk again soon, so don't cache
2735 	 * it.
2736 	 */
2737 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2738 
2739 	page = alloc_page(GFP_NOFS);
2740 	if (!page)
2741 		return -ENOMEM;
2742 	p_paddr = page_to_phys(page);
2743 	page = NULL;
2744 	pointers[nr_data] = kmap_local_paddr(p_paddr);
2745 
2746 	if (has_qstripe) {
2747 		/* RAID6, allocate and map temp space for the Q stripe */
2748 		page = alloc_page(GFP_NOFS);
2749 		if (!page) {
2750 			__free_page(phys_to_page(p_paddr));
2751 			p_paddr = INVALID_PADDR;
2752 			return -ENOMEM;
2753 		}
2754 		q_paddr = page_to_phys(page);
2755 		page = NULL;
2756 		pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
2757 	}
2758 
2759 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2760 
2761 	/* Map the parity stripe just once */
2762 
2763 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
2764 		verify_one_parity_sector(rbio, pointers, sectornr);
2765 
2766 	kunmap_local(pointers[nr_data]);
2767 	__free_page(phys_to_page(p_paddr));
2768 	p_paddr = INVALID_PADDR;
2769 	if (q_paddr != INVALID_PADDR) {
2770 		__free_page(phys_to_page(q_paddr));
2771 		q_paddr = INVALID_PADDR;
2772 	}
2773 
2774 	/*
2775 	 * time to start writing.  Make bios for everything from the
2776 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2777 	 * everything else.
2778 	 */
2779 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2780 		phys_addr_t *paddrs;
2781 
2782 		paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2783 		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
2784 					 sectornr, REQ_OP_WRITE);
2785 		if (ret)
2786 			goto cleanup;
2787 	}
2788 
2789 	if (!is_replace)
2790 		goto submit_write;
2791 
2792 	/*
2793 	 * Replace is running and our parity stripe needs to be duplicated to
2794 	 * the target device.  Check we have a valid source stripe number.
2795 	 */
2796 	ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2797 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2798 		phys_addr_t *paddrs;
2799 
2800 		paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2801 		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
2802 					 sectornr, REQ_OP_WRITE);
2803 		if (ret)
2804 			goto cleanup;
2805 	}
2806 
2807 submit_write:
2808 	submit_write_bios(rbio, &bio_list);
2809 	return 0;
2810 
2811 cleanup:
2812 	bio_list_put(&bio_list);
2813 	return ret;
2814 }
2815 
2816 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2817 {
2818 	if (stripe >= 0 && stripe < rbio->nr_data)
2819 		return 1;
2820 	return 0;
2821 }
2822 
2823 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2824 {
2825 	void **pointers = NULL;
2826 	void **unmap_array = NULL;
2827 	int sector_nr;
2828 	int ret = 0;
2829 
2830 	/*
2831 	 * @pointers array stores the pointer for each sector.
2832 	 *
2833 	 * @unmap_array stores copy of pointers that does not get reordered
2834 	 * during reconstruction so that kunmap_local works.
2835 	 */
2836 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2837 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2838 	if (!pointers || !unmap_array) {
2839 		ret = -ENOMEM;
2840 		goto out;
2841 	}
2842 
2843 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2844 		int dfail = 0, failp = -1;
2845 		int faila;
2846 		int failb;
2847 		int found_errors;
2848 
2849 		found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2850 							 &faila, &failb);
2851 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2852 			ret = -EIO;
2853 			goto out;
2854 		}
2855 		if (found_errors == 0)
2856 			continue;
2857 
2858 		/* We should have at least one error here. */
2859 		ASSERT(faila >= 0 || failb >= 0);
2860 
2861 		if (is_data_stripe(rbio, faila))
2862 			dfail++;
2863 		else if (is_parity_stripe(faila))
2864 			failp = faila;
2865 
2866 		if (is_data_stripe(rbio, failb))
2867 			dfail++;
2868 		else if (is_parity_stripe(failb))
2869 			failp = failb;
2870 		/*
2871 		 * Because we can not use a scrubbing parity to repair the
2872 		 * data, so the capability of the repair is declined.  (In the
2873 		 * case of RAID5, we can not repair anything.)
2874 		 */
2875 		if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
2876 			ret = -EIO;
2877 			goto out;
2878 		}
2879 		/*
2880 		 * If all data is good, only parity is correctly, just repair
2881 		 * the parity, no need to recover data stripes.
2882 		 */
2883 		if (dfail == 0)
2884 			continue;
2885 
2886 		/*
2887 		 * Here means we got one corrupted data stripe and one
2888 		 * corrupted parity on RAID6, if the corrupted parity is
2889 		 * scrubbing parity, luckily, use the other one to repair the
2890 		 * data, or we can not repair the data stripe.
2891 		 */
2892 		if (unlikely(failp != rbio->scrubp)) {
2893 			ret = -EIO;
2894 			goto out;
2895 		}
2896 
2897 		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2898 		if (ret < 0)
2899 			goto out;
2900 	}
2901 out:
2902 	kfree(pointers);
2903 	kfree(unmap_array);
2904 	return ret;
2905 }
2906 
2907 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2908 {
2909 	struct bio_list bio_list = BIO_EMPTY_LIST;
2910 	int total_sector_nr;
2911 	int ret = 0;
2912 
2913 	/* Build a list of bios to read all the missing parts. */
2914 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2915 	     total_sector_nr++) {
2916 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2917 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2918 		phys_addr_t *paddrs;
2919 
2920 		/* No data in the vertical stripe, no need to read. */
2921 		if (!test_bit(sectornr, &rbio->dbitmap))
2922 			continue;
2923 
2924 		/*
2925 		 * We want to find all the sectors missing from the rbio and
2926 		 * read them from the disk. If sector_paddr_in_rbio() finds a sector
2927 		 * in the bio list we don't need to read it off the stripe.
2928 		 */
2929 		paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
2930 		if (paddrs == NULL)
2931 			continue;
2932 
2933 		paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2934 		/*
2935 		 * The bio cache may have handed us an uptodate sector.  If so,
2936 		 * use it.
2937 		 */
2938 		if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
2939 			     rbio->stripe_uptodate_bitmap))
2940 			continue;
2941 
2942 		ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2943 					 sectornr, REQ_OP_READ);
2944 		if (ret) {
2945 			bio_list_put(&bio_list);
2946 			return ret;
2947 		}
2948 	}
2949 
2950 	submit_read_wait_bio_list(rbio, &bio_list);
2951 	return 0;
2952 }
2953 
2954 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2955 {
2956 	int sector_nr;
2957 	int ret;
2958 
2959 	ret = alloc_rbio_essential_pages(rbio);
2960 	if (ret)
2961 		goto out;
2962 
2963 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2964 
2965 	ret = scrub_assemble_read_bios(rbio);
2966 	if (ret < 0)
2967 		goto out;
2968 
2969 	/* We may have some failures, recover the failed sectors first. */
2970 	ret = recover_scrub_rbio(rbio);
2971 	if (ret < 0)
2972 		goto out;
2973 
2974 	/*
2975 	 * We have every sector properly prepared. Can finish the scrub
2976 	 * and writeback the good content.
2977 	 */
2978 	ret = finish_parity_scrub(rbio);
2979 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2980 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2981 		int found_errors;
2982 
2983 		found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
2984 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2985 			ret = -EIO;
2986 			break;
2987 		}
2988 	}
2989 out:
2990 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2991 }
2992 
2993 static void scrub_rbio_work_locked(struct work_struct *work)
2994 {
2995 	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2996 }
2997 
2998 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2999 {
3000 	if (!lock_stripe_add(rbio))
3001 		start_async_work(rbio, scrub_rbio_work_locked);
3002 }
3003 
3004 /*
3005  * This is for scrub call sites where we already have correct data contents.
3006  * This allows us to avoid reading data stripes again.
3007  *
3008  * Unfortunately here we have to do folio copy, other than reusing the pages.
3009  * This is due to the fact rbio has its own page management for its cache.
3010  */
3011 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
3012 				     struct folio **data_folios, u64 data_logical)
3013 {
3014 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
3015 	const u64 offset_in_full_stripe = data_logical -
3016 					  rbio->bioc->full_stripe_logical;
3017 	unsigned int findex = 0;
3018 	unsigned int foffset = 0;
3019 	int ret;
3020 
3021 	/*
3022 	 * If we hit ENOMEM temporarily, but later at
3023 	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
3024 	 * the extra read, not a big deal.
3025 	 *
3026 	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
3027 	 * the bio would got proper error number set.
3028 	 */
3029 	ret = alloc_rbio_data_pages(rbio);
3030 	if (ret < 0)
3031 		return;
3032 
3033 	/* data_logical must be at stripe boundary and inside the full stripe. */
3034 	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
3035 	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
3036 
3037 	for (unsigned int cur_off = offset_in_full_stripe;
3038 	     cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
3039 	     cur_off += PAGE_SIZE) {
3040 		const unsigned int pindex = cur_off >> PAGE_SHIFT;
3041 		void *kaddr;
3042 
3043 		kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
3044 		memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
3045 		kunmap_local(kaddr);
3046 
3047 		foffset += PAGE_SIZE;
3048 		ASSERT(foffset <= folio_size(data_folios[findex]));
3049 		if (foffset == folio_size(data_folios[findex])) {
3050 			findex++;
3051 			foffset = 0;
3052 		}
3053 	}
3054 	bitmap_set(rbio->stripe_uptodate_bitmap,
3055 		   offset_in_full_stripe >> fs_info->sectorsize_bits,
3056 		   BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
3057 }
3058