1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 */
6
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
24
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT 1
27
28 /*
29 * set when this rbio is sitting in the hash, but it is just a cache
30 * of past RMW
31 */
32 #define RBIO_CACHE_BIT 2
33
34 /*
35 * set when it is safe to trust the stripe_pages for caching
36 */
37 #define RBIO_CACHE_READY_BIT 3
38
39 #define RBIO_CACHE_SIZE 1024
40
41 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
42
dump_bioc(const struct btrfs_fs_info * fs_info,const struct btrfs_io_context * bioc)43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44 {
45 if (unlikely(!bioc)) {
46 btrfs_crit(fs_info, "bioc=NULL");
47 return;
48 }
49 btrfs_crit(fs_info,
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 bioc->logical, bioc->full_stripe_logical, bioc->size,
52 bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 bioc->replace_stripe_src, bioc->num_stripes);
54 for (int i = 0; i < bioc->num_stripes; i++) {
55 btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
56 i, bioc->stripes[i].dev->devid,
57 bioc->stripes[i].physical);
58 }
59 }
60
btrfs_dump_rbio(const struct btrfs_fs_info * fs_info,const struct btrfs_raid_bio * rbio)61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 const struct btrfs_raid_bio *rbio)
63 {
64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 return;
66
67 dump_bioc(fs_info, rbio->bioc);
68 btrfs_crit(fs_info,
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
70 rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 rbio->real_stripes, rbio->stripe_nsectors,
72 rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
73 }
74
75 #define ASSERT_RBIO(expr, rbio) \
76 ({ \
77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
79 (rbio)->bioc->fs_info : NULL; \
80 \
81 btrfs_dump_rbio(__fs_info, (rbio)); \
82 } \
83 ASSERT((expr)); \
84 })
85
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
87 ({ \
88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
90 (rbio)->bioc->fs_info : NULL; \
91 \
92 btrfs_dump_rbio(__fs_info, (rbio)); \
93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
94 } \
95 ASSERT((expr)); \
96 })
97
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
99 ({ \
100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
102 (rbio)->bioc->fs_info : NULL; \
103 \
104 btrfs_dump_rbio(__fs_info, (rbio)); \
105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
106 } \
107 ASSERT((expr)); \
108 })
109
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
111 ({ \
112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
114 (rbio)->bioc->fs_info : NULL; \
115 \
116 btrfs_dump_rbio(__fs_info, (rbio)); \
117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \
118 } \
119 ASSERT((expr)); \
120 })
121
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash {
124 struct list_head hash_list;
125 spinlock_t lock;
126 };
127
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table {
130 struct list_head stripe_cache;
131 spinlock_t cache_lock;
132 int cache_size;
133 struct btrfs_stripe_hash table[];
134 };
135
136 /*
137 * The PFN may still be valid, but our paddrs should always be block size
138 * aligned, thus such -1 paddr is definitely not a valid one.
139 */
140 #define INVALID_PADDR (~(phys_addr_t)0)
141
142 static void rmw_rbio_work(struct work_struct *work);
143 static void rmw_rbio_work_locked(struct work_struct *work);
144 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
145 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
146
147 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
148 static void scrub_rbio_work_locked(struct work_struct *work);
149
free_raid_bio_pointers(struct btrfs_raid_bio * rbio)150 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
151 {
152 bitmap_free(rbio->error_bitmap);
153 bitmap_free(rbio->stripe_uptodate_bitmap);
154 kfree(rbio->stripe_pages);
155 kfree(rbio->bio_paddrs);
156 kfree(rbio->stripe_paddrs);
157 kfree(rbio->finish_pointers);
158 }
159
free_raid_bio(struct btrfs_raid_bio * rbio)160 static void free_raid_bio(struct btrfs_raid_bio *rbio)
161 {
162 int i;
163
164 if (!refcount_dec_and_test(&rbio->refs))
165 return;
166
167 WARN_ON(!list_empty(&rbio->stripe_cache));
168 WARN_ON(!list_empty(&rbio->hash_list));
169 WARN_ON(!bio_list_empty(&rbio->bio_list));
170
171 for (i = 0; i < rbio->nr_pages; i++) {
172 if (rbio->stripe_pages[i]) {
173 __free_page(rbio->stripe_pages[i]);
174 rbio->stripe_pages[i] = NULL;
175 }
176 }
177
178 btrfs_put_bioc(rbio->bioc);
179 free_raid_bio_pointers(rbio);
180 kfree(rbio);
181 }
182
start_async_work(struct btrfs_raid_bio * rbio,work_func_t work_func)183 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
184 {
185 INIT_WORK(&rbio->work, work_func);
186 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
187 }
188
189 /*
190 * the stripe hash table is used for locking, and to collect
191 * bios in hopes of making a full stripe
192 */
btrfs_alloc_stripe_hash_table(struct btrfs_fs_info * info)193 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
194 {
195 struct btrfs_stripe_hash_table *table;
196 struct btrfs_stripe_hash_table *x;
197 struct btrfs_stripe_hash *cur;
198 struct btrfs_stripe_hash *h;
199 unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
200
201 if (info->stripe_hash_table)
202 return 0;
203
204 /*
205 * The table is large, starting with order 4 and can go as high as
206 * order 7 in case lock debugging is turned on.
207 *
208 * Try harder to allocate and fallback to vmalloc to lower the chance
209 * of a failing mount.
210 */
211 table = kvzalloc_flex(*table, table, num_entries);
212 if (!table)
213 return -ENOMEM;
214
215 spin_lock_init(&table->cache_lock);
216 INIT_LIST_HEAD(&table->stripe_cache);
217
218 h = table->table;
219
220 for (unsigned int i = 0; i < num_entries; i++) {
221 cur = h + i;
222 INIT_LIST_HEAD(&cur->hash_list);
223 spin_lock_init(&cur->lock);
224 }
225
226 x = cmpxchg(&info->stripe_hash_table, NULL, table);
227 kvfree(x);
228 return 0;
229 }
230
memcpy_from_bio_to_stripe(struct btrfs_raid_bio * rbio,unsigned int sector_nr)231 static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
232 {
233 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
234
235 ASSERT(sector_nr < rbio->nr_sectors);
236 for (int i = 0; i < rbio->sector_nsteps; i++) {
237 unsigned int index = sector_nr * rbio->sector_nsteps + i;
238 phys_addr_t dst = rbio->stripe_paddrs[index];
239 phys_addr_t src = rbio->bio_paddrs[index];
240
241 ASSERT(dst != INVALID_PADDR);
242 ASSERT(src != INVALID_PADDR);
243
244 memcpy_page(phys_to_page(dst), offset_in_page(dst),
245 phys_to_page(src), offset_in_page(src), step);
246 }
247 }
248
249 /*
250 * caching an rbio means to copy anything from the
251 * bio_sectors array into the stripe_pages array. We
252 * use the page uptodate bit in the stripe cache array
253 * to indicate if it has valid data
254 *
255 * once the caching is done, we set the cache ready
256 * bit.
257 */
cache_rbio_pages(struct btrfs_raid_bio * rbio)258 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
259 {
260 int i;
261 int ret;
262
263 ret = alloc_rbio_pages(rbio);
264 if (ret)
265 return;
266
267 for (i = 0; i < rbio->nr_sectors; i++) {
268 /* Some range not covered by bio (partial write), skip it */
269 if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
270 /*
271 * Even if the sector is not covered by bio, if it is
272 * a data sector it should still be uptodate as it is
273 * read from disk.
274 */
275 if (i < rbio->nr_data * rbio->stripe_nsectors)
276 ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
277 continue;
278 }
279
280 memcpy_from_bio_to_stripe(rbio, i);
281 set_bit(i, rbio->stripe_uptodate_bitmap);
282 }
283 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
284 }
285
286 /*
287 * we hash on the first logical address of the stripe
288 */
rbio_bucket(struct btrfs_raid_bio * rbio)289 static int rbio_bucket(struct btrfs_raid_bio *rbio)
290 {
291 u64 num = rbio->bioc->full_stripe_logical;
292
293 /*
294 * we shift down quite a bit. We're using byte
295 * addressing, and most of the lower bits are zeros.
296 * This tends to upset hash_64, and it consistently
297 * returns just one or two different values.
298 *
299 * shifting off the lower bits fixes things.
300 */
301 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
302 }
303
304 /* Get the sector number of the first sector covered by @page_nr. */
page_nr_to_sector_nr(struct btrfs_raid_bio * rbio,unsigned int page_nr)305 static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
306 {
307 u32 sector_nr;
308
309 ASSERT(page_nr < rbio->nr_pages);
310
311 sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
312 ASSERT(sector_nr < rbio->nr_sectors);
313 return sector_nr;
314 }
315
316 /*
317 * Get the number of sectors covered by @page_nr.
318 *
319 * For bs > ps cases, the result will always be 1.
320 * For bs <= ps cases, the result will be ps / bs.
321 */
page_nr_to_num_sectors(struct btrfs_raid_bio * rbio,unsigned int page_nr)322 static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
323 {
324 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
325 u32 nr_sectors;
326
327 ASSERT(page_nr < rbio->nr_pages);
328
329 nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
330 ASSERT(nr_sectors > 0);
331 return nr_sectors;
332 }
333
full_page_sectors_uptodate(struct btrfs_raid_bio * rbio,unsigned int page_nr)334 static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
335 unsigned int page_nr)
336 {
337 const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
338 const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
339 int i;
340
341 ASSERT(page_nr < rbio->nr_pages);
342 ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
343
344 for (i = sector_nr; i < sector_nr + nr_bits; i++) {
345 if (!test_bit(i, rbio->stripe_uptodate_bitmap))
346 return false;
347 }
348 return true;
349 }
350
351 /*
352 * Update the stripe_sectors[] array to use correct page and pgoff
353 *
354 * Should be called every time any page pointer in stripes_pages[] got modified.
355 */
index_stripe_sectors(struct btrfs_raid_bio * rbio)356 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
357 {
358 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
359 u32 offset;
360 int i;
361
362 for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
363 i++, offset += step) {
364 int page_index = offset >> PAGE_SHIFT;
365
366 ASSERT(page_index < rbio->nr_pages);
367 if (!rbio->stripe_pages[page_index])
368 continue;
369
370 rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
371 offset_in_page(offset);
372 }
373 }
374
steal_rbio_page(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest,int page_nr)375 static void steal_rbio_page(struct btrfs_raid_bio *src,
376 struct btrfs_raid_bio *dest, int page_nr)
377 {
378 const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
379 const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
380
381 ASSERT(page_nr < src->nr_pages);
382 ASSERT(sector_nr + nr_bits < src->nr_sectors);
383
384 if (dest->stripe_pages[page_nr])
385 __free_page(dest->stripe_pages[page_nr]);
386 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
387 src->stripe_pages[page_nr] = NULL;
388
389 /* Also update the stripe_uptodate_bitmap bits. */
390 bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
391 }
392
is_data_stripe_page(struct btrfs_raid_bio * rbio,int page_nr)393 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
394 {
395 const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
396
397 /*
398 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
399 * we won't have a page which is half data half parity.
400 *
401 * Thus if the first sector of the page belongs to data stripes, then
402 * the full page belongs to data stripes.
403 */
404 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
405 }
406
407 /*
408 * Stealing an rbio means taking all the uptodate pages from the stripe array
409 * in the source rbio and putting them into the destination rbio.
410 *
411 * This will also update the involved stripe_sectors[] which are referring to
412 * the old pages.
413 */
steal_rbio(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest)414 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
415 {
416 int i;
417
418 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
419 return;
420
421 for (i = 0; i < dest->nr_pages; i++) {
422 struct page *p = src->stripe_pages[i];
423
424 /*
425 * We don't need to steal P/Q pages as they will always be
426 * regenerated for RMW or full write anyway.
427 */
428 if (!is_data_stripe_page(src, i))
429 continue;
430
431 /*
432 * If @src already has RBIO_CACHE_READY_BIT, it should have
433 * all data stripe pages present and uptodate.
434 */
435 ASSERT(p);
436 ASSERT(full_page_sectors_uptodate(src, i));
437 steal_rbio_page(src, dest, i);
438 }
439 index_stripe_sectors(dest);
440 index_stripe_sectors(src);
441 }
442
443 /*
444 * merging means we take the bio_list from the victim and
445 * splice it into the destination. The victim should
446 * be discarded afterwards.
447 *
448 * must be called with dest->rbio_list_lock held
449 */
merge_rbio(struct btrfs_raid_bio * dest,struct btrfs_raid_bio * victim)450 static void merge_rbio(struct btrfs_raid_bio *dest,
451 struct btrfs_raid_bio *victim)
452 {
453 bio_list_merge_init(&dest->bio_list, &victim->bio_list);
454 dest->bio_list_bytes += victim->bio_list_bytes;
455 /* Also inherit the bitmaps from @victim. */
456 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
457 dest->stripe_nsectors);
458 }
459
460 /*
461 * used to prune items that are in the cache. The caller
462 * must hold the hash table lock.
463 */
__remove_rbio_from_cache(struct btrfs_raid_bio * rbio)464 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
465 {
466 int bucket = rbio_bucket(rbio);
467 struct btrfs_stripe_hash_table *table;
468 struct btrfs_stripe_hash *h;
469 int freeit = 0;
470
471 /*
472 * check the bit again under the hash table lock.
473 */
474 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
475 return;
476
477 table = rbio->bioc->fs_info->stripe_hash_table;
478 h = table->table + bucket;
479
480 /* hold the lock for the bucket because we may be
481 * removing it from the hash table
482 */
483 spin_lock(&h->lock);
484
485 /*
486 * hold the lock for the bio list because we need
487 * to make sure the bio list is empty
488 */
489 spin_lock(&rbio->bio_list_lock);
490
491 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
492 list_del_init(&rbio->stripe_cache);
493 table->cache_size -= 1;
494 freeit = 1;
495
496 /* if the bio list isn't empty, this rbio is
497 * still involved in an IO. We take it out
498 * of the cache list, and drop the ref that
499 * was held for the list.
500 *
501 * If the bio_list was empty, we also remove
502 * the rbio from the hash_table, and drop
503 * the corresponding ref
504 */
505 if (bio_list_empty(&rbio->bio_list)) {
506 if (!list_empty(&rbio->hash_list)) {
507 list_del_init(&rbio->hash_list);
508 refcount_dec(&rbio->refs);
509 BUG_ON(!list_empty(&rbio->plug_list));
510 }
511 }
512 }
513
514 spin_unlock(&rbio->bio_list_lock);
515 spin_unlock(&h->lock);
516
517 if (freeit)
518 free_raid_bio(rbio);
519 }
520
521 /*
522 * prune a given rbio from the cache
523 */
remove_rbio_from_cache(struct btrfs_raid_bio * rbio)524 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
525 {
526 struct btrfs_stripe_hash_table *table;
527
528 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
529 return;
530
531 table = rbio->bioc->fs_info->stripe_hash_table;
532
533 spin_lock(&table->cache_lock);
534 __remove_rbio_from_cache(rbio);
535 spin_unlock(&table->cache_lock);
536 }
537
538 /*
539 * remove everything in the cache
540 */
btrfs_clear_rbio_cache(struct btrfs_fs_info * info)541 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
542 {
543 struct btrfs_stripe_hash_table *table;
544 struct btrfs_raid_bio *rbio;
545
546 table = info->stripe_hash_table;
547
548 spin_lock(&table->cache_lock);
549 while (!list_empty(&table->stripe_cache)) {
550 rbio = list_first_entry(&table->stripe_cache,
551 struct btrfs_raid_bio, stripe_cache);
552 __remove_rbio_from_cache(rbio);
553 }
554 spin_unlock(&table->cache_lock);
555 }
556
557 /*
558 * remove all cached entries and free the hash table
559 * used by unmount
560 */
btrfs_free_stripe_hash_table(struct btrfs_fs_info * info)561 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
562 {
563 if (!info->stripe_hash_table)
564 return;
565 btrfs_clear_rbio_cache(info);
566 kvfree(info->stripe_hash_table);
567 info->stripe_hash_table = NULL;
568 }
569
570 /*
571 * insert an rbio into the stripe cache. It
572 * must have already been prepared by calling
573 * cache_rbio_pages
574 *
575 * If this rbio was already cached, it gets
576 * moved to the front of the lru.
577 *
578 * If the size of the rbio cache is too big, we
579 * prune an item.
580 */
cache_rbio(struct btrfs_raid_bio * rbio)581 static void cache_rbio(struct btrfs_raid_bio *rbio)
582 {
583 struct btrfs_stripe_hash_table *table;
584
585 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
586 return;
587
588 table = rbio->bioc->fs_info->stripe_hash_table;
589
590 spin_lock(&table->cache_lock);
591 spin_lock(&rbio->bio_list_lock);
592
593 /* bump our ref if we were not in the list before */
594 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
595 refcount_inc(&rbio->refs);
596
597 if (!list_empty(&rbio->stripe_cache)){
598 list_move(&rbio->stripe_cache, &table->stripe_cache);
599 } else {
600 list_add(&rbio->stripe_cache, &table->stripe_cache);
601 table->cache_size += 1;
602 }
603
604 spin_unlock(&rbio->bio_list_lock);
605
606 if (table->cache_size > RBIO_CACHE_SIZE) {
607 struct btrfs_raid_bio *found;
608
609 found = list_last_entry(&table->stripe_cache,
610 struct btrfs_raid_bio,
611 stripe_cache);
612
613 if (found != rbio)
614 __remove_rbio_from_cache(found);
615 }
616
617 spin_unlock(&table->cache_lock);
618 }
619
620 /*
621 * Returns true if the bio list inside this rbio covers an entire stripe (no
622 * rmw required).
623 */
rbio_is_full(struct btrfs_raid_bio * rbio)624 static int rbio_is_full(struct btrfs_raid_bio *rbio)
625 {
626 unsigned long size = rbio->bio_list_bytes;
627 int ret = 1;
628
629 spin_lock(&rbio->bio_list_lock);
630 if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
631 ret = 0;
632 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
633 spin_unlock(&rbio->bio_list_lock);
634
635 return ret;
636 }
637
638 /*
639 * returns 1 if it is safe to merge two rbios together.
640 * The merging is safe if the two rbios correspond to
641 * the same stripe and if they are both going in the same
642 * direction (read vs write), and if neither one is
643 * locked for final IO
644 *
645 * The caller is responsible for locking such that
646 * rmw_locked is safe to test
647 */
rbio_can_merge(struct btrfs_raid_bio * last,struct btrfs_raid_bio * cur)648 static int rbio_can_merge(struct btrfs_raid_bio *last,
649 struct btrfs_raid_bio *cur)
650 {
651 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
652 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
653 return 0;
654
655 /*
656 * we can't merge with cached rbios, since the
657 * idea is that when we merge the destination
658 * rbio is going to run our IO for us. We can
659 * steal from cached rbios though, other functions
660 * handle that.
661 */
662 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
663 test_bit(RBIO_CACHE_BIT, &cur->flags))
664 return 0;
665
666 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
667 return 0;
668
669 /* we can't merge with different operations */
670 if (last->operation != cur->operation)
671 return 0;
672 /*
673 * We've need read the full stripe from the drive.
674 * check and repair the parity and write the new results.
675 *
676 * We're not allowed to add any new bios to the
677 * bio list here, anyone else that wants to
678 * change this stripe needs to do their own rmw.
679 */
680 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
681 return 0;
682
683 if (last->operation == BTRFS_RBIO_READ_REBUILD)
684 return 0;
685
686 return 1;
687 }
688
689 /* Return the sector index for @stripe_nr and @sector_nr. */
rbio_sector_index(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)690 static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
691 unsigned int stripe_nr,
692 unsigned int sector_nr)
693 {
694 unsigned int ret;
695
696 ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
697 ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
698
699 ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
700 ASSERT(ret < rbio->nr_sectors);
701 return ret;
702 }
703
704 /* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
rbio_paddr_index(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr,unsigned int step_nr)705 static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
706 unsigned int stripe_nr,
707 unsigned int sector_nr,
708 unsigned int step_nr)
709 {
710 unsigned int ret;
711
712 ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
713
714 ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
715 ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
716 return ret;
717 }
718
rbio_stripe_paddr(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr,unsigned int step_nr)719 static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
720 unsigned int stripe_nr, unsigned int sector_nr,
721 unsigned int step_nr)
722 {
723 return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
724 }
725
rbio_pstripe_paddr(const struct btrfs_raid_bio * rbio,unsigned int sector_nr,unsigned int step_nr)726 static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
727 unsigned int sector_nr, unsigned int step_nr)
728 {
729 return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
730 }
731
rbio_qstripe_paddr(const struct btrfs_raid_bio * rbio,unsigned int sector_nr,unsigned int step_nr)732 static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
733 unsigned int sector_nr, unsigned int step_nr)
734 {
735 if (rbio->nr_data + 1 == rbio->real_stripes)
736 return INVALID_PADDR;
737 return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
738 }
739
740 /* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
rbio_stripe_paddrs(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)741 static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
742 unsigned int stripe_nr, unsigned int sector_nr)
743 {
744 return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
745 }
746
747 /*
748 * The first stripe in the table for a logical address
749 * has the lock. rbios are added in one of three ways:
750 *
751 * 1) Nobody has the stripe locked yet. The rbio is given
752 * the lock and 0 is returned. The caller must start the IO
753 * themselves.
754 *
755 * 2) Someone has the stripe locked, but we're able to merge
756 * with the lock owner. The rbio is freed and the IO will
757 * start automatically along with the existing rbio. 1 is returned.
758 *
759 * 3) Someone has the stripe locked, but we're not able to merge.
760 * The rbio is added to the lock owner's plug list, or merged into
761 * an rbio already on the plug list. When the lock owner unlocks,
762 * the next rbio on the list is run and the IO is started automatically.
763 * 1 is returned
764 *
765 * If we return 0, the caller still owns the rbio and must continue with
766 * IO submission. If we return 1, the caller must assume the rbio has
767 * already been freed.
768 */
lock_stripe_add(struct btrfs_raid_bio * rbio)769 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
770 {
771 struct btrfs_stripe_hash *h;
772 struct btrfs_raid_bio *cur;
773 struct btrfs_raid_bio *pending;
774 struct btrfs_raid_bio *freeit = NULL;
775 struct btrfs_raid_bio *cache_drop = NULL;
776 int ret = 0;
777
778 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
779
780 spin_lock(&h->lock);
781 list_for_each_entry(cur, &h->hash_list, hash_list) {
782 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
783 continue;
784
785 spin_lock(&cur->bio_list_lock);
786
787 /* Can we steal this cached rbio's pages? */
788 if (bio_list_empty(&cur->bio_list) &&
789 list_empty(&cur->plug_list) &&
790 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
791 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
792 list_del_init(&cur->hash_list);
793 refcount_dec(&cur->refs);
794
795 steal_rbio(cur, rbio);
796 cache_drop = cur;
797 spin_unlock(&cur->bio_list_lock);
798
799 goto lockit;
800 }
801
802 /* Can we merge into the lock owner? */
803 if (rbio_can_merge(cur, rbio)) {
804 merge_rbio(cur, rbio);
805 spin_unlock(&cur->bio_list_lock);
806 freeit = rbio;
807 ret = 1;
808 goto out;
809 }
810
811
812 /*
813 * We couldn't merge with the running rbio, see if we can merge
814 * with the pending ones. We don't have to check for rmw_locked
815 * because there is no way they are inside finish_rmw right now
816 */
817 list_for_each_entry(pending, &cur->plug_list, plug_list) {
818 if (rbio_can_merge(pending, rbio)) {
819 merge_rbio(pending, rbio);
820 spin_unlock(&cur->bio_list_lock);
821 freeit = rbio;
822 ret = 1;
823 goto out;
824 }
825 }
826
827 /*
828 * No merging, put us on the tail of the plug list, our rbio
829 * will be started with the currently running rbio unlocks
830 */
831 list_add_tail(&rbio->plug_list, &cur->plug_list);
832 spin_unlock(&cur->bio_list_lock);
833 ret = 1;
834 goto out;
835 }
836 lockit:
837 refcount_inc(&rbio->refs);
838 list_add(&rbio->hash_list, &h->hash_list);
839 out:
840 spin_unlock(&h->lock);
841 if (cache_drop)
842 remove_rbio_from_cache(cache_drop);
843 if (freeit)
844 free_raid_bio(freeit);
845 return ret;
846 }
847
848 static void recover_rbio_work_locked(struct work_struct *work);
849
850 /*
851 * called as rmw or parity rebuild is completed. If the plug list has more
852 * rbios waiting for this stripe, the next one on the list will be started
853 */
unlock_stripe(struct btrfs_raid_bio * rbio)854 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
855 {
856 int bucket;
857 struct btrfs_stripe_hash *h;
858 int keep_cache = 0;
859
860 bucket = rbio_bucket(rbio);
861 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
862
863 if (list_empty(&rbio->plug_list))
864 cache_rbio(rbio);
865
866 spin_lock(&h->lock);
867 spin_lock(&rbio->bio_list_lock);
868
869 if (!list_empty(&rbio->hash_list)) {
870 /*
871 * if we're still cached and there is no other IO
872 * to perform, just leave this rbio here for others
873 * to steal from later
874 */
875 if (list_empty(&rbio->plug_list) &&
876 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
877 keep_cache = 1;
878 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
879 BUG_ON(!bio_list_empty(&rbio->bio_list));
880 goto done;
881 }
882
883 list_del_init(&rbio->hash_list);
884 refcount_dec(&rbio->refs);
885
886 /*
887 * we use the plug list to hold all the rbios
888 * waiting for the chance to lock this stripe.
889 * hand the lock over to one of them.
890 */
891 if (!list_empty(&rbio->plug_list)) {
892 struct btrfs_raid_bio *next;
893 struct list_head *head = rbio->plug_list.next;
894
895 next = list_entry(head, struct btrfs_raid_bio,
896 plug_list);
897
898 list_del_init(&rbio->plug_list);
899
900 list_add(&next->hash_list, &h->hash_list);
901 refcount_inc(&next->refs);
902 spin_unlock(&rbio->bio_list_lock);
903 spin_unlock(&h->lock);
904
905 if (next->operation == BTRFS_RBIO_READ_REBUILD) {
906 start_async_work(next, recover_rbio_work_locked);
907 } else if (next->operation == BTRFS_RBIO_WRITE) {
908 steal_rbio(rbio, next);
909 start_async_work(next, rmw_rbio_work_locked);
910 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
911 steal_rbio(rbio, next);
912 start_async_work(next, scrub_rbio_work_locked);
913 }
914
915 goto done_nolock;
916 }
917 }
918 done:
919 spin_unlock(&rbio->bio_list_lock);
920 spin_unlock(&h->lock);
921
922 done_nolock:
923 if (!keep_cache)
924 remove_rbio_from_cache(rbio);
925 }
926
rbio_endio_bio_list(struct bio * cur,blk_status_t status)927 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
928 {
929 struct bio *next;
930
931 while (cur) {
932 next = cur->bi_next;
933 cur->bi_next = NULL;
934 cur->bi_status = status;
935 bio_endio(cur);
936 cur = next;
937 }
938 }
939
940 /*
941 * this frees the rbio and runs through all the bios in the
942 * bio_list and calls end_io on them
943 */
rbio_orig_end_io(struct btrfs_raid_bio * rbio,blk_status_t status)944 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
945 {
946 struct bio *cur = bio_list_get(&rbio->bio_list);
947 struct bio *extra;
948
949 kfree(rbio->csum_buf);
950 bitmap_free(rbio->csum_bitmap);
951 rbio->csum_buf = NULL;
952 rbio->csum_bitmap = NULL;
953
954 /*
955 * Clear the data bitmap, as the rbio may be cached for later usage.
956 * do this before before unlock_stripe() so there will be no new bio
957 * for this bio.
958 */
959 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
960
961 /*
962 * At this moment, rbio->bio_list is empty, however since rbio does not
963 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
964 * hash list, rbio may be merged with others so that rbio->bio_list
965 * becomes non-empty.
966 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
967 * more and we can call bio_endio() on all queued bios.
968 */
969 unlock_stripe(rbio);
970 extra = bio_list_get(&rbio->bio_list);
971 free_raid_bio(rbio);
972
973 rbio_endio_bio_list(cur, status);
974 if (extra)
975 rbio_endio_bio_list(extra, status);
976 }
977
978 /*
979 * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
980 *
981 * @rbio: The raid bio
982 * @stripe_nr: Stripe number, valid range [0, real_stripe)
983 * @sector_nr: Sector number inside the stripe,
984 * valid range [0, stripe_nsectors)
985 * @bio_list_only: Whether to use sectors inside the bio list only.
986 *
987 * The read/modify/write code wants to reuse the original bio page as much
988 * as possible, and only use stripe_sectors as fallback.
989 *
990 * Return NULL if bio_list_only is set but the specified sector has no
991 * coresponding bio.
992 */
sector_paddrs_in_rbio(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr,bool bio_list_only)993 static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
994 int stripe_nr, int sector_nr,
995 bool bio_list_only)
996 {
997 phys_addr_t *ret = NULL;
998 const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
999
1000 ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1001
1002 scoped_guard(spinlock, &rbio->bio_list_lock) {
1003 if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1004 /* Don't return sector without a valid page pointer */
1005 if (rbio->bio_paddrs[index] != INVALID_PADDR)
1006 ret = &rbio->bio_paddrs[index];
1007 return ret;
1008 }
1009 }
1010 return &rbio->stripe_paddrs[index];
1011 }
1012
1013 /*
1014 * Similar to sector_paddr_in_rbio(), but with extra consideration for
1015 * bs > ps cases, where we can have multiple steps for a fs block.
1016 */
sector_paddr_in_rbio(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr,int step_nr,bool bio_list_only)1017 static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
1018 int stripe_nr, int sector_nr, int step_nr,
1019 bool bio_list_only)
1020 {
1021 phys_addr_t ret = INVALID_PADDR;
1022 const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
1023
1024 ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
1025
1026 scoped_guard(spinlock, &rbio->bio_list_lock) {
1027 if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
1028 /* Don't return sector without a valid page pointer */
1029 if (rbio->bio_paddrs[index] != INVALID_PADDR)
1030 ret = rbio->bio_paddrs[index];
1031 return ret;
1032 }
1033 }
1034 return rbio->stripe_paddrs[index];
1035 }
1036
1037 /*
1038 * allocation and initial setup for the btrfs_raid_bio. Not
1039 * this does not allocate any pages for rbio->pages.
1040 */
alloc_rbio(struct btrfs_fs_info * fs_info,struct btrfs_io_context * bioc)1041 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
1042 struct btrfs_io_context *bioc)
1043 {
1044 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
1045 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
1046 const unsigned int num_pages = stripe_npages * real_stripes;
1047 const unsigned int stripe_nsectors =
1048 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
1049 const unsigned int num_sectors = stripe_nsectors * real_stripes;
1050 const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
1051 const unsigned int sector_nsteps = fs_info->sectorsize / step;
1052 struct btrfs_raid_bio *rbio;
1053
1054 /*
1055 * For bs <= ps cases, ps must be aligned to bs.
1056 * For bs > ps cases, bs must be aligned to ps.
1057 */
1058 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
1059 IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
1060 /*
1061 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1062 * (at most 16) should be no larger than BITS_PER_LONG.
1063 */
1064 ASSERT(stripe_nsectors <= BITS_PER_LONG);
1065
1066 /*
1067 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1068 * (limited by u8).
1069 */
1070 ASSERT(real_stripes >= 2);
1071 ASSERT(real_stripes <= U8_MAX);
1072
1073 rbio = kzalloc_obj(*rbio, GFP_NOFS);
1074 if (!rbio)
1075 return ERR_PTR(-ENOMEM);
1076 rbio->stripe_pages = kzalloc_objs(struct page *, num_pages, GFP_NOFS);
1077 rbio->bio_paddrs = kzalloc_objs(phys_addr_t,
1078 num_sectors * sector_nsteps, GFP_NOFS);
1079 rbio->stripe_paddrs = kzalloc_objs(phys_addr_t,
1080 num_sectors * sector_nsteps,
1081 GFP_NOFS);
1082 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1083 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1084 rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1085
1086 if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
1087 !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
1088 free_raid_bio_pointers(rbio);
1089 kfree(rbio);
1090 return ERR_PTR(-ENOMEM);
1091 }
1092 for (int i = 0; i < num_sectors * sector_nsteps; i++) {
1093 rbio->stripe_paddrs[i] = INVALID_PADDR;
1094 rbio->bio_paddrs[i] = INVALID_PADDR;
1095 }
1096
1097 bio_list_init(&rbio->bio_list);
1098 init_waitqueue_head(&rbio->io_wait);
1099 INIT_LIST_HEAD(&rbio->plug_list);
1100 spin_lock_init(&rbio->bio_list_lock);
1101 INIT_LIST_HEAD(&rbio->stripe_cache);
1102 INIT_LIST_HEAD(&rbio->hash_list);
1103 btrfs_get_bioc(bioc);
1104 rbio->bioc = bioc;
1105 rbio->nr_pages = num_pages;
1106 rbio->nr_sectors = num_sectors;
1107 rbio->real_stripes = real_stripes;
1108 rbio->stripe_npages = stripe_npages;
1109 rbio->stripe_nsectors = stripe_nsectors;
1110 rbio->sector_nsteps = sector_nsteps;
1111 refcount_set(&rbio->refs, 1);
1112 atomic_set(&rbio->stripes_pending, 0);
1113
1114 ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1115 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1116 ASSERT(rbio->nr_data > 0);
1117
1118 return rbio;
1119 }
1120
1121 /* allocate pages for all the stripes in the bio, including parity */
alloc_rbio_pages(struct btrfs_raid_bio * rbio)1122 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1123 {
1124 int ret;
1125
1126 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1127 if (ret < 0)
1128 return ret;
1129 /* Mapping all sectors */
1130 index_stripe_sectors(rbio);
1131 return 0;
1132 }
1133
1134 /* only allocate pages for p/q stripes */
alloc_rbio_parity_pages(struct btrfs_raid_bio * rbio)1135 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1136 {
1137 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1138 int ret;
1139
1140 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1141 rbio->stripe_pages + data_pages, false);
1142 if (ret < 0)
1143 return ret;
1144
1145 index_stripe_sectors(rbio);
1146 return 0;
1147 }
1148
1149 /*
1150 * Return the total number of errors found in the vertical stripe of @sector_nr.
1151 *
1152 * @faila and @failb will also be updated to the first and second stripe
1153 * number of the errors.
1154 */
get_rbio_vertical_errors(struct btrfs_raid_bio * rbio,int sector_nr,int * faila,int * failb)1155 static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1156 int *faila, int *failb)
1157 {
1158 int stripe_nr;
1159 int found_errors = 0;
1160
1161 if (faila || failb) {
1162 /*
1163 * Both @faila and @failb should be valid pointers if any of
1164 * them is specified.
1165 */
1166 ASSERT(faila && failb);
1167 *faila = -1;
1168 *failb = -1;
1169 }
1170
1171 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1172 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1173
1174 if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1175 found_errors++;
1176 if (faila) {
1177 /* Update faila and failb. */
1178 if (*faila < 0)
1179 *faila = stripe_nr;
1180 else if (*failb < 0)
1181 *failb = stripe_nr;
1182 }
1183 }
1184 }
1185 return found_errors;
1186 }
1187
bio_add_paddrs(struct bio * bio,phys_addr_t * paddrs,unsigned int nr_steps,unsigned int step)1188 static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
1189 unsigned int step)
1190 {
1191 int added = 0;
1192 int ret;
1193
1194 for (int i = 0; i < nr_steps; i++) {
1195 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
1196 offset_in_page(paddrs[i]));
1197 if (ret != step)
1198 goto revert;
1199 added += ret;
1200 }
1201 return added;
1202 revert:
1203 /*
1204 * We don't need to revert the bvec, as the bio will be submitted immediately,
1205 * as long as the size is reduced the extra bvec will not be accessed.
1206 */
1207 bio->bi_iter.bi_size -= added;
1208 return 0;
1209 }
1210
1211 /*
1212 * Add a single sector @sector into our list of bios for IO.
1213 *
1214 * Return 0 if everything went well.
1215 * Return <0 for error, and no byte will be added to @rbio.
1216 */
rbio_add_io_paddrs(struct btrfs_raid_bio * rbio,struct bio_list * bio_list,phys_addr_t * paddrs,unsigned int stripe_nr,unsigned int sector_nr,enum req_op op)1217 static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
1218 phys_addr_t *paddrs, unsigned int stripe_nr,
1219 unsigned int sector_nr, enum req_op op)
1220 {
1221 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1222 const u32 step = min(sectorsize, PAGE_SIZE);
1223 struct bio *last = bio_list->tail;
1224 int ret;
1225 struct bio *bio;
1226 struct btrfs_io_stripe *stripe;
1227 u64 disk_start;
1228
1229 /*
1230 * Note: here stripe_nr has taken device replace into consideration,
1231 * thus it can be larger than rbio->real_stripe.
1232 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1233 */
1234 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1235 rbio, stripe_nr);
1236 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1237 rbio, sector_nr);
1238 ASSERT(paddrs != NULL);
1239
1240 stripe = &rbio->bioc->stripes[stripe_nr];
1241 disk_start = stripe->physical + sector_nr * sectorsize;
1242
1243 /* if the device is missing, just fail this stripe */
1244 if (!stripe->dev->bdev) {
1245 int found_errors;
1246
1247 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1248 rbio->error_bitmap);
1249
1250 /* Check if we have reached tolerance early. */
1251 found_errors = get_rbio_vertical_errors(rbio, sector_nr,
1252 NULL, NULL);
1253 if (unlikely(found_errors > rbio->bioc->max_errors))
1254 return -EIO;
1255 return 0;
1256 }
1257
1258 /* see if we can add this page onto our existing bio */
1259 if (last) {
1260 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1261 last_end += last->bi_iter.bi_size;
1262
1263 /*
1264 * we can't merge these if they are from different
1265 * devices or if they are not contiguous
1266 */
1267 if (last_end == disk_start && !last->bi_status &&
1268 last->bi_bdev == stripe->dev->bdev) {
1269 ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
1270 if (ret == sectorsize)
1271 return 0;
1272 }
1273 }
1274
1275 /* put a new bio on the list */
1276 bio = bio_alloc(stripe->dev->bdev,
1277 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1278 op, GFP_NOFS);
1279 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1280 bio->bi_private = rbio;
1281
1282 ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
1283 ASSERT(ret == sectorsize);
1284 bio_list_add(bio_list, bio);
1285 return 0;
1286 }
1287
index_one_bio(struct btrfs_raid_bio * rbio,struct bio * bio)1288 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1289 {
1290 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1291 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1292 const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
1293 struct bvec_iter iter = bio->bi_iter;
1294 phys_addr_t paddr;
1295 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1296 rbio->bioc->full_stripe_logical;
1297
1298 btrfs_bio_for_each_block(paddr, bio, &iter, step) {
1299 unsigned int index = (offset >> step_bits);
1300
1301 rbio->bio_paddrs[index] = paddr;
1302 offset += step;
1303 }
1304 }
1305
1306 /*
1307 * helper function to walk our bio list and populate the bio_pages array with
1308 * the result. This seems expensive, but it is faster than constantly
1309 * searching through the bio list as we setup the IO in finish_rmw or stripe
1310 * reconstruction.
1311 *
1312 * This must be called before you trust the answers from page_in_rbio
1313 */
index_rbio_pages(struct btrfs_raid_bio * rbio)1314 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1315 {
1316 struct bio *bio;
1317
1318 spin_lock(&rbio->bio_list_lock);
1319 bio_list_for_each(bio, &rbio->bio_list)
1320 index_one_bio(rbio, bio);
1321
1322 spin_unlock(&rbio->bio_list_lock);
1323 }
1324
bio_get_trace_info(struct btrfs_raid_bio * rbio,struct bio * bio,struct raid56_bio_trace_info * trace_info)1325 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1326 struct raid56_bio_trace_info *trace_info)
1327 {
1328 const struct btrfs_io_context *bioc = rbio->bioc;
1329 int i;
1330
1331 ASSERT(bioc);
1332
1333 /* We rely on bio->bi_bdev to find the stripe number. */
1334 if (!bio->bi_bdev)
1335 goto not_found;
1336
1337 for (i = 0; i < bioc->num_stripes; i++) {
1338 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1339 continue;
1340 trace_info->stripe_nr = i;
1341 trace_info->devid = bioc->stripes[i].dev->devid;
1342 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1343 bioc->stripes[i].physical;
1344 return;
1345 }
1346
1347 not_found:
1348 trace_info->devid = -1;
1349 trace_info->offset = -1;
1350 trace_info->stripe_nr = -1;
1351 }
1352
bio_list_put(struct bio_list * bio_list)1353 static inline void bio_list_put(struct bio_list *bio_list)
1354 {
1355 struct bio *bio;
1356
1357 while ((bio = bio_list_pop(bio_list)))
1358 bio_put(bio);
1359 }
1360
assert_rbio(struct btrfs_raid_bio * rbio)1361 static void assert_rbio(struct btrfs_raid_bio *rbio)
1362 {
1363 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1364 return;
1365
1366 /*
1367 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1368 * we won't go beyond 256 disks anyway.
1369 */
1370 ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1371 ASSERT_RBIO(rbio->nr_data > 0, rbio);
1372
1373 /*
1374 * This is another check to make sure nr data stripes is smaller
1375 * than total stripes.
1376 */
1377 ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1378 }
1379
kmap_local_paddr(phys_addr_t paddr)1380 static inline void *kmap_local_paddr(phys_addr_t paddr)
1381 {
1382 /* The sector pointer must have a page mapped to it. */
1383 ASSERT(paddr != INVALID_PADDR);
1384
1385 return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
1386 }
1387
generate_pq_vertical_step(struct btrfs_raid_bio * rbio,unsigned int sector_nr,unsigned int step_nr)1388 static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
1389 unsigned int step_nr)
1390 {
1391 void **pointers = rbio->finish_pointers;
1392 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
1393 int stripe;
1394 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1395
1396 /* First collect one sector from each data stripe */
1397 for (stripe = 0; stripe < rbio->nr_data; stripe++)
1398 pointers[stripe] = kmap_local_paddr(
1399 sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
1400
1401 /* Then add the parity stripe */
1402 pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
1403
1404 if (has_qstripe) {
1405 /*
1406 * RAID6, add the qstripe and call the library function
1407 * to fill in our p/q
1408 */
1409 pointers[stripe++] = kmap_local_paddr(
1410 rbio_qstripe_paddr(rbio, sector_nr, step_nr));
1411
1412 assert_rbio(rbio);
1413 raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
1414 } else {
1415 /* raid5 */
1416 memcpy(pointers[rbio->nr_data], pointers[0], step);
1417 xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1,
1418 step);
1419 }
1420 for (stripe = stripe - 1; stripe >= 0; stripe--)
1421 kunmap_local(pointers[stripe]);
1422 }
1423
1424 /* Generate PQ for one vertical stripe. */
generate_pq_vertical(struct btrfs_raid_bio * rbio,int sectornr)1425 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1426 {
1427 const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
1428
1429 for (int i = 0; i < rbio->sector_nsteps; i++)
1430 generate_pq_vertical_step(rbio, sectornr, i);
1431
1432 set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
1433 rbio->stripe_uptodate_bitmap);
1434 if (has_qstripe)
1435 set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
1436 rbio->stripe_uptodate_bitmap);
1437 }
1438
rmw_assemble_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1439 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1440 struct bio_list *bio_list)
1441 {
1442 /* The total sector number inside the full stripe. */
1443 int total_sector_nr;
1444 int sectornr;
1445 int stripe;
1446 int ret;
1447
1448 ASSERT(bio_list_size(bio_list) == 0);
1449
1450 /* We should have at least one data sector. */
1451 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1452
1453 /*
1454 * Reset errors, as we may have errors inherited from from degraded
1455 * write.
1456 */
1457 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1458
1459 /*
1460 * Start assembly. Make bios for everything from the higher layers (the
1461 * bio_list in our rbio) and our P/Q. Ignore everything else.
1462 */
1463 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1464 total_sector_nr++) {
1465 phys_addr_t *paddrs;
1466
1467 stripe = total_sector_nr / rbio->stripe_nsectors;
1468 sectornr = total_sector_nr % rbio->stripe_nsectors;
1469
1470 /* This vertical stripe has no data, skip it. */
1471 if (!test_bit(sectornr, &rbio->dbitmap))
1472 continue;
1473
1474 if (stripe < rbio->nr_data) {
1475 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1476 if (paddrs == NULL)
1477 continue;
1478 } else {
1479 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1480 }
1481
1482 ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
1483 sectornr, REQ_OP_WRITE);
1484 if (ret)
1485 goto error;
1486 }
1487
1488 if (likely(!rbio->bioc->replace_nr_stripes))
1489 return 0;
1490
1491 /*
1492 * Make a copy for the replace target device.
1493 *
1494 * Thus the source stripe number (in replace_stripe_src) should be valid.
1495 */
1496 ASSERT(rbio->bioc->replace_stripe_src >= 0);
1497
1498 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1499 total_sector_nr++) {
1500 phys_addr_t *paddrs;
1501
1502 stripe = total_sector_nr / rbio->stripe_nsectors;
1503 sectornr = total_sector_nr % rbio->stripe_nsectors;
1504
1505 /*
1506 * For RAID56, there is only one device that can be replaced,
1507 * and replace_stripe_src[0] indicates the stripe number we
1508 * need to copy from.
1509 */
1510 if (stripe != rbio->bioc->replace_stripe_src) {
1511 /*
1512 * We can skip the whole stripe completely, note
1513 * total_sector_nr will be increased by one anyway.
1514 */
1515 ASSERT(sectornr == 0);
1516 total_sector_nr += rbio->stripe_nsectors - 1;
1517 continue;
1518 }
1519
1520 /* This vertical stripe has no data, skip it. */
1521 if (!test_bit(sectornr, &rbio->dbitmap))
1522 continue;
1523
1524 if (stripe < rbio->nr_data) {
1525 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
1526 if (paddrs == NULL)
1527 continue;
1528 } else {
1529 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
1530 }
1531
1532 ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
1533 rbio->real_stripes,
1534 sectornr, REQ_OP_WRITE);
1535 if (ret)
1536 goto error;
1537 }
1538
1539 return 0;
1540 error:
1541 bio_list_put(bio_list);
1542 return -EIO;
1543 }
1544
set_rbio_range_error(struct btrfs_raid_bio * rbio,struct bio * bio)1545 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1546 {
1547 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1548 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1549 rbio->bioc->full_stripe_logical;
1550 int total_nr_sector = offset >> fs_info->sectorsize_bits;
1551
1552 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1553
1554 bitmap_set(rbio->error_bitmap, total_nr_sector,
1555 bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1556
1557 /*
1558 * Special handling for raid56_alloc_missing_rbio() used by
1559 * scrub/replace. Unlike call path in raid56_parity_recover(), they
1560 * pass an empty bio here. Thus we have to find out the missing device
1561 * and mark the stripe error instead.
1562 */
1563 if (bio->bi_iter.bi_size == 0) {
1564 bool found_missing = false;
1565 int stripe_nr;
1566
1567 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1568 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1569 found_missing = true;
1570 bitmap_set(rbio->error_bitmap,
1571 stripe_nr * rbio->stripe_nsectors,
1572 rbio->stripe_nsectors);
1573 }
1574 }
1575 ASSERT(found_missing);
1576 }
1577 }
1578
1579 /*
1580 * Return the index inside the rbio->stripe_sectors[] array.
1581 *
1582 * Return -1 if not found.
1583 */
find_stripe_sector_nr(struct btrfs_raid_bio * rbio,phys_addr_t paddr)1584 static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
1585 {
1586 for (int i = 0; i < rbio->nr_sectors; i++) {
1587 if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
1588 return i;
1589 }
1590 return -1;
1591 }
1592
1593 /*
1594 * this sets each page in the bio uptodate. It should only be used on private
1595 * rbio pages, nothing that comes in from the higher layers
1596 */
set_bio_pages_uptodate(struct btrfs_raid_bio * rbio,struct bio * bio)1597 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1598 {
1599 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1600 const u32 step = min(sectorsize, PAGE_SIZE);
1601 u32 offset = 0;
1602 phys_addr_t paddr;
1603
1604 ASSERT(!bio_flagged(bio, BIO_CLONED));
1605
1606 btrfs_bio_for_each_block_all(paddr, bio, step) {
1607 /* Hitting the first step of a sector. */
1608 if (IS_ALIGNED(offset, sectorsize)) {
1609 int sector_nr = find_stripe_sector_nr(rbio, paddr);
1610
1611 ASSERT(sector_nr >= 0);
1612 if (sector_nr >= 0)
1613 set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
1614 }
1615 offset += step;
1616 }
1617 }
1618
get_bio_sector_nr(struct btrfs_raid_bio * rbio,struct bio * bio)1619 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1620 {
1621 phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1622 int i;
1623
1624 for (i = 0; i < rbio->nr_sectors; i++) {
1625 if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1626 break;
1627 if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
1628 break;
1629 }
1630 ASSERT(i < rbio->nr_sectors);
1631 return i;
1632 }
1633
rbio_update_error_bitmap(struct btrfs_raid_bio * rbio,struct bio * bio)1634 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1635 {
1636 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1637 const u32 bio_size = bio_get_size(bio);
1638
1639 /*
1640 * Since we can have multiple bios touching the error_bitmap, we cannot
1641 * call bitmap_set() without protection.
1642 *
1643 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1644 */
1645 for (int i = total_sector_nr; i < total_sector_nr +
1646 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1647 set_bit(i, rbio->error_bitmap);
1648 }
1649
1650 /* Verify the data sectors at read time. */
verify_bio_data_sectors(struct btrfs_raid_bio * rbio,struct bio * bio)1651 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1652 struct bio *bio)
1653 {
1654 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1655 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1656 const u32 nr_steps = rbio->sector_nsteps;
1657 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1658 u32 offset = 0;
1659 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
1660 phys_addr_t paddr;
1661
1662 /* No data csum for the whole stripe, no need to verify. */
1663 if (!rbio->csum_bitmap || !rbio->csum_buf)
1664 return;
1665
1666 /* P/Q stripes, they have no data csum to verify against. */
1667 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1668 return;
1669
1670 btrfs_bio_for_each_block_all(paddr, bio, step) {
1671 u8 csum_buf[BTRFS_CSUM_SIZE];
1672 u8 *expected_csum;
1673
1674 paddrs[(offset / step) % nr_steps] = paddr;
1675 offset += step;
1676
1677 /* Not yet covering the full fs block, continue to the next step. */
1678 if (!IS_ALIGNED(offset, fs_info->sectorsize))
1679 continue;
1680
1681 /* No csum for this sector, skip to the next sector. */
1682 if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1683 continue;
1684
1685 expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
1686 btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1687 if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
1688 set_bit(total_sector_nr, rbio->error_bitmap);
1689 total_sector_nr++;
1690 }
1691 }
1692
raid_wait_read_end_io(struct bio * bio)1693 static void raid_wait_read_end_io(struct bio *bio)
1694 {
1695 struct btrfs_raid_bio *rbio = bio->bi_private;
1696
1697 if (bio->bi_status) {
1698 rbio_update_error_bitmap(rbio, bio);
1699 } else {
1700 set_bio_pages_uptodate(rbio, bio);
1701 verify_bio_data_sectors(rbio, bio);
1702 }
1703
1704 bio_put(bio);
1705 if (atomic_dec_and_test(&rbio->stripes_pending))
1706 wake_up(&rbio->io_wait);
1707 }
1708
submit_read_wait_bio_list(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1709 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1710 struct bio_list *bio_list)
1711 {
1712 struct bio *bio;
1713
1714 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1715 while ((bio = bio_list_pop(bio_list))) {
1716 bio->bi_end_io = raid_wait_read_end_io;
1717
1718 if (trace_raid56_read_enabled()) {
1719 struct raid56_bio_trace_info trace_info = { 0 };
1720
1721 bio_get_trace_info(rbio, bio, &trace_info);
1722 trace_call__raid56_read(rbio, bio, &trace_info);
1723 }
1724 submit_bio(bio);
1725 }
1726
1727 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1728 }
1729
alloc_rbio_data_pages(struct btrfs_raid_bio * rbio)1730 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1731 {
1732 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1733 int ret;
1734
1735 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1736 if (ret < 0)
1737 return ret;
1738
1739 index_stripe_sectors(rbio);
1740 return 0;
1741 }
1742
1743 /*
1744 * We use plugging call backs to collect full stripes.
1745 * Any time we get a partial stripe write while plugged
1746 * we collect it into a list. When the unplug comes down,
1747 * we sort the list by logical block number and merge
1748 * everything we can into the same rbios
1749 */
1750 struct btrfs_plug_cb {
1751 struct blk_plug_cb cb;
1752 struct btrfs_fs_info *info;
1753 struct list_head rbio_list;
1754 };
1755
1756 /*
1757 * rbios on the plug list are sorted for easier merging.
1758 */
plug_cmp(void * priv,const struct list_head * a,const struct list_head * b)1759 static int plug_cmp(void *priv, const struct list_head *a,
1760 const struct list_head *b)
1761 {
1762 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1763 plug_list);
1764 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1765 plug_list);
1766 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1767 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1768
1769 if (a_sector < b_sector)
1770 return -1;
1771 if (a_sector > b_sector)
1772 return 1;
1773 return 0;
1774 }
1775
raid_unplug(struct blk_plug_cb * cb,bool from_schedule)1776 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1777 {
1778 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1779 struct btrfs_raid_bio *cur;
1780 struct btrfs_raid_bio *last = NULL;
1781
1782 list_sort(NULL, &plug->rbio_list, plug_cmp);
1783
1784 while (!list_empty(&plug->rbio_list)) {
1785 cur = list_first_entry(&plug->rbio_list,
1786 struct btrfs_raid_bio, plug_list);
1787 list_del_init(&cur->plug_list);
1788
1789 if (rbio_is_full(cur)) {
1790 /* We have a full stripe, queue it down. */
1791 start_async_work(cur, rmw_rbio_work);
1792 continue;
1793 }
1794 if (last) {
1795 if (rbio_can_merge(last, cur)) {
1796 merge_rbio(last, cur);
1797 free_raid_bio(cur);
1798 continue;
1799 }
1800 start_async_work(last, rmw_rbio_work);
1801 }
1802 last = cur;
1803 }
1804 if (last)
1805 start_async_work(last, rmw_rbio_work);
1806 kfree(plug);
1807 }
1808
1809 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
rbio_add_bio(struct btrfs_raid_bio * rbio,struct bio * orig_bio)1810 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1811 {
1812 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1813 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1814 const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1815 const u32 orig_len = orig_bio->bi_iter.bi_size;
1816 const u32 sectorsize = fs_info->sectorsize;
1817 u64 cur_logical;
1818
1819 ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1820 orig_logical + orig_len <= full_stripe_start +
1821 rbio->nr_data * BTRFS_STRIPE_LEN,
1822 rbio, orig_logical);
1823
1824 bio_list_add(&rbio->bio_list, orig_bio);
1825 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1826
1827 /* Update the dbitmap. */
1828 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1829 cur_logical += sectorsize) {
1830 int bit = ((u32)(cur_logical - full_stripe_start) >>
1831 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1832
1833 set_bit(bit, &rbio->dbitmap);
1834 }
1835 }
1836
1837 /*
1838 * our main entry point for writes from the rest of the FS.
1839 */
raid56_parity_write(struct bio * bio,struct btrfs_io_context * bioc)1840 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1841 {
1842 struct btrfs_fs_info *fs_info = bioc->fs_info;
1843 struct btrfs_raid_bio *rbio;
1844 struct btrfs_plug_cb *plug = NULL;
1845 struct blk_plug_cb *cb;
1846
1847 rbio = alloc_rbio(fs_info, bioc);
1848 if (IS_ERR(rbio)) {
1849 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1850 bio_endio(bio);
1851 return;
1852 }
1853 rbio->operation = BTRFS_RBIO_WRITE;
1854 rbio_add_bio(rbio, bio);
1855
1856 /*
1857 * Don't plug on full rbios, just get them out the door
1858 * as quickly as we can
1859 */
1860 if (!rbio_is_full(rbio)) {
1861 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1862 if (cb) {
1863 plug = container_of(cb, struct btrfs_plug_cb, cb);
1864 if (!plug->info) {
1865 plug->info = fs_info;
1866 INIT_LIST_HEAD(&plug->rbio_list);
1867 }
1868 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1869 return;
1870 }
1871 }
1872
1873 /*
1874 * Either we don't have any existing plug, or we're doing a full stripe,
1875 * queue the rmw work now.
1876 */
1877 start_async_work(rbio, rmw_rbio_work);
1878 }
1879
verify_one_sector(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr)1880 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1881 int stripe_nr, int sector_nr)
1882 {
1883 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1884 phys_addr_t *paddrs;
1885 u8 csum_buf[BTRFS_CSUM_SIZE];
1886 u8 *csum_expected;
1887
1888 if (!rbio->csum_bitmap || !rbio->csum_buf)
1889 return 0;
1890
1891 /* No way to verify P/Q as they are not covered by data csum. */
1892 if (stripe_nr >= rbio->nr_data)
1893 return 0;
1894 /*
1895 * If we're rebuilding a read, we have to use pages from the
1896 * bio list if possible.
1897 */
1898 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1899 paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
1900 } else {
1901 paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
1902 }
1903
1904 csum_expected = rbio->csum_buf +
1905 (stripe_nr * rbio->stripe_nsectors + sector_nr) *
1906 fs_info->csum_size;
1907 btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
1908 if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
1909 return -EIO;
1910 return 0;
1911 }
1912
recover_vertical_step(struct btrfs_raid_bio * rbio,unsigned int sector_nr,unsigned int step_nr,int faila,int failb,void ** pointers,void ** unmap_array)1913 static void recover_vertical_step(struct btrfs_raid_bio *rbio,
1914 unsigned int sector_nr,
1915 unsigned int step_nr,
1916 int faila, int failb,
1917 void **pointers, void **unmap_array)
1918 {
1919 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1920 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
1921 int stripe_nr;
1922
1923 ASSERT(step_nr < rbio->sector_nsteps);
1924 ASSERT(sector_nr < rbio->stripe_nsectors);
1925
1926 /*
1927 * Setup our array of pointers with sectors from each stripe
1928 *
1929 * NOTE: store a duplicate array of pointers to preserve the
1930 * pointer order.
1931 */
1932 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1933 phys_addr_t paddr;
1934
1935 /*
1936 * If we're rebuilding a read, we have to use pages from the
1937 * bio list if possible.
1938 */
1939 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1940 paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
1941 } else {
1942 paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
1943 }
1944 pointers[stripe_nr] = kmap_local_paddr(paddr);
1945 unmap_array[stripe_nr] = pointers[stripe_nr];
1946 }
1947
1948 /* All raid6 handling here */
1949 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1950 /* Single failure, rebuild from parity raid5 style */
1951 if (failb < 0) {
1952 if (faila == rbio->nr_data)
1953 /*
1954 * Just the P stripe has failed, without
1955 * a bad data or Q stripe.
1956 * We have nothing to do, just skip the
1957 * recovery for this stripe.
1958 */
1959 goto cleanup;
1960 /*
1961 * a single failure in raid6 is rebuilt
1962 * in the pstripe code below
1963 */
1964 goto pstripe;
1965 }
1966
1967 /*
1968 * If the q stripe is failed, do a pstripe reconstruction from
1969 * the xors.
1970 * If both the q stripe and the P stripe are failed, we're
1971 * here due to a crc mismatch and we can't give them the
1972 * data they want.
1973 */
1974 if (failb == rbio->real_stripes - 1) {
1975 if (faila == rbio->real_stripes - 2)
1976 /*
1977 * Only P and Q are corrupted.
1978 * We only care about data stripes recovery,
1979 * can skip this vertical stripe.
1980 */
1981 goto cleanup;
1982 /*
1983 * Otherwise we have one bad data stripe and
1984 * a good P stripe. raid5!
1985 */
1986 goto pstripe;
1987 }
1988
1989 if (failb == rbio->real_stripes - 2) {
1990 raid6_datap_recov(rbio->real_stripes, step,
1991 faila, pointers);
1992 } else {
1993 raid6_2data_recov(rbio->real_stripes, step,
1994 faila, failb, pointers);
1995 }
1996 } else {
1997 void *p;
1998
1999 /* Rebuild from P stripe here (raid5 or raid6). */
2000 ASSERT(failb == -1);
2001 pstripe:
2002 /* Copy parity block into failed block to start with */
2003 memcpy(pointers[faila], pointers[rbio->nr_data], step);
2004
2005 /* Rearrange the pointer array */
2006 p = pointers[faila];
2007 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
2008 stripe_nr++)
2009 pointers[stripe_nr] = pointers[stripe_nr + 1];
2010 pointers[rbio->nr_data - 1] = p;
2011
2012 /* Xor in the rest */
2013 xor_gen(p, pointers, rbio->nr_data - 1, step);
2014 }
2015
2016 cleanup:
2017 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
2018 kunmap_local(unmap_array[stripe_nr]);
2019 }
2020
2021 /*
2022 * Recover a vertical stripe specified by @sector_nr.
2023 * @*pointers are the pre-allocated pointers by the caller, so we don't
2024 * need to allocate/free the pointers again and again.
2025 */
recover_vertical(struct btrfs_raid_bio * rbio,int sector_nr,void ** pointers,void ** unmap_array)2026 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
2027 void **pointers, void **unmap_array)
2028 {
2029 int found_errors;
2030 int faila;
2031 int failb;
2032 int ret = 0;
2033
2034 /*
2035 * Now we just use bitmap to mark the horizontal stripes in
2036 * which we have data when doing parity scrub.
2037 */
2038 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
2039 !test_bit(sector_nr, &rbio->dbitmap))
2040 return 0;
2041
2042 found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
2043 &failb);
2044 /*
2045 * No errors in the vertical stripe, skip it. Can happen for recovery
2046 * which only part of a stripe failed csum check.
2047 */
2048 if (!found_errors)
2049 return 0;
2050
2051 if (unlikely(found_errors > rbio->bioc->max_errors))
2052 return -EIO;
2053
2054 for (int i = 0; i < rbio->sector_nsteps; i++)
2055 recover_vertical_step(rbio, sector_nr, i, faila, failb,
2056 pointers, unmap_array);
2057 if (faila >= 0) {
2058 ret = verify_one_sector(rbio, faila, sector_nr);
2059 if (ret < 0)
2060 return ret;
2061
2062 set_bit(rbio_sector_index(rbio, faila, sector_nr),
2063 rbio->stripe_uptodate_bitmap);
2064 }
2065 if (failb >= 0) {
2066 ret = verify_one_sector(rbio, failb, sector_nr);
2067 if (ret < 0)
2068 return ret;
2069
2070 set_bit(rbio_sector_index(rbio, failb, sector_nr),
2071 rbio->stripe_uptodate_bitmap);
2072 }
2073 return ret;
2074 }
2075
recover_sectors(struct btrfs_raid_bio * rbio)2076 static int recover_sectors(struct btrfs_raid_bio *rbio)
2077 {
2078 void **pointers = NULL;
2079 void **unmap_array = NULL;
2080 int sectornr;
2081 int ret = 0;
2082
2083 /*
2084 * @pointers array stores the pointer for each sector.
2085 *
2086 * @unmap_array stores copy of pointers that does not get reordered
2087 * during reconstruction so that kunmap_local works.
2088 */
2089 pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
2090 unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
2091 if (!pointers || !unmap_array) {
2092 ret = -ENOMEM;
2093 goto out;
2094 }
2095
2096 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2097 spin_lock(&rbio->bio_list_lock);
2098 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2099 spin_unlock(&rbio->bio_list_lock);
2100 }
2101
2102 index_rbio_pages(rbio);
2103
2104 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2105 ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2106 if (ret < 0)
2107 break;
2108 }
2109
2110 out:
2111 kfree(pointers);
2112 kfree(unmap_array);
2113 return ret;
2114 }
2115
recover_rbio(struct btrfs_raid_bio * rbio)2116 static void recover_rbio(struct btrfs_raid_bio *rbio)
2117 {
2118 struct bio_list bio_list = BIO_EMPTY_LIST;
2119 int total_sector_nr;
2120 int ret = 0;
2121
2122 /*
2123 * Either we're doing recover for a read failure or degraded write,
2124 * caller should have set error bitmap correctly.
2125 */
2126 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2127
2128 /* For recovery, we need to read all sectors including P/Q. */
2129 ret = alloc_rbio_pages(rbio);
2130 if (ret < 0)
2131 goto out;
2132
2133 index_rbio_pages(rbio);
2134
2135 /*
2136 * Read everything that hasn't failed. However this time we will
2137 * not trust any cached sector.
2138 * As we may read out some stale data but higher layer is not reading
2139 * that stale part.
2140 *
2141 * So here we always re-read everything in recovery path.
2142 */
2143 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2144 total_sector_nr++) {
2145 int stripe = total_sector_nr / rbio->stripe_nsectors;
2146 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2147 phys_addr_t *paddrs;
2148
2149 /*
2150 * Skip the range which has error. It can be a range which is
2151 * marked error (for csum mismatch), or it can be a missing
2152 * device.
2153 */
2154 if (!rbio->bioc->stripes[stripe].dev->bdev ||
2155 test_bit(total_sector_nr, rbio->error_bitmap)) {
2156 /*
2157 * Also set the error bit for missing device, which
2158 * may not yet have its error bit set.
2159 */
2160 set_bit(total_sector_nr, rbio->error_bitmap);
2161 continue;
2162 }
2163
2164 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2165 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2166 sectornr, REQ_OP_READ);
2167 if (ret < 0) {
2168 bio_list_put(&bio_list);
2169 goto out;
2170 }
2171 }
2172
2173 submit_read_wait_bio_list(rbio, &bio_list);
2174 ret = recover_sectors(rbio);
2175 out:
2176 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2177 }
2178
recover_rbio_work(struct work_struct * work)2179 static void recover_rbio_work(struct work_struct *work)
2180 {
2181 struct btrfs_raid_bio *rbio;
2182
2183 rbio = container_of(work, struct btrfs_raid_bio, work);
2184 if (!lock_stripe_add(rbio))
2185 recover_rbio(rbio);
2186 }
2187
recover_rbio_work_locked(struct work_struct * work)2188 static void recover_rbio_work_locked(struct work_struct *work)
2189 {
2190 recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2191 }
2192
set_rbio_raid6_extra_error(struct btrfs_raid_bio * rbio,int mirror_num)2193 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2194 {
2195 bool found = false;
2196 int sector_nr;
2197
2198 /*
2199 * This is for RAID6 extra recovery tries, thus mirror number should
2200 * be large than 2.
2201 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2202 * RAID5 methods.
2203 */
2204 ASSERT(mirror_num > 2);
2205 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2206 int found_errors;
2207 int faila;
2208 int failb;
2209
2210 found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2211 &faila, &failb);
2212 /* This vertical stripe doesn't have errors. */
2213 if (!found_errors)
2214 continue;
2215
2216 /*
2217 * If we found errors, there should be only one error marked
2218 * by previous set_rbio_range_error().
2219 */
2220 ASSERT(found_errors == 1);
2221 found = true;
2222
2223 /* Now select another stripe to mark as error. */
2224 failb = rbio->real_stripes - (mirror_num - 1);
2225 if (failb <= faila)
2226 failb--;
2227
2228 /* Set the extra bit in error bitmap. */
2229 if (failb >= 0)
2230 set_bit(failb * rbio->stripe_nsectors + sector_nr,
2231 rbio->error_bitmap);
2232 }
2233
2234 /* We should found at least one vertical stripe with error.*/
2235 ASSERT(found);
2236 }
2237
2238 /*
2239 * the main entry point for reads from the higher layers. This
2240 * is really only called when the normal read path had a failure,
2241 * so we assume the bio they send down corresponds to a failed part
2242 * of the drive.
2243 */
raid56_parity_recover(struct bio * bio,struct btrfs_io_context * bioc,int mirror_num)2244 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2245 int mirror_num)
2246 {
2247 struct btrfs_fs_info *fs_info = bioc->fs_info;
2248 struct btrfs_raid_bio *rbio;
2249
2250 rbio = alloc_rbio(fs_info, bioc);
2251 if (IS_ERR(rbio)) {
2252 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2253 bio_endio(bio);
2254 return;
2255 }
2256
2257 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2258 rbio_add_bio(rbio, bio);
2259
2260 set_rbio_range_error(rbio, bio);
2261
2262 /*
2263 * Loop retry:
2264 * for 'mirror == 2', reconstruct from all other stripes.
2265 * for 'mirror_num > 2', select a stripe to fail on every retry.
2266 */
2267 if (mirror_num > 2)
2268 set_rbio_raid6_extra_error(rbio, mirror_num);
2269
2270 start_async_work(rbio, recover_rbio_work);
2271 }
2272
fill_data_csums(struct btrfs_raid_bio * rbio)2273 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2274 {
2275 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2276 struct btrfs_root *csum_root;
2277 const u64 start = rbio->bioc->full_stripe_logical;
2278 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2279 fs_info->sectorsize_bits;
2280 int ret;
2281
2282 /* The rbio should not have its csum buffer initialized. */
2283 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2284
2285 /*
2286 * Skip the csum search if:
2287 *
2288 * - The rbio doesn't belong to data block groups
2289 * Then we are doing IO for tree blocks, no need to search csums.
2290 *
2291 * - The rbio belongs to mixed block groups
2292 * This is to avoid deadlock, as we're already holding the full
2293 * stripe lock, if we trigger a metadata read, and it needs to do
2294 * raid56 recovery, we will deadlock.
2295 */
2296 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2297 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2298 return;
2299
2300 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2301 fs_info->csum_size, GFP_NOFS);
2302 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2303 GFP_NOFS);
2304 if (!rbio->csum_buf || !rbio->csum_bitmap) {
2305 ret = -ENOMEM;
2306 goto error;
2307 }
2308
2309 csum_root = btrfs_csum_root(fs_info, rbio->bioc->full_stripe_logical);
2310 if (unlikely(!csum_root)) {
2311 btrfs_err(fs_info,
2312 "missing csum root for extent at bytenr %llu",
2313 rbio->bioc->full_stripe_logical);
2314 ret = -EUCLEAN;
2315 goto error;
2316 }
2317
2318 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2319 rbio->csum_buf, rbio->csum_bitmap);
2320 if (ret < 0)
2321 goto error;
2322 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2323 goto no_csum;
2324 return;
2325
2326 error:
2327 /*
2328 * We failed to allocate memory or grab the csum, but it's not fatal,
2329 * we can still continue. But better to warn users that RMW is no
2330 * longer safe for this particular sub-stripe write.
2331 */
2332 btrfs_warn_rl(fs_info,
2333 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2334 rbio->bioc->full_stripe_logical, ret);
2335 no_csum:
2336 kfree(rbio->csum_buf);
2337 bitmap_free(rbio->csum_bitmap);
2338 rbio->csum_buf = NULL;
2339 rbio->csum_bitmap = NULL;
2340 }
2341
rmw_read_wait_recover(struct btrfs_raid_bio * rbio)2342 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2343 {
2344 struct bio_list bio_list = BIO_EMPTY_LIST;
2345 int total_sector_nr;
2346 int ret = 0;
2347
2348 /*
2349 * Fill the data csums we need for data verification. We need to fill
2350 * the csum_bitmap/csum_buf first, as our endio function will try to
2351 * verify the data sectors.
2352 */
2353 fill_data_csums(rbio);
2354
2355 /*
2356 * Build a list of bios to read all sectors (including data and P/Q).
2357 *
2358 * This behavior is to compensate the later csum verification and recovery.
2359 */
2360 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2361 total_sector_nr++) {
2362 int stripe = total_sector_nr / rbio->stripe_nsectors;
2363 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2364 phys_addr_t *paddrs;
2365
2366 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2367 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2368 sectornr, REQ_OP_READ);
2369 if (ret) {
2370 bio_list_put(&bio_list);
2371 return ret;
2372 }
2373 }
2374
2375 /*
2376 * We may or may not have any corrupted sectors (including missing dev
2377 * and csum mismatch), just let recover_sectors() to handle them all.
2378 */
2379 submit_read_wait_bio_list(rbio, &bio_list);
2380 return recover_sectors(rbio);
2381 }
2382
raid_wait_write_end_io(struct bio * bio)2383 static void raid_wait_write_end_io(struct bio *bio)
2384 {
2385 struct btrfs_raid_bio *rbio = bio->bi_private;
2386
2387 if (bio->bi_status)
2388 rbio_update_error_bitmap(rbio, bio);
2389 bio_put(bio);
2390 if (atomic_dec_and_test(&rbio->stripes_pending))
2391 wake_up(&rbio->io_wait);
2392 }
2393
submit_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)2394 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2395 struct bio_list *bio_list)
2396 {
2397 struct bio *bio;
2398
2399 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2400 while ((bio = bio_list_pop(bio_list))) {
2401 bio->bi_end_io = raid_wait_write_end_io;
2402
2403 if (trace_raid56_write_enabled()) {
2404 struct raid56_bio_trace_info trace_info = { 0 };
2405
2406 bio_get_trace_info(rbio, bio, &trace_info);
2407 trace_call__raid56_write(rbio, bio, &trace_info);
2408 }
2409 submit_bio(bio);
2410 }
2411 }
2412
2413 /*
2414 * To determine if we need to read any sector from the disk.
2415 * Should only be utilized in RMW path, to skip cached rbio.
2416 */
need_read_stripe_sectors(struct btrfs_raid_bio * rbio)2417 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2418 {
2419 int i;
2420
2421 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2422 phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
2423
2424 /*
2425 * We have a sector which doesn't have page nor uptodate,
2426 * thus this rbio can not be cached one, as cached one must
2427 * have all its data sectors present and uptodate.
2428 */
2429 if (paddr == INVALID_PADDR ||
2430 !test_bit(i, rbio->stripe_uptodate_bitmap))
2431 return true;
2432 }
2433 return false;
2434 }
2435
rmw_rbio(struct btrfs_raid_bio * rbio)2436 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2437 {
2438 struct bio_list bio_list;
2439 int sectornr;
2440 int ret = 0;
2441
2442 /*
2443 * Allocate the pages for parity first, as P/Q pages will always be
2444 * needed for both full-stripe and sub-stripe writes.
2445 */
2446 ret = alloc_rbio_parity_pages(rbio);
2447 if (ret < 0)
2448 goto out;
2449
2450 /*
2451 * Either full stripe write, or we have every data sector already
2452 * cached, can go to write path immediately.
2453 */
2454 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2455 /*
2456 * Now we're doing sub-stripe write, also need all data stripes
2457 * to do the full RMW.
2458 */
2459 ret = alloc_rbio_data_pages(rbio);
2460 if (ret < 0)
2461 goto out;
2462
2463 index_rbio_pages(rbio);
2464
2465 ret = rmw_read_wait_recover(rbio);
2466 if (ret < 0)
2467 goto out;
2468 }
2469
2470 /*
2471 * At this stage we're not allowed to add any new bios to the
2472 * bio list any more, anyone else that wants to change this stripe
2473 * needs to do their own rmw.
2474 */
2475 spin_lock(&rbio->bio_list_lock);
2476 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2477 spin_unlock(&rbio->bio_list_lock);
2478
2479 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2480
2481 index_rbio_pages(rbio);
2482
2483 /*
2484 * We don't cache full rbios because we're assuming
2485 * the higher layers are unlikely to use this area of
2486 * the disk again soon. If they do use it again,
2487 * hopefully they will send another full bio.
2488 */
2489 if (!rbio_is_full(rbio))
2490 cache_rbio_pages(rbio);
2491 else
2492 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2493
2494 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2495 generate_pq_vertical(rbio, sectornr);
2496
2497 bio_list_init(&bio_list);
2498 ret = rmw_assemble_write_bios(rbio, &bio_list);
2499 if (ret < 0)
2500 goto out;
2501
2502 /* We should have at least one bio assembled. */
2503 ASSERT(bio_list_size(&bio_list));
2504 submit_write_bios(rbio, &bio_list);
2505 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2506
2507 /* We may have more errors than our tolerance during the read. */
2508 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2509 int found_errors;
2510
2511 found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
2512 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2513 ret = -EIO;
2514 break;
2515 }
2516 }
2517 out:
2518 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2519 }
2520
rmw_rbio_work(struct work_struct * work)2521 static void rmw_rbio_work(struct work_struct *work)
2522 {
2523 struct btrfs_raid_bio *rbio;
2524
2525 rbio = container_of(work, struct btrfs_raid_bio, work);
2526 if (lock_stripe_add(rbio) == 0)
2527 rmw_rbio(rbio);
2528 }
2529
rmw_rbio_work_locked(struct work_struct * work)2530 static void rmw_rbio_work_locked(struct work_struct *work)
2531 {
2532 rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2533 }
2534
2535 /*
2536 * The following code is used to scrub/replace the parity stripe
2537 *
2538 * Caller must have already increased bio_counter for getting @bioc.
2539 *
2540 * Note: We need make sure all the pages that add into the scrub/replace
2541 * raid bio are correct and not be changed during the scrub/replace. That
2542 * is those pages just hold metadata or file data with checksum.
2543 */
2544
raid56_parity_alloc_scrub_rbio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_device * scrub_dev,unsigned long * dbitmap,int stripe_nsectors)2545 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2546 struct btrfs_io_context *bioc,
2547 struct btrfs_device *scrub_dev,
2548 unsigned long *dbitmap, int stripe_nsectors)
2549 {
2550 struct btrfs_fs_info *fs_info = bioc->fs_info;
2551 struct btrfs_raid_bio *rbio;
2552 int i;
2553
2554 rbio = alloc_rbio(fs_info, bioc);
2555 if (IS_ERR(rbio))
2556 return NULL;
2557 bio_list_add(&rbio->bio_list, bio);
2558 /*
2559 * This is a special bio which is used to hold the completion handler
2560 * and make the scrub rbio is similar to the other types
2561 */
2562 ASSERT(!bio->bi_iter.bi_size);
2563 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2564
2565 /*
2566 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2567 * to the end position, so this search can start from the first parity
2568 * stripe.
2569 */
2570 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2571 if (bioc->stripes[i].dev == scrub_dev) {
2572 rbio->scrubp = i;
2573 break;
2574 }
2575 }
2576 ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2577
2578 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2579 return rbio;
2580 }
2581
alloc_rbio_sector_pages(struct btrfs_raid_bio * rbio,int sector_nr)2582 static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
2583 int sector_nr)
2584 {
2585 const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
2586 const u32 base = sector_nr * rbio->sector_nsteps;
2587
2588 for (int i = base; i < base + rbio->sector_nsteps; i++) {
2589 const unsigned int page_index = (i * step) >> PAGE_SHIFT;
2590 struct page *page;
2591
2592 if (rbio->stripe_pages[page_index])
2593 continue;
2594 page = alloc_page(GFP_NOFS);
2595 if (!page)
2596 return -ENOMEM;
2597 rbio->stripe_pages[page_index] = page;
2598 }
2599 return 0;
2600 }
2601
2602 /*
2603 * We just scrub the parity that we have correct data on the same horizontal,
2604 * so we needn't allocate all pages for all the stripes.
2605 */
alloc_rbio_essential_pages(struct btrfs_raid_bio * rbio)2606 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2607 {
2608 int total_sector_nr;
2609
2610 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2611 total_sector_nr++) {
2612 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2613 int ret;
2614
2615 if (!test_bit(sectornr, &rbio->dbitmap))
2616 continue;
2617 ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
2618 if (ret < 0)
2619 return ret;
2620 }
2621 index_stripe_sectors(rbio);
2622 return 0;
2623 }
2624
2625 /* Return true if the content of the step matches the caclulated one. */
verify_one_parity_step(struct btrfs_raid_bio * rbio,void * pointers[],unsigned int sector_nr,unsigned int step_nr)2626 static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
2627 void *pointers[], unsigned int sector_nr,
2628 unsigned int step_nr)
2629 {
2630 const unsigned int nr_data = rbio->nr_data;
2631 const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
2632 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
2633 void *parity;
2634 bool ret = false;
2635
2636 ASSERT(step_nr < rbio->sector_nsteps);
2637
2638 /* First collect one page from each data stripe. */
2639 for (int stripe = 0; stripe < nr_data; stripe++)
2640 pointers[stripe] = kmap_local_paddr(
2641 sector_paddr_in_rbio(rbio, stripe, sector_nr,
2642 step_nr, 0));
2643
2644 if (has_qstripe) {
2645 assert_rbio(rbio);
2646 /* RAID6, call the library function to fill in our P/Q. */
2647 raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
2648 } else {
2649 /* RAID5. */
2650 memcpy(pointers[nr_data], pointers[0], step);
2651 xor_gen(pointers[nr_data], pointers + 1, nr_data - 1, step);
2652 }
2653
2654 /* Check scrubbing parity and repair it. */
2655 parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
2656 if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
2657 memcpy(parity, pointers[rbio->scrubp], step);
2658 else
2659 ret = true;
2660 kunmap_local(parity);
2661
2662 for (int stripe = nr_data - 1; stripe >= 0; stripe--)
2663 kunmap_local(pointers[stripe]);
2664 return ret;
2665 }
2666
2667 /*
2668 * The @pointers array should have the P/Q parity already mapped.
2669 */
verify_one_parity_sector(struct btrfs_raid_bio * rbio,void * pointers[],unsigned int sector_nr)2670 static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
2671 void *pointers[], unsigned int sector_nr)
2672 {
2673 bool found_error = false;
2674
2675 for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
2676 bool match;
2677
2678 match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
2679 if (!match)
2680 found_error = true;
2681 }
2682 if (!found_error)
2683 bitmap_clear(&rbio->dbitmap, sector_nr, 1);
2684 }
2685
finish_parity_scrub(struct btrfs_raid_bio * rbio)2686 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2687 {
2688 struct btrfs_io_context *bioc = rbio->bioc;
2689 void **pointers = rbio->finish_pointers;
2690 unsigned long *pbitmap = &rbio->finish_pbitmap;
2691 int nr_data = rbio->nr_data;
2692 int sectornr;
2693 bool has_qstripe;
2694 struct page *page;
2695 phys_addr_t p_paddr = INVALID_PADDR;
2696 phys_addr_t q_paddr = INVALID_PADDR;
2697 struct bio_list bio_list;
2698 int is_replace = 0;
2699 int ret;
2700
2701 bio_list_init(&bio_list);
2702
2703 if (rbio->real_stripes - rbio->nr_data == 1)
2704 has_qstripe = false;
2705 else if (rbio->real_stripes - rbio->nr_data == 2)
2706 has_qstripe = true;
2707 else
2708 BUG();
2709
2710 /*
2711 * Replace is running and our P/Q stripe is being replaced, then we
2712 * need to duplicate the final write to replace target.
2713 */
2714 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2715 is_replace = 1;
2716 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2717 }
2718
2719 /*
2720 * Because the higher layers(scrubber) are unlikely to
2721 * use this area of the disk again soon, so don't cache
2722 * it.
2723 */
2724 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2725
2726 page = alloc_page(GFP_NOFS);
2727 if (!page)
2728 return -ENOMEM;
2729 p_paddr = page_to_phys(page);
2730 page = NULL;
2731 pointers[nr_data] = kmap_local_paddr(p_paddr);
2732
2733 if (has_qstripe) {
2734 /* RAID6, allocate and map temp space for the Q stripe */
2735 page = alloc_page(GFP_NOFS);
2736 if (!page) {
2737 __free_page(phys_to_page(p_paddr));
2738 p_paddr = INVALID_PADDR;
2739 return -ENOMEM;
2740 }
2741 q_paddr = page_to_phys(page);
2742 page = NULL;
2743 pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
2744 }
2745
2746 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2747
2748 /* Map the parity stripe just once */
2749
2750 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
2751 verify_one_parity_sector(rbio, pointers, sectornr);
2752
2753 kunmap_local(pointers[nr_data]);
2754 __free_page(phys_to_page(p_paddr));
2755 p_paddr = INVALID_PADDR;
2756 if (q_paddr != INVALID_PADDR) {
2757 __free_page(phys_to_page(q_paddr));
2758 q_paddr = INVALID_PADDR;
2759 }
2760
2761 /*
2762 * time to start writing. Make bios for everything from the
2763 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2764 * everything else.
2765 */
2766 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2767 phys_addr_t *paddrs;
2768
2769 paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2770 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
2771 sectornr, REQ_OP_WRITE);
2772 if (ret)
2773 goto cleanup;
2774 }
2775
2776 if (!is_replace)
2777 goto submit_write;
2778
2779 /*
2780 * Replace is running and our parity stripe needs to be duplicated to
2781 * the target device. Check we have a valid source stripe number.
2782 */
2783 ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2784 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2785 phys_addr_t *paddrs;
2786
2787 paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
2788 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
2789 sectornr, REQ_OP_WRITE);
2790 if (ret)
2791 goto cleanup;
2792 }
2793
2794 submit_write:
2795 submit_write_bios(rbio, &bio_list);
2796 return 0;
2797
2798 cleanup:
2799 bio_list_put(&bio_list);
2800 return ret;
2801 }
2802
is_data_stripe(struct btrfs_raid_bio * rbio,int stripe)2803 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2804 {
2805 if (stripe >= 0 && stripe < rbio->nr_data)
2806 return 1;
2807 return 0;
2808 }
2809
recover_scrub_rbio(struct btrfs_raid_bio * rbio)2810 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2811 {
2812 void **pointers = NULL;
2813 void **unmap_array = NULL;
2814 int sector_nr;
2815 int ret = 0;
2816
2817 /*
2818 * @pointers array stores the pointer for each sector.
2819 *
2820 * @unmap_array stores copy of pointers that does not get reordered
2821 * during reconstruction so that kunmap_local works.
2822 */
2823 pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
2824 unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS);
2825 if (!pointers || !unmap_array) {
2826 ret = -ENOMEM;
2827 goto out;
2828 }
2829
2830 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2831 int dfail = 0, failp = -1;
2832 int faila;
2833 int failb;
2834 int found_errors;
2835
2836 found_errors = get_rbio_vertical_errors(rbio, sector_nr,
2837 &faila, &failb);
2838 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2839 ret = -EIO;
2840 goto out;
2841 }
2842 if (found_errors == 0)
2843 continue;
2844
2845 /* We should have at least one error here. */
2846 ASSERT(faila >= 0 || failb >= 0);
2847
2848 if (is_data_stripe(rbio, faila))
2849 dfail++;
2850 else if (is_parity_stripe(faila))
2851 failp = faila;
2852
2853 if (is_data_stripe(rbio, failb))
2854 dfail++;
2855 else if (is_parity_stripe(failb))
2856 failp = failb;
2857 /*
2858 * Because we can not use a scrubbing parity to repair the
2859 * data, so the capability of the repair is declined. (In the
2860 * case of RAID5, we can not repair anything.)
2861 */
2862 if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
2863 ret = -EIO;
2864 goto out;
2865 }
2866 /*
2867 * If all data is good, only parity is correctly, just repair
2868 * the parity, no need to recover data stripes.
2869 */
2870 if (dfail == 0)
2871 continue;
2872
2873 /*
2874 * Here means we got one corrupted data stripe and one
2875 * corrupted parity on RAID6, if the corrupted parity is
2876 * scrubbing parity, luckily, use the other one to repair the
2877 * data, or we can not repair the data stripe.
2878 */
2879 if (unlikely(failp != rbio->scrubp)) {
2880 ret = -EIO;
2881 goto out;
2882 }
2883
2884 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2885 if (ret < 0)
2886 goto out;
2887 }
2888 out:
2889 kfree(pointers);
2890 kfree(unmap_array);
2891 return ret;
2892 }
2893
scrub_assemble_read_bios(struct btrfs_raid_bio * rbio)2894 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2895 {
2896 struct bio_list bio_list = BIO_EMPTY_LIST;
2897 int total_sector_nr;
2898 int ret = 0;
2899
2900 /* Build a list of bios to read all the missing parts. */
2901 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2902 total_sector_nr++) {
2903 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2904 int stripe = total_sector_nr / rbio->stripe_nsectors;
2905 phys_addr_t *paddrs;
2906
2907 /* No data in the vertical stripe, no need to read. */
2908 if (!test_bit(sectornr, &rbio->dbitmap))
2909 continue;
2910
2911 /*
2912 * We want to find all the sectors missing from the rbio and
2913 * read them from the disk. If sector_paddr_in_rbio() finds a sector
2914 * in the bio list we don't need to read it off the stripe.
2915 */
2916 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
2917 if (paddrs == NULL)
2918 continue;
2919
2920 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
2921 /*
2922 * The bio cache may have handed us an uptodate sector. If so,
2923 * use it.
2924 */
2925 if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
2926 rbio->stripe_uptodate_bitmap))
2927 continue;
2928
2929 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
2930 sectornr, REQ_OP_READ);
2931 if (ret) {
2932 bio_list_put(&bio_list);
2933 return ret;
2934 }
2935 }
2936
2937 submit_read_wait_bio_list(rbio, &bio_list);
2938 return 0;
2939 }
2940
scrub_rbio(struct btrfs_raid_bio * rbio)2941 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2942 {
2943 int sector_nr;
2944 int ret;
2945
2946 ret = alloc_rbio_essential_pages(rbio);
2947 if (ret)
2948 goto out;
2949
2950 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2951
2952 ret = scrub_assemble_read_bios(rbio);
2953 if (ret < 0)
2954 goto out;
2955
2956 /* We may have some failures, recover the failed sectors first. */
2957 ret = recover_scrub_rbio(rbio);
2958 if (ret < 0)
2959 goto out;
2960
2961 /*
2962 * We have every sector properly prepared. Can finish the scrub
2963 * and writeback the good content.
2964 */
2965 ret = finish_parity_scrub(rbio);
2966 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2967 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2968 int found_errors;
2969
2970 found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
2971 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2972 ret = -EIO;
2973 break;
2974 }
2975 }
2976 out:
2977 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2978 }
2979
scrub_rbio_work_locked(struct work_struct * work)2980 static void scrub_rbio_work_locked(struct work_struct *work)
2981 {
2982 scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2983 }
2984
raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio * rbio)2985 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2986 {
2987 if (!lock_stripe_add(rbio))
2988 start_async_work(rbio, scrub_rbio_work_locked);
2989 }
2990
2991 /*
2992 * This is for scrub call sites where we already have correct data contents.
2993 * This allows us to avoid reading data stripes again.
2994 *
2995 * Unfortunately here we have to do folio copy, other than reusing the pages.
2996 * This is due to the fact rbio has its own page management for its cache.
2997 */
raid56_parity_cache_data_folios(struct btrfs_raid_bio * rbio,struct folio ** data_folios,u64 data_logical)2998 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
2999 struct folio **data_folios, u64 data_logical)
3000 {
3001 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
3002 const u64 offset_in_full_stripe = data_logical -
3003 rbio->bioc->full_stripe_logical;
3004 unsigned int findex = 0;
3005 unsigned int foffset = 0;
3006 int ret;
3007
3008 /*
3009 * If we hit ENOMEM temporarily, but later at
3010 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
3011 * the extra read, not a big deal.
3012 *
3013 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
3014 * the bio would got proper error number set.
3015 */
3016 ret = alloc_rbio_data_pages(rbio);
3017 if (ret < 0)
3018 return;
3019
3020 /* data_logical must be at stripe boundary and inside the full stripe. */
3021 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
3022 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
3023
3024 for (unsigned int cur_off = offset_in_full_stripe;
3025 cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
3026 cur_off += PAGE_SIZE) {
3027 const unsigned int pindex = cur_off >> PAGE_SHIFT;
3028 void *kaddr;
3029
3030 kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
3031 memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
3032 kunmap_local(kaddr);
3033
3034 foffset += PAGE_SIZE;
3035 ASSERT(foffset <= folio_size(data_folios[findex]));
3036 if (foffset == folio_size(data_folios[findex])) {
3037 findex++;
3038 foffset = 0;
3039 }
3040 }
3041 bitmap_set(rbio->stripe_uptodate_bitmap,
3042 offset_in_full_stripe >> fs_info->sectorsize_bits,
3043 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
3044 }
3045