1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 */
6
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
24
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT 1
27
28 /*
29 * set when this rbio is sitting in the hash, but it is just a cache
30 * of past RMW
31 */
32 #define RBIO_CACHE_BIT 2
33
34 /*
35 * set when it is safe to trust the stripe_pages for caching
36 */
37 #define RBIO_CACHE_READY_BIT 3
38
39 #define RBIO_CACHE_SIZE 1024
40
41 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
42
dump_bioc(const struct btrfs_fs_info * fs_info,const struct btrfs_io_context * bioc)43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44 {
45 if (unlikely(!bioc)) {
46 btrfs_crit(fs_info, "bioc=NULL");
47 return;
48 }
49 btrfs_crit(fs_info,
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 bioc->logical, bioc->full_stripe_logical, bioc->size,
52 bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 bioc->replace_stripe_src, bioc->num_stripes);
54 for (int i = 0; i < bioc->num_stripes; i++) {
55 btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
56 i, bioc->stripes[i].dev->devid,
57 bioc->stripes[i].physical);
58 }
59 }
60
btrfs_dump_rbio(const struct btrfs_fs_info * fs_info,const struct btrfs_raid_bio * rbio)61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 const struct btrfs_raid_bio *rbio)
63 {
64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 return;
66
67 dump_bioc(fs_info, rbio->bioc);
68 btrfs_crit(fs_info,
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 rbio->real_stripes, rbio->stripe_nsectors,
72 rbio->scrubp, rbio->dbitmap);
73 }
74
75 #define ASSERT_RBIO(expr, rbio) \
76 ({ \
77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
79 (rbio)->bioc->fs_info : NULL; \
80 \
81 btrfs_dump_rbio(__fs_info, (rbio)); \
82 } \
83 ASSERT((expr)); \
84 })
85
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
87 ({ \
88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
90 (rbio)->bioc->fs_info : NULL; \
91 \
92 btrfs_dump_rbio(__fs_info, (rbio)); \
93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
94 } \
95 ASSERT((expr)); \
96 })
97
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
99 ({ \
100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
102 (rbio)->bioc->fs_info : NULL; \
103 \
104 btrfs_dump_rbio(__fs_info, (rbio)); \
105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
106 } \
107 ASSERT((expr)); \
108 })
109
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
111 ({ \
112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
114 (rbio)->bioc->fs_info : NULL; \
115 \
116 btrfs_dump_rbio(__fs_info, (rbio)); \
117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \
118 } \
119 ASSERT((expr)); \
120 })
121
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash {
124 struct list_head hash_list;
125 spinlock_t lock;
126 };
127
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table {
130 struct list_head stripe_cache;
131 spinlock_t cache_lock;
132 int cache_size;
133 struct btrfs_stripe_hash table[];
134 };
135
136 /*
137 * A structure to present a sector inside a page, the length is fixed to
138 * sectorsize;
139 */
140 struct sector_ptr {
141 /*
142 * Blocks from the bio list can still be highmem.
143 * So here we use physical address to present a page and the offset inside it.
144 */
145 phys_addr_t paddr;
146 bool has_paddr;
147 bool uptodate;
148 };
149
150 static void rmw_rbio_work(struct work_struct *work);
151 static void rmw_rbio_work_locked(struct work_struct *work);
152 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
153 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
154
155 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
156 static void scrub_rbio_work_locked(struct work_struct *work);
157
free_raid_bio_pointers(struct btrfs_raid_bio * rbio)158 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
159 {
160 bitmap_free(rbio->error_bitmap);
161 kfree(rbio->stripe_pages);
162 kfree(rbio->bio_sectors);
163 kfree(rbio->stripe_sectors);
164 kfree(rbio->finish_pointers);
165 }
166
free_raid_bio(struct btrfs_raid_bio * rbio)167 static void free_raid_bio(struct btrfs_raid_bio *rbio)
168 {
169 int i;
170
171 if (!refcount_dec_and_test(&rbio->refs))
172 return;
173
174 WARN_ON(!list_empty(&rbio->stripe_cache));
175 WARN_ON(!list_empty(&rbio->hash_list));
176 WARN_ON(!bio_list_empty(&rbio->bio_list));
177
178 for (i = 0; i < rbio->nr_pages; i++) {
179 if (rbio->stripe_pages[i]) {
180 __free_page(rbio->stripe_pages[i]);
181 rbio->stripe_pages[i] = NULL;
182 }
183 }
184
185 btrfs_put_bioc(rbio->bioc);
186 free_raid_bio_pointers(rbio);
187 kfree(rbio);
188 }
189
start_async_work(struct btrfs_raid_bio * rbio,work_func_t work_func)190 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
191 {
192 INIT_WORK(&rbio->work, work_func);
193 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
194 }
195
196 /*
197 * the stripe hash table is used for locking, and to collect
198 * bios in hopes of making a full stripe
199 */
btrfs_alloc_stripe_hash_table(struct btrfs_fs_info * info)200 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
201 {
202 struct btrfs_stripe_hash_table *table;
203 struct btrfs_stripe_hash_table *x;
204 struct btrfs_stripe_hash *cur;
205 struct btrfs_stripe_hash *h;
206 unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
207
208 if (info->stripe_hash_table)
209 return 0;
210
211 /*
212 * The table is large, starting with order 4 and can go as high as
213 * order 7 in case lock debugging is turned on.
214 *
215 * Try harder to allocate and fallback to vmalloc to lower the chance
216 * of a failing mount.
217 */
218 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
219 if (!table)
220 return -ENOMEM;
221
222 spin_lock_init(&table->cache_lock);
223 INIT_LIST_HEAD(&table->stripe_cache);
224
225 h = table->table;
226
227 for (unsigned int i = 0; i < num_entries; i++) {
228 cur = h + i;
229 INIT_LIST_HEAD(&cur->hash_list);
230 spin_lock_init(&cur->lock);
231 }
232
233 x = cmpxchg(&info->stripe_hash_table, NULL, table);
234 kvfree(x);
235 return 0;
236 }
237
memcpy_sectors(const struct sector_ptr * dst,const struct sector_ptr * src,u32 blocksize)238 static void memcpy_sectors(const struct sector_ptr *dst,
239 const struct sector_ptr *src, u32 blocksize)
240 {
241 memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr),
242 phys_to_page(src->paddr), offset_in_page(src->paddr),
243 blocksize);
244 }
245
246 /*
247 * caching an rbio means to copy anything from the
248 * bio_sectors array into the stripe_pages array. We
249 * use the page uptodate bit in the stripe cache array
250 * to indicate if it has valid data
251 *
252 * once the caching is done, we set the cache ready
253 * bit.
254 */
cache_rbio_pages(struct btrfs_raid_bio * rbio)255 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
256 {
257 int i;
258 int ret;
259
260 ret = alloc_rbio_pages(rbio);
261 if (ret)
262 return;
263
264 for (i = 0; i < rbio->nr_sectors; i++) {
265 /* Some range not covered by bio (partial write), skip it */
266 if (!rbio->bio_sectors[i].has_paddr) {
267 /*
268 * Even if the sector is not covered by bio, if it is
269 * a data sector it should still be uptodate as it is
270 * read from disk.
271 */
272 if (i < rbio->nr_data * rbio->stripe_nsectors)
273 ASSERT(rbio->stripe_sectors[i].uptodate);
274 continue;
275 }
276
277 memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i],
278 rbio->bioc->fs_info->sectorsize);
279 rbio->stripe_sectors[i].uptodate = 1;
280 }
281 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
282 }
283
284 /*
285 * we hash on the first logical address of the stripe
286 */
rbio_bucket(struct btrfs_raid_bio * rbio)287 static int rbio_bucket(struct btrfs_raid_bio *rbio)
288 {
289 u64 num = rbio->bioc->full_stripe_logical;
290
291 /*
292 * we shift down quite a bit. We're using byte
293 * addressing, and most of the lower bits are zeros.
294 * This tends to upset hash_64, and it consistently
295 * returns just one or two different values.
296 *
297 * shifting off the lower bits fixes things.
298 */
299 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
300 }
301
full_page_sectors_uptodate(struct btrfs_raid_bio * rbio,unsigned int page_nr)302 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
303 unsigned int page_nr)
304 {
305 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
306 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
307 int i;
308
309 ASSERT(page_nr < rbio->nr_pages);
310
311 for (i = sectors_per_page * page_nr;
312 i < sectors_per_page * page_nr + sectors_per_page;
313 i++) {
314 if (!rbio->stripe_sectors[i].uptodate)
315 return false;
316 }
317 return true;
318 }
319
320 /*
321 * Update the stripe_sectors[] array to use correct page and pgoff
322 *
323 * Should be called every time any page pointer in stripes_pages[] got modified.
324 */
index_stripe_sectors(struct btrfs_raid_bio * rbio)325 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
326 {
327 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
328 u32 offset;
329 int i;
330
331 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
332 int page_index = offset >> PAGE_SHIFT;
333
334 ASSERT(page_index < rbio->nr_pages);
335 if (!rbio->stripe_pages[page_index])
336 continue;
337
338 rbio->stripe_sectors[i].has_paddr = true;
339 rbio->stripe_sectors[i].paddr =
340 page_to_phys(rbio->stripe_pages[page_index]) +
341 offset_in_page(offset);
342 }
343 }
344
steal_rbio_page(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest,int page_nr)345 static void steal_rbio_page(struct btrfs_raid_bio *src,
346 struct btrfs_raid_bio *dest, int page_nr)
347 {
348 const u32 sectorsize = src->bioc->fs_info->sectorsize;
349 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
350 int i;
351
352 if (dest->stripe_pages[page_nr])
353 __free_page(dest->stripe_pages[page_nr]);
354 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
355 src->stripe_pages[page_nr] = NULL;
356
357 /* Also update the sector->uptodate bits. */
358 for (i = sectors_per_page * page_nr;
359 i < sectors_per_page * page_nr + sectors_per_page; i++)
360 dest->stripe_sectors[i].uptodate = true;
361 }
362
is_data_stripe_page(struct btrfs_raid_bio * rbio,int page_nr)363 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
364 {
365 const int sector_nr = (page_nr << PAGE_SHIFT) >>
366 rbio->bioc->fs_info->sectorsize_bits;
367
368 /*
369 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
370 * we won't have a page which is half data half parity.
371 *
372 * Thus if the first sector of the page belongs to data stripes, then
373 * the full page belongs to data stripes.
374 */
375 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
376 }
377
378 /*
379 * Stealing an rbio means taking all the uptodate pages from the stripe array
380 * in the source rbio and putting them into the destination rbio.
381 *
382 * This will also update the involved stripe_sectors[] which are referring to
383 * the old pages.
384 */
steal_rbio(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest)385 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
386 {
387 int i;
388
389 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
390 return;
391
392 for (i = 0; i < dest->nr_pages; i++) {
393 struct page *p = src->stripe_pages[i];
394
395 /*
396 * We don't need to steal P/Q pages as they will always be
397 * regenerated for RMW or full write anyway.
398 */
399 if (!is_data_stripe_page(src, i))
400 continue;
401
402 /*
403 * If @src already has RBIO_CACHE_READY_BIT, it should have
404 * all data stripe pages present and uptodate.
405 */
406 ASSERT(p);
407 ASSERT(full_page_sectors_uptodate(src, i));
408 steal_rbio_page(src, dest, i);
409 }
410 index_stripe_sectors(dest);
411 index_stripe_sectors(src);
412 }
413
414 /*
415 * merging means we take the bio_list from the victim and
416 * splice it into the destination. The victim should
417 * be discarded afterwards.
418 *
419 * must be called with dest->rbio_list_lock held
420 */
merge_rbio(struct btrfs_raid_bio * dest,struct btrfs_raid_bio * victim)421 static void merge_rbio(struct btrfs_raid_bio *dest,
422 struct btrfs_raid_bio *victim)
423 {
424 bio_list_merge_init(&dest->bio_list, &victim->bio_list);
425 dest->bio_list_bytes += victim->bio_list_bytes;
426 /* Also inherit the bitmaps from @victim. */
427 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
428 dest->stripe_nsectors);
429 }
430
431 /*
432 * used to prune items that are in the cache. The caller
433 * must hold the hash table lock.
434 */
__remove_rbio_from_cache(struct btrfs_raid_bio * rbio)435 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
436 {
437 int bucket = rbio_bucket(rbio);
438 struct btrfs_stripe_hash_table *table;
439 struct btrfs_stripe_hash *h;
440 int freeit = 0;
441
442 /*
443 * check the bit again under the hash table lock.
444 */
445 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
446 return;
447
448 table = rbio->bioc->fs_info->stripe_hash_table;
449 h = table->table + bucket;
450
451 /* hold the lock for the bucket because we may be
452 * removing it from the hash table
453 */
454 spin_lock(&h->lock);
455
456 /*
457 * hold the lock for the bio list because we need
458 * to make sure the bio list is empty
459 */
460 spin_lock(&rbio->bio_list_lock);
461
462 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
463 list_del_init(&rbio->stripe_cache);
464 table->cache_size -= 1;
465 freeit = 1;
466
467 /* if the bio list isn't empty, this rbio is
468 * still involved in an IO. We take it out
469 * of the cache list, and drop the ref that
470 * was held for the list.
471 *
472 * If the bio_list was empty, we also remove
473 * the rbio from the hash_table, and drop
474 * the corresponding ref
475 */
476 if (bio_list_empty(&rbio->bio_list)) {
477 if (!list_empty(&rbio->hash_list)) {
478 list_del_init(&rbio->hash_list);
479 refcount_dec(&rbio->refs);
480 BUG_ON(!list_empty(&rbio->plug_list));
481 }
482 }
483 }
484
485 spin_unlock(&rbio->bio_list_lock);
486 spin_unlock(&h->lock);
487
488 if (freeit)
489 free_raid_bio(rbio);
490 }
491
492 /*
493 * prune a given rbio from the cache
494 */
remove_rbio_from_cache(struct btrfs_raid_bio * rbio)495 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
496 {
497 struct btrfs_stripe_hash_table *table;
498
499 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
500 return;
501
502 table = rbio->bioc->fs_info->stripe_hash_table;
503
504 spin_lock(&table->cache_lock);
505 __remove_rbio_from_cache(rbio);
506 spin_unlock(&table->cache_lock);
507 }
508
509 /*
510 * remove everything in the cache
511 */
btrfs_clear_rbio_cache(struct btrfs_fs_info * info)512 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
513 {
514 struct btrfs_stripe_hash_table *table;
515 struct btrfs_raid_bio *rbio;
516
517 table = info->stripe_hash_table;
518
519 spin_lock(&table->cache_lock);
520 while (!list_empty(&table->stripe_cache)) {
521 rbio = list_first_entry(&table->stripe_cache,
522 struct btrfs_raid_bio, stripe_cache);
523 __remove_rbio_from_cache(rbio);
524 }
525 spin_unlock(&table->cache_lock);
526 }
527
528 /*
529 * remove all cached entries and free the hash table
530 * used by unmount
531 */
btrfs_free_stripe_hash_table(struct btrfs_fs_info * info)532 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
533 {
534 if (!info->stripe_hash_table)
535 return;
536 btrfs_clear_rbio_cache(info);
537 kvfree(info->stripe_hash_table);
538 info->stripe_hash_table = NULL;
539 }
540
541 /*
542 * insert an rbio into the stripe cache. It
543 * must have already been prepared by calling
544 * cache_rbio_pages
545 *
546 * If this rbio was already cached, it gets
547 * moved to the front of the lru.
548 *
549 * If the size of the rbio cache is too big, we
550 * prune an item.
551 */
cache_rbio(struct btrfs_raid_bio * rbio)552 static void cache_rbio(struct btrfs_raid_bio *rbio)
553 {
554 struct btrfs_stripe_hash_table *table;
555
556 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
557 return;
558
559 table = rbio->bioc->fs_info->stripe_hash_table;
560
561 spin_lock(&table->cache_lock);
562 spin_lock(&rbio->bio_list_lock);
563
564 /* bump our ref if we were not in the list before */
565 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
566 refcount_inc(&rbio->refs);
567
568 if (!list_empty(&rbio->stripe_cache)){
569 list_move(&rbio->stripe_cache, &table->stripe_cache);
570 } else {
571 list_add(&rbio->stripe_cache, &table->stripe_cache);
572 table->cache_size += 1;
573 }
574
575 spin_unlock(&rbio->bio_list_lock);
576
577 if (table->cache_size > RBIO_CACHE_SIZE) {
578 struct btrfs_raid_bio *found;
579
580 found = list_last_entry(&table->stripe_cache,
581 struct btrfs_raid_bio,
582 stripe_cache);
583
584 if (found != rbio)
585 __remove_rbio_from_cache(found);
586 }
587
588 spin_unlock(&table->cache_lock);
589 }
590
591 /*
592 * helper function to run the xor_blocks api. It is only
593 * able to do MAX_XOR_BLOCKS at a time, so we need to
594 * loop through.
595 */
run_xor(void ** pages,int src_cnt,ssize_t len)596 static void run_xor(void **pages, int src_cnt, ssize_t len)
597 {
598 int src_off = 0;
599 int xor_src_cnt = 0;
600 void *dest = pages[src_cnt];
601
602 while(src_cnt > 0) {
603 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
604 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
605
606 src_cnt -= xor_src_cnt;
607 src_off += xor_src_cnt;
608 }
609 }
610
611 /*
612 * Returns true if the bio list inside this rbio covers an entire stripe (no
613 * rmw required).
614 */
rbio_is_full(struct btrfs_raid_bio * rbio)615 static int rbio_is_full(struct btrfs_raid_bio *rbio)
616 {
617 unsigned long size = rbio->bio_list_bytes;
618 int ret = 1;
619
620 spin_lock(&rbio->bio_list_lock);
621 if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
622 ret = 0;
623 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
624 spin_unlock(&rbio->bio_list_lock);
625
626 return ret;
627 }
628
629 /*
630 * returns 1 if it is safe to merge two rbios together.
631 * The merging is safe if the two rbios correspond to
632 * the same stripe and if they are both going in the same
633 * direction (read vs write), and if neither one is
634 * locked for final IO
635 *
636 * The caller is responsible for locking such that
637 * rmw_locked is safe to test
638 */
rbio_can_merge(struct btrfs_raid_bio * last,struct btrfs_raid_bio * cur)639 static int rbio_can_merge(struct btrfs_raid_bio *last,
640 struct btrfs_raid_bio *cur)
641 {
642 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
643 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
644 return 0;
645
646 /*
647 * we can't merge with cached rbios, since the
648 * idea is that when we merge the destination
649 * rbio is going to run our IO for us. We can
650 * steal from cached rbios though, other functions
651 * handle that.
652 */
653 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
654 test_bit(RBIO_CACHE_BIT, &cur->flags))
655 return 0;
656
657 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
658 return 0;
659
660 /* we can't merge with different operations */
661 if (last->operation != cur->operation)
662 return 0;
663 /*
664 * We've need read the full stripe from the drive.
665 * check and repair the parity and write the new results.
666 *
667 * We're not allowed to add any new bios to the
668 * bio list here, anyone else that wants to
669 * change this stripe needs to do their own rmw.
670 */
671 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
672 return 0;
673
674 if (last->operation == BTRFS_RBIO_READ_REBUILD)
675 return 0;
676
677 return 1;
678 }
679
rbio_stripe_sector_index(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)680 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
681 unsigned int stripe_nr,
682 unsigned int sector_nr)
683 {
684 ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
685 ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
686
687 return stripe_nr * rbio->stripe_nsectors + sector_nr;
688 }
689
690 /* Return a sector from rbio->stripe_sectors, not from the bio list */
rbio_stripe_sector(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)691 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
692 unsigned int stripe_nr,
693 unsigned int sector_nr)
694 {
695 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
696 sector_nr)];
697 }
698
699 /* Grab a sector inside P stripe */
rbio_pstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)700 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
701 unsigned int sector_nr)
702 {
703 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
704 }
705
706 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
rbio_qstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)707 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
708 unsigned int sector_nr)
709 {
710 if (rbio->nr_data + 1 == rbio->real_stripes)
711 return NULL;
712 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
713 }
714
715 /*
716 * The first stripe in the table for a logical address
717 * has the lock. rbios are added in one of three ways:
718 *
719 * 1) Nobody has the stripe locked yet. The rbio is given
720 * the lock and 0 is returned. The caller must start the IO
721 * themselves.
722 *
723 * 2) Someone has the stripe locked, but we're able to merge
724 * with the lock owner. The rbio is freed and the IO will
725 * start automatically along with the existing rbio. 1 is returned.
726 *
727 * 3) Someone has the stripe locked, but we're not able to merge.
728 * The rbio is added to the lock owner's plug list, or merged into
729 * an rbio already on the plug list. When the lock owner unlocks,
730 * the next rbio on the list is run and the IO is started automatically.
731 * 1 is returned
732 *
733 * If we return 0, the caller still owns the rbio and must continue with
734 * IO submission. If we return 1, the caller must assume the rbio has
735 * already been freed.
736 */
lock_stripe_add(struct btrfs_raid_bio * rbio)737 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
738 {
739 struct btrfs_stripe_hash *h;
740 struct btrfs_raid_bio *cur;
741 struct btrfs_raid_bio *pending;
742 struct btrfs_raid_bio *freeit = NULL;
743 struct btrfs_raid_bio *cache_drop = NULL;
744 int ret = 0;
745
746 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
747
748 spin_lock(&h->lock);
749 list_for_each_entry(cur, &h->hash_list, hash_list) {
750 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
751 continue;
752
753 spin_lock(&cur->bio_list_lock);
754
755 /* Can we steal this cached rbio's pages? */
756 if (bio_list_empty(&cur->bio_list) &&
757 list_empty(&cur->plug_list) &&
758 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
759 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
760 list_del_init(&cur->hash_list);
761 refcount_dec(&cur->refs);
762
763 steal_rbio(cur, rbio);
764 cache_drop = cur;
765 spin_unlock(&cur->bio_list_lock);
766
767 goto lockit;
768 }
769
770 /* Can we merge into the lock owner? */
771 if (rbio_can_merge(cur, rbio)) {
772 merge_rbio(cur, rbio);
773 spin_unlock(&cur->bio_list_lock);
774 freeit = rbio;
775 ret = 1;
776 goto out;
777 }
778
779
780 /*
781 * We couldn't merge with the running rbio, see if we can merge
782 * with the pending ones. We don't have to check for rmw_locked
783 * because there is no way they are inside finish_rmw right now
784 */
785 list_for_each_entry(pending, &cur->plug_list, plug_list) {
786 if (rbio_can_merge(pending, rbio)) {
787 merge_rbio(pending, rbio);
788 spin_unlock(&cur->bio_list_lock);
789 freeit = rbio;
790 ret = 1;
791 goto out;
792 }
793 }
794
795 /*
796 * No merging, put us on the tail of the plug list, our rbio
797 * will be started with the currently running rbio unlocks
798 */
799 list_add_tail(&rbio->plug_list, &cur->plug_list);
800 spin_unlock(&cur->bio_list_lock);
801 ret = 1;
802 goto out;
803 }
804 lockit:
805 refcount_inc(&rbio->refs);
806 list_add(&rbio->hash_list, &h->hash_list);
807 out:
808 spin_unlock(&h->lock);
809 if (cache_drop)
810 remove_rbio_from_cache(cache_drop);
811 if (freeit)
812 free_raid_bio(freeit);
813 return ret;
814 }
815
816 static void recover_rbio_work_locked(struct work_struct *work);
817
818 /*
819 * called as rmw or parity rebuild is completed. If the plug list has more
820 * rbios waiting for this stripe, the next one on the list will be started
821 */
unlock_stripe(struct btrfs_raid_bio * rbio)822 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
823 {
824 int bucket;
825 struct btrfs_stripe_hash *h;
826 int keep_cache = 0;
827
828 bucket = rbio_bucket(rbio);
829 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
830
831 if (list_empty(&rbio->plug_list))
832 cache_rbio(rbio);
833
834 spin_lock(&h->lock);
835 spin_lock(&rbio->bio_list_lock);
836
837 if (!list_empty(&rbio->hash_list)) {
838 /*
839 * if we're still cached and there is no other IO
840 * to perform, just leave this rbio here for others
841 * to steal from later
842 */
843 if (list_empty(&rbio->plug_list) &&
844 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
845 keep_cache = 1;
846 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
847 BUG_ON(!bio_list_empty(&rbio->bio_list));
848 goto done;
849 }
850
851 list_del_init(&rbio->hash_list);
852 refcount_dec(&rbio->refs);
853
854 /*
855 * we use the plug list to hold all the rbios
856 * waiting for the chance to lock this stripe.
857 * hand the lock over to one of them.
858 */
859 if (!list_empty(&rbio->plug_list)) {
860 struct btrfs_raid_bio *next;
861 struct list_head *head = rbio->plug_list.next;
862
863 next = list_entry(head, struct btrfs_raid_bio,
864 plug_list);
865
866 list_del_init(&rbio->plug_list);
867
868 list_add(&next->hash_list, &h->hash_list);
869 refcount_inc(&next->refs);
870 spin_unlock(&rbio->bio_list_lock);
871 spin_unlock(&h->lock);
872
873 if (next->operation == BTRFS_RBIO_READ_REBUILD) {
874 start_async_work(next, recover_rbio_work_locked);
875 } else if (next->operation == BTRFS_RBIO_WRITE) {
876 steal_rbio(rbio, next);
877 start_async_work(next, rmw_rbio_work_locked);
878 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
879 steal_rbio(rbio, next);
880 start_async_work(next, scrub_rbio_work_locked);
881 }
882
883 goto done_nolock;
884 }
885 }
886 done:
887 spin_unlock(&rbio->bio_list_lock);
888 spin_unlock(&h->lock);
889
890 done_nolock:
891 if (!keep_cache)
892 remove_rbio_from_cache(rbio);
893 }
894
rbio_endio_bio_list(struct bio * cur,blk_status_t status)895 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
896 {
897 struct bio *next;
898
899 while (cur) {
900 next = cur->bi_next;
901 cur->bi_next = NULL;
902 cur->bi_status = status;
903 bio_endio(cur);
904 cur = next;
905 }
906 }
907
908 /*
909 * this frees the rbio and runs through all the bios in the
910 * bio_list and calls end_io on them
911 */
rbio_orig_end_io(struct btrfs_raid_bio * rbio,blk_status_t status)912 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
913 {
914 struct bio *cur = bio_list_get(&rbio->bio_list);
915 struct bio *extra;
916
917 kfree(rbio->csum_buf);
918 bitmap_free(rbio->csum_bitmap);
919 rbio->csum_buf = NULL;
920 rbio->csum_bitmap = NULL;
921
922 /*
923 * Clear the data bitmap, as the rbio may be cached for later usage.
924 * do this before before unlock_stripe() so there will be no new bio
925 * for this bio.
926 */
927 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
928
929 /*
930 * At this moment, rbio->bio_list is empty, however since rbio does not
931 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
932 * hash list, rbio may be merged with others so that rbio->bio_list
933 * becomes non-empty.
934 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
935 * more and we can call bio_endio() on all queued bios.
936 */
937 unlock_stripe(rbio);
938 extra = bio_list_get(&rbio->bio_list);
939 free_raid_bio(rbio);
940
941 rbio_endio_bio_list(cur, status);
942 if (extra)
943 rbio_endio_bio_list(extra, status);
944 }
945
946 /*
947 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
948 *
949 * @rbio: The raid bio
950 * @stripe_nr: Stripe number, valid range [0, real_stripe)
951 * @sector_nr: Sector number inside the stripe,
952 * valid range [0, stripe_nsectors)
953 * @bio_list_only: Whether to use sectors inside the bio list only.
954 *
955 * The read/modify/write code wants to reuse the original bio page as much
956 * as possible, and only use stripe_sectors as fallback.
957 */
sector_in_rbio(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr,bool bio_list_only)958 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
959 int stripe_nr, int sector_nr,
960 bool bio_list_only)
961 {
962 struct sector_ptr *sector;
963 int index;
964
965 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
966 rbio, stripe_nr);
967 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
968 rbio, sector_nr);
969
970 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
971 ASSERT(index >= 0 && index < rbio->nr_sectors);
972
973 spin_lock(&rbio->bio_list_lock);
974 sector = &rbio->bio_sectors[index];
975 if (sector->has_paddr || bio_list_only) {
976 /* Don't return sector without a valid page pointer */
977 if (!sector->has_paddr)
978 sector = NULL;
979 spin_unlock(&rbio->bio_list_lock);
980 return sector;
981 }
982 spin_unlock(&rbio->bio_list_lock);
983
984 return &rbio->stripe_sectors[index];
985 }
986
987 /*
988 * allocation and initial setup for the btrfs_raid_bio. Not
989 * this does not allocate any pages for rbio->pages.
990 */
alloc_rbio(struct btrfs_fs_info * fs_info,struct btrfs_io_context * bioc)991 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
992 struct btrfs_io_context *bioc)
993 {
994 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
995 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
996 const unsigned int num_pages = stripe_npages * real_stripes;
997 const unsigned int stripe_nsectors =
998 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
999 const unsigned int num_sectors = stripe_nsectors * real_stripes;
1000 struct btrfs_raid_bio *rbio;
1001
1002 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1003 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
1004 /*
1005 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1006 * (at most 16) should be no larger than BITS_PER_LONG.
1007 */
1008 ASSERT(stripe_nsectors <= BITS_PER_LONG);
1009
1010 /*
1011 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1012 * (limited by u8).
1013 */
1014 ASSERT(real_stripes >= 2);
1015 ASSERT(real_stripes <= U8_MAX);
1016
1017 rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1018 if (!rbio)
1019 return ERR_PTR(-ENOMEM);
1020 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1021 GFP_NOFS);
1022 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1023 GFP_NOFS);
1024 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1025 GFP_NOFS);
1026 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1027 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1028
1029 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
1030 !rbio->finish_pointers || !rbio->error_bitmap) {
1031 free_raid_bio_pointers(rbio);
1032 kfree(rbio);
1033 return ERR_PTR(-ENOMEM);
1034 }
1035
1036 bio_list_init(&rbio->bio_list);
1037 init_waitqueue_head(&rbio->io_wait);
1038 INIT_LIST_HEAD(&rbio->plug_list);
1039 spin_lock_init(&rbio->bio_list_lock);
1040 INIT_LIST_HEAD(&rbio->stripe_cache);
1041 INIT_LIST_HEAD(&rbio->hash_list);
1042 btrfs_get_bioc(bioc);
1043 rbio->bioc = bioc;
1044 rbio->nr_pages = num_pages;
1045 rbio->nr_sectors = num_sectors;
1046 rbio->real_stripes = real_stripes;
1047 rbio->stripe_npages = stripe_npages;
1048 rbio->stripe_nsectors = stripe_nsectors;
1049 refcount_set(&rbio->refs, 1);
1050 atomic_set(&rbio->stripes_pending, 0);
1051
1052 ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1053 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1054 ASSERT(rbio->nr_data > 0);
1055
1056 return rbio;
1057 }
1058
1059 /* allocate pages for all the stripes in the bio, including parity */
alloc_rbio_pages(struct btrfs_raid_bio * rbio)1060 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1061 {
1062 int ret;
1063
1064 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1065 if (ret < 0)
1066 return ret;
1067 /* Mapping all sectors */
1068 index_stripe_sectors(rbio);
1069 return 0;
1070 }
1071
1072 /* only allocate pages for p/q stripes */
alloc_rbio_parity_pages(struct btrfs_raid_bio * rbio)1073 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1074 {
1075 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1076 int ret;
1077
1078 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1079 rbio->stripe_pages + data_pages, false);
1080 if (ret < 0)
1081 return ret;
1082
1083 index_stripe_sectors(rbio);
1084 return 0;
1085 }
1086
1087 /*
1088 * Return the total number of errors found in the vertical stripe of @sector_nr.
1089 *
1090 * @faila and @failb will also be updated to the first and second stripe
1091 * number of the errors.
1092 */
get_rbio_veritical_errors(struct btrfs_raid_bio * rbio,int sector_nr,int * faila,int * failb)1093 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1094 int *faila, int *failb)
1095 {
1096 int stripe_nr;
1097 int found_errors = 0;
1098
1099 if (faila || failb) {
1100 /*
1101 * Both @faila and @failb should be valid pointers if any of
1102 * them is specified.
1103 */
1104 ASSERT(faila && failb);
1105 *faila = -1;
1106 *failb = -1;
1107 }
1108
1109 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1110 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1111
1112 if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1113 found_errors++;
1114 if (faila) {
1115 /* Update faila and failb. */
1116 if (*faila < 0)
1117 *faila = stripe_nr;
1118 else if (*failb < 0)
1119 *failb = stripe_nr;
1120 }
1121 }
1122 }
1123 return found_errors;
1124 }
1125
1126 /*
1127 * Add a single sector @sector into our list of bios for IO.
1128 *
1129 * Return 0 if everything went well.
1130 * Return <0 for error.
1131 */
rbio_add_io_sector(struct btrfs_raid_bio * rbio,struct bio_list * bio_list,struct sector_ptr * sector,unsigned int stripe_nr,unsigned int sector_nr,enum req_op op)1132 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1133 struct bio_list *bio_list,
1134 struct sector_ptr *sector,
1135 unsigned int stripe_nr,
1136 unsigned int sector_nr,
1137 enum req_op op)
1138 {
1139 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1140 struct bio *last = bio_list->tail;
1141 int ret;
1142 struct bio *bio;
1143 struct btrfs_io_stripe *stripe;
1144 u64 disk_start;
1145
1146 /*
1147 * Note: here stripe_nr has taken device replace into consideration,
1148 * thus it can be larger than rbio->real_stripe.
1149 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1150 */
1151 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1152 rbio, stripe_nr);
1153 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1154 rbio, sector_nr);
1155 ASSERT(sector->has_paddr);
1156
1157 stripe = &rbio->bioc->stripes[stripe_nr];
1158 disk_start = stripe->physical + sector_nr * sectorsize;
1159
1160 /* if the device is missing, just fail this stripe */
1161 if (!stripe->dev->bdev) {
1162 int found_errors;
1163
1164 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1165 rbio->error_bitmap);
1166
1167 /* Check if we have reached tolerance early. */
1168 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1169 NULL, NULL);
1170 if (unlikely(found_errors > rbio->bioc->max_errors))
1171 return -EIO;
1172 return 0;
1173 }
1174
1175 /* see if we can add this page onto our existing bio */
1176 if (last) {
1177 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1178 last_end += last->bi_iter.bi_size;
1179
1180 /*
1181 * we can't merge these if they are from different
1182 * devices or if they are not contiguous
1183 */
1184 if (last_end == disk_start && !last->bi_status &&
1185 last->bi_bdev == stripe->dev->bdev) {
1186 ret = bio_add_page(last, phys_to_page(sector->paddr),
1187 sectorsize, offset_in_page(sector->paddr));
1188 if (ret == sectorsize)
1189 return 0;
1190 }
1191 }
1192
1193 /* put a new bio on the list */
1194 bio = bio_alloc(stripe->dev->bdev,
1195 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1196 op, GFP_NOFS);
1197 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1198 bio->bi_private = rbio;
1199
1200 __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize,
1201 offset_in_page(sector->paddr));
1202 bio_list_add(bio_list, bio);
1203 return 0;
1204 }
1205
index_one_bio(struct btrfs_raid_bio * rbio,struct bio * bio)1206 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1207 {
1208 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1209 const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
1210 struct bvec_iter iter = bio->bi_iter;
1211 phys_addr_t paddr;
1212 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1213 rbio->bioc->full_stripe_logical;
1214
1215 btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) {
1216 unsigned int index = (offset >> sectorsize_bits);
1217 struct sector_ptr *sector = &rbio->bio_sectors[index];
1218
1219 sector->has_paddr = true;
1220 sector->paddr = paddr;
1221 offset += sectorsize;
1222 }
1223 }
1224
1225 /*
1226 * helper function to walk our bio list and populate the bio_pages array with
1227 * the result. This seems expensive, but it is faster than constantly
1228 * searching through the bio list as we setup the IO in finish_rmw or stripe
1229 * reconstruction.
1230 *
1231 * This must be called before you trust the answers from page_in_rbio
1232 */
index_rbio_pages(struct btrfs_raid_bio * rbio)1233 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1234 {
1235 struct bio *bio;
1236
1237 spin_lock(&rbio->bio_list_lock);
1238 bio_list_for_each(bio, &rbio->bio_list)
1239 index_one_bio(rbio, bio);
1240
1241 spin_unlock(&rbio->bio_list_lock);
1242 }
1243
bio_get_trace_info(struct btrfs_raid_bio * rbio,struct bio * bio,struct raid56_bio_trace_info * trace_info)1244 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1245 struct raid56_bio_trace_info *trace_info)
1246 {
1247 const struct btrfs_io_context *bioc = rbio->bioc;
1248 int i;
1249
1250 ASSERT(bioc);
1251
1252 /* We rely on bio->bi_bdev to find the stripe number. */
1253 if (!bio->bi_bdev)
1254 goto not_found;
1255
1256 for (i = 0; i < bioc->num_stripes; i++) {
1257 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1258 continue;
1259 trace_info->stripe_nr = i;
1260 trace_info->devid = bioc->stripes[i].dev->devid;
1261 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1262 bioc->stripes[i].physical;
1263 return;
1264 }
1265
1266 not_found:
1267 trace_info->devid = -1;
1268 trace_info->offset = -1;
1269 trace_info->stripe_nr = -1;
1270 }
1271
bio_list_put(struct bio_list * bio_list)1272 static inline void bio_list_put(struct bio_list *bio_list)
1273 {
1274 struct bio *bio;
1275
1276 while ((bio = bio_list_pop(bio_list)))
1277 bio_put(bio);
1278 }
1279
assert_rbio(struct btrfs_raid_bio * rbio)1280 static void assert_rbio(struct btrfs_raid_bio *rbio)
1281 {
1282 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1283 return;
1284
1285 /*
1286 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1287 * we won't go beyond 256 disks anyway.
1288 */
1289 ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1290 ASSERT_RBIO(rbio->nr_data > 0, rbio);
1291
1292 /*
1293 * This is another check to make sure nr data stripes is smaller
1294 * than total stripes.
1295 */
1296 ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1297 }
1298
kmap_local_sector(const struct sector_ptr * sector)1299 static inline void *kmap_local_sector(const struct sector_ptr *sector)
1300 {
1301 /* The sector pointer must have a page mapped to it. */
1302 ASSERT(sector->has_paddr);
1303
1304 return kmap_local_page(phys_to_page(sector->paddr)) +
1305 offset_in_page(sector->paddr);
1306 }
1307
1308 /* Generate PQ for one vertical stripe. */
generate_pq_vertical(struct btrfs_raid_bio * rbio,int sectornr)1309 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1310 {
1311 void **pointers = rbio->finish_pointers;
1312 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1313 struct sector_ptr *sector;
1314 int stripe;
1315 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1316
1317 /* First collect one sector from each data stripe */
1318 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1319 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1320 pointers[stripe] = kmap_local_sector(sector);
1321 }
1322
1323 /* Then add the parity stripe */
1324 sector = rbio_pstripe_sector(rbio, sectornr);
1325 sector->uptodate = 1;
1326 pointers[stripe++] = kmap_local_sector(sector);
1327
1328 if (has_qstripe) {
1329 /*
1330 * RAID6, add the qstripe and call the library function
1331 * to fill in our p/q
1332 */
1333 sector = rbio_qstripe_sector(rbio, sectornr);
1334 sector->uptodate = 1;
1335 pointers[stripe++] = kmap_local_sector(sector);
1336
1337 assert_rbio(rbio);
1338 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1339 pointers);
1340 } else {
1341 /* raid5 */
1342 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1343 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1344 }
1345 for (stripe = stripe - 1; stripe >= 0; stripe--)
1346 kunmap_local(pointers[stripe]);
1347 }
1348
rmw_assemble_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1349 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1350 struct bio_list *bio_list)
1351 {
1352 /* The total sector number inside the full stripe. */
1353 int total_sector_nr;
1354 int sectornr;
1355 int stripe;
1356 int ret;
1357
1358 ASSERT(bio_list_size(bio_list) == 0);
1359
1360 /* We should have at least one data sector. */
1361 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1362
1363 /*
1364 * Reset errors, as we may have errors inherited from from degraded
1365 * write.
1366 */
1367 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1368
1369 /*
1370 * Start assembly. Make bios for everything from the higher layers (the
1371 * bio_list in our rbio) and our P/Q. Ignore everything else.
1372 */
1373 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1374 total_sector_nr++) {
1375 struct sector_ptr *sector;
1376
1377 stripe = total_sector_nr / rbio->stripe_nsectors;
1378 sectornr = total_sector_nr % rbio->stripe_nsectors;
1379
1380 /* This vertical stripe has no data, skip it. */
1381 if (!test_bit(sectornr, &rbio->dbitmap))
1382 continue;
1383
1384 if (stripe < rbio->nr_data) {
1385 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1386 if (!sector)
1387 continue;
1388 } else {
1389 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1390 }
1391
1392 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1393 sectornr, REQ_OP_WRITE);
1394 if (ret)
1395 goto error;
1396 }
1397
1398 if (likely(!rbio->bioc->replace_nr_stripes))
1399 return 0;
1400
1401 /*
1402 * Make a copy for the replace target device.
1403 *
1404 * Thus the source stripe number (in replace_stripe_src) should be valid.
1405 */
1406 ASSERT(rbio->bioc->replace_stripe_src >= 0);
1407
1408 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1409 total_sector_nr++) {
1410 struct sector_ptr *sector;
1411
1412 stripe = total_sector_nr / rbio->stripe_nsectors;
1413 sectornr = total_sector_nr % rbio->stripe_nsectors;
1414
1415 /*
1416 * For RAID56, there is only one device that can be replaced,
1417 * and replace_stripe_src[0] indicates the stripe number we
1418 * need to copy from.
1419 */
1420 if (stripe != rbio->bioc->replace_stripe_src) {
1421 /*
1422 * We can skip the whole stripe completely, note
1423 * total_sector_nr will be increased by one anyway.
1424 */
1425 ASSERT(sectornr == 0);
1426 total_sector_nr += rbio->stripe_nsectors - 1;
1427 continue;
1428 }
1429
1430 /* This vertical stripe has no data, skip it. */
1431 if (!test_bit(sectornr, &rbio->dbitmap))
1432 continue;
1433
1434 if (stripe < rbio->nr_data) {
1435 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1436 if (!sector)
1437 continue;
1438 } else {
1439 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1440 }
1441
1442 ret = rbio_add_io_sector(rbio, bio_list, sector,
1443 rbio->real_stripes,
1444 sectornr, REQ_OP_WRITE);
1445 if (ret)
1446 goto error;
1447 }
1448
1449 return 0;
1450 error:
1451 bio_list_put(bio_list);
1452 return -EIO;
1453 }
1454
set_rbio_range_error(struct btrfs_raid_bio * rbio,struct bio * bio)1455 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1456 {
1457 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1458 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1459 rbio->bioc->full_stripe_logical;
1460 int total_nr_sector = offset >> fs_info->sectorsize_bits;
1461
1462 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1463
1464 bitmap_set(rbio->error_bitmap, total_nr_sector,
1465 bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1466
1467 /*
1468 * Special handling for raid56_alloc_missing_rbio() used by
1469 * scrub/replace. Unlike call path in raid56_parity_recover(), they
1470 * pass an empty bio here. Thus we have to find out the missing device
1471 * and mark the stripe error instead.
1472 */
1473 if (bio->bi_iter.bi_size == 0) {
1474 bool found_missing = false;
1475 int stripe_nr;
1476
1477 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1478 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1479 found_missing = true;
1480 bitmap_set(rbio->error_bitmap,
1481 stripe_nr * rbio->stripe_nsectors,
1482 rbio->stripe_nsectors);
1483 }
1484 }
1485 ASSERT(found_missing);
1486 }
1487 }
1488
1489 /*
1490 * For subpage case, we can no longer set page Up-to-date directly for
1491 * stripe_pages[], thus we need to locate the sector.
1492 */
find_stripe_sector(struct btrfs_raid_bio * rbio,phys_addr_t paddr)1493 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1494 phys_addr_t paddr)
1495 {
1496 int i;
1497
1498 for (i = 0; i < rbio->nr_sectors; i++) {
1499 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1500
1501 if (sector->has_paddr && sector->paddr == paddr)
1502 return sector;
1503 }
1504 return NULL;
1505 }
1506
1507 /*
1508 * this sets each page in the bio uptodate. It should only be used on private
1509 * rbio pages, nothing that comes in from the higher layers
1510 */
set_bio_pages_uptodate(struct btrfs_raid_bio * rbio,struct bio * bio)1511 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1512 {
1513 const u32 blocksize = rbio->bioc->fs_info->sectorsize;
1514 phys_addr_t paddr;
1515
1516 ASSERT(!bio_flagged(bio, BIO_CLONED));
1517
1518 btrfs_bio_for_each_block_all(paddr, bio, blocksize) {
1519 struct sector_ptr *sector = find_stripe_sector(rbio, paddr);
1520
1521 ASSERT(sector);
1522 if (sector)
1523 sector->uptodate = 1;
1524 }
1525 }
1526
get_bio_sector_nr(struct btrfs_raid_bio * rbio,struct bio * bio)1527 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1528 {
1529 phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1530 int i;
1531
1532 for (i = 0; i < rbio->nr_sectors; i++) {
1533 if (rbio->stripe_sectors[i].paddr == bvec_paddr)
1534 break;
1535 if (rbio->bio_sectors[i].has_paddr &&
1536 rbio->bio_sectors[i].paddr == bvec_paddr)
1537 break;
1538 }
1539 ASSERT(i < rbio->nr_sectors);
1540 return i;
1541 }
1542
rbio_update_error_bitmap(struct btrfs_raid_bio * rbio,struct bio * bio)1543 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1544 {
1545 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1546 u32 bio_size = 0;
1547 struct bio_vec *bvec;
1548 int i;
1549
1550 bio_for_each_bvec_all(bvec, bio, i)
1551 bio_size += bvec->bv_len;
1552
1553 /*
1554 * Since we can have multiple bios touching the error_bitmap, we cannot
1555 * call bitmap_set() without protection.
1556 *
1557 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1558 */
1559 for (i = total_sector_nr; i < total_sector_nr +
1560 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1561 set_bit(i, rbio->error_bitmap);
1562 }
1563
1564 /* Verify the data sectors at read time. */
verify_bio_data_sectors(struct btrfs_raid_bio * rbio,struct bio * bio)1565 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1566 struct bio *bio)
1567 {
1568 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1569 int total_sector_nr = get_bio_sector_nr(rbio, bio);
1570 phys_addr_t paddr;
1571
1572 /* No data csum for the whole stripe, no need to verify. */
1573 if (!rbio->csum_bitmap || !rbio->csum_buf)
1574 return;
1575
1576 /* P/Q stripes, they have no data csum to verify against. */
1577 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1578 return;
1579
1580 btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) {
1581 u8 csum_buf[BTRFS_CSUM_SIZE];
1582 u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
1583 int ret;
1584
1585 /* No csum for this sector, skip to the next sector. */
1586 if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1587 continue;
1588
1589 ret = btrfs_check_block_csum(fs_info, paddr,
1590 csum_buf, expected_csum);
1591 if (ret < 0)
1592 set_bit(total_sector_nr, rbio->error_bitmap);
1593 total_sector_nr++;
1594 }
1595 }
1596
raid_wait_read_end_io(struct bio * bio)1597 static void raid_wait_read_end_io(struct bio *bio)
1598 {
1599 struct btrfs_raid_bio *rbio = bio->bi_private;
1600
1601 if (bio->bi_status) {
1602 rbio_update_error_bitmap(rbio, bio);
1603 } else {
1604 set_bio_pages_uptodate(rbio, bio);
1605 verify_bio_data_sectors(rbio, bio);
1606 }
1607
1608 bio_put(bio);
1609 if (atomic_dec_and_test(&rbio->stripes_pending))
1610 wake_up(&rbio->io_wait);
1611 }
1612
submit_read_wait_bio_list(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1613 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1614 struct bio_list *bio_list)
1615 {
1616 struct bio *bio;
1617
1618 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1619 while ((bio = bio_list_pop(bio_list))) {
1620 bio->bi_end_io = raid_wait_read_end_io;
1621
1622 if (trace_raid56_read_enabled()) {
1623 struct raid56_bio_trace_info trace_info = { 0 };
1624
1625 bio_get_trace_info(rbio, bio, &trace_info);
1626 trace_raid56_read(rbio, bio, &trace_info);
1627 }
1628 submit_bio(bio);
1629 }
1630
1631 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1632 }
1633
alloc_rbio_data_pages(struct btrfs_raid_bio * rbio)1634 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1635 {
1636 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1637 int ret;
1638
1639 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1640 if (ret < 0)
1641 return ret;
1642
1643 index_stripe_sectors(rbio);
1644 return 0;
1645 }
1646
1647 /*
1648 * We use plugging call backs to collect full stripes.
1649 * Any time we get a partial stripe write while plugged
1650 * we collect it into a list. When the unplug comes down,
1651 * we sort the list by logical block number and merge
1652 * everything we can into the same rbios
1653 */
1654 struct btrfs_plug_cb {
1655 struct blk_plug_cb cb;
1656 struct btrfs_fs_info *info;
1657 struct list_head rbio_list;
1658 };
1659
1660 /*
1661 * rbios on the plug list are sorted for easier merging.
1662 */
plug_cmp(void * priv,const struct list_head * a,const struct list_head * b)1663 static int plug_cmp(void *priv, const struct list_head *a,
1664 const struct list_head *b)
1665 {
1666 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1667 plug_list);
1668 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1669 plug_list);
1670 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1671 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1672
1673 if (a_sector < b_sector)
1674 return -1;
1675 if (a_sector > b_sector)
1676 return 1;
1677 return 0;
1678 }
1679
raid_unplug(struct blk_plug_cb * cb,bool from_schedule)1680 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1681 {
1682 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1683 struct btrfs_raid_bio *cur;
1684 struct btrfs_raid_bio *last = NULL;
1685
1686 list_sort(NULL, &plug->rbio_list, plug_cmp);
1687
1688 while (!list_empty(&plug->rbio_list)) {
1689 cur = list_first_entry(&plug->rbio_list,
1690 struct btrfs_raid_bio, plug_list);
1691 list_del_init(&cur->plug_list);
1692
1693 if (rbio_is_full(cur)) {
1694 /* We have a full stripe, queue it down. */
1695 start_async_work(cur, rmw_rbio_work);
1696 continue;
1697 }
1698 if (last) {
1699 if (rbio_can_merge(last, cur)) {
1700 merge_rbio(last, cur);
1701 free_raid_bio(cur);
1702 continue;
1703 }
1704 start_async_work(last, rmw_rbio_work);
1705 }
1706 last = cur;
1707 }
1708 if (last)
1709 start_async_work(last, rmw_rbio_work);
1710 kfree(plug);
1711 }
1712
1713 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
rbio_add_bio(struct btrfs_raid_bio * rbio,struct bio * orig_bio)1714 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1715 {
1716 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1717 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1718 const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1719 const u32 orig_len = orig_bio->bi_iter.bi_size;
1720 const u32 sectorsize = fs_info->sectorsize;
1721 u64 cur_logical;
1722
1723 ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1724 orig_logical + orig_len <= full_stripe_start +
1725 rbio->nr_data * BTRFS_STRIPE_LEN,
1726 rbio, orig_logical);
1727
1728 bio_list_add(&rbio->bio_list, orig_bio);
1729 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1730
1731 /* Update the dbitmap. */
1732 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1733 cur_logical += sectorsize) {
1734 int bit = ((u32)(cur_logical - full_stripe_start) >>
1735 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1736
1737 set_bit(bit, &rbio->dbitmap);
1738 }
1739 }
1740
1741 /*
1742 * our main entry point for writes from the rest of the FS.
1743 */
raid56_parity_write(struct bio * bio,struct btrfs_io_context * bioc)1744 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1745 {
1746 struct btrfs_fs_info *fs_info = bioc->fs_info;
1747 struct btrfs_raid_bio *rbio;
1748 struct btrfs_plug_cb *plug = NULL;
1749 struct blk_plug_cb *cb;
1750
1751 rbio = alloc_rbio(fs_info, bioc);
1752 if (IS_ERR(rbio)) {
1753 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1754 bio_endio(bio);
1755 return;
1756 }
1757 rbio->operation = BTRFS_RBIO_WRITE;
1758 rbio_add_bio(rbio, bio);
1759
1760 /*
1761 * Don't plug on full rbios, just get them out the door
1762 * as quickly as we can
1763 */
1764 if (!rbio_is_full(rbio)) {
1765 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1766 if (cb) {
1767 plug = container_of(cb, struct btrfs_plug_cb, cb);
1768 if (!plug->info) {
1769 plug->info = fs_info;
1770 INIT_LIST_HEAD(&plug->rbio_list);
1771 }
1772 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1773 return;
1774 }
1775 }
1776
1777 /*
1778 * Either we don't have any existing plug, or we're doing a full stripe,
1779 * queue the rmw work now.
1780 */
1781 start_async_work(rbio, rmw_rbio_work);
1782 }
1783
verify_one_sector(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr)1784 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1785 int stripe_nr, int sector_nr)
1786 {
1787 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1788 struct sector_ptr *sector;
1789 u8 csum_buf[BTRFS_CSUM_SIZE];
1790 u8 *csum_expected;
1791 int ret;
1792
1793 if (!rbio->csum_bitmap || !rbio->csum_buf)
1794 return 0;
1795
1796 /* No way to verify P/Q as they are not covered by data csum. */
1797 if (stripe_nr >= rbio->nr_data)
1798 return 0;
1799 /*
1800 * If we're rebuilding a read, we have to use pages from the
1801 * bio list if possible.
1802 */
1803 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1804 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1805 } else {
1806 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1807 }
1808
1809 csum_expected = rbio->csum_buf +
1810 (stripe_nr * rbio->stripe_nsectors + sector_nr) *
1811 fs_info->csum_size;
1812 ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected);
1813 return ret;
1814 }
1815
1816 /*
1817 * Recover a vertical stripe specified by @sector_nr.
1818 * @*pointers are the pre-allocated pointers by the caller, so we don't
1819 * need to allocate/free the pointers again and again.
1820 */
recover_vertical(struct btrfs_raid_bio * rbio,int sector_nr,void ** pointers,void ** unmap_array)1821 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1822 void **pointers, void **unmap_array)
1823 {
1824 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1825 struct sector_ptr *sector;
1826 const u32 sectorsize = fs_info->sectorsize;
1827 int found_errors;
1828 int faila;
1829 int failb;
1830 int stripe_nr;
1831 int ret = 0;
1832
1833 /*
1834 * Now we just use bitmap to mark the horizontal stripes in
1835 * which we have data when doing parity scrub.
1836 */
1837 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1838 !test_bit(sector_nr, &rbio->dbitmap))
1839 return 0;
1840
1841 found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1842 &failb);
1843 /*
1844 * No errors in the vertical stripe, skip it. Can happen for recovery
1845 * which only part of a stripe failed csum check.
1846 */
1847 if (!found_errors)
1848 return 0;
1849
1850 if (unlikely(found_errors > rbio->bioc->max_errors))
1851 return -EIO;
1852
1853 /*
1854 * Setup our array of pointers with sectors from each stripe
1855 *
1856 * NOTE: store a duplicate array of pointers to preserve the
1857 * pointer order.
1858 */
1859 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1860 /*
1861 * If we're rebuilding a read, we have to use pages from the
1862 * bio list if possible.
1863 */
1864 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1865 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1866 } else {
1867 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1868 }
1869 pointers[stripe_nr] = kmap_local_sector(sector);
1870 unmap_array[stripe_nr] = pointers[stripe_nr];
1871 }
1872
1873 /* All raid6 handling here */
1874 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1875 /* Single failure, rebuild from parity raid5 style */
1876 if (failb < 0) {
1877 if (faila == rbio->nr_data)
1878 /*
1879 * Just the P stripe has failed, without
1880 * a bad data or Q stripe.
1881 * We have nothing to do, just skip the
1882 * recovery for this stripe.
1883 */
1884 goto cleanup;
1885 /*
1886 * a single failure in raid6 is rebuilt
1887 * in the pstripe code below
1888 */
1889 goto pstripe;
1890 }
1891
1892 /*
1893 * If the q stripe is failed, do a pstripe reconstruction from
1894 * the xors.
1895 * If both the q stripe and the P stripe are failed, we're
1896 * here due to a crc mismatch and we can't give them the
1897 * data they want.
1898 */
1899 if (failb == rbio->real_stripes - 1) {
1900 if (faila == rbio->real_stripes - 2)
1901 /*
1902 * Only P and Q are corrupted.
1903 * We only care about data stripes recovery,
1904 * can skip this vertical stripe.
1905 */
1906 goto cleanup;
1907 /*
1908 * Otherwise we have one bad data stripe and
1909 * a good P stripe. raid5!
1910 */
1911 goto pstripe;
1912 }
1913
1914 if (failb == rbio->real_stripes - 2) {
1915 raid6_datap_recov(rbio->real_stripes, sectorsize,
1916 faila, pointers);
1917 } else {
1918 raid6_2data_recov(rbio->real_stripes, sectorsize,
1919 faila, failb, pointers);
1920 }
1921 } else {
1922 void *p;
1923
1924 /* Rebuild from P stripe here (raid5 or raid6). */
1925 ASSERT(failb == -1);
1926 pstripe:
1927 /* Copy parity block into failed block to start with */
1928 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1929
1930 /* Rearrange the pointer array */
1931 p = pointers[faila];
1932 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1933 stripe_nr++)
1934 pointers[stripe_nr] = pointers[stripe_nr + 1];
1935 pointers[rbio->nr_data - 1] = p;
1936
1937 /* Xor in the rest */
1938 run_xor(pointers, rbio->nr_data - 1, sectorsize);
1939
1940 }
1941
1942 /*
1943 * No matter if this is a RMW or recovery, we should have all
1944 * failed sectors repaired in the vertical stripe, thus they are now
1945 * uptodate.
1946 * Especially if we determine to cache the rbio, we need to
1947 * have at least all data sectors uptodate.
1948 *
1949 * If possible, also check if the repaired sector matches its data
1950 * checksum.
1951 */
1952 if (faila >= 0) {
1953 ret = verify_one_sector(rbio, faila, sector_nr);
1954 if (ret < 0)
1955 goto cleanup;
1956
1957 sector = rbio_stripe_sector(rbio, faila, sector_nr);
1958 sector->uptodate = 1;
1959 }
1960 if (failb >= 0) {
1961 ret = verify_one_sector(rbio, failb, sector_nr);
1962 if (ret < 0)
1963 goto cleanup;
1964
1965 sector = rbio_stripe_sector(rbio, failb, sector_nr);
1966 sector->uptodate = 1;
1967 }
1968
1969 cleanup:
1970 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1971 kunmap_local(unmap_array[stripe_nr]);
1972 return ret;
1973 }
1974
recover_sectors(struct btrfs_raid_bio * rbio)1975 static int recover_sectors(struct btrfs_raid_bio *rbio)
1976 {
1977 void **pointers = NULL;
1978 void **unmap_array = NULL;
1979 int sectornr;
1980 int ret = 0;
1981
1982 /*
1983 * @pointers array stores the pointer for each sector.
1984 *
1985 * @unmap_array stores copy of pointers that does not get reordered
1986 * during reconstruction so that kunmap_local works.
1987 */
1988 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1989 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1990 if (!pointers || !unmap_array) {
1991 ret = -ENOMEM;
1992 goto out;
1993 }
1994
1995 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1996 spin_lock(&rbio->bio_list_lock);
1997 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1998 spin_unlock(&rbio->bio_list_lock);
1999 }
2000
2001 index_rbio_pages(rbio);
2002
2003 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2004 ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2005 if (ret < 0)
2006 break;
2007 }
2008
2009 out:
2010 kfree(pointers);
2011 kfree(unmap_array);
2012 return ret;
2013 }
2014
recover_rbio(struct btrfs_raid_bio * rbio)2015 static void recover_rbio(struct btrfs_raid_bio *rbio)
2016 {
2017 struct bio_list bio_list = BIO_EMPTY_LIST;
2018 int total_sector_nr;
2019 int ret = 0;
2020
2021 /*
2022 * Either we're doing recover for a read failure or degraded write,
2023 * caller should have set error bitmap correctly.
2024 */
2025 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2026
2027 /* For recovery, we need to read all sectors including P/Q. */
2028 ret = alloc_rbio_pages(rbio);
2029 if (ret < 0)
2030 goto out;
2031
2032 index_rbio_pages(rbio);
2033
2034 /*
2035 * Read everything that hasn't failed. However this time we will
2036 * not trust any cached sector.
2037 * As we may read out some stale data but higher layer is not reading
2038 * that stale part.
2039 *
2040 * So here we always re-read everything in recovery path.
2041 */
2042 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2043 total_sector_nr++) {
2044 int stripe = total_sector_nr / rbio->stripe_nsectors;
2045 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2046 struct sector_ptr *sector;
2047
2048 /*
2049 * Skip the range which has error. It can be a range which is
2050 * marked error (for csum mismatch), or it can be a missing
2051 * device.
2052 */
2053 if (!rbio->bioc->stripes[stripe].dev->bdev ||
2054 test_bit(total_sector_nr, rbio->error_bitmap)) {
2055 /*
2056 * Also set the error bit for missing device, which
2057 * may not yet have its error bit set.
2058 */
2059 set_bit(total_sector_nr, rbio->error_bitmap);
2060 continue;
2061 }
2062
2063 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2064 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2065 sectornr, REQ_OP_READ);
2066 if (ret < 0) {
2067 bio_list_put(&bio_list);
2068 goto out;
2069 }
2070 }
2071
2072 submit_read_wait_bio_list(rbio, &bio_list);
2073 ret = recover_sectors(rbio);
2074 out:
2075 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2076 }
2077
recover_rbio_work(struct work_struct * work)2078 static void recover_rbio_work(struct work_struct *work)
2079 {
2080 struct btrfs_raid_bio *rbio;
2081
2082 rbio = container_of(work, struct btrfs_raid_bio, work);
2083 if (!lock_stripe_add(rbio))
2084 recover_rbio(rbio);
2085 }
2086
recover_rbio_work_locked(struct work_struct * work)2087 static void recover_rbio_work_locked(struct work_struct *work)
2088 {
2089 recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2090 }
2091
set_rbio_raid6_extra_error(struct btrfs_raid_bio * rbio,int mirror_num)2092 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2093 {
2094 bool found = false;
2095 int sector_nr;
2096
2097 /*
2098 * This is for RAID6 extra recovery tries, thus mirror number should
2099 * be large than 2.
2100 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2101 * RAID5 methods.
2102 */
2103 ASSERT(mirror_num > 2);
2104 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2105 int found_errors;
2106 int faila;
2107 int failb;
2108
2109 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2110 &faila, &failb);
2111 /* This vertical stripe doesn't have errors. */
2112 if (!found_errors)
2113 continue;
2114
2115 /*
2116 * If we found errors, there should be only one error marked
2117 * by previous set_rbio_range_error().
2118 */
2119 ASSERT(found_errors == 1);
2120 found = true;
2121
2122 /* Now select another stripe to mark as error. */
2123 failb = rbio->real_stripes - (mirror_num - 1);
2124 if (failb <= faila)
2125 failb--;
2126
2127 /* Set the extra bit in error bitmap. */
2128 if (failb >= 0)
2129 set_bit(failb * rbio->stripe_nsectors + sector_nr,
2130 rbio->error_bitmap);
2131 }
2132
2133 /* We should found at least one vertical stripe with error.*/
2134 ASSERT(found);
2135 }
2136
2137 /*
2138 * the main entry point for reads from the higher layers. This
2139 * is really only called when the normal read path had a failure,
2140 * so we assume the bio they send down corresponds to a failed part
2141 * of the drive.
2142 */
raid56_parity_recover(struct bio * bio,struct btrfs_io_context * bioc,int mirror_num)2143 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2144 int mirror_num)
2145 {
2146 struct btrfs_fs_info *fs_info = bioc->fs_info;
2147 struct btrfs_raid_bio *rbio;
2148
2149 rbio = alloc_rbio(fs_info, bioc);
2150 if (IS_ERR(rbio)) {
2151 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2152 bio_endio(bio);
2153 return;
2154 }
2155
2156 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2157 rbio_add_bio(rbio, bio);
2158
2159 set_rbio_range_error(rbio, bio);
2160
2161 /*
2162 * Loop retry:
2163 * for 'mirror == 2', reconstruct from all other stripes.
2164 * for 'mirror_num > 2', select a stripe to fail on every retry.
2165 */
2166 if (mirror_num > 2)
2167 set_rbio_raid6_extra_error(rbio, mirror_num);
2168
2169 start_async_work(rbio, recover_rbio_work);
2170 }
2171
fill_data_csums(struct btrfs_raid_bio * rbio)2172 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2173 {
2174 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2175 struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2176 rbio->bioc->full_stripe_logical);
2177 const u64 start = rbio->bioc->full_stripe_logical;
2178 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2179 fs_info->sectorsize_bits;
2180 int ret;
2181
2182 /* The rbio should not have its csum buffer initialized. */
2183 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2184
2185 /*
2186 * Skip the csum search if:
2187 *
2188 * - The rbio doesn't belong to data block groups
2189 * Then we are doing IO for tree blocks, no need to search csums.
2190 *
2191 * - The rbio belongs to mixed block groups
2192 * This is to avoid deadlock, as we're already holding the full
2193 * stripe lock, if we trigger a metadata read, and it needs to do
2194 * raid56 recovery, we will deadlock.
2195 */
2196 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2197 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2198 return;
2199
2200 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2201 fs_info->csum_size, GFP_NOFS);
2202 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2203 GFP_NOFS);
2204 if (!rbio->csum_buf || !rbio->csum_bitmap) {
2205 ret = -ENOMEM;
2206 goto error;
2207 }
2208
2209 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2210 rbio->csum_buf, rbio->csum_bitmap);
2211 if (ret < 0)
2212 goto error;
2213 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2214 goto no_csum;
2215 return;
2216
2217 error:
2218 /*
2219 * We failed to allocate memory or grab the csum, but it's not fatal,
2220 * we can still continue. But better to warn users that RMW is no
2221 * longer safe for this particular sub-stripe write.
2222 */
2223 btrfs_warn_rl(fs_info,
2224 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2225 rbio->bioc->full_stripe_logical, ret);
2226 no_csum:
2227 kfree(rbio->csum_buf);
2228 bitmap_free(rbio->csum_bitmap);
2229 rbio->csum_buf = NULL;
2230 rbio->csum_bitmap = NULL;
2231 }
2232
rmw_read_wait_recover(struct btrfs_raid_bio * rbio)2233 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2234 {
2235 struct bio_list bio_list = BIO_EMPTY_LIST;
2236 int total_sector_nr;
2237 int ret = 0;
2238
2239 /*
2240 * Fill the data csums we need for data verification. We need to fill
2241 * the csum_bitmap/csum_buf first, as our endio function will try to
2242 * verify the data sectors.
2243 */
2244 fill_data_csums(rbio);
2245
2246 /*
2247 * Build a list of bios to read all sectors (including data and P/Q).
2248 *
2249 * This behavior is to compensate the later csum verification and recovery.
2250 */
2251 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2252 total_sector_nr++) {
2253 struct sector_ptr *sector;
2254 int stripe = total_sector_nr / rbio->stripe_nsectors;
2255 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2256
2257 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2258 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2259 stripe, sectornr, REQ_OP_READ);
2260 if (ret) {
2261 bio_list_put(&bio_list);
2262 return ret;
2263 }
2264 }
2265
2266 /*
2267 * We may or may not have any corrupted sectors (including missing dev
2268 * and csum mismatch), just let recover_sectors() to handle them all.
2269 */
2270 submit_read_wait_bio_list(rbio, &bio_list);
2271 return recover_sectors(rbio);
2272 }
2273
raid_wait_write_end_io(struct bio * bio)2274 static void raid_wait_write_end_io(struct bio *bio)
2275 {
2276 struct btrfs_raid_bio *rbio = bio->bi_private;
2277
2278 if (bio->bi_status)
2279 rbio_update_error_bitmap(rbio, bio);
2280 bio_put(bio);
2281 if (atomic_dec_and_test(&rbio->stripes_pending))
2282 wake_up(&rbio->io_wait);
2283 }
2284
submit_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)2285 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2286 struct bio_list *bio_list)
2287 {
2288 struct bio *bio;
2289
2290 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2291 while ((bio = bio_list_pop(bio_list))) {
2292 bio->bi_end_io = raid_wait_write_end_io;
2293
2294 if (trace_raid56_write_enabled()) {
2295 struct raid56_bio_trace_info trace_info = { 0 };
2296
2297 bio_get_trace_info(rbio, bio, &trace_info);
2298 trace_raid56_write(rbio, bio, &trace_info);
2299 }
2300 submit_bio(bio);
2301 }
2302 }
2303
2304 /*
2305 * To determine if we need to read any sector from the disk.
2306 * Should only be utilized in RMW path, to skip cached rbio.
2307 */
need_read_stripe_sectors(struct btrfs_raid_bio * rbio)2308 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2309 {
2310 int i;
2311
2312 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2313 struct sector_ptr *sector = &rbio->stripe_sectors[i];
2314
2315 /*
2316 * We have a sector which doesn't have page nor uptodate,
2317 * thus this rbio can not be cached one, as cached one must
2318 * have all its data sectors present and uptodate.
2319 */
2320 if (!sector->has_paddr || !sector->uptodate)
2321 return true;
2322 }
2323 return false;
2324 }
2325
rmw_rbio(struct btrfs_raid_bio * rbio)2326 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2327 {
2328 struct bio_list bio_list;
2329 int sectornr;
2330 int ret = 0;
2331
2332 /*
2333 * Allocate the pages for parity first, as P/Q pages will always be
2334 * needed for both full-stripe and sub-stripe writes.
2335 */
2336 ret = alloc_rbio_parity_pages(rbio);
2337 if (ret < 0)
2338 goto out;
2339
2340 /*
2341 * Either full stripe write, or we have every data sector already
2342 * cached, can go to write path immediately.
2343 */
2344 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2345 /*
2346 * Now we're doing sub-stripe write, also need all data stripes
2347 * to do the full RMW.
2348 */
2349 ret = alloc_rbio_data_pages(rbio);
2350 if (ret < 0)
2351 goto out;
2352
2353 index_rbio_pages(rbio);
2354
2355 ret = rmw_read_wait_recover(rbio);
2356 if (ret < 0)
2357 goto out;
2358 }
2359
2360 /*
2361 * At this stage we're not allowed to add any new bios to the
2362 * bio list any more, anyone else that wants to change this stripe
2363 * needs to do their own rmw.
2364 */
2365 spin_lock(&rbio->bio_list_lock);
2366 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2367 spin_unlock(&rbio->bio_list_lock);
2368
2369 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2370
2371 index_rbio_pages(rbio);
2372
2373 /*
2374 * We don't cache full rbios because we're assuming
2375 * the higher layers are unlikely to use this area of
2376 * the disk again soon. If they do use it again,
2377 * hopefully they will send another full bio.
2378 */
2379 if (!rbio_is_full(rbio))
2380 cache_rbio_pages(rbio);
2381 else
2382 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2383
2384 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2385 generate_pq_vertical(rbio, sectornr);
2386
2387 bio_list_init(&bio_list);
2388 ret = rmw_assemble_write_bios(rbio, &bio_list);
2389 if (ret < 0)
2390 goto out;
2391
2392 /* We should have at least one bio assembled. */
2393 ASSERT(bio_list_size(&bio_list));
2394 submit_write_bios(rbio, &bio_list);
2395 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2396
2397 /* We may have more errors than our tolerance during the read. */
2398 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2399 int found_errors;
2400
2401 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2402 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2403 ret = -EIO;
2404 break;
2405 }
2406 }
2407 out:
2408 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2409 }
2410
rmw_rbio_work(struct work_struct * work)2411 static void rmw_rbio_work(struct work_struct *work)
2412 {
2413 struct btrfs_raid_bio *rbio;
2414
2415 rbio = container_of(work, struct btrfs_raid_bio, work);
2416 if (lock_stripe_add(rbio) == 0)
2417 rmw_rbio(rbio);
2418 }
2419
rmw_rbio_work_locked(struct work_struct * work)2420 static void rmw_rbio_work_locked(struct work_struct *work)
2421 {
2422 rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2423 }
2424
2425 /*
2426 * The following code is used to scrub/replace the parity stripe
2427 *
2428 * Caller must have already increased bio_counter for getting @bioc.
2429 *
2430 * Note: We need make sure all the pages that add into the scrub/replace
2431 * raid bio are correct and not be changed during the scrub/replace. That
2432 * is those pages just hold metadata or file data with checksum.
2433 */
2434
raid56_parity_alloc_scrub_rbio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_device * scrub_dev,unsigned long * dbitmap,int stripe_nsectors)2435 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2436 struct btrfs_io_context *bioc,
2437 struct btrfs_device *scrub_dev,
2438 unsigned long *dbitmap, int stripe_nsectors)
2439 {
2440 struct btrfs_fs_info *fs_info = bioc->fs_info;
2441 struct btrfs_raid_bio *rbio;
2442 int i;
2443
2444 rbio = alloc_rbio(fs_info, bioc);
2445 if (IS_ERR(rbio))
2446 return NULL;
2447 bio_list_add(&rbio->bio_list, bio);
2448 /*
2449 * This is a special bio which is used to hold the completion handler
2450 * and make the scrub rbio is similar to the other types
2451 */
2452 ASSERT(!bio->bi_iter.bi_size);
2453 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2454
2455 /*
2456 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2457 * to the end position, so this search can start from the first parity
2458 * stripe.
2459 */
2460 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2461 if (bioc->stripes[i].dev == scrub_dev) {
2462 rbio->scrubp = i;
2463 break;
2464 }
2465 }
2466 ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2467
2468 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2469 return rbio;
2470 }
2471
2472 /*
2473 * We just scrub the parity that we have correct data on the same horizontal,
2474 * so we needn't allocate all pages for all the stripes.
2475 */
alloc_rbio_essential_pages(struct btrfs_raid_bio * rbio)2476 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2477 {
2478 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2479 int total_sector_nr;
2480
2481 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2482 total_sector_nr++) {
2483 struct page *page;
2484 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2485 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2486
2487 if (!test_bit(sectornr, &rbio->dbitmap))
2488 continue;
2489 if (rbio->stripe_pages[index])
2490 continue;
2491 page = alloc_page(GFP_NOFS);
2492 if (!page)
2493 return -ENOMEM;
2494 rbio->stripe_pages[index] = page;
2495 }
2496 index_stripe_sectors(rbio);
2497 return 0;
2498 }
2499
finish_parity_scrub(struct btrfs_raid_bio * rbio)2500 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2501 {
2502 struct btrfs_io_context *bioc = rbio->bioc;
2503 const u32 sectorsize = bioc->fs_info->sectorsize;
2504 void **pointers = rbio->finish_pointers;
2505 unsigned long *pbitmap = &rbio->finish_pbitmap;
2506 int nr_data = rbio->nr_data;
2507 int stripe;
2508 int sectornr;
2509 bool has_qstripe;
2510 struct page *page;
2511 struct sector_ptr p_sector = { 0 };
2512 struct sector_ptr q_sector = { 0 };
2513 struct bio_list bio_list;
2514 int is_replace = 0;
2515 int ret;
2516
2517 bio_list_init(&bio_list);
2518
2519 if (rbio->real_stripes - rbio->nr_data == 1)
2520 has_qstripe = false;
2521 else if (rbio->real_stripes - rbio->nr_data == 2)
2522 has_qstripe = true;
2523 else
2524 BUG();
2525
2526 /*
2527 * Replace is running and our P/Q stripe is being replaced, then we
2528 * need to duplicate the final write to replace target.
2529 */
2530 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2531 is_replace = 1;
2532 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2533 }
2534
2535 /*
2536 * Because the higher layers(scrubber) are unlikely to
2537 * use this area of the disk again soon, so don't cache
2538 * it.
2539 */
2540 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2541
2542 page = alloc_page(GFP_NOFS);
2543 if (!page)
2544 return -ENOMEM;
2545 p_sector.has_paddr = true;
2546 p_sector.paddr = page_to_phys(page);
2547 p_sector.uptodate = 1;
2548 page = NULL;
2549
2550 if (has_qstripe) {
2551 /* RAID6, allocate and map temp space for the Q stripe */
2552 page = alloc_page(GFP_NOFS);
2553 if (!page) {
2554 __free_page(phys_to_page(p_sector.paddr));
2555 p_sector.has_paddr = false;
2556 return -ENOMEM;
2557 }
2558 q_sector.has_paddr = true;
2559 q_sector.paddr = page_to_phys(page);
2560 q_sector.uptodate = 1;
2561 page = NULL;
2562 pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector);
2563 }
2564
2565 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2566
2567 /* Map the parity stripe just once */
2568 pointers[nr_data] = kmap_local_sector(&p_sector);
2569
2570 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2571 struct sector_ptr *sector;
2572 void *parity;
2573
2574 /* first collect one page from each data stripe */
2575 for (stripe = 0; stripe < nr_data; stripe++) {
2576 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2577 pointers[stripe] = kmap_local_sector(sector);
2578 }
2579
2580 if (has_qstripe) {
2581 assert_rbio(rbio);
2582 /* RAID6, call the library function to fill in our P/Q */
2583 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2584 pointers);
2585 } else {
2586 /* raid5 */
2587 memcpy(pointers[nr_data], pointers[0], sectorsize);
2588 run_xor(pointers + 1, nr_data - 1, sectorsize);
2589 }
2590
2591 /* Check scrubbing parity and repair it */
2592 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2593 parity = kmap_local_sector(sector);
2594 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2595 memcpy(parity, pointers[rbio->scrubp], sectorsize);
2596 else
2597 /* Parity is right, needn't writeback */
2598 bitmap_clear(&rbio->dbitmap, sectornr, 1);
2599 kunmap_local(parity);
2600
2601 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2602 kunmap_local(pointers[stripe]);
2603 }
2604
2605 kunmap_local(pointers[nr_data]);
2606 __free_page(phys_to_page(p_sector.paddr));
2607 p_sector.has_paddr = false;
2608 if (q_sector.has_paddr) {
2609 __free_page(phys_to_page(q_sector.paddr));
2610 q_sector.has_paddr = false;
2611 }
2612
2613 /*
2614 * time to start writing. Make bios for everything from the
2615 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2616 * everything else.
2617 */
2618 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2619 struct sector_ptr *sector;
2620
2621 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2622 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2623 sectornr, REQ_OP_WRITE);
2624 if (ret)
2625 goto cleanup;
2626 }
2627
2628 if (!is_replace)
2629 goto submit_write;
2630
2631 /*
2632 * Replace is running and our parity stripe needs to be duplicated to
2633 * the target device. Check we have a valid source stripe number.
2634 */
2635 ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2636 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2637 struct sector_ptr *sector;
2638
2639 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2640 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2641 rbio->real_stripes,
2642 sectornr, REQ_OP_WRITE);
2643 if (ret)
2644 goto cleanup;
2645 }
2646
2647 submit_write:
2648 submit_write_bios(rbio, &bio_list);
2649 return 0;
2650
2651 cleanup:
2652 bio_list_put(&bio_list);
2653 return ret;
2654 }
2655
is_data_stripe(struct btrfs_raid_bio * rbio,int stripe)2656 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2657 {
2658 if (stripe >= 0 && stripe < rbio->nr_data)
2659 return 1;
2660 return 0;
2661 }
2662
recover_scrub_rbio(struct btrfs_raid_bio * rbio)2663 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2664 {
2665 void **pointers = NULL;
2666 void **unmap_array = NULL;
2667 int sector_nr;
2668 int ret = 0;
2669
2670 /*
2671 * @pointers array stores the pointer for each sector.
2672 *
2673 * @unmap_array stores copy of pointers that does not get reordered
2674 * during reconstruction so that kunmap_local works.
2675 */
2676 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2677 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2678 if (!pointers || !unmap_array) {
2679 ret = -ENOMEM;
2680 goto out;
2681 }
2682
2683 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2684 int dfail = 0, failp = -1;
2685 int faila;
2686 int failb;
2687 int found_errors;
2688
2689 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2690 &faila, &failb);
2691 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2692 ret = -EIO;
2693 goto out;
2694 }
2695 if (found_errors == 0)
2696 continue;
2697
2698 /* We should have at least one error here. */
2699 ASSERT(faila >= 0 || failb >= 0);
2700
2701 if (is_data_stripe(rbio, faila))
2702 dfail++;
2703 else if (is_parity_stripe(faila))
2704 failp = faila;
2705
2706 if (is_data_stripe(rbio, failb))
2707 dfail++;
2708 else if (is_parity_stripe(failb))
2709 failp = failb;
2710 /*
2711 * Because we can not use a scrubbing parity to repair the
2712 * data, so the capability of the repair is declined. (In the
2713 * case of RAID5, we can not repair anything.)
2714 */
2715 if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
2716 ret = -EIO;
2717 goto out;
2718 }
2719 /*
2720 * If all data is good, only parity is correctly, just repair
2721 * the parity, no need to recover data stripes.
2722 */
2723 if (dfail == 0)
2724 continue;
2725
2726 /*
2727 * Here means we got one corrupted data stripe and one
2728 * corrupted parity on RAID6, if the corrupted parity is
2729 * scrubbing parity, luckily, use the other one to repair the
2730 * data, or we can not repair the data stripe.
2731 */
2732 if (unlikely(failp != rbio->scrubp)) {
2733 ret = -EIO;
2734 goto out;
2735 }
2736
2737 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2738 if (ret < 0)
2739 goto out;
2740 }
2741 out:
2742 kfree(pointers);
2743 kfree(unmap_array);
2744 return ret;
2745 }
2746
scrub_assemble_read_bios(struct btrfs_raid_bio * rbio)2747 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2748 {
2749 struct bio_list bio_list = BIO_EMPTY_LIST;
2750 int total_sector_nr;
2751 int ret = 0;
2752
2753 /* Build a list of bios to read all the missing parts. */
2754 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2755 total_sector_nr++) {
2756 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2757 int stripe = total_sector_nr / rbio->stripe_nsectors;
2758 struct sector_ptr *sector;
2759
2760 /* No data in the vertical stripe, no need to read. */
2761 if (!test_bit(sectornr, &rbio->dbitmap))
2762 continue;
2763
2764 /*
2765 * We want to find all the sectors missing from the rbio and
2766 * read them from the disk. If sector_in_rbio() finds a sector
2767 * in the bio list we don't need to read it off the stripe.
2768 */
2769 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2770 if (sector)
2771 continue;
2772
2773 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2774 /*
2775 * The bio cache may have handed us an uptodate sector. If so,
2776 * use it.
2777 */
2778 if (sector->uptodate)
2779 continue;
2780
2781 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2782 sectornr, REQ_OP_READ);
2783 if (ret) {
2784 bio_list_put(&bio_list);
2785 return ret;
2786 }
2787 }
2788
2789 submit_read_wait_bio_list(rbio, &bio_list);
2790 return 0;
2791 }
2792
scrub_rbio(struct btrfs_raid_bio * rbio)2793 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2794 {
2795 int sector_nr;
2796 int ret;
2797
2798 ret = alloc_rbio_essential_pages(rbio);
2799 if (ret)
2800 goto out;
2801
2802 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2803
2804 ret = scrub_assemble_read_bios(rbio);
2805 if (ret < 0)
2806 goto out;
2807
2808 /* We may have some failures, recover the failed sectors first. */
2809 ret = recover_scrub_rbio(rbio);
2810 if (ret < 0)
2811 goto out;
2812
2813 /*
2814 * We have every sector properly prepared. Can finish the scrub
2815 * and writeback the good content.
2816 */
2817 ret = finish_parity_scrub(rbio);
2818 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2819 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2820 int found_errors;
2821
2822 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2823 if (unlikely(found_errors > rbio->bioc->max_errors)) {
2824 ret = -EIO;
2825 break;
2826 }
2827 }
2828 out:
2829 rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2830 }
2831
scrub_rbio_work_locked(struct work_struct * work)2832 static void scrub_rbio_work_locked(struct work_struct *work)
2833 {
2834 scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2835 }
2836
raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio * rbio)2837 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2838 {
2839 if (!lock_stripe_add(rbio))
2840 start_async_work(rbio, scrub_rbio_work_locked);
2841 }
2842
2843 /*
2844 * This is for scrub call sites where we already have correct data contents.
2845 * This allows us to avoid reading data stripes again.
2846 *
2847 * Unfortunately here we have to do folio copy, other than reusing the pages.
2848 * This is due to the fact rbio has its own page management for its cache.
2849 */
raid56_parity_cache_data_folios(struct btrfs_raid_bio * rbio,struct folio ** data_folios,u64 data_logical)2850 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
2851 struct folio **data_folios, u64 data_logical)
2852 {
2853 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2854 const u64 offset_in_full_stripe = data_logical -
2855 rbio->bioc->full_stripe_logical;
2856 unsigned int findex = 0;
2857 unsigned int foffset = 0;
2858 int ret;
2859
2860 /* We shouldn't hit RAID56 for bs > ps cases for now. */
2861 ASSERT(fs_info->sectorsize <= PAGE_SIZE);
2862
2863 /*
2864 * If we hit ENOMEM temporarily, but later at
2865 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2866 * the extra read, not a big deal.
2867 *
2868 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2869 * the bio would got proper error number set.
2870 */
2871 ret = alloc_rbio_data_pages(rbio);
2872 if (ret < 0)
2873 return;
2874
2875 /* data_logical must be at stripe boundary and inside the full stripe. */
2876 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2877 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2878
2879 for (unsigned int cur_off = offset_in_full_stripe;
2880 cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
2881 cur_off += PAGE_SIZE) {
2882 const unsigned int pindex = cur_off >> PAGE_SHIFT;
2883 void *kaddr;
2884
2885 kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
2886 memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
2887 kunmap_local(kaddr);
2888
2889 foffset += PAGE_SIZE;
2890 ASSERT(foffset <= folio_size(data_folios[findex]));
2891 if (foffset == folio_size(data_folios[findex])) {
2892 findex++;
2893 foffset = 0;
2894 }
2895 }
2896 for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits;
2897 sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits;
2898 sector_nr++)
2899 rbio->stripe_sectors[sector_nr].uptodate = true;
2900 }
2901