1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 253b381b3SDavid Woodhouse /* 353b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 453b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 553b381b3SDavid Woodhouse */ 6c1d7c514SDavid Sterba 753b381b3SDavid Woodhouse #include <linux/sched.h> 853b381b3SDavid Woodhouse #include <linux/bio.h> 953b381b3SDavid Woodhouse #include <linux/slab.h> 1053b381b3SDavid Woodhouse #include <linux/blkdev.h> 1153b381b3SDavid Woodhouse #include <linux/raid/pq.h> 1253b381b3SDavid Woodhouse #include <linux/hash.h> 1353b381b3SDavid Woodhouse #include <linux/list_sort.h> 1453b381b3SDavid Woodhouse #include <linux/raid/xor.h> 15818e010bSDavid Sterba #include <linux/mm.h> 169b569ea0SJosef Bacik #include "messages.h" 17cea62800SJohannes Thumshirn #include "misc.h" 1853b381b3SDavid Woodhouse #include "ctree.h" 1953b381b3SDavid Woodhouse #include "disk-io.h" 2053b381b3SDavid Woodhouse #include "volumes.h" 2153b381b3SDavid Woodhouse #include "raid56.h" 2253b381b3SDavid Woodhouse #include "async-thread.h" 23c5a41562SQu Wenruo #include "file-item.h" 247a315072SQu Wenruo #include "btrfs_inode.h" 2553b381b3SDavid Woodhouse 2653b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 2753b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 2853b381b3SDavid Woodhouse 294ae10b3aSChris Mason /* 304ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 314ae10b3aSChris Mason * of past RMW 324ae10b3aSChris Mason */ 334ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 344ae10b3aSChris Mason 354ae10b3aSChris Mason /* 364ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 374ae10b3aSChris Mason */ 384ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 394ae10b3aSChris Mason 404ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 414ae10b3aSChris Mason 428a953348SDavid Sterba #define BTRFS_STRIPE_HASH_TABLE_BITS 11 438a953348SDavid Sterba 448a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 458a953348SDavid Sterba struct btrfs_stripe_hash { 468a953348SDavid Sterba struct list_head hash_list; 478a953348SDavid Sterba spinlock_t lock; 488a953348SDavid Sterba }; 498a953348SDavid Sterba 508a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 518a953348SDavid Sterba struct btrfs_stripe_hash_table { 528a953348SDavid Sterba struct list_head stripe_cache; 538a953348SDavid Sterba spinlock_t cache_lock; 548a953348SDavid Sterba int cache_size; 558a953348SDavid Sterba struct btrfs_stripe_hash table[]; 568a953348SDavid Sterba }; 578a953348SDavid Sterba 58eb357060SQu Wenruo /* 59eb357060SQu Wenruo * A bvec like structure to present a sector inside a page. 60eb357060SQu Wenruo * 61eb357060SQu Wenruo * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 62eb357060SQu Wenruo */ 63eb357060SQu Wenruo struct sector_ptr { 64eb357060SQu Wenruo struct page *page; 6500425dd9SQu Wenruo unsigned int pgoff:24; 6600425dd9SQu Wenruo unsigned int uptodate:8; 67eb357060SQu Wenruo }; 68eb357060SQu Wenruo 6993723095SQu Wenruo static void rmw_rbio_work(struct work_struct *work); 7093723095SQu Wenruo static void rmw_rbio_work_locked(struct work_struct *work); 7153b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 7253b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 7353b381b3SDavid Woodhouse 746bfd0133SQu Wenruo static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); 756bfd0133SQu Wenruo static void scrub_rbio_work_locked(struct work_struct *work); 765a6ac9eaSMiao Xie 77797d74b7SQu Wenruo static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 78797d74b7SQu Wenruo { 792942a50dSQu Wenruo bitmap_free(rbio->error_bitmap); 80797d74b7SQu Wenruo kfree(rbio->stripe_pages); 81797d74b7SQu Wenruo kfree(rbio->bio_sectors); 82797d74b7SQu Wenruo kfree(rbio->stripe_sectors); 83797d74b7SQu Wenruo kfree(rbio->finish_pointers); 84797d74b7SQu Wenruo } 85797d74b7SQu Wenruo 86ff2b64a2SQu Wenruo static void free_raid_bio(struct btrfs_raid_bio *rbio) 87ff2b64a2SQu Wenruo { 88ff2b64a2SQu Wenruo int i; 89ff2b64a2SQu Wenruo 90ff2b64a2SQu Wenruo if (!refcount_dec_and_test(&rbio->refs)) 91ff2b64a2SQu Wenruo return; 92ff2b64a2SQu Wenruo 93ff2b64a2SQu Wenruo WARN_ON(!list_empty(&rbio->stripe_cache)); 94ff2b64a2SQu Wenruo WARN_ON(!list_empty(&rbio->hash_list)); 95ff2b64a2SQu Wenruo WARN_ON(!bio_list_empty(&rbio->bio_list)); 96ff2b64a2SQu Wenruo 97ff2b64a2SQu Wenruo for (i = 0; i < rbio->nr_pages; i++) { 98ff2b64a2SQu Wenruo if (rbio->stripe_pages[i]) { 99ff2b64a2SQu Wenruo __free_page(rbio->stripe_pages[i]); 100ff2b64a2SQu Wenruo rbio->stripe_pages[i] = NULL; 101ff2b64a2SQu Wenruo } 102ff2b64a2SQu Wenruo } 103ff2b64a2SQu Wenruo 104ff2b64a2SQu Wenruo btrfs_put_bioc(rbio->bioc); 105797d74b7SQu Wenruo free_raid_bio_pointers(rbio); 106ff2b64a2SQu Wenruo kfree(rbio); 107ff2b64a2SQu Wenruo } 108ff2b64a2SQu Wenruo 109385de0efSChristoph Hellwig static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 110ac638859SDavid Sterba { 111385de0efSChristoph Hellwig INIT_WORK(&rbio->work, work_func); 112385de0efSChristoph Hellwig queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 113ac638859SDavid Sterba } 114ac638859SDavid Sterba 11553b381b3SDavid Woodhouse /* 11653b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 11753b381b3SDavid Woodhouse * bios in hopes of making a full stripe 11853b381b3SDavid Woodhouse */ 11953b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 12053b381b3SDavid Woodhouse { 12153b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 12253b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 12353b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 12453b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 12553b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 12653b381b3SDavid Woodhouse int i; 12753b381b3SDavid Woodhouse 12853b381b3SDavid Woodhouse if (info->stripe_hash_table) 12953b381b3SDavid Woodhouse return 0; 13053b381b3SDavid Woodhouse 13183c8266aSDavid Sterba /* 13283c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 13383c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 13483c8266aSDavid Sterba * 13583c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 13683c8266aSDavid Sterba * of a failing mount. 13783c8266aSDavid Sterba */ 138ee787f95SDavid Sterba table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 13953b381b3SDavid Woodhouse if (!table) 14053b381b3SDavid Woodhouse return -ENOMEM; 14153b381b3SDavid Woodhouse 1424ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 1434ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 1444ae10b3aSChris Mason 14553b381b3SDavid Woodhouse h = table->table; 14653b381b3SDavid Woodhouse 14753b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 14853b381b3SDavid Woodhouse cur = h + i; 14953b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 15053b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 15153b381b3SDavid Woodhouse } 15253b381b3SDavid Woodhouse 15353b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 154f749303bSWang Shilong kvfree(x); 15553b381b3SDavid Woodhouse return 0; 15653b381b3SDavid Woodhouse } 15753b381b3SDavid Woodhouse 15853b381b3SDavid Woodhouse /* 1594ae10b3aSChris Mason * caching an rbio means to copy anything from the 160ac26df8bSQu Wenruo * bio_sectors array into the stripe_pages array. We 1614ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 1624ae10b3aSChris Mason * to indicate if it has valid data 1634ae10b3aSChris Mason * 1644ae10b3aSChris Mason * once the caching is done, we set the cache ready 1654ae10b3aSChris Mason * bit. 1664ae10b3aSChris Mason */ 1674ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 1684ae10b3aSChris Mason { 1694ae10b3aSChris Mason int i; 1704ae10b3aSChris Mason int ret; 1714ae10b3aSChris Mason 1724ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 1734ae10b3aSChris Mason if (ret) 1744ae10b3aSChris Mason return; 1754ae10b3aSChris Mason 17600425dd9SQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 17700425dd9SQu Wenruo /* Some range not covered by bio (partial write), skip it */ 17888074c8bSQu Wenruo if (!rbio->bio_sectors[i].page) { 17988074c8bSQu Wenruo /* 18088074c8bSQu Wenruo * Even if the sector is not covered by bio, if it is 18188074c8bSQu Wenruo * a data sector it should still be uptodate as it is 18288074c8bSQu Wenruo * read from disk. 18388074c8bSQu Wenruo */ 18488074c8bSQu Wenruo if (i < rbio->nr_data * rbio->stripe_nsectors) 18588074c8bSQu Wenruo ASSERT(rbio->stripe_sectors[i].uptodate); 18600425dd9SQu Wenruo continue; 18788074c8bSQu Wenruo } 18800425dd9SQu Wenruo 18900425dd9SQu Wenruo ASSERT(rbio->stripe_sectors[i].page); 19000425dd9SQu Wenruo memcpy_page(rbio->stripe_sectors[i].page, 19100425dd9SQu Wenruo rbio->stripe_sectors[i].pgoff, 19200425dd9SQu Wenruo rbio->bio_sectors[i].page, 19300425dd9SQu Wenruo rbio->bio_sectors[i].pgoff, 19400425dd9SQu Wenruo rbio->bioc->fs_info->sectorsize); 19500425dd9SQu Wenruo rbio->stripe_sectors[i].uptodate = 1; 19600425dd9SQu Wenruo } 1974ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1984ae10b3aSChris Mason } 1994ae10b3aSChris Mason 2004ae10b3aSChris Mason /* 20153b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 20253b381b3SDavid Woodhouse */ 20353b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 20453b381b3SDavid Woodhouse { 2054c664611SQu Wenruo u64 num = rbio->bioc->raid_map[0]; 20653b381b3SDavid Woodhouse 20753b381b3SDavid Woodhouse /* 20853b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 20953b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 21053b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 21153b381b3SDavid Woodhouse * returns just one or two different values. 21253b381b3SDavid Woodhouse * 21353b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 21453b381b3SDavid Woodhouse */ 21553b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 21653b381b3SDavid Woodhouse } 21753b381b3SDavid Woodhouse 218d4e28d9bSQu Wenruo static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 219d4e28d9bSQu Wenruo unsigned int page_nr) 220d4e28d9bSQu Wenruo { 221d4e28d9bSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 222d4e28d9bSQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 223d4e28d9bSQu Wenruo int i; 224d4e28d9bSQu Wenruo 225d4e28d9bSQu Wenruo ASSERT(page_nr < rbio->nr_pages); 226d4e28d9bSQu Wenruo 227d4e28d9bSQu Wenruo for (i = sectors_per_page * page_nr; 228d4e28d9bSQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; 229d4e28d9bSQu Wenruo i++) { 230d4e28d9bSQu Wenruo if (!rbio->stripe_sectors[i].uptodate) 231d4e28d9bSQu Wenruo return false; 232d4e28d9bSQu Wenruo } 233d4e28d9bSQu Wenruo return true; 234d4e28d9bSQu Wenruo } 235d4e28d9bSQu Wenruo 23653b381b3SDavid Woodhouse /* 237eb357060SQu Wenruo * Update the stripe_sectors[] array to use correct page and pgoff 238eb357060SQu Wenruo * 239eb357060SQu Wenruo * Should be called every time any page pointer in stripes_pages[] got modified. 240eb357060SQu Wenruo */ 241eb357060SQu Wenruo static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 242eb357060SQu Wenruo { 243eb357060SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 244eb357060SQu Wenruo u32 offset; 245eb357060SQu Wenruo int i; 246eb357060SQu Wenruo 247eb357060SQu Wenruo for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 248eb357060SQu Wenruo int page_index = offset >> PAGE_SHIFT; 249eb357060SQu Wenruo 250eb357060SQu Wenruo ASSERT(page_index < rbio->nr_pages); 251eb357060SQu Wenruo rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 252eb357060SQu Wenruo rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 253eb357060SQu Wenruo } 254eb357060SQu Wenruo } 255eb357060SQu Wenruo 2564d100466SQu Wenruo static void steal_rbio_page(struct btrfs_raid_bio *src, 2574d100466SQu Wenruo struct btrfs_raid_bio *dest, int page_nr) 2584d100466SQu Wenruo { 2594d100466SQu Wenruo const u32 sectorsize = src->bioc->fs_info->sectorsize; 2604d100466SQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2614d100466SQu Wenruo int i; 2624d100466SQu Wenruo 2634d100466SQu Wenruo if (dest->stripe_pages[page_nr]) 2644d100466SQu Wenruo __free_page(dest->stripe_pages[page_nr]); 2654d100466SQu Wenruo dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 2664d100466SQu Wenruo src->stripe_pages[page_nr] = NULL; 2674d100466SQu Wenruo 2684d100466SQu Wenruo /* Also update the sector->uptodate bits. */ 2694d100466SQu Wenruo for (i = sectors_per_page * page_nr; 2704d100466SQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; i++) 2714d100466SQu Wenruo dest->stripe_sectors[i].uptodate = true; 2724d100466SQu Wenruo } 2734d100466SQu Wenruo 27488074c8bSQu Wenruo static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 27588074c8bSQu Wenruo { 27688074c8bSQu Wenruo const int sector_nr = (page_nr << PAGE_SHIFT) >> 27788074c8bSQu Wenruo rbio->bioc->fs_info->sectorsize_bits; 27888074c8bSQu Wenruo 27988074c8bSQu Wenruo /* 28088074c8bSQu Wenruo * We have ensured PAGE_SIZE is aligned with sectorsize, thus 28188074c8bSQu Wenruo * we won't have a page which is half data half parity. 28288074c8bSQu Wenruo * 28388074c8bSQu Wenruo * Thus if the first sector of the page belongs to data stripes, then 28488074c8bSQu Wenruo * the full page belongs to data stripes. 28588074c8bSQu Wenruo */ 28688074c8bSQu Wenruo return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 28788074c8bSQu Wenruo } 28888074c8bSQu Wenruo 289eb357060SQu Wenruo /* 290d4e28d9bSQu Wenruo * Stealing an rbio means taking all the uptodate pages from the stripe array 291d4e28d9bSQu Wenruo * in the source rbio and putting them into the destination rbio. 292d4e28d9bSQu Wenruo * 293d4e28d9bSQu Wenruo * This will also update the involved stripe_sectors[] which are referring to 294d4e28d9bSQu Wenruo * the old pages. 2954ae10b3aSChris Mason */ 2964ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 2974ae10b3aSChris Mason { 2984ae10b3aSChris Mason int i; 2994ae10b3aSChris Mason 3004ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 3014ae10b3aSChris Mason return; 3024ae10b3aSChris Mason 3034ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 30488074c8bSQu Wenruo struct page *p = src->stripe_pages[i]; 30588074c8bSQu Wenruo 30688074c8bSQu Wenruo /* 30788074c8bSQu Wenruo * We don't need to steal P/Q pages as they will always be 30888074c8bSQu Wenruo * regenerated for RMW or full write anyway. 30988074c8bSQu Wenruo */ 31088074c8bSQu Wenruo if (!is_data_stripe_page(src, i)) 3114ae10b3aSChris Mason continue; 3124ae10b3aSChris Mason 31388074c8bSQu Wenruo /* 31488074c8bSQu Wenruo * If @src already has RBIO_CACHE_READY_BIT, it should have 31588074c8bSQu Wenruo * all data stripe pages present and uptodate. 31688074c8bSQu Wenruo */ 31788074c8bSQu Wenruo ASSERT(p); 31888074c8bSQu Wenruo ASSERT(full_page_sectors_uptodate(src, i)); 3194d100466SQu Wenruo steal_rbio_page(src, dest, i); 3204ae10b3aSChris Mason } 321eb357060SQu Wenruo index_stripe_sectors(dest); 322eb357060SQu Wenruo index_stripe_sectors(src); 3234ae10b3aSChris Mason } 3244ae10b3aSChris Mason 3254ae10b3aSChris Mason /* 32653b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 32753b381b3SDavid Woodhouse * splice it into the destination. The victim should 32853b381b3SDavid Woodhouse * be discarded afterwards. 32953b381b3SDavid Woodhouse * 33053b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 33153b381b3SDavid Woodhouse */ 33253b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 33353b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 33453b381b3SDavid Woodhouse { 33553b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 33653b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 337bd8f7e62SQu Wenruo /* Also inherit the bitmaps from @victim. */ 338bd8f7e62SQu Wenruo bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 339bd8f7e62SQu Wenruo dest->stripe_nsectors); 34053b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 34153b381b3SDavid Woodhouse } 34253b381b3SDavid Woodhouse 34353b381b3SDavid Woodhouse /* 3444ae10b3aSChris Mason * used to prune items that are in the cache. The caller 3454ae10b3aSChris Mason * must hold the hash table lock. 3464ae10b3aSChris Mason */ 3474ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3484ae10b3aSChris Mason { 3494ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 3504ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3514ae10b3aSChris Mason struct btrfs_stripe_hash *h; 3524ae10b3aSChris Mason int freeit = 0; 3534ae10b3aSChris Mason 3544ae10b3aSChris Mason /* 3554ae10b3aSChris Mason * check the bit again under the hash table lock. 3564ae10b3aSChris Mason */ 3574ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 3584ae10b3aSChris Mason return; 3594ae10b3aSChris Mason 3606a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 3614ae10b3aSChris Mason h = table->table + bucket; 3624ae10b3aSChris Mason 3634ae10b3aSChris Mason /* hold the lock for the bucket because we may be 3644ae10b3aSChris Mason * removing it from the hash table 3654ae10b3aSChris Mason */ 3664ae10b3aSChris Mason spin_lock(&h->lock); 3674ae10b3aSChris Mason 3684ae10b3aSChris Mason /* 3694ae10b3aSChris Mason * hold the lock for the bio list because we need 3704ae10b3aSChris Mason * to make sure the bio list is empty 3714ae10b3aSChris Mason */ 3724ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 3734ae10b3aSChris Mason 3744ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 3754ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 3764ae10b3aSChris Mason table->cache_size -= 1; 3774ae10b3aSChris Mason freeit = 1; 3784ae10b3aSChris Mason 3794ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 3804ae10b3aSChris Mason * still involved in an IO. We take it out 3814ae10b3aSChris Mason * of the cache list, and drop the ref that 3824ae10b3aSChris Mason * was held for the list. 3834ae10b3aSChris Mason * 3844ae10b3aSChris Mason * If the bio_list was empty, we also remove 3854ae10b3aSChris Mason * the rbio from the hash_table, and drop 3864ae10b3aSChris Mason * the corresponding ref 3874ae10b3aSChris Mason */ 3884ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 3894ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 3904ae10b3aSChris Mason list_del_init(&rbio->hash_list); 391dec95574SElena Reshetova refcount_dec(&rbio->refs); 3924ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 3934ae10b3aSChris Mason } 3944ae10b3aSChris Mason } 3954ae10b3aSChris Mason } 3964ae10b3aSChris Mason 3974ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 3984ae10b3aSChris Mason spin_unlock(&h->lock); 3994ae10b3aSChris Mason 4004ae10b3aSChris Mason if (freeit) 401ff2b64a2SQu Wenruo free_raid_bio(rbio); 4024ae10b3aSChris Mason } 4034ae10b3aSChris Mason 4044ae10b3aSChris Mason /* 4054ae10b3aSChris Mason * prune a given rbio from the cache 4064ae10b3aSChris Mason */ 4074ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 4084ae10b3aSChris Mason { 4094ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4104ae10b3aSChris Mason unsigned long flags; 4114ae10b3aSChris Mason 4124ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4134ae10b3aSChris Mason return; 4144ae10b3aSChris Mason 4156a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4164ae10b3aSChris Mason 4174ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4184ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4194ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4204ae10b3aSChris Mason } 4214ae10b3aSChris Mason 4224ae10b3aSChris Mason /* 4234ae10b3aSChris Mason * remove everything in the cache 4244ae10b3aSChris Mason */ 42548a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 4264ae10b3aSChris Mason { 4274ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4284ae10b3aSChris Mason unsigned long flags; 4294ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 4304ae10b3aSChris Mason 4314ae10b3aSChris Mason table = info->stripe_hash_table; 4324ae10b3aSChris Mason 4334ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4344ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 4354ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 4364ae10b3aSChris Mason struct btrfs_raid_bio, 4374ae10b3aSChris Mason stripe_cache); 4384ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4394ae10b3aSChris Mason } 4404ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4414ae10b3aSChris Mason } 4424ae10b3aSChris Mason 4434ae10b3aSChris Mason /* 4444ae10b3aSChris Mason * remove all cached entries and free the hash table 4454ae10b3aSChris Mason * used by unmount 44653b381b3SDavid Woodhouse */ 44753b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 44853b381b3SDavid Woodhouse { 44953b381b3SDavid Woodhouse if (!info->stripe_hash_table) 45053b381b3SDavid Woodhouse return; 4514ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 452f749303bSWang Shilong kvfree(info->stripe_hash_table); 45353b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 45453b381b3SDavid Woodhouse } 45553b381b3SDavid Woodhouse 45653b381b3SDavid Woodhouse /* 4574ae10b3aSChris Mason * insert an rbio into the stripe cache. It 4584ae10b3aSChris Mason * must have already been prepared by calling 4594ae10b3aSChris Mason * cache_rbio_pages 4604ae10b3aSChris Mason * 4614ae10b3aSChris Mason * If this rbio was already cached, it gets 4624ae10b3aSChris Mason * moved to the front of the lru. 4634ae10b3aSChris Mason * 4644ae10b3aSChris Mason * If the size of the rbio cache is too big, we 4654ae10b3aSChris Mason * prune an item. 4664ae10b3aSChris Mason */ 4674ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 4684ae10b3aSChris Mason { 4694ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4704ae10b3aSChris Mason unsigned long flags; 4714ae10b3aSChris Mason 4724ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 4734ae10b3aSChris Mason return; 4744ae10b3aSChris Mason 4756a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4764ae10b3aSChris Mason 4774ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4784ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4794ae10b3aSChris Mason 4804ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 4814ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 482dec95574SElena Reshetova refcount_inc(&rbio->refs); 4834ae10b3aSChris Mason 4844ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 4854ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 4864ae10b3aSChris Mason } else { 4874ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 4884ae10b3aSChris Mason table->cache_size += 1; 4894ae10b3aSChris Mason } 4904ae10b3aSChris Mason 4914ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4924ae10b3aSChris Mason 4934ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 4944ae10b3aSChris Mason struct btrfs_raid_bio *found; 4954ae10b3aSChris Mason 4964ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 4974ae10b3aSChris Mason struct btrfs_raid_bio, 4984ae10b3aSChris Mason stripe_cache); 4994ae10b3aSChris Mason 5004ae10b3aSChris Mason if (found != rbio) 5014ae10b3aSChris Mason __remove_rbio_from_cache(found); 5024ae10b3aSChris Mason } 5034ae10b3aSChris Mason 5044ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 5054ae10b3aSChris Mason } 5064ae10b3aSChris Mason 5074ae10b3aSChris Mason /* 50853b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 50953b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 51053b381b3SDavid Woodhouse * loop through. 51153b381b3SDavid Woodhouse */ 51253b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 51353b381b3SDavid Woodhouse { 51453b381b3SDavid Woodhouse int src_off = 0; 51553b381b3SDavid Woodhouse int xor_src_cnt = 0; 51653b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 51753b381b3SDavid Woodhouse 51853b381b3SDavid Woodhouse while(src_cnt > 0) { 51953b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 52053b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 52153b381b3SDavid Woodhouse 52253b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 52353b381b3SDavid Woodhouse src_off += xor_src_cnt; 52453b381b3SDavid Woodhouse } 52553b381b3SDavid Woodhouse } 52653b381b3SDavid Woodhouse 52753b381b3SDavid Woodhouse /* 528176571a1SDavid Sterba * Returns true if the bio list inside this rbio covers an entire stripe (no 529176571a1SDavid Sterba * rmw required). 53053b381b3SDavid Woodhouse */ 53153b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 53253b381b3SDavid Woodhouse { 53353b381b3SDavid Woodhouse unsigned long flags; 534176571a1SDavid Sterba unsigned long size = rbio->bio_list_bytes; 535176571a1SDavid Sterba int ret = 1; 53653b381b3SDavid Woodhouse 53753b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 538ff18a4afSChristoph Hellwig if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 539176571a1SDavid Sterba ret = 0; 540ff18a4afSChristoph Hellwig BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 54153b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 542176571a1SDavid Sterba 54353b381b3SDavid Woodhouse return ret; 54453b381b3SDavid Woodhouse } 54553b381b3SDavid Woodhouse 54653b381b3SDavid Woodhouse /* 54753b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 54853b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 54953b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 55053b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 55153b381b3SDavid Woodhouse * locked for final IO 55253b381b3SDavid Woodhouse * 55353b381b3SDavid Woodhouse * The caller is responsible for locking such that 55453b381b3SDavid Woodhouse * rmw_locked is safe to test 55553b381b3SDavid Woodhouse */ 55653b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 55753b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 55853b381b3SDavid Woodhouse { 55953b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 56053b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 56153b381b3SDavid Woodhouse return 0; 56253b381b3SDavid Woodhouse 5634ae10b3aSChris Mason /* 5644ae10b3aSChris Mason * we can't merge with cached rbios, since the 5654ae10b3aSChris Mason * idea is that when we merge the destination 5664ae10b3aSChris Mason * rbio is going to run our IO for us. We can 56701327610SNicholas D Steeves * steal from cached rbios though, other functions 5684ae10b3aSChris Mason * handle that. 5694ae10b3aSChris Mason */ 5704ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 5714ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 5724ae10b3aSChris Mason return 0; 5734ae10b3aSChris Mason 5744c664611SQu Wenruo if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 57553b381b3SDavid Woodhouse return 0; 57653b381b3SDavid Woodhouse 5775a6ac9eaSMiao Xie /* we can't merge with different operations */ 5785a6ac9eaSMiao Xie if (last->operation != cur->operation) 57953b381b3SDavid Woodhouse return 0; 5805a6ac9eaSMiao Xie /* 5815a6ac9eaSMiao Xie * We've need read the full stripe from the drive. 5825a6ac9eaSMiao Xie * check and repair the parity and write the new results. 5835a6ac9eaSMiao Xie * 5845a6ac9eaSMiao Xie * We're not allowed to add any new bios to the 5855a6ac9eaSMiao Xie * bio list here, anyone else that wants to 5865a6ac9eaSMiao Xie * change this stripe needs to do their own rmw. 5875a6ac9eaSMiao Xie */ 588db34be19SLiu Bo if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 5895a6ac9eaSMiao Xie return 0; 59053b381b3SDavid Woodhouse 591ad3daf1cSQu Wenruo if (last->operation == BTRFS_RBIO_REBUILD_MISSING || 592ad3daf1cSQu Wenruo last->operation == BTRFS_RBIO_READ_REBUILD) 593b4ee1782SOmar Sandoval return 0; 594b4ee1782SOmar Sandoval 59553b381b3SDavid Woodhouse return 1; 59653b381b3SDavid Woodhouse } 59753b381b3SDavid Woodhouse 5983e77605dSQu Wenruo static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 5993e77605dSQu Wenruo unsigned int stripe_nr, 6003e77605dSQu Wenruo unsigned int sector_nr) 6013e77605dSQu Wenruo { 6023e77605dSQu Wenruo ASSERT(stripe_nr < rbio->real_stripes); 6033e77605dSQu Wenruo ASSERT(sector_nr < rbio->stripe_nsectors); 6043e77605dSQu Wenruo 6053e77605dSQu Wenruo return stripe_nr * rbio->stripe_nsectors + sector_nr; 6063e77605dSQu Wenruo } 6073e77605dSQu Wenruo 6083e77605dSQu Wenruo /* Return a sector from rbio->stripe_sectors, not from the bio list */ 6093e77605dSQu Wenruo static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 6103e77605dSQu Wenruo unsigned int stripe_nr, 6113e77605dSQu Wenruo unsigned int sector_nr) 6123e77605dSQu Wenruo { 6133e77605dSQu Wenruo return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 6143e77605dSQu Wenruo sector_nr)]; 6153e77605dSQu Wenruo } 6163e77605dSQu Wenruo 6171145059aSQu Wenruo /* Grab a sector inside P stripe */ 6181145059aSQu Wenruo static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 6191145059aSQu Wenruo unsigned int sector_nr) 620b7178a5fSZhao Lei { 6211145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 622b7178a5fSZhao Lei } 623b7178a5fSZhao Lei 6241145059aSQu Wenruo /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 6251145059aSQu Wenruo static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 6261145059aSQu Wenruo unsigned int sector_nr) 62753b381b3SDavid Woodhouse { 6282c8cdd6eSMiao Xie if (rbio->nr_data + 1 == rbio->real_stripes) 62953b381b3SDavid Woodhouse return NULL; 6301145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 6311145059aSQu Wenruo } 6321145059aSQu Wenruo 63353b381b3SDavid Woodhouse /* 63453b381b3SDavid Woodhouse * The first stripe in the table for a logical address 63553b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 63653b381b3SDavid Woodhouse * 63753b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 63853b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 63953b381b3SDavid Woodhouse * themselves. 64053b381b3SDavid Woodhouse * 64153b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 64253b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 64353b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 64453b381b3SDavid Woodhouse * 64553b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 64653b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 64753b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 64853b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 64953b381b3SDavid Woodhouse * 1 is returned 65053b381b3SDavid Woodhouse * 65153b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 65253b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 65353b381b3SDavid Woodhouse * already been freed. 65453b381b3SDavid Woodhouse */ 65553b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 65653b381b3SDavid Woodhouse { 657721860d5SJohannes Thumshirn struct btrfs_stripe_hash *h; 65853b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 65953b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 66053b381b3SDavid Woodhouse unsigned long flags; 66153b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 6624ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 66353b381b3SDavid Woodhouse int ret = 0; 66453b381b3SDavid Woodhouse 6656a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 666721860d5SJohannes Thumshirn 66753b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 66853b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 6694c664611SQu Wenruo if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 6709d6cb1b0SJohannes Thumshirn continue; 6719d6cb1b0SJohannes Thumshirn 67253b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 67353b381b3SDavid Woodhouse 6749d6cb1b0SJohannes Thumshirn /* Can we steal this cached rbio's pages? */ 6754ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 6764ae10b3aSChris Mason list_empty(&cur->plug_list) && 6774ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 6784ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 6794ae10b3aSChris Mason list_del_init(&cur->hash_list); 680dec95574SElena Reshetova refcount_dec(&cur->refs); 6814ae10b3aSChris Mason 6824ae10b3aSChris Mason steal_rbio(cur, rbio); 6834ae10b3aSChris Mason cache_drop = cur; 6844ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 6854ae10b3aSChris Mason 6864ae10b3aSChris Mason goto lockit; 6874ae10b3aSChris Mason } 6884ae10b3aSChris Mason 6899d6cb1b0SJohannes Thumshirn /* Can we merge into the lock owner? */ 69053b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 69153b381b3SDavid Woodhouse merge_rbio(cur, rbio); 69253b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 69353b381b3SDavid Woodhouse freeit = rbio; 69453b381b3SDavid Woodhouse ret = 1; 69553b381b3SDavid Woodhouse goto out; 69653b381b3SDavid Woodhouse } 69753b381b3SDavid Woodhouse 6984ae10b3aSChris Mason 69953b381b3SDavid Woodhouse /* 7009d6cb1b0SJohannes Thumshirn * We couldn't merge with the running rbio, see if we can merge 7019d6cb1b0SJohannes Thumshirn * with the pending ones. We don't have to check for rmw_locked 7029d6cb1b0SJohannes Thumshirn * because there is no way they are inside finish_rmw right now 70353b381b3SDavid Woodhouse */ 7049d6cb1b0SJohannes Thumshirn list_for_each_entry(pending, &cur->plug_list, plug_list) { 70553b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 70653b381b3SDavid Woodhouse merge_rbio(pending, rbio); 70753b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 70853b381b3SDavid Woodhouse freeit = rbio; 70953b381b3SDavid Woodhouse ret = 1; 71053b381b3SDavid Woodhouse goto out; 71153b381b3SDavid Woodhouse } 71253b381b3SDavid Woodhouse } 71353b381b3SDavid Woodhouse 7149d6cb1b0SJohannes Thumshirn /* 7159d6cb1b0SJohannes Thumshirn * No merging, put us on the tail of the plug list, our rbio 7169d6cb1b0SJohannes Thumshirn * will be started with the currently running rbio unlocks 71753b381b3SDavid Woodhouse */ 71853b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 71953b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 72053b381b3SDavid Woodhouse ret = 1; 72153b381b3SDavid Woodhouse goto out; 72253b381b3SDavid Woodhouse } 7234ae10b3aSChris Mason lockit: 724dec95574SElena Reshetova refcount_inc(&rbio->refs); 72553b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 72653b381b3SDavid Woodhouse out: 72753b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 7284ae10b3aSChris Mason if (cache_drop) 7294ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 73053b381b3SDavid Woodhouse if (freeit) 731ff2b64a2SQu Wenruo free_raid_bio(freeit); 73253b381b3SDavid Woodhouse return ret; 73353b381b3SDavid Woodhouse } 73453b381b3SDavid Woodhouse 735d817ce35SQu Wenruo static void recover_rbio_work_locked(struct work_struct *work); 736d817ce35SQu Wenruo 73753b381b3SDavid Woodhouse /* 73853b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 73953b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 74053b381b3SDavid Woodhouse */ 74153b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 74253b381b3SDavid Woodhouse { 74353b381b3SDavid Woodhouse int bucket; 74453b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 74553b381b3SDavid Woodhouse unsigned long flags; 7464ae10b3aSChris Mason int keep_cache = 0; 74753b381b3SDavid Woodhouse 74853b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 7496a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 75053b381b3SDavid Woodhouse 7514ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 7524ae10b3aSChris Mason cache_rbio(rbio); 7534ae10b3aSChris Mason 75453b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 75553b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 75653b381b3SDavid Woodhouse 75753b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 7584ae10b3aSChris Mason /* 7594ae10b3aSChris Mason * if we're still cached and there is no other IO 7604ae10b3aSChris Mason * to perform, just leave this rbio here for others 7614ae10b3aSChris Mason * to steal from later 7624ae10b3aSChris Mason */ 7634ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 7644ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 7654ae10b3aSChris Mason keep_cache = 1; 7664ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 7674ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 7684ae10b3aSChris Mason goto done; 7694ae10b3aSChris Mason } 77053b381b3SDavid Woodhouse 77153b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 772dec95574SElena Reshetova refcount_dec(&rbio->refs); 77353b381b3SDavid Woodhouse 77453b381b3SDavid Woodhouse /* 77553b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 77653b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 77753b381b3SDavid Woodhouse * hand the lock over to one of them. 77853b381b3SDavid Woodhouse */ 77953b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 78053b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 78153b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 78253b381b3SDavid Woodhouse 78353b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 78453b381b3SDavid Woodhouse plug_list); 78553b381b3SDavid Woodhouse 78653b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 78753b381b3SDavid Woodhouse 78853b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 789dec95574SElena Reshetova refcount_inc(&next->refs); 79053b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 79153b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 79253b381b3SDavid Woodhouse 7931b94b556SMiao Xie if (next->operation == BTRFS_RBIO_READ_REBUILD) 794d817ce35SQu Wenruo start_async_work(next, recover_rbio_work_locked); 795b4ee1782SOmar Sandoval else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 796b4ee1782SOmar Sandoval steal_rbio(rbio, next); 797d817ce35SQu Wenruo start_async_work(next, recover_rbio_work_locked); 798b4ee1782SOmar Sandoval } else if (next->operation == BTRFS_RBIO_WRITE) { 7994ae10b3aSChris Mason steal_rbio(rbio, next); 80093723095SQu Wenruo start_async_work(next, rmw_rbio_work_locked); 8015a6ac9eaSMiao Xie } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 8025a6ac9eaSMiao Xie steal_rbio(rbio, next); 8036bfd0133SQu Wenruo start_async_work(next, scrub_rbio_work_locked); 8044ae10b3aSChris Mason } 80553b381b3SDavid Woodhouse 80653b381b3SDavid Woodhouse goto done_nolock; 80753b381b3SDavid Woodhouse } 80853b381b3SDavid Woodhouse } 8094ae10b3aSChris Mason done: 81053b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 81153b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 81253b381b3SDavid Woodhouse 81353b381b3SDavid Woodhouse done_nolock: 8144ae10b3aSChris Mason if (!keep_cache) 8154ae10b3aSChris Mason remove_rbio_from_cache(rbio); 81653b381b3SDavid Woodhouse } 81753b381b3SDavid Woodhouse 8187583d8d0SLiu Bo static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 81953b381b3SDavid Woodhouse { 8207583d8d0SLiu Bo struct bio *next; 8217583d8d0SLiu Bo 8227583d8d0SLiu Bo while (cur) { 8237583d8d0SLiu Bo next = cur->bi_next; 8247583d8d0SLiu Bo cur->bi_next = NULL; 8257583d8d0SLiu Bo cur->bi_status = err; 8267583d8d0SLiu Bo bio_endio(cur); 8277583d8d0SLiu Bo cur = next; 8287583d8d0SLiu Bo } 82953b381b3SDavid Woodhouse } 83053b381b3SDavid Woodhouse 83153b381b3SDavid Woodhouse /* 83253b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 83353b381b3SDavid Woodhouse * bio_list and calls end_io on them 83453b381b3SDavid Woodhouse */ 8354e4cbee9SChristoph Hellwig static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 83653b381b3SDavid Woodhouse { 83753b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 8387583d8d0SLiu Bo struct bio *extra; 8394245215dSMiao Xie 840c5a41562SQu Wenruo kfree(rbio->csum_buf); 841c5a41562SQu Wenruo bitmap_free(rbio->csum_bitmap); 842c5a41562SQu Wenruo rbio->csum_buf = NULL; 843c5a41562SQu Wenruo rbio->csum_bitmap = NULL; 844c5a41562SQu Wenruo 845bd8f7e62SQu Wenruo /* 846bd8f7e62SQu Wenruo * Clear the data bitmap, as the rbio may be cached for later usage. 847bd8f7e62SQu Wenruo * do this before before unlock_stripe() so there will be no new bio 848bd8f7e62SQu Wenruo * for this bio. 849bd8f7e62SQu Wenruo */ 850bd8f7e62SQu Wenruo bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 8514245215dSMiao Xie 8527583d8d0SLiu Bo /* 8537583d8d0SLiu Bo * At this moment, rbio->bio_list is empty, however since rbio does not 8547583d8d0SLiu Bo * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 8557583d8d0SLiu Bo * hash list, rbio may be merged with others so that rbio->bio_list 8567583d8d0SLiu Bo * becomes non-empty. 8577583d8d0SLiu Bo * Once unlock_stripe() is done, rbio->bio_list will not be updated any 8587583d8d0SLiu Bo * more and we can call bio_endio() on all queued bios. 8597583d8d0SLiu Bo */ 8607583d8d0SLiu Bo unlock_stripe(rbio); 8617583d8d0SLiu Bo extra = bio_list_get(&rbio->bio_list); 862ff2b64a2SQu Wenruo free_raid_bio(rbio); 86353b381b3SDavid Woodhouse 8647583d8d0SLiu Bo rbio_endio_bio_list(cur, err); 8657583d8d0SLiu Bo if (extra) 8667583d8d0SLiu Bo rbio_endio_bio_list(extra, err); 86753b381b3SDavid Woodhouse } 86853b381b3SDavid Woodhouse 86953b381b3SDavid Woodhouse /* 87043dd529aSDavid Sterba * Get a sector pointer specified by its @stripe_nr and @sector_nr. 8713e77605dSQu Wenruo * 8723e77605dSQu Wenruo * @rbio: The raid bio 8733e77605dSQu Wenruo * @stripe_nr: Stripe number, valid range [0, real_stripe) 8743e77605dSQu Wenruo * @sector_nr: Sector number inside the stripe, 8753e77605dSQu Wenruo * valid range [0, stripe_nsectors) 8763e77605dSQu Wenruo * @bio_list_only: Whether to use sectors inside the bio list only. 8773e77605dSQu Wenruo * 8783e77605dSQu Wenruo * The read/modify/write code wants to reuse the original bio page as much 8793e77605dSQu Wenruo * as possible, and only use stripe_sectors as fallback. 8803e77605dSQu Wenruo */ 8813e77605dSQu Wenruo static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 8823e77605dSQu Wenruo int stripe_nr, int sector_nr, 8833e77605dSQu Wenruo bool bio_list_only) 8843e77605dSQu Wenruo { 8853e77605dSQu Wenruo struct sector_ptr *sector; 8863e77605dSQu Wenruo int index; 8873e77605dSQu Wenruo 8883e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 8893e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 8903e77605dSQu Wenruo 8913e77605dSQu Wenruo index = stripe_nr * rbio->stripe_nsectors + sector_nr; 8923e77605dSQu Wenruo ASSERT(index >= 0 && index < rbio->nr_sectors); 8933e77605dSQu Wenruo 8943e77605dSQu Wenruo spin_lock_irq(&rbio->bio_list_lock); 8953e77605dSQu Wenruo sector = &rbio->bio_sectors[index]; 8963e77605dSQu Wenruo if (sector->page || bio_list_only) { 8973e77605dSQu Wenruo /* Don't return sector without a valid page pointer */ 8983e77605dSQu Wenruo if (!sector->page) 8993e77605dSQu Wenruo sector = NULL; 9003e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9013e77605dSQu Wenruo return sector; 9023e77605dSQu Wenruo } 9033e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9043e77605dSQu Wenruo 9053e77605dSQu Wenruo return &rbio->stripe_sectors[index]; 9063e77605dSQu Wenruo } 9073e77605dSQu Wenruo 90853b381b3SDavid Woodhouse /* 90953b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 91053b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 91153b381b3SDavid Woodhouse */ 9122ff7e61eSJeff Mahoney static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 913ff18a4afSChristoph Hellwig struct btrfs_io_context *bioc) 91453b381b3SDavid Woodhouse { 915843de58bSQu Wenruo const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 916ff18a4afSChristoph Hellwig const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 917843de58bSQu Wenruo const unsigned int num_pages = stripe_npages * real_stripes; 918ff18a4afSChristoph Hellwig const unsigned int stripe_nsectors = 919ff18a4afSChristoph Hellwig BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 92094efbe19SQu Wenruo const unsigned int num_sectors = stripe_nsectors * real_stripes; 92153b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 92253b381b3SDavid Woodhouse 92394efbe19SQu Wenruo /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 92494efbe19SQu Wenruo ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 925c67c68ebSQu Wenruo /* 926c67c68ebSQu Wenruo * Our current stripe len should be fixed to 64k thus stripe_nsectors 927c67c68ebSQu Wenruo * (at most 16) should be no larger than BITS_PER_LONG. 928c67c68ebSQu Wenruo */ 929c67c68ebSQu Wenruo ASSERT(stripe_nsectors <= BITS_PER_LONG); 930843de58bSQu Wenruo 931797d74b7SQu Wenruo rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 932af8e2d1dSMiao Xie if (!rbio) 93353b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 934797d74b7SQu Wenruo rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 935797d74b7SQu Wenruo GFP_NOFS); 936797d74b7SQu Wenruo rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 937797d74b7SQu Wenruo GFP_NOFS); 938797d74b7SQu Wenruo rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 939797d74b7SQu Wenruo GFP_NOFS); 940797d74b7SQu Wenruo rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 9412942a50dSQu Wenruo rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 942797d74b7SQu Wenruo 943797d74b7SQu Wenruo if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || 9442942a50dSQu Wenruo !rbio->finish_pointers || !rbio->error_bitmap) { 945797d74b7SQu Wenruo free_raid_bio_pointers(rbio); 946797d74b7SQu Wenruo kfree(rbio); 947797d74b7SQu Wenruo return ERR_PTR(-ENOMEM); 948797d74b7SQu Wenruo } 94953b381b3SDavid Woodhouse 95053b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 951d817ce35SQu Wenruo init_waitqueue_head(&rbio->io_wait); 95253b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 95353b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 9544ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 95553b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 956f1c29379SChristoph Hellwig btrfs_get_bioc(bioc); 9574c664611SQu Wenruo rbio->bioc = bioc; 95853b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 95994efbe19SQu Wenruo rbio->nr_sectors = num_sectors; 9602c8cdd6eSMiao Xie rbio->real_stripes = real_stripes; 9615a6ac9eaSMiao Xie rbio->stripe_npages = stripe_npages; 96294efbe19SQu Wenruo rbio->stripe_nsectors = stripe_nsectors; 963dec95574SElena Reshetova refcount_set(&rbio->refs, 1); 964b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, 0); 96553b381b3SDavid Woodhouse 9660b30f719SQu Wenruo ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 9670b30f719SQu Wenruo rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 96853b381b3SDavid Woodhouse 96953b381b3SDavid Woodhouse return rbio; 97053b381b3SDavid Woodhouse } 97153b381b3SDavid Woodhouse 97253b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 97353b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 97453b381b3SDavid Woodhouse { 975eb357060SQu Wenruo int ret; 976eb357060SQu Wenruo 977eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 978eb357060SQu Wenruo if (ret < 0) 979eb357060SQu Wenruo return ret; 980eb357060SQu Wenruo /* Mapping all sectors */ 981eb357060SQu Wenruo index_stripe_sectors(rbio); 982eb357060SQu Wenruo return 0; 98353b381b3SDavid Woodhouse } 98453b381b3SDavid Woodhouse 985b7178a5fSZhao Lei /* only allocate pages for p/q stripes */ 98653b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 98753b381b3SDavid Woodhouse { 988f77183dcSQu Wenruo const int data_pages = rbio->nr_data * rbio->stripe_npages; 989eb357060SQu Wenruo int ret; 99053b381b3SDavid Woodhouse 991eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 992dd137dd1SSweet Tea Dorminy rbio->stripe_pages + data_pages); 993eb357060SQu Wenruo if (ret < 0) 994eb357060SQu Wenruo return ret; 995eb357060SQu Wenruo 996eb357060SQu Wenruo index_stripe_sectors(rbio); 997eb357060SQu Wenruo return 0; 99853b381b3SDavid Woodhouse } 99953b381b3SDavid Woodhouse 100053b381b3SDavid Woodhouse /* 100167da05b3SColin Ian King * Return the total number of errors found in the vertical stripe of @sector_nr. 100275b47033SQu Wenruo * 100375b47033SQu Wenruo * @faila and @failb will also be updated to the first and second stripe 100475b47033SQu Wenruo * number of the errors. 100575b47033SQu Wenruo */ 100675b47033SQu Wenruo static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, 100775b47033SQu Wenruo int *faila, int *failb) 100875b47033SQu Wenruo { 100975b47033SQu Wenruo int stripe_nr; 101075b47033SQu Wenruo int found_errors = 0; 101175b47033SQu Wenruo 1012ad3daf1cSQu Wenruo if (faila || failb) { 1013ad3daf1cSQu Wenruo /* 1014ad3daf1cSQu Wenruo * Both @faila and @failb should be valid pointers if any of 1015ad3daf1cSQu Wenruo * them is specified. 1016ad3daf1cSQu Wenruo */ 101775b47033SQu Wenruo ASSERT(faila && failb); 101875b47033SQu Wenruo *faila = -1; 101975b47033SQu Wenruo *failb = -1; 1020ad3daf1cSQu Wenruo } 102175b47033SQu Wenruo 102275b47033SQu Wenruo for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 102375b47033SQu Wenruo int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; 102475b47033SQu Wenruo 102575b47033SQu Wenruo if (test_bit(total_sector_nr, rbio->error_bitmap)) { 102675b47033SQu Wenruo found_errors++; 1027ad3daf1cSQu Wenruo if (faila) { 1028ad3daf1cSQu Wenruo /* Update faila and failb. */ 102975b47033SQu Wenruo if (*faila < 0) 103075b47033SQu Wenruo *faila = stripe_nr; 103175b47033SQu Wenruo else if (*failb < 0) 103275b47033SQu Wenruo *failb = stripe_nr; 103375b47033SQu Wenruo } 103475b47033SQu Wenruo } 1035ad3daf1cSQu Wenruo } 103675b47033SQu Wenruo return found_errors; 103775b47033SQu Wenruo } 103875b47033SQu Wenruo 103975b47033SQu Wenruo /* 10403e77605dSQu Wenruo * Add a single sector @sector into our list of bios for IO. 10413e77605dSQu Wenruo * 10423e77605dSQu Wenruo * Return 0 if everything went well. 10433e77605dSQu Wenruo * Return <0 for error. 104453b381b3SDavid Woodhouse */ 10453e77605dSQu Wenruo static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 104653b381b3SDavid Woodhouse struct bio_list *bio_list, 10473e77605dSQu Wenruo struct sector_ptr *sector, 10483e77605dSQu Wenruo unsigned int stripe_nr, 10493e77605dSQu Wenruo unsigned int sector_nr, 1050bf9486d6SBart Van Assche enum req_op op) 105153b381b3SDavid Woodhouse { 10523e77605dSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 105353b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 105453b381b3SDavid Woodhouse int ret; 105553b381b3SDavid Woodhouse struct bio *bio; 10564c664611SQu Wenruo struct btrfs_io_stripe *stripe; 105753b381b3SDavid Woodhouse u64 disk_start; 105853b381b3SDavid Woodhouse 10593e77605dSQu Wenruo /* 10603e77605dSQu Wenruo * Note: here stripe_nr has taken device replace into consideration, 10613e77605dSQu Wenruo * thus it can be larger than rbio->real_stripe. 10623e77605dSQu Wenruo * So here we check against bioc->num_stripes, not rbio->real_stripes. 10633e77605dSQu Wenruo */ 10643e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 10653e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 10663e77605dSQu Wenruo ASSERT(sector->page); 10673e77605dSQu Wenruo 10684c664611SQu Wenruo stripe = &rbio->bioc->stripes[stripe_nr]; 10693e77605dSQu Wenruo disk_start = stripe->physical + sector_nr * sectorsize; 107053b381b3SDavid Woodhouse 107153b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 10722942a50dSQu Wenruo if (!stripe->dev->bdev) { 1073ad3daf1cSQu Wenruo int found_errors; 1074ad3daf1cSQu Wenruo 10752942a50dSQu Wenruo set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, 10762942a50dSQu Wenruo rbio->error_bitmap); 1077ad3daf1cSQu Wenruo 1078ad3daf1cSQu Wenruo /* Check if we have reached tolerance early. */ 1079ad3daf1cSQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sector_nr, 1080ad3daf1cSQu Wenruo NULL, NULL); 1081ad3daf1cSQu Wenruo if (found_errors > rbio->bioc->max_errors) 1082ad3daf1cSQu Wenruo return -EIO; 1083ad3daf1cSQu Wenruo return 0; 10842942a50dSQu Wenruo } 108553b381b3SDavid Woodhouse 108653b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 108753b381b3SDavid Woodhouse if (last) { 10881201b58bSDavid Sterba u64 last_end = last->bi_iter.bi_sector << 9; 10894f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 109053b381b3SDavid Woodhouse 109153b381b3SDavid Woodhouse /* 109253b381b3SDavid Woodhouse * we can't merge these if they are from different 109353b381b3SDavid Woodhouse * devices or if they are not contiguous 109453b381b3SDavid Woodhouse */ 1095f90ae76aSNikolay Borisov if (last_end == disk_start && !last->bi_status && 1096309dca30SChristoph Hellwig last->bi_bdev == stripe->dev->bdev) { 10973e77605dSQu Wenruo ret = bio_add_page(last, sector->page, sectorsize, 10983e77605dSQu Wenruo sector->pgoff); 10993e77605dSQu Wenruo if (ret == sectorsize) 110053b381b3SDavid Woodhouse return 0; 110153b381b3SDavid Woodhouse } 110253b381b3SDavid Woodhouse } 110353b381b3SDavid Woodhouse 110453b381b3SDavid Woodhouse /* put a new bio on the list */ 1105ff18a4afSChristoph Hellwig bio = bio_alloc(stripe->dev->bdev, 1106ff18a4afSChristoph Hellwig max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1107bf9486d6SBart Van Assche op, GFP_NOFS); 11084f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 1109e01bf588SChristoph Hellwig bio->bi_private = rbio; 111053b381b3SDavid Woodhouse 11113e77605dSQu Wenruo bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 111253b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 111353b381b3SDavid Woodhouse return 0; 111453b381b3SDavid Woodhouse } 111553b381b3SDavid Woodhouse 111600425dd9SQu Wenruo static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 111700425dd9SQu Wenruo { 111800425dd9SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 111900425dd9SQu Wenruo struct bio_vec bvec; 112000425dd9SQu Wenruo struct bvec_iter iter; 112100425dd9SQu Wenruo u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 112200425dd9SQu Wenruo rbio->bioc->raid_map[0]; 112300425dd9SQu Wenruo 112400425dd9SQu Wenruo bio_for_each_segment(bvec, bio, iter) { 112500425dd9SQu Wenruo u32 bvec_offset; 112600425dd9SQu Wenruo 112700425dd9SQu Wenruo for (bvec_offset = 0; bvec_offset < bvec.bv_len; 112800425dd9SQu Wenruo bvec_offset += sectorsize, offset += sectorsize) { 112900425dd9SQu Wenruo int index = offset / sectorsize; 113000425dd9SQu Wenruo struct sector_ptr *sector = &rbio->bio_sectors[index]; 113100425dd9SQu Wenruo 113200425dd9SQu Wenruo sector->page = bvec.bv_page; 113300425dd9SQu Wenruo sector->pgoff = bvec.bv_offset + bvec_offset; 113400425dd9SQu Wenruo ASSERT(sector->pgoff < PAGE_SIZE); 113500425dd9SQu Wenruo } 113600425dd9SQu Wenruo } 113700425dd9SQu Wenruo } 113800425dd9SQu Wenruo 113953b381b3SDavid Woodhouse /* 114053b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 114153b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 114253b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 114353b381b3SDavid Woodhouse * reconstruction. 114453b381b3SDavid Woodhouse * 114553b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 114653b381b3SDavid Woodhouse */ 114753b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 114853b381b3SDavid Woodhouse { 114953b381b3SDavid Woodhouse struct bio *bio; 115053b381b3SDavid Woodhouse 115153b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 115200425dd9SQu Wenruo bio_list_for_each(bio, &rbio->bio_list) 115300425dd9SQu Wenruo index_one_bio(rbio, bio); 115400425dd9SQu Wenruo 115553b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 115653b381b3SDavid Woodhouse } 115753b381b3SDavid Woodhouse 1158b8bea09aSQu Wenruo static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1159b8bea09aSQu Wenruo struct raid56_bio_trace_info *trace_info) 1160b8bea09aSQu Wenruo { 1161b8bea09aSQu Wenruo const struct btrfs_io_context *bioc = rbio->bioc; 1162b8bea09aSQu Wenruo int i; 1163b8bea09aSQu Wenruo 1164b8bea09aSQu Wenruo ASSERT(bioc); 1165b8bea09aSQu Wenruo 1166b8bea09aSQu Wenruo /* We rely on bio->bi_bdev to find the stripe number. */ 1167b8bea09aSQu Wenruo if (!bio->bi_bdev) 1168b8bea09aSQu Wenruo goto not_found; 1169b8bea09aSQu Wenruo 1170b8bea09aSQu Wenruo for (i = 0; i < bioc->num_stripes; i++) { 1171b8bea09aSQu Wenruo if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1172b8bea09aSQu Wenruo continue; 1173b8bea09aSQu Wenruo trace_info->stripe_nr = i; 1174b8bea09aSQu Wenruo trace_info->devid = bioc->stripes[i].dev->devid; 1175b8bea09aSQu Wenruo trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1176b8bea09aSQu Wenruo bioc->stripes[i].physical; 1177b8bea09aSQu Wenruo return; 1178b8bea09aSQu Wenruo } 1179b8bea09aSQu Wenruo 1180b8bea09aSQu Wenruo not_found: 1181b8bea09aSQu Wenruo trace_info->devid = -1; 1182b8bea09aSQu Wenruo trace_info->offset = -1; 1183b8bea09aSQu Wenruo trace_info->stripe_nr = -1; 1184b8bea09aSQu Wenruo } 1185b8bea09aSQu Wenruo 118667da05b3SColin Ian King /* Generate PQ for one vertical stripe. */ 118730e3c897SQu Wenruo static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 118830e3c897SQu Wenruo { 118930e3c897SQu Wenruo void **pointers = rbio->finish_pointers; 119030e3c897SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 119130e3c897SQu Wenruo struct sector_ptr *sector; 119230e3c897SQu Wenruo int stripe; 119330e3c897SQu Wenruo const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 119430e3c897SQu Wenruo 119530e3c897SQu Wenruo /* First collect one sector from each data stripe */ 119630e3c897SQu Wenruo for (stripe = 0; stripe < rbio->nr_data; stripe++) { 119730e3c897SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 119830e3c897SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 119930e3c897SQu Wenruo sector->pgoff; 120030e3c897SQu Wenruo } 120130e3c897SQu Wenruo 120230e3c897SQu Wenruo /* Then add the parity stripe */ 120330e3c897SQu Wenruo sector = rbio_pstripe_sector(rbio, sectornr); 120430e3c897SQu Wenruo sector->uptodate = 1; 120530e3c897SQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 120630e3c897SQu Wenruo 120730e3c897SQu Wenruo if (has_qstripe) { 120830e3c897SQu Wenruo /* 120930e3c897SQu Wenruo * RAID6, add the qstripe and call the library function 121030e3c897SQu Wenruo * to fill in our p/q 121130e3c897SQu Wenruo */ 121230e3c897SQu Wenruo sector = rbio_qstripe_sector(rbio, sectornr); 121330e3c897SQu Wenruo sector->uptodate = 1; 121430e3c897SQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + 121530e3c897SQu Wenruo sector->pgoff; 121630e3c897SQu Wenruo 121730e3c897SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 121830e3c897SQu Wenruo pointers); 121930e3c897SQu Wenruo } else { 122030e3c897SQu Wenruo /* raid5 */ 122130e3c897SQu Wenruo memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); 122230e3c897SQu Wenruo run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); 122330e3c897SQu Wenruo } 122430e3c897SQu Wenruo for (stripe = stripe - 1; stripe >= 0; stripe--) 122530e3c897SQu Wenruo kunmap_local(pointers[stripe]); 122630e3c897SQu Wenruo } 122730e3c897SQu Wenruo 12286486d21cSQu Wenruo static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, 12296486d21cSQu Wenruo struct bio_list *bio_list) 12306486d21cSQu Wenruo { 12316486d21cSQu Wenruo struct bio *bio; 12326486d21cSQu Wenruo /* The total sector number inside the full stripe. */ 12336486d21cSQu Wenruo int total_sector_nr; 12346486d21cSQu Wenruo int sectornr; 12356486d21cSQu Wenruo int stripe; 12366486d21cSQu Wenruo int ret; 12376486d21cSQu Wenruo 12386486d21cSQu Wenruo ASSERT(bio_list_size(bio_list) == 0); 12396486d21cSQu Wenruo 12406486d21cSQu Wenruo /* We should have at least one data sector. */ 12416486d21cSQu Wenruo ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 12426486d21cSQu Wenruo 12436486d21cSQu Wenruo /* 12445eb30ee2SQu Wenruo * Reset errors, as we may have errors inherited from from degraded 12455eb30ee2SQu Wenruo * write. 12465eb30ee2SQu Wenruo */ 12472942a50dSQu Wenruo bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 12485eb30ee2SQu Wenruo 12495eb30ee2SQu Wenruo /* 12506486d21cSQu Wenruo * Start assembly. Make bios for everything from the higher layers (the 12516486d21cSQu Wenruo * bio_list in our rbio) and our P/Q. Ignore everything else. 12526486d21cSQu Wenruo */ 12536486d21cSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 12546486d21cSQu Wenruo total_sector_nr++) { 12556486d21cSQu Wenruo struct sector_ptr *sector; 12566486d21cSQu Wenruo 12576486d21cSQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 12586486d21cSQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 12596486d21cSQu Wenruo 12606486d21cSQu Wenruo /* This vertical stripe has no data, skip it. */ 12616486d21cSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 12626486d21cSQu Wenruo continue; 12636486d21cSQu Wenruo 12646486d21cSQu Wenruo if (stripe < rbio->nr_data) { 12656486d21cSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 12666486d21cSQu Wenruo if (!sector) 12676486d21cSQu Wenruo continue; 12686486d21cSQu Wenruo } else { 12696486d21cSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 12706486d21cSQu Wenruo } 12716486d21cSQu Wenruo 12726486d21cSQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 12736486d21cSQu Wenruo sectornr, REQ_OP_WRITE); 12746486d21cSQu Wenruo if (ret) 12756486d21cSQu Wenruo goto error; 12766486d21cSQu Wenruo } 12776486d21cSQu Wenruo 12786486d21cSQu Wenruo if (likely(!rbio->bioc->num_tgtdevs)) 12796486d21cSQu Wenruo return 0; 12806486d21cSQu Wenruo 12816486d21cSQu Wenruo /* Make a copy for the replace target device. */ 12826486d21cSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 12836486d21cSQu Wenruo total_sector_nr++) { 12846486d21cSQu Wenruo struct sector_ptr *sector; 12856486d21cSQu Wenruo 12866486d21cSQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 12876486d21cSQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 12886486d21cSQu Wenruo 12896486d21cSQu Wenruo if (!rbio->bioc->tgtdev_map[stripe]) { 12906486d21cSQu Wenruo /* 12916486d21cSQu Wenruo * We can skip the whole stripe completely, note 12926486d21cSQu Wenruo * total_sector_nr will be increased by one anyway. 12936486d21cSQu Wenruo */ 12946486d21cSQu Wenruo ASSERT(sectornr == 0); 12956486d21cSQu Wenruo total_sector_nr += rbio->stripe_nsectors - 1; 12966486d21cSQu Wenruo continue; 12976486d21cSQu Wenruo } 12986486d21cSQu Wenruo 12996486d21cSQu Wenruo /* This vertical stripe has no data, skip it. */ 13006486d21cSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 13016486d21cSQu Wenruo continue; 13026486d21cSQu Wenruo 13036486d21cSQu Wenruo if (stripe < rbio->nr_data) { 13046486d21cSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13056486d21cSQu Wenruo if (!sector) 13066486d21cSQu Wenruo continue; 13076486d21cSQu Wenruo } else { 13086486d21cSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 13096486d21cSQu Wenruo } 13106486d21cSQu Wenruo 13116486d21cSQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, 13126486d21cSQu Wenruo rbio->bioc->tgtdev_map[stripe], 13136486d21cSQu Wenruo sectornr, REQ_OP_WRITE); 13146486d21cSQu Wenruo if (ret) 13156486d21cSQu Wenruo goto error; 13166486d21cSQu Wenruo } 13176486d21cSQu Wenruo 13186486d21cSQu Wenruo return 0; 13196486d21cSQu Wenruo error: 13206486d21cSQu Wenruo while ((bio = bio_list_pop(bio_list))) 13216486d21cSQu Wenruo bio_put(bio); 13226486d21cSQu Wenruo return -EIO; 13236486d21cSQu Wenruo } 13246486d21cSQu Wenruo 13252942a50dSQu Wenruo static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) 13262942a50dSQu Wenruo { 13272942a50dSQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 13282942a50dSQu Wenruo u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 13292942a50dSQu Wenruo rbio->bioc->raid_map[0]; 13302942a50dSQu Wenruo int total_nr_sector = offset >> fs_info->sectorsize_bits; 13312942a50dSQu Wenruo 13322942a50dSQu Wenruo ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); 13332942a50dSQu Wenruo 13342942a50dSQu Wenruo bitmap_set(rbio->error_bitmap, total_nr_sector, 13352942a50dSQu Wenruo bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 13362942a50dSQu Wenruo 13372942a50dSQu Wenruo /* 13382942a50dSQu Wenruo * Special handling for raid56_alloc_missing_rbio() used by 13392942a50dSQu Wenruo * scrub/replace. Unlike call path in raid56_parity_recover(), they 13402942a50dSQu Wenruo * pass an empty bio here. Thus we have to find out the missing device 13412942a50dSQu Wenruo * and mark the stripe error instead. 13422942a50dSQu Wenruo */ 13432942a50dSQu Wenruo if (bio->bi_iter.bi_size == 0) { 13442942a50dSQu Wenruo bool found_missing = false; 13452942a50dSQu Wenruo int stripe_nr; 13462942a50dSQu Wenruo 13472942a50dSQu Wenruo for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 13482942a50dSQu Wenruo if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { 13492942a50dSQu Wenruo found_missing = true; 13502942a50dSQu Wenruo bitmap_set(rbio->error_bitmap, 13512942a50dSQu Wenruo stripe_nr * rbio->stripe_nsectors, 13522942a50dSQu Wenruo rbio->stripe_nsectors); 13532942a50dSQu Wenruo } 13542942a50dSQu Wenruo } 13552942a50dSQu Wenruo ASSERT(found_missing); 13562942a50dSQu Wenruo } 13572942a50dSQu Wenruo } 13582942a50dSQu Wenruo 135953b381b3SDavid Woodhouse /* 136067da05b3SColin Ian King * For subpage case, we can no longer set page Up-to-date directly for 13615fdb7afcSQu Wenruo * stripe_pages[], thus we need to locate the sector. 13625fdb7afcSQu Wenruo */ 13635fdb7afcSQu Wenruo static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 13645fdb7afcSQu Wenruo struct page *page, 13655fdb7afcSQu Wenruo unsigned int pgoff) 13665fdb7afcSQu Wenruo { 13675fdb7afcSQu Wenruo int i; 13685fdb7afcSQu Wenruo 13695fdb7afcSQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 13705fdb7afcSQu Wenruo struct sector_ptr *sector = &rbio->stripe_sectors[i]; 13715fdb7afcSQu Wenruo 13725fdb7afcSQu Wenruo if (sector->page == page && sector->pgoff == pgoff) 13735fdb7afcSQu Wenruo return sector; 13745fdb7afcSQu Wenruo } 13755fdb7afcSQu Wenruo return NULL; 13765fdb7afcSQu Wenruo } 13775fdb7afcSQu Wenruo 13785fdb7afcSQu Wenruo /* 137953b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 138053b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 138153b381b3SDavid Woodhouse */ 13825fdb7afcSQu Wenruo static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 138353b381b3SDavid Woodhouse { 13845fdb7afcSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 13850198e5b7SLiu Bo struct bio_vec *bvec; 13866dc4f100SMing Lei struct bvec_iter_all iter_all; 138753b381b3SDavid Woodhouse 13880198e5b7SLiu Bo ASSERT(!bio_flagged(bio, BIO_CLONED)); 13896592e58cSFilipe Manana 13905fdb7afcSQu Wenruo bio_for_each_segment_all(bvec, bio, iter_all) { 13915fdb7afcSQu Wenruo struct sector_ptr *sector; 13925fdb7afcSQu Wenruo int pgoff; 13935fdb7afcSQu Wenruo 13945fdb7afcSQu Wenruo for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 13955fdb7afcSQu Wenruo pgoff += sectorsize) { 13965fdb7afcSQu Wenruo sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 13975fdb7afcSQu Wenruo ASSERT(sector); 13985fdb7afcSQu Wenruo if (sector) 13995fdb7afcSQu Wenruo sector->uptodate = 1; 14005fdb7afcSQu Wenruo } 14015fdb7afcSQu Wenruo } 140253b381b3SDavid Woodhouse } 140353b381b3SDavid Woodhouse 14042942a50dSQu Wenruo static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) 14052942a50dSQu Wenruo { 14062942a50dSQu Wenruo struct bio_vec *bv = bio_first_bvec_all(bio); 14072942a50dSQu Wenruo int i; 14082942a50dSQu Wenruo 14092942a50dSQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 14102942a50dSQu Wenruo struct sector_ptr *sector; 14112942a50dSQu Wenruo 14122942a50dSQu Wenruo sector = &rbio->stripe_sectors[i]; 14132942a50dSQu Wenruo if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 14142942a50dSQu Wenruo break; 14152942a50dSQu Wenruo sector = &rbio->bio_sectors[i]; 14162942a50dSQu Wenruo if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 14172942a50dSQu Wenruo break; 14182942a50dSQu Wenruo } 14192942a50dSQu Wenruo ASSERT(i < rbio->nr_sectors); 14202942a50dSQu Wenruo return i; 14212942a50dSQu Wenruo } 14222942a50dSQu Wenruo 14232942a50dSQu Wenruo static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 14242942a50dSQu Wenruo { 14252942a50dSQu Wenruo int total_sector_nr = get_bio_sector_nr(rbio, bio); 14262942a50dSQu Wenruo u32 bio_size = 0; 14272942a50dSQu Wenruo struct bio_vec *bvec; 1428a9ad4d87SQu Wenruo int i; 14292942a50dSQu Wenruo 1430*c9a43aafSQu Wenruo bio_for_each_bvec_all(bvec, bio, i) 14312942a50dSQu Wenruo bio_size += bvec->bv_len; 14322942a50dSQu Wenruo 1433a9ad4d87SQu Wenruo /* 1434a9ad4d87SQu Wenruo * Since we can have multiple bios touching the error_bitmap, we cannot 1435a9ad4d87SQu Wenruo * call bitmap_set() without protection. 1436a9ad4d87SQu Wenruo * 1437a9ad4d87SQu Wenruo * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1438a9ad4d87SQu Wenruo */ 1439a9ad4d87SQu Wenruo for (i = total_sector_nr; i < total_sector_nr + 1440a9ad4d87SQu Wenruo (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1441a9ad4d87SQu Wenruo set_bit(i, rbio->error_bitmap); 14422942a50dSQu Wenruo } 14432942a50dSQu Wenruo 14447a315072SQu Wenruo /* Verify the data sectors at read time. */ 14457a315072SQu Wenruo static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, 14467a315072SQu Wenruo struct bio *bio) 14477a315072SQu Wenruo { 14487a315072SQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 14497a315072SQu Wenruo int total_sector_nr = get_bio_sector_nr(rbio, bio); 14507a315072SQu Wenruo struct bio_vec *bvec; 14517a315072SQu Wenruo struct bvec_iter_all iter_all; 14527a315072SQu Wenruo 14537a315072SQu Wenruo /* No data csum for the whole stripe, no need to verify. */ 14547a315072SQu Wenruo if (!rbio->csum_bitmap || !rbio->csum_buf) 14557a315072SQu Wenruo return; 14567a315072SQu Wenruo 14577a315072SQu Wenruo /* P/Q stripes, they have no data csum to verify against. */ 14587a315072SQu Wenruo if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) 14597a315072SQu Wenruo return; 14607a315072SQu Wenruo 14617a315072SQu Wenruo bio_for_each_segment_all(bvec, bio, iter_all) { 14627a315072SQu Wenruo int bv_offset; 14637a315072SQu Wenruo 14647a315072SQu Wenruo for (bv_offset = bvec->bv_offset; 14657a315072SQu Wenruo bv_offset < bvec->bv_offset + bvec->bv_len; 14667a315072SQu Wenruo bv_offset += fs_info->sectorsize, total_sector_nr++) { 14677a315072SQu Wenruo u8 csum_buf[BTRFS_CSUM_SIZE]; 14687a315072SQu Wenruo u8 *expected_csum = rbio->csum_buf + 14697a315072SQu Wenruo total_sector_nr * fs_info->csum_size; 14707a315072SQu Wenruo int ret; 14717a315072SQu Wenruo 14727a315072SQu Wenruo /* No csum for this sector, skip to the next sector. */ 14737a315072SQu Wenruo if (!test_bit(total_sector_nr, rbio->csum_bitmap)) 14747a315072SQu Wenruo continue; 14757a315072SQu Wenruo 14767a315072SQu Wenruo ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, 14777a315072SQu Wenruo bv_offset, csum_buf, expected_csum); 14787a315072SQu Wenruo if (ret < 0) 14797a315072SQu Wenruo set_bit(total_sector_nr, rbio->error_bitmap); 14807a315072SQu Wenruo } 14817a315072SQu Wenruo } 14827a315072SQu Wenruo } 14837a315072SQu Wenruo 1484d817ce35SQu Wenruo static void raid_wait_read_end_io(struct bio *bio) 1485d817ce35SQu Wenruo { 1486d817ce35SQu Wenruo struct btrfs_raid_bio *rbio = bio->bi_private; 1487d817ce35SQu Wenruo 14887a315072SQu Wenruo if (bio->bi_status) { 14892942a50dSQu Wenruo rbio_update_error_bitmap(rbio, bio); 14907a315072SQu Wenruo } else { 1491d817ce35SQu Wenruo set_bio_pages_uptodate(rbio, bio); 14927a315072SQu Wenruo verify_bio_data_sectors(rbio, bio); 14937a315072SQu Wenruo } 1494d817ce35SQu Wenruo 1495d817ce35SQu Wenruo bio_put(bio); 1496d817ce35SQu Wenruo if (atomic_dec_and_test(&rbio->stripes_pending)) 1497d817ce35SQu Wenruo wake_up(&rbio->io_wait); 1498d817ce35SQu Wenruo } 1499d817ce35SQu Wenruo 1500d817ce35SQu Wenruo static void submit_read_bios(struct btrfs_raid_bio *rbio, 1501d817ce35SQu Wenruo struct bio_list *bio_list) 1502d817ce35SQu Wenruo { 1503d817ce35SQu Wenruo struct bio *bio; 1504d817ce35SQu Wenruo 1505d817ce35SQu Wenruo atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1506d817ce35SQu Wenruo while ((bio = bio_list_pop(bio_list))) { 1507d817ce35SQu Wenruo bio->bi_end_io = raid_wait_read_end_io; 1508d817ce35SQu Wenruo 1509d817ce35SQu Wenruo if (trace_raid56_scrub_read_recover_enabled()) { 1510d817ce35SQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 1511d817ce35SQu Wenruo 1512d817ce35SQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1513d817ce35SQu Wenruo trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 1514d817ce35SQu Wenruo } 1515d817ce35SQu Wenruo submit_bio(bio); 1516d817ce35SQu Wenruo } 1517d817ce35SQu Wenruo } 1518d817ce35SQu Wenruo 1519509c27aaSQu Wenruo static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, 1520509c27aaSQu Wenruo struct bio_list *bio_list) 152153b381b3SDavid Woodhouse { 152253b381b3SDavid Woodhouse struct bio *bio; 1523509c27aaSQu Wenruo int total_sector_nr; 1524509c27aaSQu Wenruo int ret = 0; 152553b381b3SDavid Woodhouse 1526509c27aaSQu Wenruo ASSERT(bio_list_size(bio_list) == 0); 152753b381b3SDavid Woodhouse 15287a315072SQu Wenruo /* 15297a315072SQu Wenruo * Build a list of bios to read all sectors (including data and P/Q). 15307a315072SQu Wenruo * 153167da05b3SColin Ian King * This behavior is to compensate the later csum verification and 15327a315072SQu Wenruo * recovery. 15337a315072SQu Wenruo */ 15347a315072SQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1535550cdeb3SQu Wenruo total_sector_nr++) { 15363e77605dSQu Wenruo struct sector_ptr *sector; 1537550cdeb3SQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 1538550cdeb3SQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 15393e77605dSQu Wenruo 15403e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 1541509c27aaSQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, 1542ff18a4afSChristoph Hellwig stripe, sectornr, REQ_OP_READ); 154353b381b3SDavid Woodhouse if (ret) 154453b381b3SDavid Woodhouse goto cleanup; 154553b381b3SDavid Woodhouse } 1546509c27aaSQu Wenruo return 0; 1547509c27aaSQu Wenruo 1548509c27aaSQu Wenruo cleanup: 1549509c27aaSQu Wenruo while ((bio = bio_list_pop(bio_list))) 1550509c27aaSQu Wenruo bio_put(bio); 1551509c27aaSQu Wenruo return ret; 1552509c27aaSQu Wenruo } 1553509c27aaSQu Wenruo 15545eb30ee2SQu Wenruo static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) 15555eb30ee2SQu Wenruo { 15565eb30ee2SQu Wenruo const int data_pages = rbio->nr_data * rbio->stripe_npages; 15575eb30ee2SQu Wenruo int ret; 15585eb30ee2SQu Wenruo 15595eb30ee2SQu Wenruo ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); 15605eb30ee2SQu Wenruo if (ret < 0) 15615eb30ee2SQu Wenruo return ret; 15625eb30ee2SQu Wenruo 15635eb30ee2SQu Wenruo index_stripe_sectors(rbio); 15645eb30ee2SQu Wenruo return 0; 15655eb30ee2SQu Wenruo } 15665eb30ee2SQu Wenruo 1567509c27aaSQu Wenruo /* 15686ac0f488SChris Mason * We use plugging call backs to collect full stripes. 15696ac0f488SChris Mason * Any time we get a partial stripe write while plugged 15706ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 15716ac0f488SChris Mason * we sort the list by logical block number and merge 15726ac0f488SChris Mason * everything we can into the same rbios 15736ac0f488SChris Mason */ 15746ac0f488SChris Mason struct btrfs_plug_cb { 15756ac0f488SChris Mason struct blk_plug_cb cb; 15766ac0f488SChris Mason struct btrfs_fs_info *info; 15776ac0f488SChris Mason struct list_head rbio_list; 1578385de0efSChristoph Hellwig struct work_struct work; 15796ac0f488SChris Mason }; 15806ac0f488SChris Mason 15816ac0f488SChris Mason /* 15826ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 15836ac0f488SChris Mason */ 15844f0f586bSSami Tolvanen static int plug_cmp(void *priv, const struct list_head *a, 15854f0f586bSSami Tolvanen const struct list_head *b) 15866ac0f488SChris Mason { 1587214cc184SDavid Sterba const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 15886ac0f488SChris Mason plug_list); 1589214cc184SDavid Sterba const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 15906ac0f488SChris Mason plug_list); 15914f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 15924f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 15936ac0f488SChris Mason 15946ac0f488SChris Mason if (a_sector < b_sector) 15956ac0f488SChris Mason return -1; 15966ac0f488SChris Mason if (a_sector > b_sector) 15976ac0f488SChris Mason return 1; 15986ac0f488SChris Mason return 0; 15996ac0f488SChris Mason } 16006ac0f488SChris Mason 160193723095SQu Wenruo static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 16026ac0f488SChris Mason { 160393723095SQu Wenruo struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); 16046ac0f488SChris Mason struct btrfs_raid_bio *cur; 16056ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 16066ac0f488SChris Mason 16076ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 160893723095SQu Wenruo 16096ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 16106ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 16116ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 16126ac0f488SChris Mason list_del_init(&cur->plug_list); 16136ac0f488SChris Mason 16146ac0f488SChris Mason if (rbio_is_full(cur)) { 161593723095SQu Wenruo /* We have a full stripe, queue it down. */ 161693723095SQu Wenruo start_async_work(cur, rmw_rbio_work); 16176ac0f488SChris Mason continue; 16186ac0f488SChris Mason } 16196ac0f488SChris Mason if (last) { 16206ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 16216ac0f488SChris Mason merge_rbio(last, cur); 1622ff2b64a2SQu Wenruo free_raid_bio(cur); 16236ac0f488SChris Mason continue; 16246ac0f488SChris Mason } 162593723095SQu Wenruo start_async_work(last, rmw_rbio_work); 16266ac0f488SChris Mason } 16276ac0f488SChris Mason last = cur; 16286ac0f488SChris Mason } 162993723095SQu Wenruo if (last) 163093723095SQu Wenruo start_async_work(last, rmw_rbio_work); 16316ac0f488SChris Mason kfree(plug); 16326ac0f488SChris Mason } 16336ac0f488SChris Mason 1634bd8f7e62SQu Wenruo /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1635bd8f7e62SQu Wenruo static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1636bd8f7e62SQu Wenruo { 1637bd8f7e62SQu Wenruo const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1638bd8f7e62SQu Wenruo const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1639bd8f7e62SQu Wenruo const u64 full_stripe_start = rbio->bioc->raid_map[0]; 1640bd8f7e62SQu Wenruo const u32 orig_len = orig_bio->bi_iter.bi_size; 1641bd8f7e62SQu Wenruo const u32 sectorsize = fs_info->sectorsize; 1642bd8f7e62SQu Wenruo u64 cur_logical; 1643bd8f7e62SQu Wenruo 1644bd8f7e62SQu Wenruo ASSERT(orig_logical >= full_stripe_start && 1645bd8f7e62SQu Wenruo orig_logical + orig_len <= full_stripe_start + 1646ff18a4afSChristoph Hellwig rbio->nr_data * BTRFS_STRIPE_LEN); 1647bd8f7e62SQu Wenruo 1648bd8f7e62SQu Wenruo bio_list_add(&rbio->bio_list, orig_bio); 1649bd8f7e62SQu Wenruo rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1650bd8f7e62SQu Wenruo 1651bd8f7e62SQu Wenruo /* Update the dbitmap. */ 1652bd8f7e62SQu Wenruo for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1653bd8f7e62SQu Wenruo cur_logical += sectorsize) { 1654bd8f7e62SQu Wenruo int bit = ((u32)(cur_logical - full_stripe_start) >> 1655bd8f7e62SQu Wenruo fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1656bd8f7e62SQu Wenruo 1657bd8f7e62SQu Wenruo set_bit(bit, &rbio->dbitmap); 1658bd8f7e62SQu Wenruo } 1659bd8f7e62SQu Wenruo } 1660bd8f7e62SQu Wenruo 16616ac0f488SChris Mason /* 166253b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 166353b381b3SDavid Woodhouse */ 166431683f4aSChristoph Hellwig void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 166553b381b3SDavid Woodhouse { 16666a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 166753b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 16686ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 16696ac0f488SChris Mason struct blk_plug_cb *cb; 167031683f4aSChristoph Hellwig int ret = 0; 167153b381b3SDavid Woodhouse 1672ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 1673af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 167431683f4aSChristoph Hellwig ret = PTR_ERR(rbio); 1675f1c29379SChristoph Hellwig goto fail; 1676af8e2d1dSMiao Xie } 16771b94b556SMiao Xie rbio->operation = BTRFS_RBIO_WRITE; 1678bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 16796ac0f488SChris Mason 16806ac0f488SChris Mason /* 168193723095SQu Wenruo * Don't plug on full rbios, just get them out the door 16826ac0f488SChris Mason * as quickly as we can 16836ac0f488SChris Mason */ 168493723095SQu Wenruo if (rbio_is_full(rbio)) 168593723095SQu Wenruo goto queue_rbio; 16866ac0f488SChris Mason 168793723095SQu Wenruo cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); 16886ac0f488SChris Mason if (cb) { 16896ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 16906ac0f488SChris Mason if (!plug->info) { 16910b246afaSJeff Mahoney plug->info = fs_info; 16926ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 16936ac0f488SChris Mason } 16946ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 169593723095SQu Wenruo return; 169653b381b3SDavid Woodhouse } 169793723095SQu Wenruo queue_rbio: 169893723095SQu Wenruo /* 169993723095SQu Wenruo * Either we don't have any existing plug, or we're doing a full stripe, 170093723095SQu Wenruo * can queue the rmw work now. 170193723095SQu Wenruo */ 170293723095SQu Wenruo start_async_work(rbio, rmw_rbio_work); 170331683f4aSChristoph Hellwig 170431683f4aSChristoph Hellwig return; 170531683f4aSChristoph Hellwig 1706f1c29379SChristoph Hellwig fail: 170731683f4aSChristoph Hellwig bio->bi_status = errno_to_blk_status(ret); 170831683f4aSChristoph Hellwig bio_endio(bio); 17096ac0f488SChris Mason } 171053b381b3SDavid Woodhouse 17117a315072SQu Wenruo static int verify_one_sector(struct btrfs_raid_bio *rbio, 17127a315072SQu Wenruo int stripe_nr, int sector_nr) 17137a315072SQu Wenruo { 17147a315072SQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 17157a315072SQu Wenruo struct sector_ptr *sector; 17167a315072SQu Wenruo u8 csum_buf[BTRFS_CSUM_SIZE]; 17177a315072SQu Wenruo u8 *csum_expected; 17187a315072SQu Wenruo int ret; 17197a315072SQu Wenruo 17207a315072SQu Wenruo if (!rbio->csum_bitmap || !rbio->csum_buf) 17217a315072SQu Wenruo return 0; 17227a315072SQu Wenruo 17237a315072SQu Wenruo /* No way to verify P/Q as they are not covered by data csum. */ 17247a315072SQu Wenruo if (stripe_nr >= rbio->nr_data) 17257a315072SQu Wenruo return 0; 17267a315072SQu Wenruo /* 17277a315072SQu Wenruo * If we're rebuilding a read, we have to use pages from the 17287a315072SQu Wenruo * bio list if possible. 17297a315072SQu Wenruo */ 17307a315072SQu Wenruo if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 17317a315072SQu Wenruo rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { 17327a315072SQu Wenruo sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 17337a315072SQu Wenruo } else { 17347a315072SQu Wenruo sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 17357a315072SQu Wenruo } 17367a315072SQu Wenruo 17377a315072SQu Wenruo ASSERT(sector->page); 17387a315072SQu Wenruo 17397a315072SQu Wenruo csum_expected = rbio->csum_buf + 17407a315072SQu Wenruo (stripe_nr * rbio->stripe_nsectors + sector_nr) * 17417a315072SQu Wenruo fs_info->csum_size; 17427a315072SQu Wenruo ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, 17437a315072SQu Wenruo csum_buf, csum_expected); 17447a315072SQu Wenruo return ret; 17457a315072SQu Wenruo } 17467a315072SQu Wenruo 174753b381b3SDavid Woodhouse /* 17489c5ff9b4SQu Wenruo * Recover a vertical stripe specified by @sector_nr. 17499c5ff9b4SQu Wenruo * @*pointers are the pre-allocated pointers by the caller, so we don't 17509c5ff9b4SQu Wenruo * need to allocate/free the pointers again and again. 17519c5ff9b4SQu Wenruo */ 175275b47033SQu Wenruo static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 17539c5ff9b4SQu Wenruo void **pointers, void **unmap_array) 17549c5ff9b4SQu Wenruo { 17559c5ff9b4SQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 17569c5ff9b4SQu Wenruo struct sector_ptr *sector; 17579c5ff9b4SQu Wenruo const u32 sectorsize = fs_info->sectorsize; 175875b47033SQu Wenruo int found_errors; 175975b47033SQu Wenruo int faila; 176075b47033SQu Wenruo int failb; 17619c5ff9b4SQu Wenruo int stripe_nr; 17627a315072SQu Wenruo int ret = 0; 17639c5ff9b4SQu Wenruo 17649c5ff9b4SQu Wenruo /* 17659c5ff9b4SQu Wenruo * Now we just use bitmap to mark the horizontal stripes in 17669c5ff9b4SQu Wenruo * which we have data when doing parity scrub. 17679c5ff9b4SQu Wenruo */ 17689c5ff9b4SQu Wenruo if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 17699c5ff9b4SQu Wenruo !test_bit(sector_nr, &rbio->dbitmap)) 177075b47033SQu Wenruo return 0; 177175b47033SQu Wenruo 177275b47033SQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, 177375b47033SQu Wenruo &failb); 177475b47033SQu Wenruo /* 177567da05b3SColin Ian King * No errors in the vertical stripe, skip it. Can happen for recovery 177675b47033SQu Wenruo * which only part of a stripe failed csum check. 177775b47033SQu Wenruo */ 177875b47033SQu Wenruo if (!found_errors) 177975b47033SQu Wenruo return 0; 178075b47033SQu Wenruo 178175b47033SQu Wenruo if (found_errors > rbio->bioc->max_errors) 178275b47033SQu Wenruo return -EIO; 17839c5ff9b4SQu Wenruo 17849c5ff9b4SQu Wenruo /* 17859c5ff9b4SQu Wenruo * Setup our array of pointers with sectors from each stripe 17869c5ff9b4SQu Wenruo * 17879c5ff9b4SQu Wenruo * NOTE: store a duplicate array of pointers to preserve the 17889c5ff9b4SQu Wenruo * pointer order. 17899c5ff9b4SQu Wenruo */ 17909c5ff9b4SQu Wenruo for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 17919c5ff9b4SQu Wenruo /* 179275b47033SQu Wenruo * If we're rebuilding a read, we have to use pages from the 179375b47033SQu Wenruo * bio list if possible. 17949c5ff9b4SQu Wenruo */ 17959c5ff9b4SQu Wenruo if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 179675b47033SQu Wenruo rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { 17979c5ff9b4SQu Wenruo sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 17989c5ff9b4SQu Wenruo } else { 17999c5ff9b4SQu Wenruo sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 18009c5ff9b4SQu Wenruo } 18019c5ff9b4SQu Wenruo ASSERT(sector->page); 18029c5ff9b4SQu Wenruo pointers[stripe_nr] = kmap_local_page(sector->page) + 18039c5ff9b4SQu Wenruo sector->pgoff; 18049c5ff9b4SQu Wenruo unmap_array[stripe_nr] = pointers[stripe_nr]; 18059c5ff9b4SQu Wenruo } 18069c5ff9b4SQu Wenruo 18079c5ff9b4SQu Wenruo /* All raid6 handling here */ 18089c5ff9b4SQu Wenruo if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 18099c5ff9b4SQu Wenruo /* Single failure, rebuild from parity raid5 style */ 18109c5ff9b4SQu Wenruo if (failb < 0) { 18119c5ff9b4SQu Wenruo if (faila == rbio->nr_data) 18129c5ff9b4SQu Wenruo /* 18139c5ff9b4SQu Wenruo * Just the P stripe has failed, without 18149c5ff9b4SQu Wenruo * a bad data or Q stripe. 18159c5ff9b4SQu Wenruo * We have nothing to do, just skip the 18169c5ff9b4SQu Wenruo * recovery for this stripe. 18179c5ff9b4SQu Wenruo */ 18189c5ff9b4SQu Wenruo goto cleanup; 18199c5ff9b4SQu Wenruo /* 18209c5ff9b4SQu Wenruo * a single failure in raid6 is rebuilt 18219c5ff9b4SQu Wenruo * in the pstripe code below 18229c5ff9b4SQu Wenruo */ 18239c5ff9b4SQu Wenruo goto pstripe; 18249c5ff9b4SQu Wenruo } 18259c5ff9b4SQu Wenruo 18269c5ff9b4SQu Wenruo /* 18279c5ff9b4SQu Wenruo * If the q stripe is failed, do a pstripe reconstruction from 18289c5ff9b4SQu Wenruo * the xors. 18299c5ff9b4SQu Wenruo * If both the q stripe and the P stripe are failed, we're 18309c5ff9b4SQu Wenruo * here due to a crc mismatch and we can't give them the 18319c5ff9b4SQu Wenruo * data they want. 18329c5ff9b4SQu Wenruo */ 18339c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 18349c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[faila] == 18359c5ff9b4SQu Wenruo RAID5_P_STRIPE) 18369c5ff9b4SQu Wenruo /* 18379c5ff9b4SQu Wenruo * Only P and Q are corrupted. 18389c5ff9b4SQu Wenruo * We only care about data stripes recovery, 18399c5ff9b4SQu Wenruo * can skip this vertical stripe. 18409c5ff9b4SQu Wenruo */ 18419c5ff9b4SQu Wenruo goto cleanup; 18429c5ff9b4SQu Wenruo /* 18439c5ff9b4SQu Wenruo * Otherwise we have one bad data stripe and 18449c5ff9b4SQu Wenruo * a good P stripe. raid5! 18459c5ff9b4SQu Wenruo */ 18469c5ff9b4SQu Wenruo goto pstripe; 18479c5ff9b4SQu Wenruo } 18489c5ff9b4SQu Wenruo 18499c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 18509c5ff9b4SQu Wenruo raid6_datap_recov(rbio->real_stripes, sectorsize, 18519c5ff9b4SQu Wenruo faila, pointers); 18529c5ff9b4SQu Wenruo } else { 18539c5ff9b4SQu Wenruo raid6_2data_recov(rbio->real_stripes, sectorsize, 18549c5ff9b4SQu Wenruo faila, failb, pointers); 18559c5ff9b4SQu Wenruo } 18569c5ff9b4SQu Wenruo } else { 18579c5ff9b4SQu Wenruo void *p; 18589c5ff9b4SQu Wenruo 18599c5ff9b4SQu Wenruo /* Rebuild from P stripe here (raid5 or raid6). */ 18609c5ff9b4SQu Wenruo ASSERT(failb == -1); 18619c5ff9b4SQu Wenruo pstripe: 18629c5ff9b4SQu Wenruo /* Copy parity block into failed block to start with */ 18639c5ff9b4SQu Wenruo memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 18649c5ff9b4SQu Wenruo 18659c5ff9b4SQu Wenruo /* Rearrange the pointer array */ 18669c5ff9b4SQu Wenruo p = pointers[faila]; 18679c5ff9b4SQu Wenruo for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 18689c5ff9b4SQu Wenruo stripe_nr++) 18699c5ff9b4SQu Wenruo pointers[stripe_nr] = pointers[stripe_nr + 1]; 18709c5ff9b4SQu Wenruo pointers[rbio->nr_data - 1] = p; 18719c5ff9b4SQu Wenruo 18729c5ff9b4SQu Wenruo /* Xor in the rest */ 18739c5ff9b4SQu Wenruo run_xor(pointers, rbio->nr_data - 1, sectorsize); 18749c5ff9b4SQu Wenruo 18759c5ff9b4SQu Wenruo } 18769c5ff9b4SQu Wenruo 18779c5ff9b4SQu Wenruo /* 18789c5ff9b4SQu Wenruo * No matter if this is a RMW or recovery, we should have all 18799c5ff9b4SQu Wenruo * failed sectors repaired in the vertical stripe, thus they are now 18809c5ff9b4SQu Wenruo * uptodate. 18819c5ff9b4SQu Wenruo * Especially if we determine to cache the rbio, we need to 18829c5ff9b4SQu Wenruo * have at least all data sectors uptodate. 18837a315072SQu Wenruo * 18847a315072SQu Wenruo * If possible, also check if the repaired sector matches its data 18857a315072SQu Wenruo * checksum. 18869c5ff9b4SQu Wenruo */ 188775b47033SQu Wenruo if (faila >= 0) { 18887a315072SQu Wenruo ret = verify_one_sector(rbio, faila, sector_nr); 18897a315072SQu Wenruo if (ret < 0) 18907a315072SQu Wenruo goto cleanup; 18917a315072SQu Wenruo 189275b47033SQu Wenruo sector = rbio_stripe_sector(rbio, faila, sector_nr); 18939c5ff9b4SQu Wenruo sector->uptodate = 1; 18949c5ff9b4SQu Wenruo } 189575b47033SQu Wenruo if (failb >= 0) { 1896f7c11affSTanmay Bhushan ret = verify_one_sector(rbio, failb, sector_nr); 18977a315072SQu Wenruo if (ret < 0) 18987a315072SQu Wenruo goto cleanup; 18997a315072SQu Wenruo 190075b47033SQu Wenruo sector = rbio_stripe_sector(rbio, failb, sector_nr); 19019c5ff9b4SQu Wenruo sector->uptodate = 1; 19029c5ff9b4SQu Wenruo } 19039c5ff9b4SQu Wenruo 19049c5ff9b4SQu Wenruo cleanup: 19059c5ff9b4SQu Wenruo for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 19069c5ff9b4SQu Wenruo kunmap_local(unmap_array[stripe_nr]); 19077a315072SQu Wenruo return ret; 19089c5ff9b4SQu Wenruo } 19099c5ff9b4SQu Wenruo 1910ec936b03SQu Wenruo static int recover_sectors(struct btrfs_raid_bio *rbio) 191153b381b3SDavid Woodhouse { 19129c5ff9b4SQu Wenruo void **pointers = NULL; 19139c5ff9b4SQu Wenruo void **unmap_array = NULL; 1914ec936b03SQu Wenruo int sectornr; 1915ec936b03SQu Wenruo int ret = 0; 191653b381b3SDavid Woodhouse 191707e4d380SQu Wenruo /* 1918ec936b03SQu Wenruo * @pointers array stores the pointer for each sector. 1919ec936b03SQu Wenruo * 1920ec936b03SQu Wenruo * @unmap_array stores copy of pointers that does not get reordered 1921ec936b03SQu Wenruo * during reconstruction so that kunmap_local works. 192207e4d380SQu Wenruo */ 192331e818feSDavid Sterba pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 192494a0b58dSIra Weiny unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1925ec936b03SQu Wenruo if (!pointers || !unmap_array) { 1926ec936b03SQu Wenruo ret = -ENOMEM; 1927ec936b03SQu Wenruo goto out; 192894a0b58dSIra Weiny } 192994a0b58dSIra Weiny 1930b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1931b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 193253b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 193353b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 193453b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 193553b381b3SDavid Woodhouse } 193653b381b3SDavid Woodhouse 193753b381b3SDavid Woodhouse index_rbio_pages(rbio); 193853b381b3SDavid Woodhouse 193975b47033SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 194075b47033SQu Wenruo ret = recover_vertical(rbio, sectornr, pointers, unmap_array); 194175b47033SQu Wenruo if (ret < 0) 194275b47033SQu Wenruo break; 194375b47033SQu Wenruo } 194453b381b3SDavid Woodhouse 1945ec936b03SQu Wenruo out: 194653b381b3SDavid Woodhouse kfree(pointers); 1947ec936b03SQu Wenruo kfree(unmap_array); 1948ec936b03SQu Wenruo return ret; 1949ec936b03SQu Wenruo } 1950ec936b03SQu Wenruo 1951d31968d9SQu Wenruo static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, 1952d31968d9SQu Wenruo struct bio_list *bio_list) 195353b381b3SDavid Woodhouse { 195453b381b3SDavid Woodhouse struct bio *bio; 1955d31968d9SQu Wenruo int total_sector_nr; 1956d31968d9SQu Wenruo int ret = 0; 195753b381b3SDavid Woodhouse 1958d31968d9SQu Wenruo ASSERT(bio_list_size(bio_list) == 0); 195953b381b3SDavid Woodhouse /* 1960f6065f8eSQu Wenruo * Read everything that hasn't failed. However this time we will 1961f6065f8eSQu Wenruo * not trust any cached sector. 1962f6065f8eSQu Wenruo * As we may read out some stale data but higher layer is not reading 1963f6065f8eSQu Wenruo * that stale part. 1964f6065f8eSQu Wenruo * 1965f6065f8eSQu Wenruo * So here we always re-read everything in recovery path. 196653b381b3SDavid Woodhouse */ 1967ef340fccSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1968ef340fccSQu Wenruo total_sector_nr++) { 1969ef340fccSQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 1970ef340fccSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 19713e77605dSQu Wenruo struct sector_ptr *sector; 197253b381b3SDavid Woodhouse 197375b47033SQu Wenruo /* 197475b47033SQu Wenruo * Skip the range which has error. It can be a range which is 197575b47033SQu Wenruo * marked error (for csum mismatch), or it can be a missing 197675b47033SQu Wenruo * device. 197775b47033SQu Wenruo */ 197875b47033SQu Wenruo if (!rbio->bioc->stripes[stripe].dev->bdev || 197975b47033SQu Wenruo test_bit(total_sector_nr, rbio->error_bitmap)) { 198075b47033SQu Wenruo /* 198175b47033SQu Wenruo * Also set the error bit for missing device, which 198275b47033SQu Wenruo * may not yet have its error bit set. 198375b47033SQu Wenruo */ 198475b47033SQu Wenruo set_bit(total_sector_nr, rbio->error_bitmap); 198553b381b3SDavid Woodhouse continue; 1986ef340fccSQu Wenruo } 198775b47033SQu Wenruo 198853b381b3SDavid Woodhouse sector = rbio_stripe_sector(rbio, stripe, sectornr); 1989d31968d9SQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 1990ff18a4afSChristoph Hellwig sectornr, REQ_OP_READ); 199153b381b3SDavid Woodhouse if (ret < 0) 1992d31968d9SQu Wenruo goto error; 199353b381b3SDavid Woodhouse } 1994d31968d9SQu Wenruo return 0; 1995d31968d9SQu Wenruo error: 1996d31968d9SQu Wenruo while ((bio = bio_list_pop(bio_list))) 1997d31968d9SQu Wenruo bio_put(bio); 1998d31968d9SQu Wenruo 1999d31968d9SQu Wenruo return -EIO; 2000d31968d9SQu Wenruo } 2001d31968d9SQu Wenruo 2002d817ce35SQu Wenruo static int recover_rbio(struct btrfs_raid_bio *rbio) 2003d817ce35SQu Wenruo { 2004d817ce35SQu Wenruo struct bio_list bio_list; 2005d817ce35SQu Wenruo struct bio *bio; 2006d817ce35SQu Wenruo int ret; 2007d817ce35SQu Wenruo 2008d817ce35SQu Wenruo /* 2009d817ce35SQu Wenruo * Either we're doing recover for a read failure or degraded write, 201075b47033SQu Wenruo * caller should have set error bitmap correctly. 2011d817ce35SQu Wenruo */ 20122942a50dSQu Wenruo ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); 2013d817ce35SQu Wenruo bio_list_init(&bio_list); 2014d817ce35SQu Wenruo 2015d817ce35SQu Wenruo /* For recovery, we need to read all sectors including P/Q. */ 2016d817ce35SQu Wenruo ret = alloc_rbio_pages(rbio); 2017d817ce35SQu Wenruo if (ret < 0) 2018d817ce35SQu Wenruo goto out; 2019d817ce35SQu Wenruo 2020d817ce35SQu Wenruo index_rbio_pages(rbio); 2021d817ce35SQu Wenruo 2022d817ce35SQu Wenruo ret = recover_assemble_read_bios(rbio, &bio_list); 2023d817ce35SQu Wenruo if (ret < 0) 2024d817ce35SQu Wenruo goto out; 2025d817ce35SQu Wenruo 2026d817ce35SQu Wenruo submit_read_bios(rbio, &bio_list); 2027d817ce35SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2028d817ce35SQu Wenruo 2029d817ce35SQu Wenruo ret = recover_sectors(rbio); 2030d817ce35SQu Wenruo 2031d817ce35SQu Wenruo out: 2032d817ce35SQu Wenruo while ((bio = bio_list_pop(&bio_list))) 2033d817ce35SQu Wenruo bio_put(bio); 2034d817ce35SQu Wenruo 2035d817ce35SQu Wenruo return ret; 2036d817ce35SQu Wenruo } 2037d817ce35SQu Wenruo 2038d817ce35SQu Wenruo static void recover_rbio_work(struct work_struct *work) 2039d817ce35SQu Wenruo { 2040d817ce35SQu Wenruo struct btrfs_raid_bio *rbio; 2041d817ce35SQu Wenruo int ret; 2042d817ce35SQu Wenruo 2043d817ce35SQu Wenruo rbio = container_of(work, struct btrfs_raid_bio, work); 2044d817ce35SQu Wenruo 2045d817ce35SQu Wenruo ret = lock_stripe_add(rbio); 2046d817ce35SQu Wenruo if (ret == 0) { 2047d817ce35SQu Wenruo ret = recover_rbio(rbio); 2048d817ce35SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2049d817ce35SQu Wenruo } 2050d817ce35SQu Wenruo } 2051d817ce35SQu Wenruo 2052d817ce35SQu Wenruo static void recover_rbio_work_locked(struct work_struct *work) 2053d817ce35SQu Wenruo { 2054d817ce35SQu Wenruo struct btrfs_raid_bio *rbio; 2055d817ce35SQu Wenruo int ret; 2056d817ce35SQu Wenruo 2057d817ce35SQu Wenruo rbio = container_of(work, struct btrfs_raid_bio, work); 2058d817ce35SQu Wenruo 2059d817ce35SQu Wenruo ret = recover_rbio(rbio); 2060d817ce35SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2061d817ce35SQu Wenruo } 2062d817ce35SQu Wenruo 206375b47033SQu Wenruo static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) 206475b47033SQu Wenruo { 206575b47033SQu Wenruo bool found = false; 206675b47033SQu Wenruo int sector_nr; 206775b47033SQu Wenruo 206875b47033SQu Wenruo /* 206975b47033SQu Wenruo * This is for RAID6 extra recovery tries, thus mirror number should 207075b47033SQu Wenruo * be large than 2. 207175b47033SQu Wenruo * Mirror 1 means read from data stripes. Mirror 2 means rebuild using 207275b47033SQu Wenruo * RAID5 methods. 207375b47033SQu Wenruo */ 207475b47033SQu Wenruo ASSERT(mirror_num > 2); 207575b47033SQu Wenruo for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 207675b47033SQu Wenruo int found_errors; 207775b47033SQu Wenruo int faila; 207875b47033SQu Wenruo int failb; 207975b47033SQu Wenruo 208075b47033SQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sector_nr, 208175b47033SQu Wenruo &faila, &failb); 208275b47033SQu Wenruo /* This vertical stripe doesn't have errors. */ 208375b47033SQu Wenruo if (!found_errors) 208475b47033SQu Wenruo continue; 208575b47033SQu Wenruo 208675b47033SQu Wenruo /* 208775b47033SQu Wenruo * If we found errors, there should be only one error marked 208875b47033SQu Wenruo * by previous set_rbio_range_error(). 208975b47033SQu Wenruo */ 209075b47033SQu Wenruo ASSERT(found_errors == 1); 209175b47033SQu Wenruo found = true; 209275b47033SQu Wenruo 209375b47033SQu Wenruo /* Now select another stripe to mark as error. */ 209475b47033SQu Wenruo failb = rbio->real_stripes - (mirror_num - 1); 209575b47033SQu Wenruo if (failb <= faila) 209675b47033SQu Wenruo failb--; 209775b47033SQu Wenruo 209875b47033SQu Wenruo /* Set the extra bit in error bitmap. */ 209975b47033SQu Wenruo if (failb >= 0) 210075b47033SQu Wenruo set_bit(failb * rbio->stripe_nsectors + sector_nr, 210175b47033SQu Wenruo rbio->error_bitmap); 210275b47033SQu Wenruo } 210375b47033SQu Wenruo 210475b47033SQu Wenruo /* We should found at least one vertical stripe with error.*/ 210575b47033SQu Wenruo ASSERT(found); 210675b47033SQu Wenruo } 210775b47033SQu Wenruo 2108d31968d9SQu Wenruo /* 210953b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 211053b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 211153b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 211253b381b3SDavid Woodhouse * of the drive. 211353b381b3SDavid Woodhouse */ 21146065fd95SChristoph Hellwig void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2115f1c29379SChristoph Hellwig int mirror_num) 211653b381b3SDavid Woodhouse { 21176a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 211853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 211953b381b3SDavid Woodhouse 2120ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 2121af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 21226065fd95SChristoph Hellwig bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2123d817ce35SQu Wenruo bio_endio(bio); 2124d817ce35SQu Wenruo return; 2125af8e2d1dSMiao Xie } 212653b381b3SDavid Woodhouse 21271b94b556SMiao Xie rbio->operation = BTRFS_RBIO_READ_REBUILD; 2128bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 212953b381b3SDavid Woodhouse 21302942a50dSQu Wenruo set_rbio_range_error(rbio, bio); 21312942a50dSQu Wenruo 213253b381b3SDavid Woodhouse /* 21338810f751SLiu Bo * Loop retry: 21348810f751SLiu Bo * for 'mirror == 2', reconstruct from all other stripes. 21358810f751SLiu Bo * for 'mirror_num > 2', select a stripe to fail on every retry. 213653b381b3SDavid Woodhouse */ 2137ad3daf1cSQu Wenruo if (mirror_num > 2) 213875b47033SQu Wenruo set_rbio_raid6_extra_error(rbio, mirror_num); 213953b381b3SDavid Woodhouse 2140d817ce35SQu Wenruo start_async_work(rbio, recover_rbio_work); 214153b381b3SDavid Woodhouse } 214253b381b3SDavid Woodhouse 2143c5a41562SQu Wenruo static void fill_data_csums(struct btrfs_raid_bio *rbio) 2144c5a41562SQu Wenruo { 2145c5a41562SQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 2146c5a41562SQu Wenruo struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 2147c5a41562SQu Wenruo rbio->bioc->raid_map[0]); 2148c5a41562SQu Wenruo const u64 start = rbio->bioc->raid_map[0]; 2149c5a41562SQu Wenruo const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << 2150c5a41562SQu Wenruo fs_info->sectorsize_bits; 2151c5a41562SQu Wenruo int ret; 2152c5a41562SQu Wenruo 2153c5a41562SQu Wenruo /* The rbio should not have its csum buffer initialized. */ 2154c5a41562SQu Wenruo ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); 2155c5a41562SQu Wenruo 2156c5a41562SQu Wenruo /* 2157c5a41562SQu Wenruo * Skip the csum search if: 2158c5a41562SQu Wenruo * 2159c5a41562SQu Wenruo * - The rbio doesn't belong to data block groups 2160c5a41562SQu Wenruo * Then we are doing IO for tree blocks, no need to search csums. 2161c5a41562SQu Wenruo * 2162c5a41562SQu Wenruo * - The rbio belongs to mixed block groups 2163c5a41562SQu Wenruo * This is to avoid deadlock, as we're already holding the full 2164c5a41562SQu Wenruo * stripe lock, if we trigger a metadata read, and it needs to do 2165c5a41562SQu Wenruo * raid56 recovery, we will deadlock. 2166c5a41562SQu Wenruo */ 2167c5a41562SQu Wenruo if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || 2168c5a41562SQu Wenruo rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) 2169c5a41562SQu Wenruo return; 2170c5a41562SQu Wenruo 2171c5a41562SQu Wenruo rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * 2172c5a41562SQu Wenruo fs_info->csum_size, GFP_NOFS); 2173c5a41562SQu Wenruo rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, 2174c5a41562SQu Wenruo GFP_NOFS); 2175c5a41562SQu Wenruo if (!rbio->csum_buf || !rbio->csum_bitmap) { 2176c5a41562SQu Wenruo ret = -ENOMEM; 2177c5a41562SQu Wenruo goto error; 2178c5a41562SQu Wenruo } 2179c5a41562SQu Wenruo 2180c5a41562SQu Wenruo ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, 2181c5a41562SQu Wenruo rbio->csum_buf, rbio->csum_bitmap); 2182c5a41562SQu Wenruo if (ret < 0) 2183c5a41562SQu Wenruo goto error; 2184c5a41562SQu Wenruo if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) 2185c5a41562SQu Wenruo goto no_csum; 2186c5a41562SQu Wenruo return; 2187c5a41562SQu Wenruo 2188c5a41562SQu Wenruo error: 2189c5a41562SQu Wenruo /* 2190c5a41562SQu Wenruo * We failed to allocate memory or grab the csum, but it's not fatal, 2191c5a41562SQu Wenruo * we can still continue. But better to warn users that RMW is no 2192c5a41562SQu Wenruo * longer safe for this particular sub-stripe write. 2193c5a41562SQu Wenruo */ 2194c5a41562SQu Wenruo btrfs_warn_rl(fs_info, 2195c5a41562SQu Wenruo "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", 2196c5a41562SQu Wenruo rbio->bioc->raid_map[0], ret); 2197c5a41562SQu Wenruo no_csum: 2198c5a41562SQu Wenruo kfree(rbio->csum_buf); 2199c5a41562SQu Wenruo bitmap_free(rbio->csum_bitmap); 2200c5a41562SQu Wenruo rbio->csum_buf = NULL; 2201c5a41562SQu Wenruo rbio->csum_bitmap = NULL; 2202c5a41562SQu Wenruo } 2203c5a41562SQu Wenruo 22047a315072SQu Wenruo static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) 22055eb30ee2SQu Wenruo { 22065eb30ee2SQu Wenruo struct bio_list bio_list; 22075eb30ee2SQu Wenruo struct bio *bio; 22085eb30ee2SQu Wenruo int ret; 22095eb30ee2SQu Wenruo 22105eb30ee2SQu Wenruo bio_list_init(&bio_list); 22115eb30ee2SQu Wenruo 2212c5a41562SQu Wenruo /* 2213c5a41562SQu Wenruo * Fill the data csums we need for data verification. We need to fill 2214c5a41562SQu Wenruo * the csum_bitmap/csum_buf first, as our endio function will try to 2215c5a41562SQu Wenruo * verify the data sectors. 2216c5a41562SQu Wenruo */ 2217c5a41562SQu Wenruo fill_data_csums(rbio); 2218c5a41562SQu Wenruo 22195eb30ee2SQu Wenruo ret = rmw_assemble_read_bios(rbio, &bio_list); 22205eb30ee2SQu Wenruo if (ret < 0) 22215eb30ee2SQu Wenruo goto out; 22225eb30ee2SQu Wenruo 22235eb30ee2SQu Wenruo submit_read_bios(rbio, &bio_list); 22245eb30ee2SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 22257a315072SQu Wenruo 22267a315072SQu Wenruo /* 22277a315072SQu Wenruo * We may or may not have any corrupted sectors (including missing dev 22287a315072SQu Wenruo * and csum mismatch), just let recover_sectors() to handle them all. 22297a315072SQu Wenruo */ 22307a315072SQu Wenruo ret = recover_sectors(rbio); 22315eb30ee2SQu Wenruo return ret; 22325eb30ee2SQu Wenruo out: 22335eb30ee2SQu Wenruo while ((bio = bio_list_pop(&bio_list))) 22345eb30ee2SQu Wenruo bio_put(bio); 22355eb30ee2SQu Wenruo 22365eb30ee2SQu Wenruo return ret; 22375eb30ee2SQu Wenruo } 22385eb30ee2SQu Wenruo 22395eb30ee2SQu Wenruo static void raid_wait_write_end_io(struct bio *bio) 22405eb30ee2SQu Wenruo { 22415eb30ee2SQu Wenruo struct btrfs_raid_bio *rbio = bio->bi_private; 22425eb30ee2SQu Wenruo blk_status_t err = bio->bi_status; 22435eb30ee2SQu Wenruo 2244ad3daf1cSQu Wenruo if (err) 22452942a50dSQu Wenruo rbio_update_error_bitmap(rbio, bio); 22465eb30ee2SQu Wenruo bio_put(bio); 22475eb30ee2SQu Wenruo if (atomic_dec_and_test(&rbio->stripes_pending)) 22485eb30ee2SQu Wenruo wake_up(&rbio->io_wait); 22495eb30ee2SQu Wenruo } 22505eb30ee2SQu Wenruo 22515eb30ee2SQu Wenruo static void submit_write_bios(struct btrfs_raid_bio *rbio, 22525eb30ee2SQu Wenruo struct bio_list *bio_list) 22535eb30ee2SQu Wenruo { 22545eb30ee2SQu Wenruo struct bio *bio; 22555eb30ee2SQu Wenruo 22565eb30ee2SQu Wenruo atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 22575eb30ee2SQu Wenruo while ((bio = bio_list_pop(bio_list))) { 22585eb30ee2SQu Wenruo bio->bi_end_io = raid_wait_write_end_io; 22595eb30ee2SQu Wenruo 22605eb30ee2SQu Wenruo if (trace_raid56_write_stripe_enabled()) { 22615eb30ee2SQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 22625eb30ee2SQu Wenruo 22635eb30ee2SQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 22645eb30ee2SQu Wenruo trace_raid56_write_stripe(rbio, bio, &trace_info); 22655eb30ee2SQu Wenruo } 22665eb30ee2SQu Wenruo submit_bio(bio); 22675eb30ee2SQu Wenruo } 22685eb30ee2SQu Wenruo } 22695eb30ee2SQu Wenruo 22707a315072SQu Wenruo /* 22717a315072SQu Wenruo * To determine if we need to read any sector from the disk. 22727a315072SQu Wenruo * Should only be utilized in RMW path, to skip cached rbio. 22737a315072SQu Wenruo */ 22747a315072SQu Wenruo static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) 22757a315072SQu Wenruo { 22767a315072SQu Wenruo int i; 22777a315072SQu Wenruo 22787a315072SQu Wenruo for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { 22797a315072SQu Wenruo struct sector_ptr *sector = &rbio->stripe_sectors[i]; 22807a315072SQu Wenruo 22817a315072SQu Wenruo /* 22827a315072SQu Wenruo * We have a sector which doesn't have page nor uptodate, 22837a315072SQu Wenruo * thus this rbio can not be cached one, as cached one must 22847a315072SQu Wenruo * have all its data sectors present and uptodate. 22857a315072SQu Wenruo */ 22867a315072SQu Wenruo if (!sector->page || !sector->uptodate) 22877a315072SQu Wenruo return true; 22887a315072SQu Wenruo } 22897a315072SQu Wenruo return false; 22907a315072SQu Wenruo } 22917a315072SQu Wenruo 229293723095SQu Wenruo static int rmw_rbio(struct btrfs_raid_bio *rbio) 22935eb30ee2SQu Wenruo { 22945eb30ee2SQu Wenruo struct bio_list bio_list; 22955eb30ee2SQu Wenruo int sectornr; 22965eb30ee2SQu Wenruo int ret = 0; 22975eb30ee2SQu Wenruo 22985eb30ee2SQu Wenruo /* 22995eb30ee2SQu Wenruo * Allocate the pages for parity first, as P/Q pages will always be 23005eb30ee2SQu Wenruo * needed for both full-stripe and sub-stripe writes. 23015eb30ee2SQu Wenruo */ 23025eb30ee2SQu Wenruo ret = alloc_rbio_parity_pages(rbio); 23035eb30ee2SQu Wenruo if (ret < 0) 23045eb30ee2SQu Wenruo return ret; 23055eb30ee2SQu Wenruo 23067a315072SQu Wenruo /* 23077a315072SQu Wenruo * Either full stripe write, or we have every data sector already 23087a315072SQu Wenruo * cached, can go to write path immediately. 23097a315072SQu Wenruo */ 23107a315072SQu Wenruo if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) 23115eb30ee2SQu Wenruo goto write; 23127a315072SQu Wenruo 23135eb30ee2SQu Wenruo /* 23145eb30ee2SQu Wenruo * Now we're doing sub-stripe write, also need all data stripes to do 23155eb30ee2SQu Wenruo * the full RMW. 23165eb30ee2SQu Wenruo */ 23175eb30ee2SQu Wenruo ret = alloc_rbio_data_pages(rbio); 23185eb30ee2SQu Wenruo if (ret < 0) 23195eb30ee2SQu Wenruo return ret; 23205eb30ee2SQu Wenruo 23215eb30ee2SQu Wenruo index_rbio_pages(rbio); 23225eb30ee2SQu Wenruo 23237a315072SQu Wenruo ret = rmw_read_wait_recover(rbio); 23245eb30ee2SQu Wenruo if (ret < 0) 23255eb30ee2SQu Wenruo return ret; 23265eb30ee2SQu Wenruo 23275eb30ee2SQu Wenruo write: 23285eb30ee2SQu Wenruo /* 23295eb30ee2SQu Wenruo * At this stage we're not allowed to add any new bios to the 23305eb30ee2SQu Wenruo * bio list any more, anyone else that wants to change this stripe 23315eb30ee2SQu Wenruo * needs to do their own rmw. 23325eb30ee2SQu Wenruo */ 23335eb30ee2SQu Wenruo spin_lock_irq(&rbio->bio_list_lock); 23345eb30ee2SQu Wenruo set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 23355eb30ee2SQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 23365eb30ee2SQu Wenruo 23372942a50dSQu Wenruo bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 23385eb30ee2SQu Wenruo 23395eb30ee2SQu Wenruo index_rbio_pages(rbio); 23405eb30ee2SQu Wenruo 23415eb30ee2SQu Wenruo /* 23425eb30ee2SQu Wenruo * We don't cache full rbios because we're assuming 23435eb30ee2SQu Wenruo * the higher layers are unlikely to use this area of 23445eb30ee2SQu Wenruo * the disk again soon. If they do use it again, 23455eb30ee2SQu Wenruo * hopefully they will send another full bio. 23465eb30ee2SQu Wenruo */ 23475eb30ee2SQu Wenruo if (!rbio_is_full(rbio)) 23485eb30ee2SQu Wenruo cache_rbio_pages(rbio); 23495eb30ee2SQu Wenruo else 23505eb30ee2SQu Wenruo clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 23515eb30ee2SQu Wenruo 23525eb30ee2SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 23535eb30ee2SQu Wenruo generate_pq_vertical(rbio, sectornr); 23545eb30ee2SQu Wenruo 23555eb30ee2SQu Wenruo bio_list_init(&bio_list); 23565eb30ee2SQu Wenruo ret = rmw_assemble_write_bios(rbio, &bio_list); 23575eb30ee2SQu Wenruo if (ret < 0) 23585eb30ee2SQu Wenruo return ret; 23595eb30ee2SQu Wenruo 23605eb30ee2SQu Wenruo /* We should have at least one bio assembled. */ 23615eb30ee2SQu Wenruo ASSERT(bio_list_size(&bio_list)); 23625eb30ee2SQu Wenruo submit_write_bios(rbio, &bio_list); 23635eb30ee2SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 23645eb30ee2SQu Wenruo 2365ad3daf1cSQu Wenruo /* We may have more errors than our tolerance during the read. */ 2366ad3daf1cSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2367ad3daf1cSQu Wenruo int found_errors; 2368ad3daf1cSQu Wenruo 2369ad3daf1cSQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); 2370ad3daf1cSQu Wenruo if (found_errors > rbio->bioc->max_errors) { 23715eb30ee2SQu Wenruo ret = -EIO; 2372ad3daf1cSQu Wenruo break; 2373ad3daf1cSQu Wenruo } 2374ad3daf1cSQu Wenruo } 23755eb30ee2SQu Wenruo return ret; 23765eb30ee2SQu Wenruo } 23775eb30ee2SQu Wenruo 237893723095SQu Wenruo static void rmw_rbio_work(struct work_struct *work) 237953b381b3SDavid Woodhouse { 238053b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 238193723095SQu Wenruo int ret; 238253b381b3SDavid Woodhouse 238353b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 238493723095SQu Wenruo 238593723095SQu Wenruo ret = lock_stripe_add(rbio); 238693723095SQu Wenruo if (ret == 0) { 238793723095SQu Wenruo ret = rmw_rbio(rbio); 238893723095SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 238993723095SQu Wenruo } 239093723095SQu Wenruo } 239193723095SQu Wenruo 239293723095SQu Wenruo static void rmw_rbio_work_locked(struct work_struct *work) 239393723095SQu Wenruo { 239493723095SQu Wenruo struct btrfs_raid_bio *rbio; 239593723095SQu Wenruo int ret; 239693723095SQu Wenruo 239793723095SQu Wenruo rbio = container_of(work, struct btrfs_raid_bio, work); 239893723095SQu Wenruo 239993723095SQu Wenruo ret = rmw_rbio(rbio); 240093723095SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 240153b381b3SDavid Woodhouse } 240253b381b3SDavid Woodhouse 24035a6ac9eaSMiao Xie /* 24045a6ac9eaSMiao Xie * The following code is used to scrub/replace the parity stripe 24055a6ac9eaSMiao Xie * 24064c664611SQu Wenruo * Caller must have already increased bio_counter for getting @bioc. 2407ae6529c3SQu Wenruo * 24085a6ac9eaSMiao Xie * Note: We need make sure all the pages that add into the scrub/replace 24095a6ac9eaSMiao Xie * raid bio are correct and not be changed during the scrub/replace. That 24105a6ac9eaSMiao Xie * is those pages just hold metadata or file data with checksum. 24115a6ac9eaSMiao Xie */ 24125a6ac9eaSMiao Xie 24136a258d72SQu Wenruo struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 24146a258d72SQu Wenruo struct btrfs_io_context *bioc, 2415ff18a4afSChristoph Hellwig struct btrfs_device *scrub_dev, 24165a6ac9eaSMiao Xie unsigned long *dbitmap, int stripe_nsectors) 24175a6ac9eaSMiao Xie { 24186a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 24195a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 24205a6ac9eaSMiao Xie int i; 24215a6ac9eaSMiao Xie 2422ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 24235a6ac9eaSMiao Xie if (IS_ERR(rbio)) 24245a6ac9eaSMiao Xie return NULL; 24255a6ac9eaSMiao Xie bio_list_add(&rbio->bio_list, bio); 24265a6ac9eaSMiao Xie /* 24275a6ac9eaSMiao Xie * This is a special bio which is used to hold the completion handler 24285a6ac9eaSMiao Xie * and make the scrub rbio is similar to the other types 24295a6ac9eaSMiao Xie */ 24305a6ac9eaSMiao Xie ASSERT(!bio->bi_iter.bi_size); 24315a6ac9eaSMiao Xie rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 24325a6ac9eaSMiao Xie 24339cd3a7ebSLiu Bo /* 24344c664611SQu Wenruo * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 24359cd3a7ebSLiu Bo * to the end position, so this search can start from the first parity 24369cd3a7ebSLiu Bo * stripe. 24379cd3a7ebSLiu Bo */ 24389cd3a7ebSLiu Bo for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 24394c664611SQu Wenruo if (bioc->stripes[i].dev == scrub_dev) { 24405a6ac9eaSMiao Xie rbio->scrubp = i; 24415a6ac9eaSMiao Xie break; 24425a6ac9eaSMiao Xie } 24435a6ac9eaSMiao Xie } 24449cd3a7ebSLiu Bo ASSERT(i < rbio->real_stripes); 24455a6ac9eaSMiao Xie 2446c67c68ebSQu Wenruo bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 24475a6ac9eaSMiao Xie return rbio; 24485a6ac9eaSMiao Xie } 24495a6ac9eaSMiao Xie 2450b4ee1782SOmar Sandoval /* Used for both parity scrub and missing. */ 2451b4ee1782SOmar Sandoval void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 24526346f6bfSQu Wenruo unsigned int pgoff, u64 logical) 24535a6ac9eaSMiao Xie { 24546346f6bfSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 24555a6ac9eaSMiao Xie int stripe_offset; 24565a6ac9eaSMiao Xie int index; 24575a6ac9eaSMiao Xie 24584c664611SQu Wenruo ASSERT(logical >= rbio->bioc->raid_map[0]); 24596346f6bfSQu Wenruo ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2460ff18a4afSChristoph Hellwig BTRFS_STRIPE_LEN * rbio->nr_data); 24614c664611SQu Wenruo stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 24626346f6bfSQu Wenruo index = stripe_offset / sectorsize; 24636346f6bfSQu Wenruo rbio->bio_sectors[index].page = page; 24646346f6bfSQu Wenruo rbio->bio_sectors[index].pgoff = pgoff; 24655a6ac9eaSMiao Xie } 24665a6ac9eaSMiao Xie 24675a6ac9eaSMiao Xie /* 24685a6ac9eaSMiao Xie * We just scrub the parity that we have correct data on the same horizontal, 24695a6ac9eaSMiao Xie * so we needn't allocate all pages for all the stripes. 24705a6ac9eaSMiao Xie */ 24715a6ac9eaSMiao Xie static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 24725a6ac9eaSMiao Xie { 24733907ce29SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2474aee35e4bSQu Wenruo int total_sector_nr; 24755a6ac9eaSMiao Xie 2476aee35e4bSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2477aee35e4bSQu Wenruo total_sector_nr++) { 24783907ce29SQu Wenruo struct page *page; 2479aee35e4bSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 2480aee35e4bSQu Wenruo int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 24813907ce29SQu Wenruo 2482aee35e4bSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 2483aee35e4bSQu Wenruo continue; 24845a6ac9eaSMiao Xie if (rbio->stripe_pages[index]) 24855a6ac9eaSMiao Xie continue; 2486b0ee5e1eSDavid Sterba page = alloc_page(GFP_NOFS); 24875a6ac9eaSMiao Xie if (!page) 24885a6ac9eaSMiao Xie return -ENOMEM; 24895a6ac9eaSMiao Xie rbio->stripe_pages[index] = page; 24905a6ac9eaSMiao Xie } 2491eb357060SQu Wenruo index_stripe_sectors(rbio); 24925a6ac9eaSMiao Xie return 0; 24935a6ac9eaSMiao Xie } 24945a6ac9eaSMiao Xie 24956bfd0133SQu Wenruo static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) 24965a6ac9eaSMiao Xie { 24974c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 249846900662SQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 24991389053eSKees Cook void **pointers = rbio->finish_pointers; 2500c67c68ebSQu Wenruo unsigned long *pbitmap = &rbio->finish_pbitmap; 25015a6ac9eaSMiao Xie int nr_data = rbio->nr_data; 25025a6ac9eaSMiao Xie int stripe; 25033e77605dSQu Wenruo int sectornr; 2504c17af965SDavid Sterba bool has_qstripe; 250546900662SQu Wenruo struct sector_ptr p_sector = { 0 }; 250646900662SQu Wenruo struct sector_ptr q_sector = { 0 }; 25075a6ac9eaSMiao Xie struct bio_list bio_list; 25085a6ac9eaSMiao Xie struct bio *bio; 250976035976SMiao Xie int is_replace = 0; 25105a6ac9eaSMiao Xie int ret; 25115a6ac9eaSMiao Xie 25125a6ac9eaSMiao Xie bio_list_init(&bio_list); 25135a6ac9eaSMiao Xie 2514c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 2515c17af965SDavid Sterba has_qstripe = false; 2516c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 2517c17af965SDavid Sterba has_qstripe = true; 2518c17af965SDavid Sterba else 25195a6ac9eaSMiao Xie BUG(); 25205a6ac9eaSMiao Xie 25214c664611SQu Wenruo if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 252276035976SMiao Xie is_replace = 1; 2523c67c68ebSQu Wenruo bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 252476035976SMiao Xie } 252576035976SMiao Xie 25265a6ac9eaSMiao Xie /* 25275a6ac9eaSMiao Xie * Because the higher layers(scrubber) are unlikely to 25285a6ac9eaSMiao Xie * use this area of the disk again soon, so don't cache 25295a6ac9eaSMiao Xie * it. 25305a6ac9eaSMiao Xie */ 25315a6ac9eaSMiao Xie clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 25325a6ac9eaSMiao Xie 25335a6ac9eaSMiao Xie if (!need_check) 25345a6ac9eaSMiao Xie goto writeback; 25355a6ac9eaSMiao Xie 253646900662SQu Wenruo p_sector.page = alloc_page(GFP_NOFS); 253746900662SQu Wenruo if (!p_sector.page) 25386bfd0133SQu Wenruo return -ENOMEM; 253946900662SQu Wenruo p_sector.pgoff = 0; 254046900662SQu Wenruo p_sector.uptodate = 1; 25415a6ac9eaSMiao Xie 2542c17af965SDavid Sterba if (has_qstripe) { 2543d70cef0dSIra Weiny /* RAID6, allocate and map temp space for the Q stripe */ 254446900662SQu Wenruo q_sector.page = alloc_page(GFP_NOFS); 254546900662SQu Wenruo if (!q_sector.page) { 254646900662SQu Wenruo __free_page(p_sector.page); 254746900662SQu Wenruo p_sector.page = NULL; 25486bfd0133SQu Wenruo return -ENOMEM; 25495a6ac9eaSMiao Xie } 255046900662SQu Wenruo q_sector.pgoff = 0; 255146900662SQu Wenruo q_sector.uptodate = 1; 255246900662SQu Wenruo pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 25535a6ac9eaSMiao Xie } 25545a6ac9eaSMiao Xie 25552942a50dSQu Wenruo bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 25565a6ac9eaSMiao Xie 2557d70cef0dSIra Weiny /* Map the parity stripe just once */ 255846900662SQu Wenruo pointers[nr_data] = kmap_local_page(p_sector.page); 2559d70cef0dSIra Weiny 2560c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 256146900662SQu Wenruo struct sector_ptr *sector; 25625a6ac9eaSMiao Xie void *parity; 256346900662SQu Wenruo 25645a6ac9eaSMiao Xie /* first collect one page from each data stripe */ 25655a6ac9eaSMiao Xie for (stripe = 0; stripe < nr_data; stripe++) { 256646900662SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 256746900662SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 256846900662SQu Wenruo sector->pgoff; 25695a6ac9eaSMiao Xie } 25705a6ac9eaSMiao Xie 2571c17af965SDavid Sterba if (has_qstripe) { 2572d70cef0dSIra Weiny /* RAID6, call the library function to fill in our P/Q */ 257346900662SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 25745a6ac9eaSMiao Xie pointers); 25755a6ac9eaSMiao Xie } else { 25765a6ac9eaSMiao Xie /* raid5 */ 257746900662SQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 257846900662SQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 25795a6ac9eaSMiao Xie } 25805a6ac9eaSMiao Xie 258101327610SNicholas D Steeves /* Check scrubbing parity and repair it */ 258246900662SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 258346900662SQu Wenruo parity = kmap_local_page(sector->page) + sector->pgoff; 258446900662SQu Wenruo if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 258546900662SQu Wenruo memcpy(parity, pointers[rbio->scrubp], sectorsize); 25865a6ac9eaSMiao Xie else 25875a6ac9eaSMiao Xie /* Parity is right, needn't writeback */ 2588c67c68ebSQu Wenruo bitmap_clear(&rbio->dbitmap, sectornr, 1); 258958c1a35cSIra Weiny kunmap_local(parity); 25905a6ac9eaSMiao Xie 259194a0b58dSIra Weiny for (stripe = nr_data - 1; stripe >= 0; stripe--) 259294a0b58dSIra Weiny kunmap_local(pointers[stripe]); 25935a6ac9eaSMiao Xie } 25945a6ac9eaSMiao Xie 259594a0b58dSIra Weiny kunmap_local(pointers[nr_data]); 259646900662SQu Wenruo __free_page(p_sector.page); 259746900662SQu Wenruo p_sector.page = NULL; 259846900662SQu Wenruo if (q_sector.page) { 259994a0b58dSIra Weiny kunmap_local(pointers[rbio->real_stripes - 1]); 260046900662SQu Wenruo __free_page(q_sector.page); 260146900662SQu Wenruo q_sector.page = NULL; 2602d70cef0dSIra Weiny } 26035a6ac9eaSMiao Xie 26045a6ac9eaSMiao Xie writeback: 26055a6ac9eaSMiao Xie /* 26065a6ac9eaSMiao Xie * time to start writing. Make bios for everything from the 26075a6ac9eaSMiao Xie * higher layers (the bio_list in our rbio) and our p/q. Ignore 26085a6ac9eaSMiao Xie * everything else. 26095a6ac9eaSMiao Xie */ 2610c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 26113e77605dSQu Wenruo struct sector_ptr *sector; 26125a6ac9eaSMiao Xie 26133e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 26143e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2615ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 26165a6ac9eaSMiao Xie if (ret) 26175a6ac9eaSMiao Xie goto cleanup; 26185a6ac9eaSMiao Xie } 26195a6ac9eaSMiao Xie 262076035976SMiao Xie if (!is_replace) 262176035976SMiao Xie goto submit_write; 262276035976SMiao Xie 26233e77605dSQu Wenruo for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 26243e77605dSQu Wenruo struct sector_ptr *sector; 262576035976SMiao Xie 26263e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 26273e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 26284c664611SQu Wenruo bioc->tgtdev_map[rbio->scrubp], 2629ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 263076035976SMiao Xie if (ret) 263176035976SMiao Xie goto cleanup; 263276035976SMiao Xie } 263376035976SMiao Xie 263476035976SMiao Xie submit_write: 26356bfd0133SQu Wenruo submit_write_bios(rbio, &bio_list); 26366bfd0133SQu Wenruo return 0; 26375a6ac9eaSMiao Xie 26385a6ac9eaSMiao Xie cleanup: 2639785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2640785884fcSLiu Bo bio_put(bio); 26416bfd0133SQu Wenruo return ret; 26425a6ac9eaSMiao Xie } 26435a6ac9eaSMiao Xie 26445a6ac9eaSMiao Xie static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 26455a6ac9eaSMiao Xie { 26465a6ac9eaSMiao Xie if (stripe >= 0 && stripe < rbio->nr_data) 26475a6ac9eaSMiao Xie return 1; 26485a6ac9eaSMiao Xie return 0; 26495a6ac9eaSMiao Xie } 26505a6ac9eaSMiao Xie 26516bfd0133SQu Wenruo static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) 26525a6ac9eaSMiao Xie { 265375b47033SQu Wenruo void **pointers = NULL; 265475b47033SQu Wenruo void **unmap_array = NULL; 265575b47033SQu Wenruo int sector_nr; 2656e7fc357eSJosef Bacik int ret = 0; 26576bfd0133SQu Wenruo 26585a6ac9eaSMiao Xie /* 265975b47033SQu Wenruo * @pointers array stores the pointer for each sector. 266075b47033SQu Wenruo * 266175b47033SQu Wenruo * @unmap_array stores copy of pointers that does not get reordered 266275b47033SQu Wenruo * during reconstruction so that kunmap_local works. 26635a6ac9eaSMiao Xie */ 266475b47033SQu Wenruo pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 266575b47033SQu Wenruo unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 266675b47033SQu Wenruo if (!pointers || !unmap_array) { 266775b47033SQu Wenruo ret = -ENOMEM; 266875b47033SQu Wenruo goto out; 266975b47033SQu Wenruo } 26705a6ac9eaSMiao Xie 267175b47033SQu Wenruo for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 267275b47033SQu Wenruo int dfail = 0, failp = -1; 267375b47033SQu Wenruo int faila; 267475b47033SQu Wenruo int failb; 267575b47033SQu Wenruo int found_errors; 267675b47033SQu Wenruo 267775b47033SQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sector_nr, 267875b47033SQu Wenruo &faila, &failb); 267975b47033SQu Wenruo if (found_errors > rbio->bioc->max_errors) { 268075b47033SQu Wenruo ret = -EIO; 268175b47033SQu Wenruo goto out; 268275b47033SQu Wenruo } 268375b47033SQu Wenruo if (found_errors == 0) 268475b47033SQu Wenruo continue; 268575b47033SQu Wenruo 268675b47033SQu Wenruo /* We should have at least one error here. */ 268775b47033SQu Wenruo ASSERT(faila >= 0 || failb >= 0); 268875b47033SQu Wenruo 268975b47033SQu Wenruo if (is_data_stripe(rbio, faila)) 269075b47033SQu Wenruo dfail++; 269175b47033SQu Wenruo else if (is_parity_stripe(faila)) 269275b47033SQu Wenruo failp = faila; 269375b47033SQu Wenruo 269475b47033SQu Wenruo if (is_data_stripe(rbio, failb)) 269575b47033SQu Wenruo dfail++; 269675b47033SQu Wenruo else if (is_parity_stripe(failb)) 269775b47033SQu Wenruo failp = failb; 26985a6ac9eaSMiao Xie /* 269975b47033SQu Wenruo * Because we can not use a scrubbing parity to repair the 270075b47033SQu Wenruo * data, so the capability of the repair is declined. (In the 270175b47033SQu Wenruo * case of RAID5, we can not repair anything.) 270275b47033SQu Wenruo */ 270375b47033SQu Wenruo if (dfail > rbio->bioc->max_errors - 1) { 270475b47033SQu Wenruo ret = -EIO; 270575b47033SQu Wenruo goto out; 270675b47033SQu Wenruo } 270775b47033SQu Wenruo /* 270875b47033SQu Wenruo * If all data is good, only parity is correctly, just repair 270975b47033SQu Wenruo * the parity, no need to recover data stripes. 27105a6ac9eaSMiao Xie */ 27116bfd0133SQu Wenruo if (dfail == 0) 271275b47033SQu Wenruo continue; 27135a6ac9eaSMiao Xie 27145a6ac9eaSMiao Xie /* 27155a6ac9eaSMiao Xie * Here means we got one corrupted data stripe and one 271675b47033SQu Wenruo * corrupted parity on RAID6, if the corrupted parity is 271775b47033SQu Wenruo * scrubbing parity, luckily, use the other one to repair the 271875b47033SQu Wenruo * data, or we can not repair the data stripe. 27195a6ac9eaSMiao Xie */ 272075b47033SQu Wenruo if (failp != rbio->scrubp) { 272175b47033SQu Wenruo ret = -EIO; 272275b47033SQu Wenruo goto out; 272375b47033SQu Wenruo } 27245a6ac9eaSMiao Xie 272575b47033SQu Wenruo ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); 272675b47033SQu Wenruo if (ret < 0) 272775b47033SQu Wenruo goto out; 272875b47033SQu Wenruo } 272975b47033SQu Wenruo out: 273075b47033SQu Wenruo kfree(pointers); 273175b47033SQu Wenruo kfree(unmap_array); 27326bfd0133SQu Wenruo return ret; 27335a6ac9eaSMiao Xie } 27345a6ac9eaSMiao Xie 2735cb3450b7SQu Wenruo static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, 2736cb3450b7SQu Wenruo struct bio_list *bio_list) 27375a6ac9eaSMiao Xie { 27385a6ac9eaSMiao Xie struct bio *bio; 2739cb3450b7SQu Wenruo int total_sector_nr; 2740cb3450b7SQu Wenruo int ret = 0; 27415a6ac9eaSMiao Xie 2742cb3450b7SQu Wenruo ASSERT(bio_list_size(bio_list) == 0); 2743785884fcSLiu Bo 27441c10702eSQu Wenruo /* Build a list of bios to read all the missing parts. */ 27451c10702eSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 27461c10702eSQu Wenruo total_sector_nr++) { 27471c10702eSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 27481c10702eSQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 27493e77605dSQu Wenruo struct sector_ptr *sector; 27501c10702eSQu Wenruo 27511c10702eSQu Wenruo /* No data in the vertical stripe, no need to read. */ 27521c10702eSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 27531c10702eSQu Wenruo continue; 27541c10702eSQu Wenruo 27555a6ac9eaSMiao Xie /* 27561c10702eSQu Wenruo * We want to find all the sectors missing from the rbio and 27571c10702eSQu Wenruo * read them from the disk. If sector_in_rbio() finds a sector 27581c10702eSQu Wenruo * in the bio list we don't need to read it off the stripe. 27595a6ac9eaSMiao Xie */ 27603e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 27613e77605dSQu Wenruo if (sector) 27625a6ac9eaSMiao Xie continue; 27635a6ac9eaSMiao Xie 27643e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 27655a6ac9eaSMiao Xie /* 27661c10702eSQu Wenruo * The bio cache may have handed us an uptodate sector. If so, 27671c10702eSQu Wenruo * use it. 27685a6ac9eaSMiao Xie */ 27693e77605dSQu Wenruo if (sector->uptodate) 27705a6ac9eaSMiao Xie continue; 27715a6ac9eaSMiao Xie 2772cb3450b7SQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 2773ff18a4afSChristoph Hellwig sectornr, REQ_OP_READ); 27745a6ac9eaSMiao Xie if (ret) 2775cb3450b7SQu Wenruo goto error; 27765a6ac9eaSMiao Xie } 2777cb3450b7SQu Wenruo return 0; 2778cb3450b7SQu Wenruo error: 2779cb3450b7SQu Wenruo while ((bio = bio_list_pop(bio_list))) 2780cb3450b7SQu Wenruo bio_put(bio); 2781cb3450b7SQu Wenruo return ret; 2782cb3450b7SQu Wenruo } 2783cb3450b7SQu Wenruo 27846bfd0133SQu Wenruo static int scrub_rbio(struct btrfs_raid_bio *rbio) 2785cb3450b7SQu Wenruo { 27866bfd0133SQu Wenruo bool need_check = false; 2787cb3450b7SQu Wenruo struct bio_list bio_list; 2788ad3daf1cSQu Wenruo int sector_nr; 2789cb3450b7SQu Wenruo int ret; 2790cb3450b7SQu Wenruo struct bio *bio; 2791cb3450b7SQu Wenruo 2792cb3450b7SQu Wenruo bio_list_init(&bio_list); 2793cb3450b7SQu Wenruo 2794cb3450b7SQu Wenruo ret = alloc_rbio_essential_pages(rbio); 2795cb3450b7SQu Wenruo if (ret) 2796cb3450b7SQu Wenruo goto cleanup; 2797cb3450b7SQu Wenruo 27982942a50dSQu Wenruo bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 27992942a50dSQu Wenruo 2800cb3450b7SQu Wenruo ret = scrub_assemble_read_bios(rbio, &bio_list); 2801cb3450b7SQu Wenruo if (ret < 0) 2802cb3450b7SQu Wenruo goto cleanup; 28035a6ac9eaSMiao Xie 28046bfd0133SQu Wenruo submit_read_bios(rbio, &bio_list); 28056bfd0133SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 28066bfd0133SQu Wenruo 280775b47033SQu Wenruo /* We may have some failures, recover the failed sectors first. */ 28086bfd0133SQu Wenruo ret = recover_scrub_rbio(rbio); 28096bfd0133SQu Wenruo if (ret < 0) 28106bfd0133SQu Wenruo goto cleanup; 28116bfd0133SQu Wenruo 28125a6ac9eaSMiao Xie /* 28136bfd0133SQu Wenruo * We have every sector properly prepared. Can finish the scrub 28146bfd0133SQu Wenruo * and writeback the good content. 28155a6ac9eaSMiao Xie */ 28166bfd0133SQu Wenruo ret = finish_parity_scrub(rbio, need_check); 28176bfd0133SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2818ad3daf1cSQu Wenruo for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2819ad3daf1cSQu Wenruo int found_errors; 2820ad3daf1cSQu Wenruo 2821ad3daf1cSQu Wenruo found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); 2822ad3daf1cSQu Wenruo if (found_errors > rbio->bioc->max_errors) { 28236bfd0133SQu Wenruo ret = -EIO; 2824ad3daf1cSQu Wenruo break; 2825ad3daf1cSQu Wenruo } 2826ad3daf1cSQu Wenruo } 28276bfd0133SQu Wenruo return ret; 28285a6ac9eaSMiao Xie 28295a6ac9eaSMiao Xie cleanup: 2830785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2831785884fcSLiu Bo bio_put(bio); 2832785884fcSLiu Bo 28336bfd0133SQu Wenruo return ret; 28345a6ac9eaSMiao Xie } 28355a6ac9eaSMiao Xie 28366bfd0133SQu Wenruo static void scrub_rbio_work_locked(struct work_struct *work) 28375a6ac9eaSMiao Xie { 28385a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 28396bfd0133SQu Wenruo int ret; 28405a6ac9eaSMiao Xie 28415a6ac9eaSMiao Xie rbio = container_of(work, struct btrfs_raid_bio, work); 28426bfd0133SQu Wenruo ret = scrub_rbio(rbio); 28436bfd0133SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 28445a6ac9eaSMiao Xie } 28455a6ac9eaSMiao Xie 28465a6ac9eaSMiao Xie void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 28475a6ac9eaSMiao Xie { 28485a6ac9eaSMiao Xie if (!lock_stripe_add(rbio)) 28496bfd0133SQu Wenruo start_async_work(rbio, scrub_rbio_work_locked); 28505a6ac9eaSMiao Xie } 2851b4ee1782SOmar Sandoval 2852b4ee1782SOmar Sandoval /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2853b4ee1782SOmar Sandoval 2854b4ee1782SOmar Sandoval struct btrfs_raid_bio * 2855ff18a4afSChristoph Hellwig raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) 2856b4ee1782SOmar Sandoval { 28576a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 2858b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2859b4ee1782SOmar Sandoval 2860ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 2861b4ee1782SOmar Sandoval if (IS_ERR(rbio)) 2862b4ee1782SOmar Sandoval return NULL; 2863b4ee1782SOmar Sandoval 2864b4ee1782SOmar Sandoval rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2865b4ee1782SOmar Sandoval bio_list_add(&rbio->bio_list, bio); 2866b4ee1782SOmar Sandoval /* 2867b4ee1782SOmar Sandoval * This is a special bio which is used to hold the completion handler 2868b4ee1782SOmar Sandoval * and make the scrub rbio is similar to the other types 2869b4ee1782SOmar Sandoval */ 2870b4ee1782SOmar Sandoval ASSERT(!bio->bi_iter.bi_size); 2871b4ee1782SOmar Sandoval 28722942a50dSQu Wenruo set_rbio_range_error(rbio, bio); 2873b4ee1782SOmar Sandoval 2874b4ee1782SOmar Sandoval return rbio; 2875b4ee1782SOmar Sandoval } 2876b4ee1782SOmar Sandoval 2877b4ee1782SOmar Sandoval void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2878b4ee1782SOmar Sandoval { 2879d817ce35SQu Wenruo start_async_work(rbio, recover_rbio_work); 2880b4ee1782SOmar Sandoval } 2881