1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 253b381b3SDavid Woodhouse /* 353b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 453b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 553b381b3SDavid Woodhouse */ 6c1d7c514SDavid Sterba 753b381b3SDavid Woodhouse #include <linux/sched.h> 853b381b3SDavid Woodhouse #include <linux/bio.h> 953b381b3SDavid Woodhouse #include <linux/slab.h> 1053b381b3SDavid Woodhouse #include <linux/blkdev.h> 1153b381b3SDavid Woodhouse #include <linux/raid/pq.h> 1253b381b3SDavid Woodhouse #include <linux/hash.h> 1353b381b3SDavid Woodhouse #include <linux/list_sort.h> 1453b381b3SDavid Woodhouse #include <linux/raid/xor.h> 15818e010bSDavid Sterba #include <linux/mm.h> 169b569ea0SJosef Bacik #include "messages.h" 17cea62800SJohannes Thumshirn #include "misc.h" 1853b381b3SDavid Woodhouse #include "ctree.h" 1953b381b3SDavid Woodhouse #include "disk-io.h" 2053b381b3SDavid Woodhouse #include "volumes.h" 2153b381b3SDavid Woodhouse #include "raid56.h" 2253b381b3SDavid Woodhouse #include "async-thread.h" 2353b381b3SDavid Woodhouse 2453b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 2553b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 2653b381b3SDavid Woodhouse 274ae10b3aSChris Mason /* 284ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 294ae10b3aSChris Mason * of past RMW 304ae10b3aSChris Mason */ 314ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 324ae10b3aSChris Mason 334ae10b3aSChris Mason /* 344ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 354ae10b3aSChris Mason */ 364ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 374ae10b3aSChris Mason 384ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 394ae10b3aSChris Mason 408a953348SDavid Sterba #define BTRFS_STRIPE_HASH_TABLE_BITS 11 418a953348SDavid Sterba 428a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 438a953348SDavid Sterba struct btrfs_stripe_hash { 448a953348SDavid Sterba struct list_head hash_list; 458a953348SDavid Sterba spinlock_t lock; 468a953348SDavid Sterba }; 478a953348SDavid Sterba 488a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 498a953348SDavid Sterba struct btrfs_stripe_hash_table { 508a953348SDavid Sterba struct list_head stripe_cache; 518a953348SDavid Sterba spinlock_t cache_lock; 528a953348SDavid Sterba int cache_size; 538a953348SDavid Sterba struct btrfs_stripe_hash table[]; 548a953348SDavid Sterba }; 558a953348SDavid Sterba 56eb357060SQu Wenruo /* 57eb357060SQu Wenruo * A bvec like structure to present a sector inside a page. 58eb357060SQu Wenruo * 59eb357060SQu Wenruo * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 60eb357060SQu Wenruo */ 61eb357060SQu Wenruo struct sector_ptr { 62eb357060SQu Wenruo struct page *page; 6300425dd9SQu Wenruo unsigned int pgoff:24; 6400425dd9SQu Wenruo unsigned int uptodate:8; 65eb357060SQu Wenruo }; 66eb357060SQu Wenruo 6753b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 6853b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 69385de0efSChristoph Hellwig static void rmw_work(struct work_struct *work); 7053b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 7153b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 7253b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 7353b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 7453b381b3SDavid Woodhouse 755a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 765a6ac9eaSMiao Xie int need_check); 77385de0efSChristoph Hellwig static void scrub_parity_work(struct work_struct *work); 785a6ac9eaSMiao Xie 79797d74b7SQu Wenruo static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 80797d74b7SQu Wenruo { 81797d74b7SQu Wenruo kfree(rbio->stripe_pages); 82797d74b7SQu Wenruo kfree(rbio->bio_sectors); 83797d74b7SQu Wenruo kfree(rbio->stripe_sectors); 84797d74b7SQu Wenruo kfree(rbio->finish_pointers); 85797d74b7SQu Wenruo } 86797d74b7SQu Wenruo 87ff2b64a2SQu Wenruo static void free_raid_bio(struct btrfs_raid_bio *rbio) 88ff2b64a2SQu Wenruo { 89ff2b64a2SQu Wenruo int i; 90ff2b64a2SQu Wenruo 91ff2b64a2SQu Wenruo if (!refcount_dec_and_test(&rbio->refs)) 92ff2b64a2SQu Wenruo return; 93ff2b64a2SQu Wenruo 94ff2b64a2SQu Wenruo WARN_ON(!list_empty(&rbio->stripe_cache)); 95ff2b64a2SQu Wenruo WARN_ON(!list_empty(&rbio->hash_list)); 96ff2b64a2SQu Wenruo WARN_ON(!bio_list_empty(&rbio->bio_list)); 97ff2b64a2SQu Wenruo 98ff2b64a2SQu Wenruo for (i = 0; i < rbio->nr_pages; i++) { 99ff2b64a2SQu Wenruo if (rbio->stripe_pages[i]) { 100ff2b64a2SQu Wenruo __free_page(rbio->stripe_pages[i]); 101ff2b64a2SQu Wenruo rbio->stripe_pages[i] = NULL; 102ff2b64a2SQu Wenruo } 103ff2b64a2SQu Wenruo } 104ff2b64a2SQu Wenruo 105ff2b64a2SQu Wenruo btrfs_put_bioc(rbio->bioc); 106797d74b7SQu Wenruo free_raid_bio_pointers(rbio); 107ff2b64a2SQu Wenruo kfree(rbio); 108ff2b64a2SQu Wenruo } 109ff2b64a2SQu Wenruo 110385de0efSChristoph Hellwig static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 111ac638859SDavid Sterba { 112385de0efSChristoph Hellwig INIT_WORK(&rbio->work, work_func); 113385de0efSChristoph Hellwig queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 114ac638859SDavid Sterba } 115ac638859SDavid Sterba 11653b381b3SDavid Woodhouse /* 11753b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 11853b381b3SDavid Woodhouse * bios in hopes of making a full stripe 11953b381b3SDavid Woodhouse */ 12053b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 12153b381b3SDavid Woodhouse { 12253b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 12353b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 12453b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 12553b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 12653b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 12753b381b3SDavid Woodhouse int i; 12853b381b3SDavid Woodhouse 12953b381b3SDavid Woodhouse if (info->stripe_hash_table) 13053b381b3SDavid Woodhouse return 0; 13153b381b3SDavid Woodhouse 13283c8266aSDavid Sterba /* 13383c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 13483c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 13583c8266aSDavid Sterba * 13683c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 13783c8266aSDavid Sterba * of a failing mount. 13883c8266aSDavid Sterba */ 139ee787f95SDavid Sterba table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 14053b381b3SDavid Woodhouse if (!table) 14153b381b3SDavid Woodhouse return -ENOMEM; 14253b381b3SDavid Woodhouse 1434ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 1444ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 1454ae10b3aSChris Mason 14653b381b3SDavid Woodhouse h = table->table; 14753b381b3SDavid Woodhouse 14853b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 14953b381b3SDavid Woodhouse cur = h + i; 15053b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 15153b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 15253b381b3SDavid Woodhouse } 15353b381b3SDavid Woodhouse 15453b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 155f749303bSWang Shilong kvfree(x); 15653b381b3SDavid Woodhouse return 0; 15753b381b3SDavid Woodhouse } 15853b381b3SDavid Woodhouse 15953b381b3SDavid Woodhouse /* 1604ae10b3aSChris Mason * caching an rbio means to copy anything from the 161ac26df8bSQu Wenruo * bio_sectors array into the stripe_pages array. We 1624ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 1634ae10b3aSChris Mason * to indicate if it has valid data 1644ae10b3aSChris Mason * 1654ae10b3aSChris Mason * once the caching is done, we set the cache ready 1664ae10b3aSChris Mason * bit. 1674ae10b3aSChris Mason */ 1684ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 1694ae10b3aSChris Mason { 1704ae10b3aSChris Mason int i; 1714ae10b3aSChris Mason int ret; 1724ae10b3aSChris Mason 1734ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 1744ae10b3aSChris Mason if (ret) 1754ae10b3aSChris Mason return; 1764ae10b3aSChris Mason 17700425dd9SQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 17800425dd9SQu Wenruo /* Some range not covered by bio (partial write), skip it */ 17988074c8bSQu Wenruo if (!rbio->bio_sectors[i].page) { 18088074c8bSQu Wenruo /* 18188074c8bSQu Wenruo * Even if the sector is not covered by bio, if it is 18288074c8bSQu Wenruo * a data sector it should still be uptodate as it is 18388074c8bSQu Wenruo * read from disk. 18488074c8bSQu Wenruo */ 18588074c8bSQu Wenruo if (i < rbio->nr_data * rbio->stripe_nsectors) 18688074c8bSQu Wenruo ASSERT(rbio->stripe_sectors[i].uptodate); 18700425dd9SQu Wenruo continue; 18888074c8bSQu Wenruo } 18900425dd9SQu Wenruo 19000425dd9SQu Wenruo ASSERT(rbio->stripe_sectors[i].page); 19100425dd9SQu Wenruo memcpy_page(rbio->stripe_sectors[i].page, 19200425dd9SQu Wenruo rbio->stripe_sectors[i].pgoff, 19300425dd9SQu Wenruo rbio->bio_sectors[i].page, 19400425dd9SQu Wenruo rbio->bio_sectors[i].pgoff, 19500425dd9SQu Wenruo rbio->bioc->fs_info->sectorsize); 19600425dd9SQu Wenruo rbio->stripe_sectors[i].uptodate = 1; 19700425dd9SQu Wenruo } 1984ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1994ae10b3aSChris Mason } 2004ae10b3aSChris Mason 2014ae10b3aSChris Mason /* 20253b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 20353b381b3SDavid Woodhouse */ 20453b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 20553b381b3SDavid Woodhouse { 2064c664611SQu Wenruo u64 num = rbio->bioc->raid_map[0]; 20753b381b3SDavid Woodhouse 20853b381b3SDavid Woodhouse /* 20953b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 21053b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 21153b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 21253b381b3SDavid Woodhouse * returns just one or two different values. 21353b381b3SDavid Woodhouse * 21453b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 21553b381b3SDavid Woodhouse */ 21653b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 21753b381b3SDavid Woodhouse } 21853b381b3SDavid Woodhouse 219d4e28d9bSQu Wenruo static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 220d4e28d9bSQu Wenruo unsigned int page_nr) 221d4e28d9bSQu Wenruo { 222d4e28d9bSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 223d4e28d9bSQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 224d4e28d9bSQu Wenruo int i; 225d4e28d9bSQu Wenruo 226d4e28d9bSQu Wenruo ASSERT(page_nr < rbio->nr_pages); 227d4e28d9bSQu Wenruo 228d4e28d9bSQu Wenruo for (i = sectors_per_page * page_nr; 229d4e28d9bSQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; 230d4e28d9bSQu Wenruo i++) { 231d4e28d9bSQu Wenruo if (!rbio->stripe_sectors[i].uptodate) 232d4e28d9bSQu Wenruo return false; 233d4e28d9bSQu Wenruo } 234d4e28d9bSQu Wenruo return true; 235d4e28d9bSQu Wenruo } 236d4e28d9bSQu Wenruo 23753b381b3SDavid Woodhouse /* 238eb357060SQu Wenruo * Update the stripe_sectors[] array to use correct page and pgoff 239eb357060SQu Wenruo * 240eb357060SQu Wenruo * Should be called every time any page pointer in stripes_pages[] got modified. 241eb357060SQu Wenruo */ 242eb357060SQu Wenruo static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 243eb357060SQu Wenruo { 244eb357060SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 245eb357060SQu Wenruo u32 offset; 246eb357060SQu Wenruo int i; 247eb357060SQu Wenruo 248eb357060SQu Wenruo for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 249eb357060SQu Wenruo int page_index = offset >> PAGE_SHIFT; 250eb357060SQu Wenruo 251eb357060SQu Wenruo ASSERT(page_index < rbio->nr_pages); 252eb357060SQu Wenruo rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 253eb357060SQu Wenruo rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 254eb357060SQu Wenruo } 255eb357060SQu Wenruo } 256eb357060SQu Wenruo 2574d100466SQu Wenruo static void steal_rbio_page(struct btrfs_raid_bio *src, 2584d100466SQu Wenruo struct btrfs_raid_bio *dest, int page_nr) 2594d100466SQu Wenruo { 2604d100466SQu Wenruo const u32 sectorsize = src->bioc->fs_info->sectorsize; 2614d100466SQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2624d100466SQu Wenruo int i; 2634d100466SQu Wenruo 2644d100466SQu Wenruo if (dest->stripe_pages[page_nr]) 2654d100466SQu Wenruo __free_page(dest->stripe_pages[page_nr]); 2664d100466SQu Wenruo dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 2674d100466SQu Wenruo src->stripe_pages[page_nr] = NULL; 2684d100466SQu Wenruo 2694d100466SQu Wenruo /* Also update the sector->uptodate bits. */ 2704d100466SQu Wenruo for (i = sectors_per_page * page_nr; 2714d100466SQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; i++) 2724d100466SQu Wenruo dest->stripe_sectors[i].uptodate = true; 2734d100466SQu Wenruo } 2744d100466SQu Wenruo 27588074c8bSQu Wenruo static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 27688074c8bSQu Wenruo { 27788074c8bSQu Wenruo const int sector_nr = (page_nr << PAGE_SHIFT) >> 27888074c8bSQu Wenruo rbio->bioc->fs_info->sectorsize_bits; 27988074c8bSQu Wenruo 28088074c8bSQu Wenruo /* 28188074c8bSQu Wenruo * We have ensured PAGE_SIZE is aligned with sectorsize, thus 28288074c8bSQu Wenruo * we won't have a page which is half data half parity. 28388074c8bSQu Wenruo * 28488074c8bSQu Wenruo * Thus if the first sector of the page belongs to data stripes, then 28588074c8bSQu Wenruo * the full page belongs to data stripes. 28688074c8bSQu Wenruo */ 28788074c8bSQu Wenruo return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 28888074c8bSQu Wenruo } 28988074c8bSQu Wenruo 290eb357060SQu Wenruo /* 291d4e28d9bSQu Wenruo * Stealing an rbio means taking all the uptodate pages from the stripe array 292d4e28d9bSQu Wenruo * in the source rbio and putting them into the destination rbio. 293d4e28d9bSQu Wenruo * 294d4e28d9bSQu Wenruo * This will also update the involved stripe_sectors[] which are referring to 295d4e28d9bSQu Wenruo * the old pages. 2964ae10b3aSChris Mason */ 2974ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 2984ae10b3aSChris Mason { 2994ae10b3aSChris Mason int i; 3004ae10b3aSChris Mason 3014ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 3024ae10b3aSChris Mason return; 3034ae10b3aSChris Mason 3044ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 30588074c8bSQu Wenruo struct page *p = src->stripe_pages[i]; 30688074c8bSQu Wenruo 30788074c8bSQu Wenruo /* 30888074c8bSQu Wenruo * We don't need to steal P/Q pages as they will always be 30988074c8bSQu Wenruo * regenerated for RMW or full write anyway. 31088074c8bSQu Wenruo */ 31188074c8bSQu Wenruo if (!is_data_stripe_page(src, i)) 3124ae10b3aSChris Mason continue; 3134ae10b3aSChris Mason 31488074c8bSQu Wenruo /* 31588074c8bSQu Wenruo * If @src already has RBIO_CACHE_READY_BIT, it should have 31688074c8bSQu Wenruo * all data stripe pages present and uptodate. 31788074c8bSQu Wenruo */ 31888074c8bSQu Wenruo ASSERT(p); 31988074c8bSQu Wenruo ASSERT(full_page_sectors_uptodate(src, i)); 3204d100466SQu Wenruo steal_rbio_page(src, dest, i); 3214ae10b3aSChris Mason } 322eb357060SQu Wenruo index_stripe_sectors(dest); 323eb357060SQu Wenruo index_stripe_sectors(src); 3244ae10b3aSChris Mason } 3254ae10b3aSChris Mason 3264ae10b3aSChris Mason /* 32753b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 32853b381b3SDavid Woodhouse * splice it into the destination. The victim should 32953b381b3SDavid Woodhouse * be discarded afterwards. 33053b381b3SDavid Woodhouse * 33153b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 33253b381b3SDavid Woodhouse */ 33353b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 33453b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 33553b381b3SDavid Woodhouse { 33653b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 33753b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 338bd8f7e62SQu Wenruo /* Also inherit the bitmaps from @victim. */ 339bd8f7e62SQu Wenruo bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 340bd8f7e62SQu Wenruo dest->stripe_nsectors); 34153b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 34253b381b3SDavid Woodhouse } 34353b381b3SDavid Woodhouse 34453b381b3SDavid Woodhouse /* 3454ae10b3aSChris Mason * used to prune items that are in the cache. The caller 3464ae10b3aSChris Mason * must hold the hash table lock. 3474ae10b3aSChris Mason */ 3484ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3494ae10b3aSChris Mason { 3504ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 3514ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3524ae10b3aSChris Mason struct btrfs_stripe_hash *h; 3534ae10b3aSChris Mason int freeit = 0; 3544ae10b3aSChris Mason 3554ae10b3aSChris Mason /* 3564ae10b3aSChris Mason * check the bit again under the hash table lock. 3574ae10b3aSChris Mason */ 3584ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 3594ae10b3aSChris Mason return; 3604ae10b3aSChris Mason 3616a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 3624ae10b3aSChris Mason h = table->table + bucket; 3634ae10b3aSChris Mason 3644ae10b3aSChris Mason /* hold the lock for the bucket because we may be 3654ae10b3aSChris Mason * removing it from the hash table 3664ae10b3aSChris Mason */ 3674ae10b3aSChris Mason spin_lock(&h->lock); 3684ae10b3aSChris Mason 3694ae10b3aSChris Mason /* 3704ae10b3aSChris Mason * hold the lock for the bio list because we need 3714ae10b3aSChris Mason * to make sure the bio list is empty 3724ae10b3aSChris Mason */ 3734ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 3744ae10b3aSChris Mason 3754ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 3764ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 3774ae10b3aSChris Mason table->cache_size -= 1; 3784ae10b3aSChris Mason freeit = 1; 3794ae10b3aSChris Mason 3804ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 3814ae10b3aSChris Mason * still involved in an IO. We take it out 3824ae10b3aSChris Mason * of the cache list, and drop the ref that 3834ae10b3aSChris Mason * was held for the list. 3844ae10b3aSChris Mason * 3854ae10b3aSChris Mason * If the bio_list was empty, we also remove 3864ae10b3aSChris Mason * the rbio from the hash_table, and drop 3874ae10b3aSChris Mason * the corresponding ref 3884ae10b3aSChris Mason */ 3894ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 3904ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 3914ae10b3aSChris Mason list_del_init(&rbio->hash_list); 392dec95574SElena Reshetova refcount_dec(&rbio->refs); 3934ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 3944ae10b3aSChris Mason } 3954ae10b3aSChris Mason } 3964ae10b3aSChris Mason } 3974ae10b3aSChris Mason 3984ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 3994ae10b3aSChris Mason spin_unlock(&h->lock); 4004ae10b3aSChris Mason 4014ae10b3aSChris Mason if (freeit) 402ff2b64a2SQu Wenruo free_raid_bio(rbio); 4034ae10b3aSChris Mason } 4044ae10b3aSChris Mason 4054ae10b3aSChris Mason /* 4064ae10b3aSChris Mason * prune a given rbio from the cache 4074ae10b3aSChris Mason */ 4084ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 4094ae10b3aSChris Mason { 4104ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4114ae10b3aSChris Mason unsigned long flags; 4124ae10b3aSChris Mason 4134ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4144ae10b3aSChris Mason return; 4154ae10b3aSChris Mason 4166a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4174ae10b3aSChris Mason 4184ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4194ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4204ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4214ae10b3aSChris Mason } 4224ae10b3aSChris Mason 4234ae10b3aSChris Mason /* 4244ae10b3aSChris Mason * remove everything in the cache 4254ae10b3aSChris Mason */ 42648a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 4274ae10b3aSChris Mason { 4284ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4294ae10b3aSChris Mason unsigned long flags; 4304ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 4314ae10b3aSChris Mason 4324ae10b3aSChris Mason table = info->stripe_hash_table; 4334ae10b3aSChris Mason 4344ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4354ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 4364ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 4374ae10b3aSChris Mason struct btrfs_raid_bio, 4384ae10b3aSChris Mason stripe_cache); 4394ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4404ae10b3aSChris Mason } 4414ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4424ae10b3aSChris Mason } 4434ae10b3aSChris Mason 4444ae10b3aSChris Mason /* 4454ae10b3aSChris Mason * remove all cached entries and free the hash table 4464ae10b3aSChris Mason * used by unmount 44753b381b3SDavid Woodhouse */ 44853b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 44953b381b3SDavid Woodhouse { 45053b381b3SDavid Woodhouse if (!info->stripe_hash_table) 45153b381b3SDavid Woodhouse return; 4524ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 453f749303bSWang Shilong kvfree(info->stripe_hash_table); 45453b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 45553b381b3SDavid Woodhouse } 45653b381b3SDavid Woodhouse 45753b381b3SDavid Woodhouse /* 4584ae10b3aSChris Mason * insert an rbio into the stripe cache. It 4594ae10b3aSChris Mason * must have already been prepared by calling 4604ae10b3aSChris Mason * cache_rbio_pages 4614ae10b3aSChris Mason * 4624ae10b3aSChris Mason * If this rbio was already cached, it gets 4634ae10b3aSChris Mason * moved to the front of the lru. 4644ae10b3aSChris Mason * 4654ae10b3aSChris Mason * If the size of the rbio cache is too big, we 4664ae10b3aSChris Mason * prune an item. 4674ae10b3aSChris Mason */ 4684ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 4694ae10b3aSChris Mason { 4704ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4714ae10b3aSChris Mason unsigned long flags; 4724ae10b3aSChris Mason 4734ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 4744ae10b3aSChris Mason return; 4754ae10b3aSChris Mason 4766a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4774ae10b3aSChris Mason 4784ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4794ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4804ae10b3aSChris Mason 4814ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 4824ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 483dec95574SElena Reshetova refcount_inc(&rbio->refs); 4844ae10b3aSChris Mason 4854ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 4864ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 4874ae10b3aSChris Mason } else { 4884ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 4894ae10b3aSChris Mason table->cache_size += 1; 4904ae10b3aSChris Mason } 4914ae10b3aSChris Mason 4924ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4934ae10b3aSChris Mason 4944ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 4954ae10b3aSChris Mason struct btrfs_raid_bio *found; 4964ae10b3aSChris Mason 4974ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 4984ae10b3aSChris Mason struct btrfs_raid_bio, 4994ae10b3aSChris Mason stripe_cache); 5004ae10b3aSChris Mason 5014ae10b3aSChris Mason if (found != rbio) 5024ae10b3aSChris Mason __remove_rbio_from_cache(found); 5034ae10b3aSChris Mason } 5044ae10b3aSChris Mason 5054ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 5064ae10b3aSChris Mason } 5074ae10b3aSChris Mason 5084ae10b3aSChris Mason /* 50953b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 51053b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 51153b381b3SDavid Woodhouse * loop through. 51253b381b3SDavid Woodhouse */ 51353b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 51453b381b3SDavid Woodhouse { 51553b381b3SDavid Woodhouse int src_off = 0; 51653b381b3SDavid Woodhouse int xor_src_cnt = 0; 51753b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 51853b381b3SDavid Woodhouse 51953b381b3SDavid Woodhouse while(src_cnt > 0) { 52053b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 52153b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 52253b381b3SDavid Woodhouse 52353b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 52453b381b3SDavid Woodhouse src_off += xor_src_cnt; 52553b381b3SDavid Woodhouse } 52653b381b3SDavid Woodhouse } 52753b381b3SDavid Woodhouse 52853b381b3SDavid Woodhouse /* 529176571a1SDavid Sterba * Returns true if the bio list inside this rbio covers an entire stripe (no 530176571a1SDavid Sterba * rmw required). 53153b381b3SDavid Woodhouse */ 53253b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 53353b381b3SDavid Woodhouse { 53453b381b3SDavid Woodhouse unsigned long flags; 535176571a1SDavid Sterba unsigned long size = rbio->bio_list_bytes; 536176571a1SDavid Sterba int ret = 1; 53753b381b3SDavid Woodhouse 53853b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 539ff18a4afSChristoph Hellwig if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 540176571a1SDavid Sterba ret = 0; 541ff18a4afSChristoph Hellwig BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 54253b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 543176571a1SDavid Sterba 54453b381b3SDavid Woodhouse return ret; 54553b381b3SDavid Woodhouse } 54653b381b3SDavid Woodhouse 54753b381b3SDavid Woodhouse /* 54853b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 54953b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 55053b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 55153b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 55253b381b3SDavid Woodhouse * locked for final IO 55353b381b3SDavid Woodhouse * 55453b381b3SDavid Woodhouse * The caller is responsible for locking such that 55553b381b3SDavid Woodhouse * rmw_locked is safe to test 55653b381b3SDavid Woodhouse */ 55753b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 55853b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 55953b381b3SDavid Woodhouse { 56053b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 56153b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 56253b381b3SDavid Woodhouse return 0; 56353b381b3SDavid Woodhouse 5644ae10b3aSChris Mason /* 5654ae10b3aSChris Mason * we can't merge with cached rbios, since the 5664ae10b3aSChris Mason * idea is that when we merge the destination 5674ae10b3aSChris Mason * rbio is going to run our IO for us. We can 56801327610SNicholas D Steeves * steal from cached rbios though, other functions 5694ae10b3aSChris Mason * handle that. 5704ae10b3aSChris Mason */ 5714ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 5724ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 5734ae10b3aSChris Mason return 0; 5744ae10b3aSChris Mason 5754c664611SQu Wenruo if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 57653b381b3SDavid Woodhouse return 0; 57753b381b3SDavid Woodhouse 5785a6ac9eaSMiao Xie /* we can't merge with different operations */ 5795a6ac9eaSMiao Xie if (last->operation != cur->operation) 58053b381b3SDavid Woodhouse return 0; 5815a6ac9eaSMiao Xie /* 5825a6ac9eaSMiao Xie * We've need read the full stripe from the drive. 5835a6ac9eaSMiao Xie * check and repair the parity and write the new results. 5845a6ac9eaSMiao Xie * 5855a6ac9eaSMiao Xie * We're not allowed to add any new bios to the 5865a6ac9eaSMiao Xie * bio list here, anyone else that wants to 5875a6ac9eaSMiao Xie * change this stripe needs to do their own rmw. 5885a6ac9eaSMiao Xie */ 589db34be19SLiu Bo if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 5905a6ac9eaSMiao Xie return 0; 59153b381b3SDavid Woodhouse 592db34be19SLiu Bo if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 593b4ee1782SOmar Sandoval return 0; 594b4ee1782SOmar Sandoval 595cc54ff62SLiu Bo if (last->operation == BTRFS_RBIO_READ_REBUILD) { 596cc54ff62SLiu Bo int fa = last->faila; 597cc54ff62SLiu Bo int fb = last->failb; 598cc54ff62SLiu Bo int cur_fa = cur->faila; 599cc54ff62SLiu Bo int cur_fb = cur->failb; 600cc54ff62SLiu Bo 601cc54ff62SLiu Bo if (last->faila >= last->failb) { 602cc54ff62SLiu Bo fa = last->failb; 603cc54ff62SLiu Bo fb = last->faila; 604cc54ff62SLiu Bo } 605cc54ff62SLiu Bo 606cc54ff62SLiu Bo if (cur->faila >= cur->failb) { 607cc54ff62SLiu Bo cur_fa = cur->failb; 608cc54ff62SLiu Bo cur_fb = cur->faila; 609cc54ff62SLiu Bo } 610cc54ff62SLiu Bo 611cc54ff62SLiu Bo if (fa != cur_fa || fb != cur_fb) 612cc54ff62SLiu Bo return 0; 613cc54ff62SLiu Bo } 61453b381b3SDavid Woodhouse return 1; 61553b381b3SDavid Woodhouse } 61653b381b3SDavid Woodhouse 6173e77605dSQu Wenruo static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 6183e77605dSQu Wenruo unsigned int stripe_nr, 6193e77605dSQu Wenruo unsigned int sector_nr) 6203e77605dSQu Wenruo { 6213e77605dSQu Wenruo ASSERT(stripe_nr < rbio->real_stripes); 6223e77605dSQu Wenruo ASSERT(sector_nr < rbio->stripe_nsectors); 6233e77605dSQu Wenruo 6243e77605dSQu Wenruo return stripe_nr * rbio->stripe_nsectors + sector_nr; 6253e77605dSQu Wenruo } 6263e77605dSQu Wenruo 6273e77605dSQu Wenruo /* Return a sector from rbio->stripe_sectors, not from the bio list */ 6283e77605dSQu Wenruo static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 6293e77605dSQu Wenruo unsigned int stripe_nr, 6303e77605dSQu Wenruo unsigned int sector_nr) 6313e77605dSQu Wenruo { 6323e77605dSQu Wenruo return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 6333e77605dSQu Wenruo sector_nr)]; 6343e77605dSQu Wenruo } 6353e77605dSQu Wenruo 6361145059aSQu Wenruo /* Grab a sector inside P stripe */ 6371145059aSQu Wenruo static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 6381145059aSQu Wenruo unsigned int sector_nr) 639b7178a5fSZhao Lei { 6401145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 641b7178a5fSZhao Lei } 642b7178a5fSZhao Lei 6431145059aSQu Wenruo /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 6441145059aSQu Wenruo static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 6451145059aSQu Wenruo unsigned int sector_nr) 64653b381b3SDavid Woodhouse { 6472c8cdd6eSMiao Xie if (rbio->nr_data + 1 == rbio->real_stripes) 64853b381b3SDavid Woodhouse return NULL; 6491145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 6501145059aSQu Wenruo } 6511145059aSQu Wenruo 65253b381b3SDavid Woodhouse /* 65353b381b3SDavid Woodhouse * The first stripe in the table for a logical address 65453b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 65553b381b3SDavid Woodhouse * 65653b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 65753b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 65853b381b3SDavid Woodhouse * themselves. 65953b381b3SDavid Woodhouse * 66053b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 66153b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 66253b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 66353b381b3SDavid Woodhouse * 66453b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 66553b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 66653b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 66753b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 66853b381b3SDavid Woodhouse * 1 is returned 66953b381b3SDavid Woodhouse * 67053b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 67153b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 67253b381b3SDavid Woodhouse * already been freed. 67353b381b3SDavid Woodhouse */ 67453b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 67553b381b3SDavid Woodhouse { 676721860d5SJohannes Thumshirn struct btrfs_stripe_hash *h; 67753b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 67853b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 67953b381b3SDavid Woodhouse unsigned long flags; 68053b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 6814ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 68253b381b3SDavid Woodhouse int ret = 0; 68353b381b3SDavid Woodhouse 6846a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 685721860d5SJohannes Thumshirn 68653b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 68753b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 6884c664611SQu Wenruo if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 6899d6cb1b0SJohannes Thumshirn continue; 6909d6cb1b0SJohannes Thumshirn 69153b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 69253b381b3SDavid Woodhouse 6939d6cb1b0SJohannes Thumshirn /* Can we steal this cached rbio's pages? */ 6944ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 6954ae10b3aSChris Mason list_empty(&cur->plug_list) && 6964ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 6974ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 6984ae10b3aSChris Mason list_del_init(&cur->hash_list); 699dec95574SElena Reshetova refcount_dec(&cur->refs); 7004ae10b3aSChris Mason 7014ae10b3aSChris Mason steal_rbio(cur, rbio); 7024ae10b3aSChris Mason cache_drop = cur; 7034ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 7044ae10b3aSChris Mason 7054ae10b3aSChris Mason goto lockit; 7064ae10b3aSChris Mason } 7074ae10b3aSChris Mason 7089d6cb1b0SJohannes Thumshirn /* Can we merge into the lock owner? */ 70953b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 71053b381b3SDavid Woodhouse merge_rbio(cur, rbio); 71153b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 71253b381b3SDavid Woodhouse freeit = rbio; 71353b381b3SDavid Woodhouse ret = 1; 71453b381b3SDavid Woodhouse goto out; 71553b381b3SDavid Woodhouse } 71653b381b3SDavid Woodhouse 7174ae10b3aSChris Mason 71853b381b3SDavid Woodhouse /* 7199d6cb1b0SJohannes Thumshirn * We couldn't merge with the running rbio, see if we can merge 7209d6cb1b0SJohannes Thumshirn * with the pending ones. We don't have to check for rmw_locked 7219d6cb1b0SJohannes Thumshirn * because there is no way they are inside finish_rmw right now 72253b381b3SDavid Woodhouse */ 7239d6cb1b0SJohannes Thumshirn list_for_each_entry(pending, &cur->plug_list, plug_list) { 72453b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 72553b381b3SDavid Woodhouse merge_rbio(pending, rbio); 72653b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 72753b381b3SDavid Woodhouse freeit = rbio; 72853b381b3SDavid Woodhouse ret = 1; 72953b381b3SDavid Woodhouse goto out; 73053b381b3SDavid Woodhouse } 73153b381b3SDavid Woodhouse } 73253b381b3SDavid Woodhouse 7339d6cb1b0SJohannes Thumshirn /* 7349d6cb1b0SJohannes Thumshirn * No merging, put us on the tail of the plug list, our rbio 7359d6cb1b0SJohannes Thumshirn * will be started with the currently running rbio unlocks 73653b381b3SDavid Woodhouse */ 73753b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 73853b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 73953b381b3SDavid Woodhouse ret = 1; 74053b381b3SDavid Woodhouse goto out; 74153b381b3SDavid Woodhouse } 7424ae10b3aSChris Mason lockit: 743dec95574SElena Reshetova refcount_inc(&rbio->refs); 74453b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 74553b381b3SDavid Woodhouse out: 74653b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 7474ae10b3aSChris Mason if (cache_drop) 7484ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 74953b381b3SDavid Woodhouse if (freeit) 750ff2b64a2SQu Wenruo free_raid_bio(freeit); 75153b381b3SDavid Woodhouse return ret; 75253b381b3SDavid Woodhouse } 75353b381b3SDavid Woodhouse 754*d817ce35SQu Wenruo static void recover_rbio_work_locked(struct work_struct *work); 755*d817ce35SQu Wenruo 75653b381b3SDavid Woodhouse /* 75753b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 75853b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 75953b381b3SDavid Woodhouse */ 76053b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 76153b381b3SDavid Woodhouse { 76253b381b3SDavid Woodhouse int bucket; 76353b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 76453b381b3SDavid Woodhouse unsigned long flags; 7654ae10b3aSChris Mason int keep_cache = 0; 76653b381b3SDavid Woodhouse 76753b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 7686a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 76953b381b3SDavid Woodhouse 7704ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 7714ae10b3aSChris Mason cache_rbio(rbio); 7724ae10b3aSChris Mason 77353b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 77453b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 77553b381b3SDavid Woodhouse 77653b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 7774ae10b3aSChris Mason /* 7784ae10b3aSChris Mason * if we're still cached and there is no other IO 7794ae10b3aSChris Mason * to perform, just leave this rbio here for others 7804ae10b3aSChris Mason * to steal from later 7814ae10b3aSChris Mason */ 7824ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 7834ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 7844ae10b3aSChris Mason keep_cache = 1; 7854ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 7864ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 7874ae10b3aSChris Mason goto done; 7884ae10b3aSChris Mason } 78953b381b3SDavid Woodhouse 79053b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 791dec95574SElena Reshetova refcount_dec(&rbio->refs); 79253b381b3SDavid Woodhouse 79353b381b3SDavid Woodhouse /* 79453b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 79553b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 79653b381b3SDavid Woodhouse * hand the lock over to one of them. 79753b381b3SDavid Woodhouse */ 79853b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 79953b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 80053b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 80153b381b3SDavid Woodhouse 80253b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 80353b381b3SDavid Woodhouse plug_list); 80453b381b3SDavid Woodhouse 80553b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 80653b381b3SDavid Woodhouse 80753b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 808dec95574SElena Reshetova refcount_inc(&next->refs); 80953b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 81053b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 81153b381b3SDavid Woodhouse 8121b94b556SMiao Xie if (next->operation == BTRFS_RBIO_READ_REBUILD) 813*d817ce35SQu Wenruo start_async_work(next, recover_rbio_work_locked); 814b4ee1782SOmar Sandoval else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 815b4ee1782SOmar Sandoval steal_rbio(rbio, next); 816*d817ce35SQu Wenruo start_async_work(next, recover_rbio_work_locked); 817b4ee1782SOmar Sandoval } else if (next->operation == BTRFS_RBIO_WRITE) { 8184ae10b3aSChris Mason steal_rbio(rbio, next); 819cf6a4a75SDavid Sterba start_async_work(next, rmw_work); 8205a6ac9eaSMiao Xie } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 8215a6ac9eaSMiao Xie steal_rbio(rbio, next); 822a81b747dSDavid Sterba start_async_work(next, scrub_parity_work); 8234ae10b3aSChris Mason } 82453b381b3SDavid Woodhouse 82553b381b3SDavid Woodhouse goto done_nolock; 82653b381b3SDavid Woodhouse } 82753b381b3SDavid Woodhouse } 8284ae10b3aSChris Mason done: 82953b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 83053b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 83153b381b3SDavid Woodhouse 83253b381b3SDavid Woodhouse done_nolock: 8334ae10b3aSChris Mason if (!keep_cache) 8344ae10b3aSChris Mason remove_rbio_from_cache(rbio); 83553b381b3SDavid Woodhouse } 83653b381b3SDavid Woodhouse 8377583d8d0SLiu Bo static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 83853b381b3SDavid Woodhouse { 8397583d8d0SLiu Bo struct bio *next; 8407583d8d0SLiu Bo 8417583d8d0SLiu Bo while (cur) { 8427583d8d0SLiu Bo next = cur->bi_next; 8437583d8d0SLiu Bo cur->bi_next = NULL; 8447583d8d0SLiu Bo cur->bi_status = err; 8457583d8d0SLiu Bo bio_endio(cur); 8467583d8d0SLiu Bo cur = next; 8477583d8d0SLiu Bo } 84853b381b3SDavid Woodhouse } 84953b381b3SDavid Woodhouse 85053b381b3SDavid Woodhouse /* 85153b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 85253b381b3SDavid Woodhouse * bio_list and calls end_io on them 85353b381b3SDavid Woodhouse */ 8544e4cbee9SChristoph Hellwig static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 85553b381b3SDavid Woodhouse { 85653b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 8577583d8d0SLiu Bo struct bio *extra; 8584245215dSMiao Xie 859bd8f7e62SQu Wenruo /* 860bd8f7e62SQu Wenruo * Clear the data bitmap, as the rbio may be cached for later usage. 861bd8f7e62SQu Wenruo * do this before before unlock_stripe() so there will be no new bio 862bd8f7e62SQu Wenruo * for this bio. 863bd8f7e62SQu Wenruo */ 864bd8f7e62SQu Wenruo bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 8654245215dSMiao Xie 8667583d8d0SLiu Bo /* 8677583d8d0SLiu Bo * At this moment, rbio->bio_list is empty, however since rbio does not 8687583d8d0SLiu Bo * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 8697583d8d0SLiu Bo * hash list, rbio may be merged with others so that rbio->bio_list 8707583d8d0SLiu Bo * becomes non-empty. 8717583d8d0SLiu Bo * Once unlock_stripe() is done, rbio->bio_list will not be updated any 8727583d8d0SLiu Bo * more and we can call bio_endio() on all queued bios. 8737583d8d0SLiu Bo */ 8747583d8d0SLiu Bo unlock_stripe(rbio); 8757583d8d0SLiu Bo extra = bio_list_get(&rbio->bio_list); 876ff2b64a2SQu Wenruo free_raid_bio(rbio); 87753b381b3SDavid Woodhouse 8787583d8d0SLiu Bo rbio_endio_bio_list(cur, err); 8797583d8d0SLiu Bo if (extra) 8807583d8d0SLiu Bo rbio_endio_bio_list(extra, err); 88153b381b3SDavid Woodhouse } 88253b381b3SDavid Woodhouse 88353b381b3SDavid Woodhouse /* 88453b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 88553b381b3SDavid Woodhouse * get here, we've written a full stripe 88653b381b3SDavid Woodhouse */ 8874246a0b6SChristoph Hellwig static void raid_write_end_io(struct bio *bio) 88853b381b3SDavid Woodhouse { 88953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 8904e4cbee9SChristoph Hellwig blk_status_t err = bio->bi_status; 891a6111d11SZhao Lei int max_errors; 89253b381b3SDavid Woodhouse 89353b381b3SDavid Woodhouse if (err) 89453b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 89553b381b3SDavid Woodhouse 89653b381b3SDavid Woodhouse bio_put(bio); 89753b381b3SDavid Woodhouse 898b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 89953b381b3SDavid Woodhouse return; 90053b381b3SDavid Woodhouse 90158efbc9fSOmar Sandoval err = BLK_STS_OK; 90253b381b3SDavid Woodhouse 90353b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 904a6111d11SZhao Lei max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 9054c664611SQu Wenruo 0 : rbio->bioc->max_errors; 906a6111d11SZhao Lei if (atomic_read(&rbio->error) > max_errors) 9074e4cbee9SChristoph Hellwig err = BLK_STS_IOERR; 90853b381b3SDavid Woodhouse 9094246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 91053b381b3SDavid Woodhouse } 91153b381b3SDavid Woodhouse 91243dd529aSDavid Sterba /* 91343dd529aSDavid Sterba * Get a sector pointer specified by its @stripe_nr and @sector_nr. 9143e77605dSQu Wenruo * 9153e77605dSQu Wenruo * @rbio: The raid bio 9163e77605dSQu Wenruo * @stripe_nr: Stripe number, valid range [0, real_stripe) 9173e77605dSQu Wenruo * @sector_nr: Sector number inside the stripe, 9183e77605dSQu Wenruo * valid range [0, stripe_nsectors) 9193e77605dSQu Wenruo * @bio_list_only: Whether to use sectors inside the bio list only. 9203e77605dSQu Wenruo * 9213e77605dSQu Wenruo * The read/modify/write code wants to reuse the original bio page as much 9223e77605dSQu Wenruo * as possible, and only use stripe_sectors as fallback. 9233e77605dSQu Wenruo */ 9243e77605dSQu Wenruo static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 9253e77605dSQu Wenruo int stripe_nr, int sector_nr, 9263e77605dSQu Wenruo bool bio_list_only) 9273e77605dSQu Wenruo { 9283e77605dSQu Wenruo struct sector_ptr *sector; 9293e77605dSQu Wenruo int index; 9303e77605dSQu Wenruo 9313e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 9323e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 9333e77605dSQu Wenruo 9343e77605dSQu Wenruo index = stripe_nr * rbio->stripe_nsectors + sector_nr; 9353e77605dSQu Wenruo ASSERT(index >= 0 && index < rbio->nr_sectors); 9363e77605dSQu Wenruo 9373e77605dSQu Wenruo spin_lock_irq(&rbio->bio_list_lock); 9383e77605dSQu Wenruo sector = &rbio->bio_sectors[index]; 9393e77605dSQu Wenruo if (sector->page || bio_list_only) { 9403e77605dSQu Wenruo /* Don't return sector without a valid page pointer */ 9413e77605dSQu Wenruo if (!sector->page) 9423e77605dSQu Wenruo sector = NULL; 9433e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9443e77605dSQu Wenruo return sector; 9453e77605dSQu Wenruo } 9463e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9473e77605dSQu Wenruo 9483e77605dSQu Wenruo return &rbio->stripe_sectors[index]; 9493e77605dSQu Wenruo } 9503e77605dSQu Wenruo 95153b381b3SDavid Woodhouse /* 95253b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 95353b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 95453b381b3SDavid Woodhouse */ 9552ff7e61eSJeff Mahoney static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 956ff18a4afSChristoph Hellwig struct btrfs_io_context *bioc) 95753b381b3SDavid Woodhouse { 958843de58bSQu Wenruo const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 959ff18a4afSChristoph Hellwig const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 960843de58bSQu Wenruo const unsigned int num_pages = stripe_npages * real_stripes; 961ff18a4afSChristoph Hellwig const unsigned int stripe_nsectors = 962ff18a4afSChristoph Hellwig BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 96394efbe19SQu Wenruo const unsigned int num_sectors = stripe_nsectors * real_stripes; 96453b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 96553b381b3SDavid Woodhouse 96694efbe19SQu Wenruo /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 96794efbe19SQu Wenruo ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 968c67c68ebSQu Wenruo /* 969c67c68ebSQu Wenruo * Our current stripe len should be fixed to 64k thus stripe_nsectors 970c67c68ebSQu Wenruo * (at most 16) should be no larger than BITS_PER_LONG. 971c67c68ebSQu Wenruo */ 972c67c68ebSQu Wenruo ASSERT(stripe_nsectors <= BITS_PER_LONG); 973843de58bSQu Wenruo 974797d74b7SQu Wenruo rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 975af8e2d1dSMiao Xie if (!rbio) 97653b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 977797d74b7SQu Wenruo rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 978797d74b7SQu Wenruo GFP_NOFS); 979797d74b7SQu Wenruo rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 980797d74b7SQu Wenruo GFP_NOFS); 981797d74b7SQu Wenruo rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 982797d74b7SQu Wenruo GFP_NOFS); 983797d74b7SQu Wenruo rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 984797d74b7SQu Wenruo 985797d74b7SQu Wenruo if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || 986797d74b7SQu Wenruo !rbio->finish_pointers) { 987797d74b7SQu Wenruo free_raid_bio_pointers(rbio); 988797d74b7SQu Wenruo kfree(rbio); 989797d74b7SQu Wenruo return ERR_PTR(-ENOMEM); 990797d74b7SQu Wenruo } 99153b381b3SDavid Woodhouse 99253b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 993*d817ce35SQu Wenruo init_waitqueue_head(&rbio->io_wait); 99453b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 99553b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 9964ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 99753b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 998f1c29379SChristoph Hellwig btrfs_get_bioc(bioc); 9994c664611SQu Wenruo rbio->bioc = bioc; 100053b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 100194efbe19SQu Wenruo rbio->nr_sectors = num_sectors; 10022c8cdd6eSMiao Xie rbio->real_stripes = real_stripes; 10035a6ac9eaSMiao Xie rbio->stripe_npages = stripe_npages; 100494efbe19SQu Wenruo rbio->stripe_nsectors = stripe_nsectors; 100553b381b3SDavid Woodhouse rbio->faila = -1; 100653b381b3SDavid Woodhouse rbio->failb = -1; 1007dec95574SElena Reshetova refcount_set(&rbio->refs, 1); 1008b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 1009b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, 0); 101053b381b3SDavid Woodhouse 10110b30f719SQu Wenruo ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 10120b30f719SQu Wenruo rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 101353b381b3SDavid Woodhouse 101453b381b3SDavid Woodhouse return rbio; 101553b381b3SDavid Woodhouse } 101653b381b3SDavid Woodhouse 101753b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 101853b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 101953b381b3SDavid Woodhouse { 1020eb357060SQu Wenruo int ret; 1021eb357060SQu Wenruo 1022eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 1023eb357060SQu Wenruo if (ret < 0) 1024eb357060SQu Wenruo return ret; 1025eb357060SQu Wenruo /* Mapping all sectors */ 1026eb357060SQu Wenruo index_stripe_sectors(rbio); 1027eb357060SQu Wenruo return 0; 102853b381b3SDavid Woodhouse } 102953b381b3SDavid Woodhouse 1030b7178a5fSZhao Lei /* only allocate pages for p/q stripes */ 103153b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 103253b381b3SDavid Woodhouse { 1033f77183dcSQu Wenruo const int data_pages = rbio->nr_data * rbio->stripe_npages; 1034eb357060SQu Wenruo int ret; 103553b381b3SDavid Woodhouse 1036eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1037dd137dd1SSweet Tea Dorminy rbio->stripe_pages + data_pages); 1038eb357060SQu Wenruo if (ret < 0) 1039eb357060SQu Wenruo return ret; 1040eb357060SQu Wenruo 1041eb357060SQu Wenruo index_stripe_sectors(rbio); 1042eb357060SQu Wenruo return 0; 104353b381b3SDavid Woodhouse } 104453b381b3SDavid Woodhouse 104553b381b3SDavid Woodhouse /* 10463e77605dSQu Wenruo * Add a single sector @sector into our list of bios for IO. 10473e77605dSQu Wenruo * 10483e77605dSQu Wenruo * Return 0 if everything went well. 10493e77605dSQu Wenruo * Return <0 for error. 105053b381b3SDavid Woodhouse */ 10513e77605dSQu Wenruo static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 105253b381b3SDavid Woodhouse struct bio_list *bio_list, 10533e77605dSQu Wenruo struct sector_ptr *sector, 10543e77605dSQu Wenruo unsigned int stripe_nr, 10553e77605dSQu Wenruo unsigned int sector_nr, 1056bf9486d6SBart Van Assche enum req_op op) 105753b381b3SDavid Woodhouse { 10583e77605dSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 105953b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 106053b381b3SDavid Woodhouse int ret; 106153b381b3SDavid Woodhouse struct bio *bio; 10624c664611SQu Wenruo struct btrfs_io_stripe *stripe; 106353b381b3SDavid Woodhouse u64 disk_start; 106453b381b3SDavid Woodhouse 10653e77605dSQu Wenruo /* 10663e77605dSQu Wenruo * Note: here stripe_nr has taken device replace into consideration, 10673e77605dSQu Wenruo * thus it can be larger than rbio->real_stripe. 10683e77605dSQu Wenruo * So here we check against bioc->num_stripes, not rbio->real_stripes. 10693e77605dSQu Wenruo */ 10703e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 10713e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 10723e77605dSQu Wenruo ASSERT(sector->page); 10733e77605dSQu Wenruo 10744c664611SQu Wenruo stripe = &rbio->bioc->stripes[stripe_nr]; 10753e77605dSQu Wenruo disk_start = stripe->physical + sector_nr * sectorsize; 107653b381b3SDavid Woodhouse 107753b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 107853b381b3SDavid Woodhouse if (!stripe->dev->bdev) 107953b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 108053b381b3SDavid Woodhouse 108153b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 108253b381b3SDavid Woodhouse if (last) { 10831201b58bSDavid Sterba u64 last_end = last->bi_iter.bi_sector << 9; 10844f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 108553b381b3SDavid Woodhouse 108653b381b3SDavid Woodhouse /* 108753b381b3SDavid Woodhouse * we can't merge these if they are from different 108853b381b3SDavid Woodhouse * devices or if they are not contiguous 108953b381b3SDavid Woodhouse */ 1090f90ae76aSNikolay Borisov if (last_end == disk_start && !last->bi_status && 1091309dca30SChristoph Hellwig last->bi_bdev == stripe->dev->bdev) { 10923e77605dSQu Wenruo ret = bio_add_page(last, sector->page, sectorsize, 10933e77605dSQu Wenruo sector->pgoff); 10943e77605dSQu Wenruo if (ret == sectorsize) 109553b381b3SDavid Woodhouse return 0; 109653b381b3SDavid Woodhouse } 109753b381b3SDavid Woodhouse } 109853b381b3SDavid Woodhouse 109953b381b3SDavid Woodhouse /* put a new bio on the list */ 1100ff18a4afSChristoph Hellwig bio = bio_alloc(stripe->dev->bdev, 1101ff18a4afSChristoph Hellwig max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1102bf9486d6SBart Van Assche op, GFP_NOFS); 11034f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 1104e01bf588SChristoph Hellwig bio->bi_private = rbio; 110553b381b3SDavid Woodhouse 11063e77605dSQu Wenruo bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 110753b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 110853b381b3SDavid Woodhouse return 0; 110953b381b3SDavid Woodhouse } 111053b381b3SDavid Woodhouse 111153b381b3SDavid Woodhouse /* 111253b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 111353b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 111453b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 111553b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 111653b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 111753b381b3SDavid Woodhouse */ 111853b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 111953b381b3SDavid Woodhouse { 112053b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 11212c8cdd6eSMiao Xie BUG_ON(rbio->faila == rbio->real_stripes - 1); 112253b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 112353b381b3SDavid Woodhouse } else { 112453b381b3SDavid Woodhouse finish_rmw(rbio); 112553b381b3SDavid Woodhouse } 112653b381b3SDavid Woodhouse } 112753b381b3SDavid Woodhouse 112800425dd9SQu Wenruo static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 112900425dd9SQu Wenruo { 113000425dd9SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 113100425dd9SQu Wenruo struct bio_vec bvec; 113200425dd9SQu Wenruo struct bvec_iter iter; 113300425dd9SQu Wenruo u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 113400425dd9SQu Wenruo rbio->bioc->raid_map[0]; 113500425dd9SQu Wenruo 113600425dd9SQu Wenruo bio_for_each_segment(bvec, bio, iter) { 113700425dd9SQu Wenruo u32 bvec_offset; 113800425dd9SQu Wenruo 113900425dd9SQu Wenruo for (bvec_offset = 0; bvec_offset < bvec.bv_len; 114000425dd9SQu Wenruo bvec_offset += sectorsize, offset += sectorsize) { 114100425dd9SQu Wenruo int index = offset / sectorsize; 114200425dd9SQu Wenruo struct sector_ptr *sector = &rbio->bio_sectors[index]; 114300425dd9SQu Wenruo 114400425dd9SQu Wenruo sector->page = bvec.bv_page; 114500425dd9SQu Wenruo sector->pgoff = bvec.bv_offset + bvec_offset; 114600425dd9SQu Wenruo ASSERT(sector->pgoff < PAGE_SIZE); 114700425dd9SQu Wenruo } 114800425dd9SQu Wenruo } 114900425dd9SQu Wenruo } 115000425dd9SQu Wenruo 115153b381b3SDavid Woodhouse /* 115253b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 115353b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 115453b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 115553b381b3SDavid Woodhouse * reconstruction. 115653b381b3SDavid Woodhouse * 115753b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 115853b381b3SDavid Woodhouse */ 115953b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 116053b381b3SDavid Woodhouse { 116153b381b3SDavid Woodhouse struct bio *bio; 116253b381b3SDavid Woodhouse 116353b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 116400425dd9SQu Wenruo bio_list_for_each(bio, &rbio->bio_list) 116500425dd9SQu Wenruo index_one_bio(rbio, bio); 116600425dd9SQu Wenruo 116753b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 116853b381b3SDavid Woodhouse } 116953b381b3SDavid Woodhouse 1170b8bea09aSQu Wenruo static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1171b8bea09aSQu Wenruo struct raid56_bio_trace_info *trace_info) 1172b8bea09aSQu Wenruo { 1173b8bea09aSQu Wenruo const struct btrfs_io_context *bioc = rbio->bioc; 1174b8bea09aSQu Wenruo int i; 1175b8bea09aSQu Wenruo 1176b8bea09aSQu Wenruo ASSERT(bioc); 1177b8bea09aSQu Wenruo 1178b8bea09aSQu Wenruo /* We rely on bio->bi_bdev to find the stripe number. */ 1179b8bea09aSQu Wenruo if (!bio->bi_bdev) 1180b8bea09aSQu Wenruo goto not_found; 1181b8bea09aSQu Wenruo 1182b8bea09aSQu Wenruo for (i = 0; i < bioc->num_stripes; i++) { 1183b8bea09aSQu Wenruo if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1184b8bea09aSQu Wenruo continue; 1185b8bea09aSQu Wenruo trace_info->stripe_nr = i; 1186b8bea09aSQu Wenruo trace_info->devid = bioc->stripes[i].dev->devid; 1187b8bea09aSQu Wenruo trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1188b8bea09aSQu Wenruo bioc->stripes[i].physical; 1189b8bea09aSQu Wenruo return; 1190b8bea09aSQu Wenruo } 1191b8bea09aSQu Wenruo 1192b8bea09aSQu Wenruo not_found: 1193b8bea09aSQu Wenruo trace_info->devid = -1; 1194b8bea09aSQu Wenruo trace_info->offset = -1; 1195b8bea09aSQu Wenruo trace_info->stripe_nr = -1; 1196b8bea09aSQu Wenruo } 1197b8bea09aSQu Wenruo 119830e3c897SQu Wenruo /* Generate PQ for one veritical stripe. */ 119930e3c897SQu Wenruo static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 120030e3c897SQu Wenruo { 120130e3c897SQu Wenruo void **pointers = rbio->finish_pointers; 120230e3c897SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 120330e3c897SQu Wenruo struct sector_ptr *sector; 120430e3c897SQu Wenruo int stripe; 120530e3c897SQu Wenruo const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 120630e3c897SQu Wenruo 120730e3c897SQu Wenruo /* First collect one sector from each data stripe */ 120830e3c897SQu Wenruo for (stripe = 0; stripe < rbio->nr_data; stripe++) { 120930e3c897SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 121030e3c897SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 121130e3c897SQu Wenruo sector->pgoff; 121230e3c897SQu Wenruo } 121330e3c897SQu Wenruo 121430e3c897SQu Wenruo /* Then add the parity stripe */ 121530e3c897SQu Wenruo sector = rbio_pstripe_sector(rbio, sectornr); 121630e3c897SQu Wenruo sector->uptodate = 1; 121730e3c897SQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 121830e3c897SQu Wenruo 121930e3c897SQu Wenruo if (has_qstripe) { 122030e3c897SQu Wenruo /* 122130e3c897SQu Wenruo * RAID6, add the qstripe and call the library function 122230e3c897SQu Wenruo * to fill in our p/q 122330e3c897SQu Wenruo */ 122430e3c897SQu Wenruo sector = rbio_qstripe_sector(rbio, sectornr); 122530e3c897SQu Wenruo sector->uptodate = 1; 122630e3c897SQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + 122730e3c897SQu Wenruo sector->pgoff; 122830e3c897SQu Wenruo 122930e3c897SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 123030e3c897SQu Wenruo pointers); 123130e3c897SQu Wenruo } else { 123230e3c897SQu Wenruo /* raid5 */ 123330e3c897SQu Wenruo memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); 123430e3c897SQu Wenruo run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); 123530e3c897SQu Wenruo } 123630e3c897SQu Wenruo for (stripe = stripe - 1; stripe >= 0; stripe--) 123730e3c897SQu Wenruo kunmap_local(pointers[stripe]); 123830e3c897SQu Wenruo } 123930e3c897SQu Wenruo 124053b381b3SDavid Woodhouse /* 124153b381b3SDavid Woodhouse * this is called from one of two situations. We either 124253b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 124353b381b3SDavid Woodhouse * the missing bits off disk. 124453b381b3SDavid Woodhouse * 124553b381b3SDavid Woodhouse * This will calculate the parity and then send down any 124653b381b3SDavid Woodhouse * changed blocks. 124753b381b3SDavid Woodhouse */ 124853b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 124953b381b3SDavid Woodhouse { 12504c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 125136920044SQu Wenruo /* The total sector number inside the full stripe. */ 125236920044SQu Wenruo int total_sector_nr; 125353b381b3SDavid Woodhouse int stripe; 125436920044SQu Wenruo /* Sector number inside a stripe. */ 12553e77605dSQu Wenruo int sectornr; 125653b381b3SDavid Woodhouse struct bio_list bio_list; 125753b381b3SDavid Woodhouse struct bio *bio; 125853b381b3SDavid Woodhouse int ret; 125953b381b3SDavid Woodhouse 126053b381b3SDavid Woodhouse bio_list_init(&bio_list); 126153b381b3SDavid Woodhouse 1262bd8f7e62SQu Wenruo /* We should have at least one data sector. */ 1263bd8f7e62SQu Wenruo ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1264bd8f7e62SQu Wenruo 126553b381b3SDavid Woodhouse /* at this point we either have a full stripe, 126653b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 126753b381b3SDavid Woodhouse * recalculate the parity and write the new results. 126853b381b3SDavid Woodhouse * 126953b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 127053b381b3SDavid Woodhouse * bio list here, anyone else that wants to 127153b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 127253b381b3SDavid Woodhouse */ 127353b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 127453b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 127553b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 127653b381b3SDavid Woodhouse 1277b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 127853b381b3SDavid Woodhouse 127953b381b3SDavid Woodhouse /* 128053b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 128153b381b3SDavid Woodhouse * bio list one last time and map the page pointers 12824ae10b3aSChris Mason * 12834ae10b3aSChris Mason * We don't cache full rbios because we're assuming 12844ae10b3aSChris Mason * the higher layers are unlikely to use this area of 12854ae10b3aSChris Mason * the disk again soon. If they do use it again, 12864ae10b3aSChris Mason * hopefully they will send another full bio. 128753b381b3SDavid Woodhouse */ 128853b381b3SDavid Woodhouse index_rbio_pages(rbio); 12894ae10b3aSChris Mason if (!rbio_is_full(rbio)) 12904ae10b3aSChris Mason cache_rbio_pages(rbio); 12914ae10b3aSChris Mason else 12924ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 129353b381b3SDavid Woodhouse 129430e3c897SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 129530e3c897SQu Wenruo generate_pq_vertical(rbio, sectornr); 129653b381b3SDavid Woodhouse 129753b381b3SDavid Woodhouse /* 129836920044SQu Wenruo * Start writing. Make bios for everything from the higher layers (the 129936920044SQu Wenruo * bio_list in our rbio) and our P/Q. Ignore everything else. 130053b381b3SDavid Woodhouse */ 130136920044SQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 130236920044SQu Wenruo total_sector_nr++) { 13033e77605dSQu Wenruo struct sector_ptr *sector; 13043e77605dSQu Wenruo 130536920044SQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 130636920044SQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 130736920044SQu Wenruo 1308bd8f7e62SQu Wenruo /* This vertical stripe has no data, skip it. */ 1309bd8f7e62SQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 1310bd8f7e62SQu Wenruo continue; 1311bd8f7e62SQu Wenruo 131253b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 13133e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13143e77605dSQu Wenruo if (!sector) 131553b381b3SDavid Woodhouse continue; 131653b381b3SDavid Woodhouse } else { 13173e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 131853b381b3SDavid Woodhouse } 131953b381b3SDavid Woodhouse 13203e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1321ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 132253b381b3SDavid Woodhouse if (ret) 132353b381b3SDavid Woodhouse goto cleanup; 132453b381b3SDavid Woodhouse } 132553b381b3SDavid Woodhouse 13264c664611SQu Wenruo if (likely(!bioc->num_tgtdevs)) 13272c8cdd6eSMiao Xie goto write_data; 13282c8cdd6eSMiao Xie 132936920044SQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 133036920044SQu Wenruo total_sector_nr++) { 13313e77605dSQu Wenruo struct sector_ptr *sector; 13323e77605dSQu Wenruo 133336920044SQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 133436920044SQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 133536920044SQu Wenruo 133636920044SQu Wenruo if (!bioc->tgtdev_map[stripe]) { 133736920044SQu Wenruo /* 133836920044SQu Wenruo * We can skip the whole stripe completely, note 133936920044SQu Wenruo * total_sector_nr will be increased by one anyway. 134036920044SQu Wenruo */ 134136920044SQu Wenruo ASSERT(sectornr == 0); 134236920044SQu Wenruo total_sector_nr += rbio->stripe_nsectors - 1; 134336920044SQu Wenruo continue; 134436920044SQu Wenruo } 134536920044SQu Wenruo 1346bd8f7e62SQu Wenruo /* This vertical stripe has no data, skip it. */ 1347bd8f7e62SQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 1348bd8f7e62SQu Wenruo continue; 1349bd8f7e62SQu Wenruo 13502c8cdd6eSMiao Xie if (stripe < rbio->nr_data) { 13513e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13523e77605dSQu Wenruo if (!sector) 13532c8cdd6eSMiao Xie continue; 13542c8cdd6eSMiao Xie } else { 13553e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 13562c8cdd6eSMiao Xie } 13572c8cdd6eSMiao Xie 13583e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 13594c664611SQu Wenruo rbio->bioc->tgtdev_map[stripe], 1360ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 13612c8cdd6eSMiao Xie if (ret) 13622c8cdd6eSMiao Xie goto cleanup; 13632c8cdd6eSMiao Xie } 13642c8cdd6eSMiao Xie 13652c8cdd6eSMiao Xie write_data: 1366b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1367b89e1b01SMiao Xie BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 136853b381b3SDavid Woodhouse 1369bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 137053b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 13714e49ea4aSMike Christie 1372b8bea09aSQu Wenruo if (trace_raid56_write_stripe_enabled()) { 1373b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 1374b8bea09aSQu Wenruo 1375b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1376b8bea09aSQu Wenruo trace_raid56_write_stripe(rbio, bio, &trace_info); 1377b8bea09aSQu Wenruo } 13784e49ea4aSMike Christie submit_bio(bio); 137953b381b3SDavid Woodhouse } 138053b381b3SDavid Woodhouse return; 138153b381b3SDavid Woodhouse 138253b381b3SDavid Woodhouse cleanup: 138358efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1384785884fcSLiu Bo 1385785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1386785884fcSLiu Bo bio_put(bio); 138753b381b3SDavid Woodhouse } 138853b381b3SDavid Woodhouse 138953b381b3SDavid Woodhouse /* 139053b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 139153b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 139253b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 139353b381b3SDavid Woodhouse */ 139453b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 139553b381b3SDavid Woodhouse struct bio *bio) 139653b381b3SDavid Woodhouse { 13974f024f37SKent Overstreet u64 physical = bio->bi_iter.bi_sector; 139853b381b3SDavid Woodhouse int i; 13994c664611SQu Wenruo struct btrfs_io_stripe *stripe; 140053b381b3SDavid Woodhouse 140153b381b3SDavid Woodhouse physical <<= 9; 140253b381b3SDavid Woodhouse 14034c664611SQu Wenruo for (i = 0; i < rbio->bioc->num_stripes; i++) { 14044c664611SQu Wenruo stripe = &rbio->bioc->stripes[i]; 1405ff18a4afSChristoph Hellwig if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && 1406309dca30SChristoph Hellwig stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 140753b381b3SDavid Woodhouse return i; 140853b381b3SDavid Woodhouse } 140953b381b3SDavid Woodhouse } 141053b381b3SDavid Woodhouse return -1; 141153b381b3SDavid Woodhouse } 141253b381b3SDavid Woodhouse 141353b381b3SDavid Woodhouse /* 141453b381b3SDavid Woodhouse * helper to find the stripe number for a given 141553b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 141653b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 141753b381b3SDavid Woodhouse */ 141853b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 141953b381b3SDavid Woodhouse struct bio *bio) 142053b381b3SDavid Woodhouse { 14211201b58bSDavid Sterba u64 logical = bio->bi_iter.bi_sector << 9; 142253b381b3SDavid Woodhouse int i; 142353b381b3SDavid Woodhouse 142453b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 14254c664611SQu Wenruo u64 stripe_start = rbio->bioc->raid_map[i]; 142683025863SNikolay Borisov 1427ff18a4afSChristoph Hellwig if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) 142853b381b3SDavid Woodhouse return i; 142953b381b3SDavid Woodhouse } 143053b381b3SDavid Woodhouse return -1; 143153b381b3SDavid Woodhouse } 143253b381b3SDavid Woodhouse 143353b381b3SDavid Woodhouse /* 143453b381b3SDavid Woodhouse * returns -EIO if we had too many failures 143553b381b3SDavid Woodhouse */ 143653b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 143753b381b3SDavid Woodhouse { 143853b381b3SDavid Woodhouse unsigned long flags; 143953b381b3SDavid Woodhouse int ret = 0; 144053b381b3SDavid Woodhouse 144153b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 144253b381b3SDavid Woodhouse 144353b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 144453b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 144553b381b3SDavid Woodhouse goto out; 144653b381b3SDavid Woodhouse 144753b381b3SDavid Woodhouse if (rbio->faila == -1) { 144853b381b3SDavid Woodhouse /* first failure on this rbio */ 144953b381b3SDavid Woodhouse rbio->faila = failed; 1450b89e1b01SMiao Xie atomic_inc(&rbio->error); 145153b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 145253b381b3SDavid Woodhouse /* second failure on this rbio */ 145353b381b3SDavid Woodhouse rbio->failb = failed; 1454b89e1b01SMiao Xie atomic_inc(&rbio->error); 145553b381b3SDavid Woodhouse } else { 145653b381b3SDavid Woodhouse ret = -EIO; 145753b381b3SDavid Woodhouse } 145853b381b3SDavid Woodhouse out: 145953b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 146053b381b3SDavid Woodhouse 146153b381b3SDavid Woodhouse return ret; 146253b381b3SDavid Woodhouse } 146353b381b3SDavid Woodhouse 146453b381b3SDavid Woodhouse /* 146553b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 146653b381b3SDavid Woodhouse * bio. 146753b381b3SDavid Woodhouse */ 146853b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 146953b381b3SDavid Woodhouse struct bio *bio) 147053b381b3SDavid Woodhouse { 147153b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 147253b381b3SDavid Woodhouse 147353b381b3SDavid Woodhouse if (failed < 0) 147453b381b3SDavid Woodhouse return -EIO; 147553b381b3SDavid Woodhouse 147653b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 147753b381b3SDavid Woodhouse } 147853b381b3SDavid Woodhouse 147953b381b3SDavid Woodhouse /* 14805fdb7afcSQu Wenruo * For subpage case, we can no longer set page Uptodate directly for 14815fdb7afcSQu Wenruo * stripe_pages[], thus we need to locate the sector. 14825fdb7afcSQu Wenruo */ 14835fdb7afcSQu Wenruo static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 14845fdb7afcSQu Wenruo struct page *page, 14855fdb7afcSQu Wenruo unsigned int pgoff) 14865fdb7afcSQu Wenruo { 14875fdb7afcSQu Wenruo int i; 14885fdb7afcSQu Wenruo 14895fdb7afcSQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 14905fdb7afcSQu Wenruo struct sector_ptr *sector = &rbio->stripe_sectors[i]; 14915fdb7afcSQu Wenruo 14925fdb7afcSQu Wenruo if (sector->page == page && sector->pgoff == pgoff) 14935fdb7afcSQu Wenruo return sector; 14945fdb7afcSQu Wenruo } 14955fdb7afcSQu Wenruo return NULL; 14965fdb7afcSQu Wenruo } 14975fdb7afcSQu Wenruo 14985fdb7afcSQu Wenruo /* 149953b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 150053b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 150153b381b3SDavid Woodhouse */ 15025fdb7afcSQu Wenruo static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 150353b381b3SDavid Woodhouse { 15045fdb7afcSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 15050198e5b7SLiu Bo struct bio_vec *bvec; 15066dc4f100SMing Lei struct bvec_iter_all iter_all; 150753b381b3SDavid Woodhouse 15080198e5b7SLiu Bo ASSERT(!bio_flagged(bio, BIO_CLONED)); 15096592e58cSFilipe Manana 15105fdb7afcSQu Wenruo bio_for_each_segment_all(bvec, bio, iter_all) { 15115fdb7afcSQu Wenruo struct sector_ptr *sector; 15125fdb7afcSQu Wenruo int pgoff; 15135fdb7afcSQu Wenruo 15145fdb7afcSQu Wenruo for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 15155fdb7afcSQu Wenruo pgoff += sectorsize) { 15165fdb7afcSQu Wenruo sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 15175fdb7afcSQu Wenruo ASSERT(sector); 15185fdb7afcSQu Wenruo if (sector) 15195fdb7afcSQu Wenruo sector->uptodate = 1; 15205fdb7afcSQu Wenruo } 15215fdb7afcSQu Wenruo } 152253b381b3SDavid Woodhouse } 152353b381b3SDavid Woodhouse 1524*d817ce35SQu Wenruo static void raid_wait_read_end_io(struct bio *bio) 1525*d817ce35SQu Wenruo { 1526*d817ce35SQu Wenruo struct btrfs_raid_bio *rbio = bio->bi_private; 1527*d817ce35SQu Wenruo 1528*d817ce35SQu Wenruo if (bio->bi_status) 1529*d817ce35SQu Wenruo fail_bio_stripe(rbio, bio); 1530*d817ce35SQu Wenruo else 1531*d817ce35SQu Wenruo set_bio_pages_uptodate(rbio, bio); 1532*d817ce35SQu Wenruo 1533*d817ce35SQu Wenruo bio_put(bio); 1534*d817ce35SQu Wenruo if (atomic_dec_and_test(&rbio->stripes_pending)) 1535*d817ce35SQu Wenruo wake_up(&rbio->io_wait); 1536*d817ce35SQu Wenruo } 1537*d817ce35SQu Wenruo 1538*d817ce35SQu Wenruo static void submit_read_bios(struct btrfs_raid_bio *rbio, 1539*d817ce35SQu Wenruo struct bio_list *bio_list) 1540*d817ce35SQu Wenruo { 1541*d817ce35SQu Wenruo struct bio *bio; 1542*d817ce35SQu Wenruo 1543*d817ce35SQu Wenruo atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1544*d817ce35SQu Wenruo while ((bio = bio_list_pop(bio_list))) { 1545*d817ce35SQu Wenruo bio->bi_end_io = raid_wait_read_end_io; 1546*d817ce35SQu Wenruo 1547*d817ce35SQu Wenruo if (trace_raid56_scrub_read_recover_enabled()) { 1548*d817ce35SQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 1549*d817ce35SQu Wenruo 1550*d817ce35SQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1551*d817ce35SQu Wenruo trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 1552*d817ce35SQu Wenruo } 1553*d817ce35SQu Wenruo submit_bio(bio); 1554*d817ce35SQu Wenruo } 1555*d817ce35SQu Wenruo } 1556*d817ce35SQu Wenruo 1557d34e123dSChristoph Hellwig static void raid56_bio_end_io(struct bio *bio) 155853b381b3SDavid Woodhouse { 155953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 156053b381b3SDavid Woodhouse 15614e4cbee9SChristoph Hellwig if (bio->bi_status) 156253b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 156353b381b3SDavid Woodhouse else 15645fdb7afcSQu Wenruo set_bio_pages_uptodate(rbio, bio); 156553b381b3SDavid Woodhouse 156653b381b3SDavid Woodhouse bio_put(bio); 156753b381b3SDavid Woodhouse 1568d34e123dSChristoph Hellwig if (atomic_dec_and_test(&rbio->stripes_pending)) 1569d34e123dSChristoph Hellwig queue_work(rbio->bioc->fs_info->endio_raid56_workers, 1570d34e123dSChristoph Hellwig &rbio->end_io_work); 1571d34e123dSChristoph Hellwig } 157253b381b3SDavid Woodhouse 157353b381b3SDavid Woodhouse /* 1574d34e123dSChristoph Hellwig * End io handler for the read phase of the RMW cycle. All the bios here are 1575d34e123dSChristoph Hellwig * physical stripe bios we've read from the disk so we can recalculate the 1576d34e123dSChristoph Hellwig * parity of the stripe. 1577d34e123dSChristoph Hellwig * 1578d34e123dSChristoph Hellwig * This will usually kick off finish_rmw once all the bios are read in, but it 1579d34e123dSChristoph Hellwig * may trigger parity reconstruction if we had any errors along the way 1580d34e123dSChristoph Hellwig */ 1581d34e123dSChristoph Hellwig static void raid56_rmw_end_io_work(struct work_struct *work) 1582d34e123dSChristoph Hellwig { 1583d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 1584d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 1585d34e123dSChristoph Hellwig 1586d34e123dSChristoph Hellwig if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { 1587d34e123dSChristoph Hellwig rbio_orig_end_io(rbio, BLK_STS_IOERR); 1588d34e123dSChristoph Hellwig return; 1589d34e123dSChristoph Hellwig } 1590d34e123dSChristoph Hellwig 1591d34e123dSChristoph Hellwig /* 1592d34e123dSChristoph Hellwig * This will normally call finish_rmw to start our write but if there 1593d34e123dSChristoph Hellwig * are any failed stripes we'll reconstruct from parity first. 159453b381b3SDavid Woodhouse */ 159553b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 159653b381b3SDavid Woodhouse } 159753b381b3SDavid Woodhouse 159853b381b3SDavid Woodhouse /* 159953b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 160053b381b3SDavid Woodhouse * unlock after all the writes are done 160153b381b3SDavid Woodhouse */ 160253b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 160353b381b3SDavid Woodhouse { 160453b381b3SDavid Woodhouse int bios_to_read = 0; 160553b381b3SDavid Woodhouse struct bio_list bio_list; 1606550cdeb3SQu Wenruo const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; 160753b381b3SDavid Woodhouse int ret; 1608550cdeb3SQu Wenruo int total_sector_nr; 160953b381b3SDavid Woodhouse struct bio *bio; 161053b381b3SDavid Woodhouse 161153b381b3SDavid Woodhouse bio_list_init(&bio_list); 161253b381b3SDavid Woodhouse 161353b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 161453b381b3SDavid Woodhouse if (ret) 161553b381b3SDavid Woodhouse goto cleanup; 161653b381b3SDavid Woodhouse 161753b381b3SDavid Woodhouse index_rbio_pages(rbio); 161853b381b3SDavid Woodhouse 1619b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 1620550cdeb3SQu Wenruo /* Build a list of bios to read all the missing data sectors. */ 1621550cdeb3SQu Wenruo for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; 1622550cdeb3SQu Wenruo total_sector_nr++) { 16233e77605dSQu Wenruo struct sector_ptr *sector; 1624550cdeb3SQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 1625550cdeb3SQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 16263e77605dSQu Wenruo 162753b381b3SDavid Woodhouse /* 1628550cdeb3SQu Wenruo * We want to find all the sectors missing from the rbio and 1629550cdeb3SQu Wenruo * read them from the disk. If sector_in_rbio() finds a page 1630550cdeb3SQu Wenruo * in the bio list we don't need to read it off the stripe. 163153b381b3SDavid Woodhouse */ 16323e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 16333e77605dSQu Wenruo if (sector) 163453b381b3SDavid Woodhouse continue; 163553b381b3SDavid Woodhouse 16363e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 16374ae10b3aSChris Mason /* 1638550cdeb3SQu Wenruo * The bio cache may have handed us an uptodate page. If so, 1639550cdeb3SQu Wenruo * use it. 16404ae10b3aSChris Mason */ 16413e77605dSQu Wenruo if (sector->uptodate) 16424ae10b3aSChris Mason continue; 16434ae10b3aSChris Mason 16443e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 1645ff18a4afSChristoph Hellwig stripe, sectornr, REQ_OP_READ); 164653b381b3SDavid Woodhouse if (ret) 164753b381b3SDavid Woodhouse goto cleanup; 164853b381b3SDavid Woodhouse } 164953b381b3SDavid Woodhouse 165053b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 165153b381b3SDavid Woodhouse if (!bios_to_read) { 165253b381b3SDavid Woodhouse /* 165353b381b3SDavid Woodhouse * this can happen if others have merged with 165453b381b3SDavid Woodhouse * us, it means there is nothing left to read. 165553b381b3SDavid Woodhouse * But if there are missing devices it may not be 165653b381b3SDavid Woodhouse * safe to do the full stripe write yet. 165753b381b3SDavid Woodhouse */ 165853b381b3SDavid Woodhouse goto finish; 165953b381b3SDavid Woodhouse } 166053b381b3SDavid Woodhouse 166153b381b3SDavid Woodhouse /* 16624c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 16634c664611SQu Wenruo * touch it after that. 166453b381b3SDavid Woodhouse */ 1665b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 1666d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); 1667bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 1668d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 166953b381b3SDavid Woodhouse 1670b8bea09aSQu Wenruo if (trace_raid56_read_partial_enabled()) { 1671b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 167253b381b3SDavid Woodhouse 1673b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1674b8bea09aSQu Wenruo trace_raid56_read_partial(rbio, bio, &trace_info); 1675b8bea09aSQu Wenruo } 16764e49ea4aSMike Christie submit_bio(bio); 167753b381b3SDavid Woodhouse } 167853b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 167953b381b3SDavid Woodhouse return 0; 168053b381b3SDavid Woodhouse 168153b381b3SDavid Woodhouse cleanup: 168258efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1683785884fcSLiu Bo 1684785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1685785884fcSLiu Bo bio_put(bio); 1686785884fcSLiu Bo 168753b381b3SDavid Woodhouse return -EIO; 168853b381b3SDavid Woodhouse 168953b381b3SDavid Woodhouse finish: 169053b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 169153b381b3SDavid Woodhouse return 0; 169253b381b3SDavid Woodhouse } 169353b381b3SDavid Woodhouse 169453b381b3SDavid Woodhouse /* 169553b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 169653b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 169753b381b3SDavid Woodhouse */ 169853b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 169953b381b3SDavid Woodhouse { 170053b381b3SDavid Woodhouse int ret; 170153b381b3SDavid Woodhouse 170253b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 1703ab4c54c6SQu Wenruo if (ret) 170453b381b3SDavid Woodhouse return ret; 170553b381b3SDavid Woodhouse 170653b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 170753b381b3SDavid Woodhouse if (ret == 0) 170853b381b3SDavid Woodhouse finish_rmw(rbio); 170953b381b3SDavid Woodhouse return 0; 171053b381b3SDavid Woodhouse } 171153b381b3SDavid Woodhouse 171253b381b3SDavid Woodhouse /* 171353b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 171453b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 171553b381b3SDavid Woodhouse * rbio before calculating new parity 171653b381b3SDavid Woodhouse */ 171753b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 171853b381b3SDavid Woodhouse { 171953b381b3SDavid Woodhouse int ret; 172053b381b3SDavid Woodhouse 172153b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 172253b381b3SDavid Woodhouse if (ret == 0) 1723cf6a4a75SDavid Sterba start_async_work(rbio, rmw_work); 172453b381b3SDavid Woodhouse return 0; 172553b381b3SDavid Woodhouse } 172653b381b3SDavid Woodhouse 172753b381b3SDavid Woodhouse /* 172853b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 172953b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 173053b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 173153b381b3SDavid Woodhouse * go directly to finish_rmw 173253b381b3SDavid Woodhouse */ 173353b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 173453b381b3SDavid Woodhouse { 173553b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 173653b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 173753b381b3SDavid Woodhouse return partial_stripe_write(rbio); 173853b381b3SDavid Woodhouse return full_stripe_write(rbio); 173953b381b3SDavid Woodhouse } 174053b381b3SDavid Woodhouse 174153b381b3SDavid Woodhouse /* 17426ac0f488SChris Mason * We use plugging call backs to collect full stripes. 17436ac0f488SChris Mason * Any time we get a partial stripe write while plugged 17446ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 17456ac0f488SChris Mason * we sort the list by logical block number and merge 17466ac0f488SChris Mason * everything we can into the same rbios 17476ac0f488SChris Mason */ 17486ac0f488SChris Mason struct btrfs_plug_cb { 17496ac0f488SChris Mason struct blk_plug_cb cb; 17506ac0f488SChris Mason struct btrfs_fs_info *info; 17516ac0f488SChris Mason struct list_head rbio_list; 1752385de0efSChristoph Hellwig struct work_struct work; 17536ac0f488SChris Mason }; 17546ac0f488SChris Mason 17556ac0f488SChris Mason /* 17566ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 17576ac0f488SChris Mason */ 17584f0f586bSSami Tolvanen static int plug_cmp(void *priv, const struct list_head *a, 17594f0f586bSSami Tolvanen const struct list_head *b) 17606ac0f488SChris Mason { 1761214cc184SDavid Sterba const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 17626ac0f488SChris Mason plug_list); 1763214cc184SDavid Sterba const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 17646ac0f488SChris Mason plug_list); 17654f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 17664f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 17676ac0f488SChris Mason 17686ac0f488SChris Mason if (a_sector < b_sector) 17696ac0f488SChris Mason return -1; 17706ac0f488SChris Mason if (a_sector > b_sector) 17716ac0f488SChris Mason return 1; 17726ac0f488SChris Mason return 0; 17736ac0f488SChris Mason } 17746ac0f488SChris Mason 17756ac0f488SChris Mason static void run_plug(struct btrfs_plug_cb *plug) 17766ac0f488SChris Mason { 17776ac0f488SChris Mason struct btrfs_raid_bio *cur; 17786ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 17796ac0f488SChris Mason 17806ac0f488SChris Mason /* 17816ac0f488SChris Mason * sort our plug list then try to merge 17826ac0f488SChris Mason * everything we can in hopes of creating full 17836ac0f488SChris Mason * stripes. 17846ac0f488SChris Mason */ 17856ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 17866ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 17876ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 17886ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 17896ac0f488SChris Mason list_del_init(&cur->plug_list); 17906ac0f488SChris Mason 17916ac0f488SChris Mason if (rbio_is_full(cur)) { 1792c7b562c5SDavid Sterba int ret; 1793c7b562c5SDavid Sterba 17946ac0f488SChris Mason /* we have a full stripe, send it down */ 1795c7b562c5SDavid Sterba ret = full_stripe_write(cur); 1796c7b562c5SDavid Sterba BUG_ON(ret); 17976ac0f488SChris Mason continue; 17986ac0f488SChris Mason } 17996ac0f488SChris Mason if (last) { 18006ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 18016ac0f488SChris Mason merge_rbio(last, cur); 1802ff2b64a2SQu Wenruo free_raid_bio(cur); 18036ac0f488SChris Mason continue; 18046ac0f488SChris Mason 18056ac0f488SChris Mason } 18066ac0f488SChris Mason __raid56_parity_write(last); 18076ac0f488SChris Mason } 18086ac0f488SChris Mason last = cur; 18096ac0f488SChris Mason } 18106ac0f488SChris Mason if (last) { 18116ac0f488SChris Mason __raid56_parity_write(last); 18126ac0f488SChris Mason } 18136ac0f488SChris Mason kfree(plug); 18146ac0f488SChris Mason } 18156ac0f488SChris Mason 18166ac0f488SChris Mason /* 18176ac0f488SChris Mason * if the unplug comes from schedule, we have to push the 18186ac0f488SChris Mason * work off to a helper thread 18196ac0f488SChris Mason */ 1820385de0efSChristoph Hellwig static void unplug_work(struct work_struct *work) 18216ac0f488SChris Mason { 18226ac0f488SChris Mason struct btrfs_plug_cb *plug; 18236ac0f488SChris Mason plug = container_of(work, struct btrfs_plug_cb, work); 18246ac0f488SChris Mason run_plug(plug); 18256ac0f488SChris Mason } 18266ac0f488SChris Mason 18276ac0f488SChris Mason static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 18286ac0f488SChris Mason { 18296ac0f488SChris Mason struct btrfs_plug_cb *plug; 18306ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 18316ac0f488SChris Mason 18326ac0f488SChris Mason if (from_schedule) { 1833385de0efSChristoph Hellwig INIT_WORK(&plug->work, unplug_work); 1834385de0efSChristoph Hellwig queue_work(plug->info->rmw_workers, &plug->work); 18356ac0f488SChris Mason return; 18366ac0f488SChris Mason } 18376ac0f488SChris Mason run_plug(plug); 18386ac0f488SChris Mason } 18396ac0f488SChris Mason 1840bd8f7e62SQu Wenruo /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1841bd8f7e62SQu Wenruo static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1842bd8f7e62SQu Wenruo { 1843bd8f7e62SQu Wenruo const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1844bd8f7e62SQu Wenruo const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1845bd8f7e62SQu Wenruo const u64 full_stripe_start = rbio->bioc->raid_map[0]; 1846bd8f7e62SQu Wenruo const u32 orig_len = orig_bio->bi_iter.bi_size; 1847bd8f7e62SQu Wenruo const u32 sectorsize = fs_info->sectorsize; 1848bd8f7e62SQu Wenruo u64 cur_logical; 1849bd8f7e62SQu Wenruo 1850bd8f7e62SQu Wenruo ASSERT(orig_logical >= full_stripe_start && 1851bd8f7e62SQu Wenruo orig_logical + orig_len <= full_stripe_start + 1852ff18a4afSChristoph Hellwig rbio->nr_data * BTRFS_STRIPE_LEN); 1853bd8f7e62SQu Wenruo 1854bd8f7e62SQu Wenruo bio_list_add(&rbio->bio_list, orig_bio); 1855bd8f7e62SQu Wenruo rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1856bd8f7e62SQu Wenruo 1857bd8f7e62SQu Wenruo /* Update the dbitmap. */ 1858bd8f7e62SQu Wenruo for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1859bd8f7e62SQu Wenruo cur_logical += sectorsize) { 1860bd8f7e62SQu Wenruo int bit = ((u32)(cur_logical - full_stripe_start) >> 1861bd8f7e62SQu Wenruo fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1862bd8f7e62SQu Wenruo 1863bd8f7e62SQu Wenruo set_bit(bit, &rbio->dbitmap); 1864bd8f7e62SQu Wenruo } 1865bd8f7e62SQu Wenruo } 1866bd8f7e62SQu Wenruo 18676ac0f488SChris Mason /* 186853b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 186953b381b3SDavid Woodhouse */ 187031683f4aSChristoph Hellwig void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 187153b381b3SDavid Woodhouse { 18726a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 187353b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 18746ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 18756ac0f488SChris Mason struct blk_plug_cb *cb; 187631683f4aSChristoph Hellwig int ret = 0; 187753b381b3SDavid Woodhouse 1878ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 1879af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 188031683f4aSChristoph Hellwig ret = PTR_ERR(rbio); 1881f1c29379SChristoph Hellwig goto fail; 1882af8e2d1dSMiao Xie } 18831b94b556SMiao Xie rbio->operation = BTRFS_RBIO_WRITE; 1884bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 18856ac0f488SChris Mason 18866ac0f488SChris Mason /* 18876ac0f488SChris Mason * don't plug on full rbios, just get them out the door 18886ac0f488SChris Mason * as quickly as we can 18896ac0f488SChris Mason */ 18904245215dSMiao Xie if (rbio_is_full(rbio)) { 18914245215dSMiao Xie ret = full_stripe_write(rbio); 1892ab4c54c6SQu Wenruo if (ret) { 1893ff2b64a2SQu Wenruo free_raid_bio(rbio); 1894f1c29379SChristoph Hellwig goto fail; 1895ab4c54c6SQu Wenruo } 189631683f4aSChristoph Hellwig return; 18974245215dSMiao Xie } 18986ac0f488SChris Mason 18990b246afaSJeff Mahoney cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 19006ac0f488SChris Mason if (cb) { 19016ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 19026ac0f488SChris Mason if (!plug->info) { 19030b246afaSJeff Mahoney plug->info = fs_info; 19046ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 19056ac0f488SChris Mason } 19066ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 19076ac0f488SChris Mason } else { 19084245215dSMiao Xie ret = __raid56_parity_write(rbio); 1909ab4c54c6SQu Wenruo if (ret) { 1910ff2b64a2SQu Wenruo free_raid_bio(rbio); 1911f1c29379SChristoph Hellwig goto fail; 191253b381b3SDavid Woodhouse } 1913ab4c54c6SQu Wenruo } 191431683f4aSChristoph Hellwig 191531683f4aSChristoph Hellwig return; 191631683f4aSChristoph Hellwig 1917f1c29379SChristoph Hellwig fail: 191831683f4aSChristoph Hellwig bio->bi_status = errno_to_blk_status(ret); 191931683f4aSChristoph Hellwig bio_endio(bio); 19206ac0f488SChris Mason } 192153b381b3SDavid Woodhouse 192253b381b3SDavid Woodhouse /* 19239c5ff9b4SQu Wenruo * Recover a vertical stripe specified by @sector_nr. 19249c5ff9b4SQu Wenruo * @*pointers are the pre-allocated pointers by the caller, so we don't 19259c5ff9b4SQu Wenruo * need to allocate/free the pointers again and again. 19269c5ff9b4SQu Wenruo */ 19279c5ff9b4SQu Wenruo static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 19289c5ff9b4SQu Wenruo void **pointers, void **unmap_array) 19299c5ff9b4SQu Wenruo { 19309c5ff9b4SQu Wenruo struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 19319c5ff9b4SQu Wenruo struct sector_ptr *sector; 19329c5ff9b4SQu Wenruo const u32 sectorsize = fs_info->sectorsize; 19339c5ff9b4SQu Wenruo const int faila = rbio->faila; 19349c5ff9b4SQu Wenruo const int failb = rbio->failb; 19359c5ff9b4SQu Wenruo int stripe_nr; 19369c5ff9b4SQu Wenruo 19379c5ff9b4SQu Wenruo /* 19389c5ff9b4SQu Wenruo * Now we just use bitmap to mark the horizontal stripes in 19399c5ff9b4SQu Wenruo * which we have data when doing parity scrub. 19409c5ff9b4SQu Wenruo */ 19419c5ff9b4SQu Wenruo if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 19429c5ff9b4SQu Wenruo !test_bit(sector_nr, &rbio->dbitmap)) 19439c5ff9b4SQu Wenruo return; 19449c5ff9b4SQu Wenruo 19459c5ff9b4SQu Wenruo /* 19469c5ff9b4SQu Wenruo * Setup our array of pointers with sectors from each stripe 19479c5ff9b4SQu Wenruo * 19489c5ff9b4SQu Wenruo * NOTE: store a duplicate array of pointers to preserve the 19499c5ff9b4SQu Wenruo * pointer order. 19509c5ff9b4SQu Wenruo */ 19519c5ff9b4SQu Wenruo for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 19529c5ff9b4SQu Wenruo /* 19539c5ff9b4SQu Wenruo * If we're rebuilding a read, we have to use 19549c5ff9b4SQu Wenruo * pages from the bio list 19559c5ff9b4SQu Wenruo */ 19569c5ff9b4SQu Wenruo if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 19579c5ff9b4SQu Wenruo rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 19589c5ff9b4SQu Wenruo (stripe_nr == faila || stripe_nr == failb)) { 19599c5ff9b4SQu Wenruo sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 19609c5ff9b4SQu Wenruo } else { 19619c5ff9b4SQu Wenruo sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 19629c5ff9b4SQu Wenruo } 19639c5ff9b4SQu Wenruo ASSERT(sector->page); 19649c5ff9b4SQu Wenruo pointers[stripe_nr] = kmap_local_page(sector->page) + 19659c5ff9b4SQu Wenruo sector->pgoff; 19669c5ff9b4SQu Wenruo unmap_array[stripe_nr] = pointers[stripe_nr]; 19679c5ff9b4SQu Wenruo } 19689c5ff9b4SQu Wenruo 19699c5ff9b4SQu Wenruo /* All raid6 handling here */ 19709c5ff9b4SQu Wenruo if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 19719c5ff9b4SQu Wenruo /* Single failure, rebuild from parity raid5 style */ 19729c5ff9b4SQu Wenruo if (failb < 0) { 19739c5ff9b4SQu Wenruo if (faila == rbio->nr_data) 19749c5ff9b4SQu Wenruo /* 19759c5ff9b4SQu Wenruo * Just the P stripe has failed, without 19769c5ff9b4SQu Wenruo * a bad data or Q stripe. 19779c5ff9b4SQu Wenruo * We have nothing to do, just skip the 19789c5ff9b4SQu Wenruo * recovery for this stripe. 19799c5ff9b4SQu Wenruo */ 19809c5ff9b4SQu Wenruo goto cleanup; 19819c5ff9b4SQu Wenruo /* 19829c5ff9b4SQu Wenruo * a single failure in raid6 is rebuilt 19839c5ff9b4SQu Wenruo * in the pstripe code below 19849c5ff9b4SQu Wenruo */ 19859c5ff9b4SQu Wenruo goto pstripe; 19869c5ff9b4SQu Wenruo } 19879c5ff9b4SQu Wenruo 19889c5ff9b4SQu Wenruo /* 19899c5ff9b4SQu Wenruo * If the q stripe is failed, do a pstripe reconstruction from 19909c5ff9b4SQu Wenruo * the xors. 19919c5ff9b4SQu Wenruo * If both the q stripe and the P stripe are failed, we're 19929c5ff9b4SQu Wenruo * here due to a crc mismatch and we can't give them the 19939c5ff9b4SQu Wenruo * data they want. 19949c5ff9b4SQu Wenruo */ 19959c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 19969c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[faila] == 19979c5ff9b4SQu Wenruo RAID5_P_STRIPE) 19989c5ff9b4SQu Wenruo /* 19999c5ff9b4SQu Wenruo * Only P and Q are corrupted. 20009c5ff9b4SQu Wenruo * We only care about data stripes recovery, 20019c5ff9b4SQu Wenruo * can skip this vertical stripe. 20029c5ff9b4SQu Wenruo */ 20039c5ff9b4SQu Wenruo goto cleanup; 20049c5ff9b4SQu Wenruo /* 20059c5ff9b4SQu Wenruo * Otherwise we have one bad data stripe and 20069c5ff9b4SQu Wenruo * a good P stripe. raid5! 20079c5ff9b4SQu Wenruo */ 20089c5ff9b4SQu Wenruo goto pstripe; 20099c5ff9b4SQu Wenruo } 20109c5ff9b4SQu Wenruo 20119c5ff9b4SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 20129c5ff9b4SQu Wenruo raid6_datap_recov(rbio->real_stripes, sectorsize, 20139c5ff9b4SQu Wenruo faila, pointers); 20149c5ff9b4SQu Wenruo } else { 20159c5ff9b4SQu Wenruo raid6_2data_recov(rbio->real_stripes, sectorsize, 20169c5ff9b4SQu Wenruo faila, failb, pointers); 20179c5ff9b4SQu Wenruo } 20189c5ff9b4SQu Wenruo } else { 20199c5ff9b4SQu Wenruo void *p; 20209c5ff9b4SQu Wenruo 20219c5ff9b4SQu Wenruo /* Rebuild from P stripe here (raid5 or raid6). */ 20229c5ff9b4SQu Wenruo ASSERT(failb == -1); 20239c5ff9b4SQu Wenruo pstripe: 20249c5ff9b4SQu Wenruo /* Copy parity block into failed block to start with */ 20259c5ff9b4SQu Wenruo memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 20269c5ff9b4SQu Wenruo 20279c5ff9b4SQu Wenruo /* Rearrange the pointer array */ 20289c5ff9b4SQu Wenruo p = pointers[faila]; 20299c5ff9b4SQu Wenruo for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 20309c5ff9b4SQu Wenruo stripe_nr++) 20319c5ff9b4SQu Wenruo pointers[stripe_nr] = pointers[stripe_nr + 1]; 20329c5ff9b4SQu Wenruo pointers[rbio->nr_data - 1] = p; 20339c5ff9b4SQu Wenruo 20349c5ff9b4SQu Wenruo /* Xor in the rest */ 20359c5ff9b4SQu Wenruo run_xor(pointers, rbio->nr_data - 1, sectorsize); 20369c5ff9b4SQu Wenruo 20379c5ff9b4SQu Wenruo } 20389c5ff9b4SQu Wenruo 20399c5ff9b4SQu Wenruo /* 20409c5ff9b4SQu Wenruo * No matter if this is a RMW or recovery, we should have all 20419c5ff9b4SQu Wenruo * failed sectors repaired in the vertical stripe, thus they are now 20429c5ff9b4SQu Wenruo * uptodate. 20439c5ff9b4SQu Wenruo * Especially if we determine to cache the rbio, we need to 20449c5ff9b4SQu Wenruo * have at least all data sectors uptodate. 20459c5ff9b4SQu Wenruo */ 20469c5ff9b4SQu Wenruo if (rbio->faila >= 0) { 20479c5ff9b4SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); 20489c5ff9b4SQu Wenruo sector->uptodate = 1; 20499c5ff9b4SQu Wenruo } 20509c5ff9b4SQu Wenruo if (rbio->failb >= 0) { 20519c5ff9b4SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); 20529c5ff9b4SQu Wenruo sector->uptodate = 1; 20539c5ff9b4SQu Wenruo } 20549c5ff9b4SQu Wenruo 20559c5ff9b4SQu Wenruo cleanup: 20569c5ff9b4SQu Wenruo for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 20579c5ff9b4SQu Wenruo kunmap_local(unmap_array[stripe_nr]); 20589c5ff9b4SQu Wenruo } 20599c5ff9b4SQu Wenruo 2060ec936b03SQu Wenruo static int recover_sectors(struct btrfs_raid_bio *rbio) 206153b381b3SDavid Woodhouse { 20629c5ff9b4SQu Wenruo void **pointers = NULL; 20639c5ff9b4SQu Wenruo void **unmap_array = NULL; 2064ec936b03SQu Wenruo int sectornr; 2065ec936b03SQu Wenruo int ret = 0; 206653b381b3SDavid Woodhouse 206707e4d380SQu Wenruo /* 2068ec936b03SQu Wenruo * @pointers array stores the pointer for each sector. 2069ec936b03SQu Wenruo * 2070ec936b03SQu Wenruo * @unmap_array stores copy of pointers that does not get reordered 2071ec936b03SQu Wenruo * during reconstruction so that kunmap_local works. 207207e4d380SQu Wenruo */ 207331e818feSDavid Sterba pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 207494a0b58dSIra Weiny unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2075ec936b03SQu Wenruo if (!pointers || !unmap_array) { 2076ec936b03SQu Wenruo ret = -ENOMEM; 2077ec936b03SQu Wenruo goto out; 207894a0b58dSIra Weiny } 207994a0b58dSIra Weiny 20809c5ff9b4SQu Wenruo /* Make sure faila and fail b are in order. */ 20819c5ff9b4SQu Wenruo if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) 20829c5ff9b4SQu Wenruo swap(rbio->faila, rbio->failb); 208353b381b3SDavid Woodhouse 2084b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2085b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 208653b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 208753b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 208853b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 208953b381b3SDavid Woodhouse } 209053b381b3SDavid Woodhouse 209153b381b3SDavid Woodhouse index_rbio_pages(rbio); 209253b381b3SDavid Woodhouse 20939c5ff9b4SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 20949c5ff9b4SQu Wenruo recover_vertical(rbio, sectornr, pointers, unmap_array); 209553b381b3SDavid Woodhouse 2096ec936b03SQu Wenruo out: 209753b381b3SDavid Woodhouse kfree(pointers); 2098ec936b03SQu Wenruo kfree(unmap_array); 2099ec936b03SQu Wenruo return ret; 2100ec936b03SQu Wenruo } 2101ec936b03SQu Wenruo 2102ec936b03SQu Wenruo /* 2103ec936b03SQu Wenruo * all parity reconstruction happens here. We've read in everything 2104ec936b03SQu Wenruo * we can find from the drives and this does the heavy lifting of 2105ec936b03SQu Wenruo * sorting the good from the bad. 2106ec936b03SQu Wenruo */ 2107ec936b03SQu Wenruo static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 2108ec936b03SQu Wenruo { 2109ec936b03SQu Wenruo int ret; 2110ec936b03SQu Wenruo 2111ec936b03SQu Wenruo ret = recover_sectors(rbio); 211253b381b3SDavid Woodhouse 2113580c6efaSLiu Bo /* 2114580c6efaSLiu Bo * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2115580c6efaSLiu Bo * valid rbio which is consistent with ondisk content, thus such a 2116580c6efaSLiu Bo * valid rbio can be cached to avoid further disk reads. 2117580c6efaSLiu Bo */ 2118580c6efaSLiu Bo if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2119580c6efaSLiu Bo rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 212044ac474dSLiu Bo /* 212144ac474dSLiu Bo * - In case of two failures, where rbio->failb != -1: 212244ac474dSLiu Bo * 212344ac474dSLiu Bo * Do not cache this rbio since the above read reconstruction 212444ac474dSLiu Bo * (raid6_datap_recov() or raid6_2data_recov()) may have 212544ac474dSLiu Bo * changed some content of stripes which are not identical to 212644ac474dSLiu Bo * on-disk content any more, otherwise, a later write/recover 212744ac474dSLiu Bo * may steal stripe_pages from this rbio and end up with 212844ac474dSLiu Bo * corruptions or rebuild failures. 212944ac474dSLiu Bo * 213044ac474dSLiu Bo * - In case of single failure, where rbio->failb == -1: 213144ac474dSLiu Bo * 213244ac474dSLiu Bo * Cache this rbio iff the above read reconstruction is 213352042d8eSAndrea Gelmini * executed without problems. 213444ac474dSLiu Bo */ 2135ec936b03SQu Wenruo if (!ret && rbio->failb < 0) 21364ae10b3aSChris Mason cache_rbio_pages(rbio); 21374ae10b3aSChris Mason else 21384ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 21394ae10b3aSChris Mason 2140ec936b03SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2141ec936b03SQu Wenruo } else if (!ret) { 214253b381b3SDavid Woodhouse rbio->faila = -1; 214353b381b3SDavid Woodhouse rbio->failb = -1; 21445a6ac9eaSMiao Xie 21455a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) 214653b381b3SDavid Woodhouse finish_rmw(rbio); 21475a6ac9eaSMiao Xie else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 21485a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 21495a6ac9eaSMiao Xie else 21505a6ac9eaSMiao Xie BUG(); 215153b381b3SDavid Woodhouse } else { 2152ec936b03SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 215353b381b3SDavid Woodhouse } 215453b381b3SDavid Woodhouse } 215553b381b3SDavid Woodhouse 215653b381b3SDavid Woodhouse /* 2157d34e123dSChristoph Hellwig * This is called only for stripes we've read from disk to reconstruct the 2158d34e123dSChristoph Hellwig * parity. 215953b381b3SDavid Woodhouse */ 2160d34e123dSChristoph Hellwig static void raid_recover_end_io_work(struct work_struct *work) 216153b381b3SDavid Woodhouse { 2162d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 2163d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 216453b381b3SDavid Woodhouse 21654c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 216658efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 216753b381b3SDavid Woodhouse else 216853b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 216953b381b3SDavid Woodhouse } 217053b381b3SDavid Woodhouse 2171d31968d9SQu Wenruo static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, 2172d31968d9SQu Wenruo struct bio_list *bio_list) 217353b381b3SDavid Woodhouse { 217453b381b3SDavid Woodhouse struct bio *bio; 2175d31968d9SQu Wenruo int total_sector_nr; 2176d31968d9SQu Wenruo int ret = 0; 217753b381b3SDavid Woodhouse 2178d31968d9SQu Wenruo ASSERT(bio_list_size(bio_list) == 0); 217953b381b3SDavid Woodhouse /* 2180f6065f8eSQu Wenruo * Read everything that hasn't failed. However this time we will 2181f6065f8eSQu Wenruo * not trust any cached sector. 2182f6065f8eSQu Wenruo * As we may read out some stale data but higher layer is not reading 2183f6065f8eSQu Wenruo * that stale part. 2184f6065f8eSQu Wenruo * 2185f6065f8eSQu Wenruo * So here we always re-read everything in recovery path. 218653b381b3SDavid Woodhouse */ 2187ef340fccSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2188ef340fccSQu Wenruo total_sector_nr++) { 2189ef340fccSQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 2190ef340fccSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 21913e77605dSQu Wenruo struct sector_ptr *sector; 219253b381b3SDavid Woodhouse 2193ef340fccSQu Wenruo if (rbio->faila == stripe || rbio->failb == stripe) { 2194ef340fccSQu Wenruo atomic_inc(&rbio->error); 2195ef340fccSQu Wenruo /* Skip the current stripe. */ 2196ef340fccSQu Wenruo ASSERT(sectornr == 0); 2197ef340fccSQu Wenruo total_sector_nr += rbio->stripe_nsectors - 1; 219853b381b3SDavid Woodhouse continue; 2199ef340fccSQu Wenruo } 220053b381b3SDavid Woodhouse sector = rbio_stripe_sector(rbio, stripe, sectornr); 2201d31968d9SQu Wenruo ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 2202ff18a4afSChristoph Hellwig sectornr, REQ_OP_READ); 220353b381b3SDavid Woodhouse if (ret < 0) 2204d31968d9SQu Wenruo goto error; 220553b381b3SDavid Woodhouse } 2206d31968d9SQu Wenruo return 0; 2207d31968d9SQu Wenruo error: 2208d31968d9SQu Wenruo while ((bio = bio_list_pop(bio_list))) 2209d31968d9SQu Wenruo bio_put(bio); 2210d31968d9SQu Wenruo 2211d31968d9SQu Wenruo return -EIO; 2212d31968d9SQu Wenruo } 2213d31968d9SQu Wenruo 2214*d817ce35SQu Wenruo static int recover_rbio(struct btrfs_raid_bio *rbio) 2215*d817ce35SQu Wenruo { 2216*d817ce35SQu Wenruo struct bio_list bio_list; 2217*d817ce35SQu Wenruo struct bio *bio; 2218*d817ce35SQu Wenruo int ret; 2219*d817ce35SQu Wenruo 2220*d817ce35SQu Wenruo /* 2221*d817ce35SQu Wenruo * Either we're doing recover for a read failure or degraded write, 2222*d817ce35SQu Wenruo * caller should have set faila/b correctly. 2223*d817ce35SQu Wenruo */ 2224*d817ce35SQu Wenruo ASSERT(rbio->faila >= 0 || rbio->failb >= 0); 2225*d817ce35SQu Wenruo bio_list_init(&bio_list); 2226*d817ce35SQu Wenruo 2227*d817ce35SQu Wenruo /* 2228*d817ce35SQu Wenruo * Reset error to 0, as we will later increase error for missing 2229*d817ce35SQu Wenruo * devices. 2230*d817ce35SQu Wenruo */ 2231*d817ce35SQu Wenruo atomic_set(&rbio->error, 0); 2232*d817ce35SQu Wenruo 2233*d817ce35SQu Wenruo /* For recovery, we need to read all sectors including P/Q. */ 2234*d817ce35SQu Wenruo ret = alloc_rbio_pages(rbio); 2235*d817ce35SQu Wenruo if (ret < 0) 2236*d817ce35SQu Wenruo goto out; 2237*d817ce35SQu Wenruo 2238*d817ce35SQu Wenruo index_rbio_pages(rbio); 2239*d817ce35SQu Wenruo 2240*d817ce35SQu Wenruo ret = recover_assemble_read_bios(rbio, &bio_list); 2241*d817ce35SQu Wenruo if (ret < 0) 2242*d817ce35SQu Wenruo goto out; 2243*d817ce35SQu Wenruo 2244*d817ce35SQu Wenruo submit_read_bios(rbio, &bio_list); 2245*d817ce35SQu Wenruo wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2246*d817ce35SQu Wenruo 2247*d817ce35SQu Wenruo /* We have more errors than our tolerance during the read. */ 2248*d817ce35SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { 2249*d817ce35SQu Wenruo ret = -EIO; 2250*d817ce35SQu Wenruo goto out; 2251*d817ce35SQu Wenruo } 2252*d817ce35SQu Wenruo 2253*d817ce35SQu Wenruo ret = recover_sectors(rbio); 2254*d817ce35SQu Wenruo 2255*d817ce35SQu Wenruo out: 2256*d817ce35SQu Wenruo while ((bio = bio_list_pop(&bio_list))) 2257*d817ce35SQu Wenruo bio_put(bio); 2258*d817ce35SQu Wenruo 2259*d817ce35SQu Wenruo return ret; 2260*d817ce35SQu Wenruo } 2261*d817ce35SQu Wenruo 2262*d817ce35SQu Wenruo static void recover_rbio_work(struct work_struct *work) 2263*d817ce35SQu Wenruo { 2264*d817ce35SQu Wenruo struct btrfs_raid_bio *rbio; 2265*d817ce35SQu Wenruo int ret; 2266*d817ce35SQu Wenruo 2267*d817ce35SQu Wenruo rbio = container_of(work, struct btrfs_raid_bio, work); 2268*d817ce35SQu Wenruo 2269*d817ce35SQu Wenruo ret = lock_stripe_add(rbio); 2270*d817ce35SQu Wenruo if (ret == 0) { 2271*d817ce35SQu Wenruo ret = recover_rbio(rbio); 2272*d817ce35SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2273*d817ce35SQu Wenruo } 2274*d817ce35SQu Wenruo } 2275*d817ce35SQu Wenruo 2276*d817ce35SQu Wenruo static void recover_rbio_work_locked(struct work_struct *work) 2277*d817ce35SQu Wenruo { 2278*d817ce35SQu Wenruo struct btrfs_raid_bio *rbio; 2279*d817ce35SQu Wenruo int ret; 2280*d817ce35SQu Wenruo 2281*d817ce35SQu Wenruo rbio = container_of(work, struct btrfs_raid_bio, work); 2282*d817ce35SQu Wenruo 2283*d817ce35SQu Wenruo ret = recover_rbio(rbio); 2284*d817ce35SQu Wenruo rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2285*d817ce35SQu Wenruo } 2286*d817ce35SQu Wenruo 2287d31968d9SQu Wenruo /* 2288d31968d9SQu Wenruo * reads everything we need off the disk to reconstruct 2289d31968d9SQu Wenruo * the parity. endio handlers trigger final reconstruction 2290d31968d9SQu Wenruo * when the IO is done. 2291d31968d9SQu Wenruo * 2292d31968d9SQu Wenruo * This is used both for reads from the higher layers and for 2293d31968d9SQu Wenruo * parity construction required to finish a rmw cycle. 2294d31968d9SQu Wenruo */ 2295d31968d9SQu Wenruo static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2296d31968d9SQu Wenruo { 2297d31968d9SQu Wenruo int bios_to_read = 0; 2298d31968d9SQu Wenruo struct bio_list bio_list; 2299d31968d9SQu Wenruo int ret; 2300d31968d9SQu Wenruo struct bio *bio; 2301d31968d9SQu Wenruo 2302d31968d9SQu Wenruo bio_list_init(&bio_list); 2303d31968d9SQu Wenruo 2304d31968d9SQu Wenruo ret = alloc_rbio_pages(rbio); 2305d31968d9SQu Wenruo if (ret) 2306d31968d9SQu Wenruo goto cleanup; 2307d31968d9SQu Wenruo 2308d31968d9SQu Wenruo atomic_set(&rbio->error, 0); 2309d31968d9SQu Wenruo 2310d31968d9SQu Wenruo ret = recover_assemble_read_bios(rbio, &bio_list); 2311d31968d9SQu Wenruo if (ret < 0) 2312d31968d9SQu Wenruo goto cleanup; 231353b381b3SDavid Woodhouse 231453b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 231553b381b3SDavid Woodhouse if (!bios_to_read) { 231653b381b3SDavid Woodhouse /* 231753b381b3SDavid Woodhouse * we might have no bios to read just because the pages 231853b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 231953b381b3SDavid Woodhouse * the devices were gone. 232053b381b3SDavid Woodhouse */ 23214c664611SQu Wenruo if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 232253b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 2323813f8a0eSNikolay Borisov return 0; 232453b381b3SDavid Woodhouse } else { 232553b381b3SDavid Woodhouse goto cleanup; 232653b381b3SDavid Woodhouse } 232753b381b3SDavid Woodhouse } 232853b381b3SDavid Woodhouse 232953b381b3SDavid Woodhouse /* 23304c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 23314c664611SQu Wenruo * touch it after that. 233253b381b3SDavid Woodhouse */ 2333b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2334d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); 2335bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2336d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 233753b381b3SDavid Woodhouse 2338b8bea09aSQu Wenruo if (trace_raid56_scrub_read_recover_enabled()) { 2339b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 234053b381b3SDavid Woodhouse 2341b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2342b8bea09aSQu Wenruo trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 2343b8bea09aSQu Wenruo } 23444e49ea4aSMike Christie submit_bio(bio); 234553b381b3SDavid Woodhouse } 2346813f8a0eSNikolay Borisov 234753b381b3SDavid Woodhouse return 0; 234853b381b3SDavid Woodhouse 234953b381b3SDavid Woodhouse cleanup: 2350b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2351b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 235258efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2353785884fcSLiu Bo 2354785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2355785884fcSLiu Bo bio_put(bio); 2356785884fcSLiu Bo 235753b381b3SDavid Woodhouse return -EIO; 235853b381b3SDavid Woodhouse } 235953b381b3SDavid Woodhouse 236053b381b3SDavid Woodhouse /* 236153b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 236253b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 236353b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 236453b381b3SDavid Woodhouse * of the drive. 236553b381b3SDavid Woodhouse */ 23666065fd95SChristoph Hellwig void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2367f1c29379SChristoph Hellwig int mirror_num) 236853b381b3SDavid Woodhouse { 23696a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 237053b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 237153b381b3SDavid Woodhouse 2372ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 2373af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 23746065fd95SChristoph Hellwig bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2375*d817ce35SQu Wenruo bio_endio(bio); 2376*d817ce35SQu Wenruo return; 2377af8e2d1dSMiao Xie } 237853b381b3SDavid Woodhouse 23791b94b556SMiao Xie rbio->operation = BTRFS_RBIO_READ_REBUILD; 2380bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 238153b381b3SDavid Woodhouse 238253b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 238353b381b3SDavid Woodhouse if (rbio->faila == -1) { 23840b246afaSJeff Mahoney btrfs_warn(fs_info, 23854c664611SQu Wenruo "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 23861201b58bSDavid Sterba __func__, bio->bi_iter.bi_sector << 9, 23874c664611SQu Wenruo (u64)bio->bi_iter.bi_size, bioc->map_type); 2388ff2b64a2SQu Wenruo free_raid_bio(rbio); 23896065fd95SChristoph Hellwig bio->bi_status = BLK_STS_IOERR; 2390*d817ce35SQu Wenruo bio_endio(bio); 2391*d817ce35SQu Wenruo return; 239253b381b3SDavid Woodhouse } 239353b381b3SDavid Woodhouse 239453b381b3SDavid Woodhouse /* 23958810f751SLiu Bo * Loop retry: 23968810f751SLiu Bo * for 'mirror == 2', reconstruct from all other stripes. 23978810f751SLiu Bo * for 'mirror_num > 2', select a stripe to fail on every retry. 239853b381b3SDavid Woodhouse */ 23998810f751SLiu Bo if (mirror_num > 2) { 24008810f751SLiu Bo /* 24018810f751SLiu Bo * 'mirror == 3' is to fail the p stripe and 24028810f751SLiu Bo * reconstruct from the q stripe. 'mirror > 3' is to 24038810f751SLiu Bo * fail a data stripe and reconstruct from p+q stripe. 24048810f751SLiu Bo */ 24058810f751SLiu Bo rbio->failb = rbio->real_stripes - (mirror_num - 1); 24068810f751SLiu Bo ASSERT(rbio->failb > 0); 24078810f751SLiu Bo if (rbio->failb <= rbio->faila) 24088810f751SLiu Bo rbio->failb--; 24098810f751SLiu Bo } 241053b381b3SDavid Woodhouse 2411*d817ce35SQu Wenruo start_async_work(rbio, recover_rbio_work); 241253b381b3SDavid Woodhouse } 241353b381b3SDavid Woodhouse 2414385de0efSChristoph Hellwig static void rmw_work(struct work_struct *work) 241553b381b3SDavid Woodhouse { 241653b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 241753b381b3SDavid Woodhouse 241853b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 241953b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 242053b381b3SDavid Woodhouse } 242153b381b3SDavid Woodhouse 24225a6ac9eaSMiao Xie /* 24235a6ac9eaSMiao Xie * The following code is used to scrub/replace the parity stripe 24245a6ac9eaSMiao Xie * 24254c664611SQu Wenruo * Caller must have already increased bio_counter for getting @bioc. 2426ae6529c3SQu Wenruo * 24275a6ac9eaSMiao Xie * Note: We need make sure all the pages that add into the scrub/replace 24285a6ac9eaSMiao Xie * raid bio are correct and not be changed during the scrub/replace. That 24295a6ac9eaSMiao Xie * is those pages just hold metadata or file data with checksum. 24305a6ac9eaSMiao Xie */ 24315a6ac9eaSMiao Xie 24326a258d72SQu Wenruo struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 24336a258d72SQu Wenruo struct btrfs_io_context *bioc, 2434ff18a4afSChristoph Hellwig struct btrfs_device *scrub_dev, 24355a6ac9eaSMiao Xie unsigned long *dbitmap, int stripe_nsectors) 24365a6ac9eaSMiao Xie { 24376a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 24385a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 24395a6ac9eaSMiao Xie int i; 24405a6ac9eaSMiao Xie 2441ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 24425a6ac9eaSMiao Xie if (IS_ERR(rbio)) 24435a6ac9eaSMiao Xie return NULL; 24445a6ac9eaSMiao Xie bio_list_add(&rbio->bio_list, bio); 24455a6ac9eaSMiao Xie /* 24465a6ac9eaSMiao Xie * This is a special bio which is used to hold the completion handler 24475a6ac9eaSMiao Xie * and make the scrub rbio is similar to the other types 24485a6ac9eaSMiao Xie */ 24495a6ac9eaSMiao Xie ASSERT(!bio->bi_iter.bi_size); 24505a6ac9eaSMiao Xie rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 24515a6ac9eaSMiao Xie 24529cd3a7ebSLiu Bo /* 24534c664611SQu Wenruo * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 24549cd3a7ebSLiu Bo * to the end position, so this search can start from the first parity 24559cd3a7ebSLiu Bo * stripe. 24569cd3a7ebSLiu Bo */ 24579cd3a7ebSLiu Bo for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 24584c664611SQu Wenruo if (bioc->stripes[i].dev == scrub_dev) { 24595a6ac9eaSMiao Xie rbio->scrubp = i; 24605a6ac9eaSMiao Xie break; 24615a6ac9eaSMiao Xie } 24625a6ac9eaSMiao Xie } 24639cd3a7ebSLiu Bo ASSERT(i < rbio->real_stripes); 24645a6ac9eaSMiao Xie 2465c67c68ebSQu Wenruo bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 24665a6ac9eaSMiao Xie return rbio; 24675a6ac9eaSMiao Xie } 24685a6ac9eaSMiao Xie 2469b4ee1782SOmar Sandoval /* Used for both parity scrub and missing. */ 2470b4ee1782SOmar Sandoval void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 24716346f6bfSQu Wenruo unsigned int pgoff, u64 logical) 24725a6ac9eaSMiao Xie { 24736346f6bfSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 24745a6ac9eaSMiao Xie int stripe_offset; 24755a6ac9eaSMiao Xie int index; 24765a6ac9eaSMiao Xie 24774c664611SQu Wenruo ASSERT(logical >= rbio->bioc->raid_map[0]); 24786346f6bfSQu Wenruo ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2479ff18a4afSChristoph Hellwig BTRFS_STRIPE_LEN * rbio->nr_data); 24804c664611SQu Wenruo stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 24816346f6bfSQu Wenruo index = stripe_offset / sectorsize; 24826346f6bfSQu Wenruo rbio->bio_sectors[index].page = page; 24836346f6bfSQu Wenruo rbio->bio_sectors[index].pgoff = pgoff; 24845a6ac9eaSMiao Xie } 24855a6ac9eaSMiao Xie 24865a6ac9eaSMiao Xie /* 24875a6ac9eaSMiao Xie * We just scrub the parity that we have correct data on the same horizontal, 24885a6ac9eaSMiao Xie * so we needn't allocate all pages for all the stripes. 24895a6ac9eaSMiao Xie */ 24905a6ac9eaSMiao Xie static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 24915a6ac9eaSMiao Xie { 24923907ce29SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2493aee35e4bSQu Wenruo int total_sector_nr; 24945a6ac9eaSMiao Xie 2495aee35e4bSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2496aee35e4bSQu Wenruo total_sector_nr++) { 24973907ce29SQu Wenruo struct page *page; 2498aee35e4bSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 2499aee35e4bSQu Wenruo int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 25003907ce29SQu Wenruo 2501aee35e4bSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 2502aee35e4bSQu Wenruo continue; 25035a6ac9eaSMiao Xie if (rbio->stripe_pages[index]) 25045a6ac9eaSMiao Xie continue; 2505b0ee5e1eSDavid Sterba page = alloc_page(GFP_NOFS); 25065a6ac9eaSMiao Xie if (!page) 25075a6ac9eaSMiao Xie return -ENOMEM; 25085a6ac9eaSMiao Xie rbio->stripe_pages[index] = page; 25095a6ac9eaSMiao Xie } 2510eb357060SQu Wenruo index_stripe_sectors(rbio); 25115a6ac9eaSMiao Xie return 0; 25125a6ac9eaSMiao Xie } 25135a6ac9eaSMiao Xie 25145a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 25155a6ac9eaSMiao Xie int need_check) 25165a6ac9eaSMiao Xie { 25174c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 251846900662SQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 25191389053eSKees Cook void **pointers = rbio->finish_pointers; 2520c67c68ebSQu Wenruo unsigned long *pbitmap = &rbio->finish_pbitmap; 25215a6ac9eaSMiao Xie int nr_data = rbio->nr_data; 25225a6ac9eaSMiao Xie int stripe; 25233e77605dSQu Wenruo int sectornr; 2524c17af965SDavid Sterba bool has_qstripe; 252546900662SQu Wenruo struct sector_ptr p_sector = { 0 }; 252646900662SQu Wenruo struct sector_ptr q_sector = { 0 }; 25275a6ac9eaSMiao Xie struct bio_list bio_list; 25285a6ac9eaSMiao Xie struct bio *bio; 252976035976SMiao Xie int is_replace = 0; 25305a6ac9eaSMiao Xie int ret; 25315a6ac9eaSMiao Xie 25325a6ac9eaSMiao Xie bio_list_init(&bio_list); 25335a6ac9eaSMiao Xie 2534c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 2535c17af965SDavid Sterba has_qstripe = false; 2536c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 2537c17af965SDavid Sterba has_qstripe = true; 2538c17af965SDavid Sterba else 25395a6ac9eaSMiao Xie BUG(); 25405a6ac9eaSMiao Xie 25414c664611SQu Wenruo if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 254276035976SMiao Xie is_replace = 1; 2543c67c68ebSQu Wenruo bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 254476035976SMiao Xie } 254576035976SMiao Xie 25465a6ac9eaSMiao Xie /* 25475a6ac9eaSMiao Xie * Because the higher layers(scrubber) are unlikely to 25485a6ac9eaSMiao Xie * use this area of the disk again soon, so don't cache 25495a6ac9eaSMiao Xie * it. 25505a6ac9eaSMiao Xie */ 25515a6ac9eaSMiao Xie clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 25525a6ac9eaSMiao Xie 25535a6ac9eaSMiao Xie if (!need_check) 25545a6ac9eaSMiao Xie goto writeback; 25555a6ac9eaSMiao Xie 255646900662SQu Wenruo p_sector.page = alloc_page(GFP_NOFS); 255746900662SQu Wenruo if (!p_sector.page) 25585a6ac9eaSMiao Xie goto cleanup; 255946900662SQu Wenruo p_sector.pgoff = 0; 256046900662SQu Wenruo p_sector.uptodate = 1; 25615a6ac9eaSMiao Xie 2562c17af965SDavid Sterba if (has_qstripe) { 2563d70cef0dSIra Weiny /* RAID6, allocate and map temp space for the Q stripe */ 256446900662SQu Wenruo q_sector.page = alloc_page(GFP_NOFS); 256546900662SQu Wenruo if (!q_sector.page) { 256646900662SQu Wenruo __free_page(p_sector.page); 256746900662SQu Wenruo p_sector.page = NULL; 25685a6ac9eaSMiao Xie goto cleanup; 25695a6ac9eaSMiao Xie } 257046900662SQu Wenruo q_sector.pgoff = 0; 257146900662SQu Wenruo q_sector.uptodate = 1; 257246900662SQu Wenruo pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 25735a6ac9eaSMiao Xie } 25745a6ac9eaSMiao Xie 25755a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 25765a6ac9eaSMiao Xie 2577d70cef0dSIra Weiny /* Map the parity stripe just once */ 257846900662SQu Wenruo pointers[nr_data] = kmap_local_page(p_sector.page); 2579d70cef0dSIra Weiny 2580c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 258146900662SQu Wenruo struct sector_ptr *sector; 25825a6ac9eaSMiao Xie void *parity; 258346900662SQu Wenruo 25845a6ac9eaSMiao Xie /* first collect one page from each data stripe */ 25855a6ac9eaSMiao Xie for (stripe = 0; stripe < nr_data; stripe++) { 258646900662SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 258746900662SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 258846900662SQu Wenruo sector->pgoff; 25895a6ac9eaSMiao Xie } 25905a6ac9eaSMiao Xie 2591c17af965SDavid Sterba if (has_qstripe) { 2592d70cef0dSIra Weiny /* RAID6, call the library function to fill in our P/Q */ 259346900662SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 25945a6ac9eaSMiao Xie pointers); 25955a6ac9eaSMiao Xie } else { 25965a6ac9eaSMiao Xie /* raid5 */ 259746900662SQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 259846900662SQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 25995a6ac9eaSMiao Xie } 26005a6ac9eaSMiao Xie 260101327610SNicholas D Steeves /* Check scrubbing parity and repair it */ 260246900662SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 260346900662SQu Wenruo parity = kmap_local_page(sector->page) + sector->pgoff; 260446900662SQu Wenruo if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 260546900662SQu Wenruo memcpy(parity, pointers[rbio->scrubp], sectorsize); 26065a6ac9eaSMiao Xie else 26075a6ac9eaSMiao Xie /* Parity is right, needn't writeback */ 2608c67c68ebSQu Wenruo bitmap_clear(&rbio->dbitmap, sectornr, 1); 260958c1a35cSIra Weiny kunmap_local(parity); 26105a6ac9eaSMiao Xie 261194a0b58dSIra Weiny for (stripe = nr_data - 1; stripe >= 0; stripe--) 261294a0b58dSIra Weiny kunmap_local(pointers[stripe]); 26135a6ac9eaSMiao Xie } 26145a6ac9eaSMiao Xie 261594a0b58dSIra Weiny kunmap_local(pointers[nr_data]); 261646900662SQu Wenruo __free_page(p_sector.page); 261746900662SQu Wenruo p_sector.page = NULL; 261846900662SQu Wenruo if (q_sector.page) { 261994a0b58dSIra Weiny kunmap_local(pointers[rbio->real_stripes - 1]); 262046900662SQu Wenruo __free_page(q_sector.page); 262146900662SQu Wenruo q_sector.page = NULL; 2622d70cef0dSIra Weiny } 26235a6ac9eaSMiao Xie 26245a6ac9eaSMiao Xie writeback: 26255a6ac9eaSMiao Xie /* 26265a6ac9eaSMiao Xie * time to start writing. Make bios for everything from the 26275a6ac9eaSMiao Xie * higher layers (the bio_list in our rbio) and our p/q. Ignore 26285a6ac9eaSMiao Xie * everything else. 26295a6ac9eaSMiao Xie */ 2630c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 26313e77605dSQu Wenruo struct sector_ptr *sector; 26325a6ac9eaSMiao Xie 26333e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 26343e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2635ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 26365a6ac9eaSMiao Xie if (ret) 26375a6ac9eaSMiao Xie goto cleanup; 26385a6ac9eaSMiao Xie } 26395a6ac9eaSMiao Xie 264076035976SMiao Xie if (!is_replace) 264176035976SMiao Xie goto submit_write; 264276035976SMiao Xie 26433e77605dSQu Wenruo for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 26443e77605dSQu Wenruo struct sector_ptr *sector; 264576035976SMiao Xie 26463e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 26473e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 26484c664611SQu Wenruo bioc->tgtdev_map[rbio->scrubp], 2649ff18a4afSChristoph Hellwig sectornr, REQ_OP_WRITE); 265076035976SMiao Xie if (ret) 265176035976SMiao Xie goto cleanup; 265276035976SMiao Xie } 265376035976SMiao Xie 265476035976SMiao Xie submit_write: 26555a6ac9eaSMiao Xie nr_data = bio_list_size(&bio_list); 26565a6ac9eaSMiao Xie if (!nr_data) { 26575a6ac9eaSMiao Xie /* Every parity is right */ 265858efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_OK); 26595a6ac9eaSMiao Xie return; 26605a6ac9eaSMiao Xie } 26615a6ac9eaSMiao Xie 26625a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, nr_data); 26635a6ac9eaSMiao Xie 2664bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2665a6111d11SZhao Lei bio->bi_end_io = raid_write_end_io; 26664e49ea4aSMike Christie 2667b8bea09aSQu Wenruo if (trace_raid56_scrub_write_stripe_enabled()) { 2668b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 2669b8bea09aSQu Wenruo 2670b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2671b8bea09aSQu Wenruo trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); 2672b8bea09aSQu Wenruo } 26734e49ea4aSMike Christie submit_bio(bio); 26745a6ac9eaSMiao Xie } 26755a6ac9eaSMiao Xie return; 26765a6ac9eaSMiao Xie 26775a6ac9eaSMiao Xie cleanup: 267858efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2679785884fcSLiu Bo 2680785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2681785884fcSLiu Bo bio_put(bio); 26825a6ac9eaSMiao Xie } 26835a6ac9eaSMiao Xie 26845a6ac9eaSMiao Xie static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 26855a6ac9eaSMiao Xie { 26865a6ac9eaSMiao Xie if (stripe >= 0 && stripe < rbio->nr_data) 26875a6ac9eaSMiao Xie return 1; 26885a6ac9eaSMiao Xie return 0; 26895a6ac9eaSMiao Xie } 26905a6ac9eaSMiao Xie 26915a6ac9eaSMiao Xie /* 26925a6ac9eaSMiao Xie * While we're doing the parity check and repair, we could have errors 26935a6ac9eaSMiao Xie * in reading pages off the disk. This checks for errors and if we're 26945a6ac9eaSMiao Xie * not able to read the page it'll trigger parity reconstruction. The 26955a6ac9eaSMiao Xie * parity scrub will be finished after we've reconstructed the failed 26965a6ac9eaSMiao Xie * stripes 26975a6ac9eaSMiao Xie */ 26985a6ac9eaSMiao Xie static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 26995a6ac9eaSMiao Xie { 27004c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 27015a6ac9eaSMiao Xie goto cleanup; 27025a6ac9eaSMiao Xie 27035a6ac9eaSMiao Xie if (rbio->faila >= 0 || rbio->failb >= 0) { 27045a6ac9eaSMiao Xie int dfail = 0, failp = -1; 27055a6ac9eaSMiao Xie 27065a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->faila)) 27075a6ac9eaSMiao Xie dfail++; 27085a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->faila)) 27095a6ac9eaSMiao Xie failp = rbio->faila; 27105a6ac9eaSMiao Xie 27115a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->failb)) 27125a6ac9eaSMiao Xie dfail++; 27135a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->failb)) 27145a6ac9eaSMiao Xie failp = rbio->failb; 27155a6ac9eaSMiao Xie 27165a6ac9eaSMiao Xie /* 27175a6ac9eaSMiao Xie * Because we can not use a scrubbing parity to repair 27185a6ac9eaSMiao Xie * the data, so the capability of the repair is declined. 27195a6ac9eaSMiao Xie * (In the case of RAID5, we can not repair anything) 27205a6ac9eaSMiao Xie */ 27214c664611SQu Wenruo if (dfail > rbio->bioc->max_errors - 1) 27225a6ac9eaSMiao Xie goto cleanup; 27235a6ac9eaSMiao Xie 27245a6ac9eaSMiao Xie /* 27255a6ac9eaSMiao Xie * If all data is good, only parity is correctly, just 27265a6ac9eaSMiao Xie * repair the parity. 27275a6ac9eaSMiao Xie */ 27285a6ac9eaSMiao Xie if (dfail == 0) { 27295a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 27305a6ac9eaSMiao Xie return; 27315a6ac9eaSMiao Xie } 27325a6ac9eaSMiao Xie 27335a6ac9eaSMiao Xie /* 27345a6ac9eaSMiao Xie * Here means we got one corrupted data stripe and one 27355a6ac9eaSMiao Xie * corrupted parity on RAID6, if the corrupted parity 273601327610SNicholas D Steeves * is scrubbing parity, luckily, use the other one to repair 27375a6ac9eaSMiao Xie * the data, or we can not repair the data stripe. 27385a6ac9eaSMiao Xie */ 27395a6ac9eaSMiao Xie if (failp != rbio->scrubp) 27405a6ac9eaSMiao Xie goto cleanup; 27415a6ac9eaSMiao Xie 27425a6ac9eaSMiao Xie __raid_recover_end_io(rbio); 27435a6ac9eaSMiao Xie } else { 27445a6ac9eaSMiao Xie finish_parity_scrub(rbio, 1); 27455a6ac9eaSMiao Xie } 27465a6ac9eaSMiao Xie return; 27475a6ac9eaSMiao Xie 27485a6ac9eaSMiao Xie cleanup: 274958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 27505a6ac9eaSMiao Xie } 27515a6ac9eaSMiao Xie 27525a6ac9eaSMiao Xie /* 27535a6ac9eaSMiao Xie * end io for the read phase of the rmw cycle. All the bios here are physical 27545a6ac9eaSMiao Xie * stripe bios we've read from the disk so we can recalculate the parity of the 27555a6ac9eaSMiao Xie * stripe. 27565a6ac9eaSMiao Xie * 27575a6ac9eaSMiao Xie * This will usually kick off finish_rmw once all the bios are read in, but it 27585a6ac9eaSMiao Xie * may trigger parity reconstruction if we had any errors along the way 27595a6ac9eaSMiao Xie */ 2760d34e123dSChristoph Hellwig static void raid56_parity_scrub_end_io_work(struct work_struct *work) 27615a6ac9eaSMiao Xie { 2762d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 2763d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 27645a6ac9eaSMiao Xie 27655a6ac9eaSMiao Xie /* 2766d34e123dSChristoph Hellwig * This will normally call finish_rmw to start our write, but if there 2767d34e123dSChristoph Hellwig * are any failed stripes we'll reconstruct from parity first 27685a6ac9eaSMiao Xie */ 27695a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 27705a6ac9eaSMiao Xie } 27715a6ac9eaSMiao Xie 27725a6ac9eaSMiao Xie static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 27735a6ac9eaSMiao Xie { 27745a6ac9eaSMiao Xie int bios_to_read = 0; 27755a6ac9eaSMiao Xie struct bio_list bio_list; 27765a6ac9eaSMiao Xie int ret; 27771c10702eSQu Wenruo int total_sector_nr; 27785a6ac9eaSMiao Xie struct bio *bio; 27795a6ac9eaSMiao Xie 2780785884fcSLiu Bo bio_list_init(&bio_list); 2781785884fcSLiu Bo 27825a6ac9eaSMiao Xie ret = alloc_rbio_essential_pages(rbio); 27835a6ac9eaSMiao Xie if (ret) 27845a6ac9eaSMiao Xie goto cleanup; 27855a6ac9eaSMiao Xie 27865a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 27871c10702eSQu Wenruo /* Build a list of bios to read all the missing parts. */ 27881c10702eSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 27891c10702eSQu Wenruo total_sector_nr++) { 27901c10702eSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 27911c10702eSQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 27923e77605dSQu Wenruo struct sector_ptr *sector; 27931c10702eSQu Wenruo 27941c10702eSQu Wenruo /* No data in the vertical stripe, no need to read. */ 27951c10702eSQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 27961c10702eSQu Wenruo continue; 27971c10702eSQu Wenruo 27985a6ac9eaSMiao Xie /* 27991c10702eSQu Wenruo * We want to find all the sectors missing from the rbio and 28001c10702eSQu Wenruo * read them from the disk. If sector_in_rbio() finds a sector 28011c10702eSQu Wenruo * in the bio list we don't need to read it off the stripe. 28025a6ac9eaSMiao Xie */ 28033e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 28043e77605dSQu Wenruo if (sector) 28055a6ac9eaSMiao Xie continue; 28065a6ac9eaSMiao Xie 28073e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 28085a6ac9eaSMiao Xie /* 28091c10702eSQu Wenruo * The bio cache may have handed us an uptodate sector. If so, 28101c10702eSQu Wenruo * use it. 28115a6ac9eaSMiao Xie */ 28123e77605dSQu Wenruo if (sector->uptodate) 28135a6ac9eaSMiao Xie continue; 28145a6ac9eaSMiao Xie 28151c10702eSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2816ff18a4afSChristoph Hellwig sectornr, REQ_OP_READ); 28175a6ac9eaSMiao Xie if (ret) 28185a6ac9eaSMiao Xie goto cleanup; 28195a6ac9eaSMiao Xie } 28205a6ac9eaSMiao Xie 28215a6ac9eaSMiao Xie bios_to_read = bio_list_size(&bio_list); 28225a6ac9eaSMiao Xie if (!bios_to_read) { 28235a6ac9eaSMiao Xie /* 28245a6ac9eaSMiao Xie * this can happen if others have merged with 28255a6ac9eaSMiao Xie * us, it means there is nothing left to read. 28265a6ac9eaSMiao Xie * But if there are missing devices it may not be 28275a6ac9eaSMiao Xie * safe to do the full stripe write yet. 28285a6ac9eaSMiao Xie */ 28295a6ac9eaSMiao Xie goto finish; 28305a6ac9eaSMiao Xie } 28315a6ac9eaSMiao Xie 28325a6ac9eaSMiao Xie /* 28334c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 28344c664611SQu Wenruo * touch it after that. 28355a6ac9eaSMiao Xie */ 28365a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2837d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); 2838bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2839d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 28405a6ac9eaSMiao Xie 2841b8bea09aSQu Wenruo if (trace_raid56_scrub_read_enabled()) { 2842b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 28435a6ac9eaSMiao Xie 2844b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2845b8bea09aSQu Wenruo trace_raid56_scrub_read(rbio, bio, &trace_info); 2846b8bea09aSQu Wenruo } 28474e49ea4aSMike Christie submit_bio(bio); 28485a6ac9eaSMiao Xie } 28495a6ac9eaSMiao Xie /* the actual write will happen once the reads are done */ 28505a6ac9eaSMiao Xie return; 28515a6ac9eaSMiao Xie 28525a6ac9eaSMiao Xie cleanup: 285358efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2854785884fcSLiu Bo 2855785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2856785884fcSLiu Bo bio_put(bio); 2857785884fcSLiu Bo 28585a6ac9eaSMiao Xie return; 28595a6ac9eaSMiao Xie 28605a6ac9eaSMiao Xie finish: 28615a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 28625a6ac9eaSMiao Xie } 28635a6ac9eaSMiao Xie 2864385de0efSChristoph Hellwig static void scrub_parity_work(struct work_struct *work) 28655a6ac9eaSMiao Xie { 28665a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 28675a6ac9eaSMiao Xie 28685a6ac9eaSMiao Xie rbio = container_of(work, struct btrfs_raid_bio, work); 28695a6ac9eaSMiao Xie raid56_parity_scrub_stripe(rbio); 28705a6ac9eaSMiao Xie } 28715a6ac9eaSMiao Xie 28725a6ac9eaSMiao Xie void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 28735a6ac9eaSMiao Xie { 28745a6ac9eaSMiao Xie if (!lock_stripe_add(rbio)) 2875a81b747dSDavid Sterba start_async_work(rbio, scrub_parity_work); 28765a6ac9eaSMiao Xie } 2877b4ee1782SOmar Sandoval 2878b4ee1782SOmar Sandoval /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2879b4ee1782SOmar Sandoval 2880b4ee1782SOmar Sandoval struct btrfs_raid_bio * 2881ff18a4afSChristoph Hellwig raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) 2882b4ee1782SOmar Sandoval { 28836a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 2884b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2885b4ee1782SOmar Sandoval 2886ff18a4afSChristoph Hellwig rbio = alloc_rbio(fs_info, bioc); 2887b4ee1782SOmar Sandoval if (IS_ERR(rbio)) 2888b4ee1782SOmar Sandoval return NULL; 2889b4ee1782SOmar Sandoval 2890b4ee1782SOmar Sandoval rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2891b4ee1782SOmar Sandoval bio_list_add(&rbio->bio_list, bio); 2892b4ee1782SOmar Sandoval /* 2893b4ee1782SOmar Sandoval * This is a special bio which is used to hold the completion handler 2894b4ee1782SOmar Sandoval * and make the scrub rbio is similar to the other types 2895b4ee1782SOmar Sandoval */ 2896b4ee1782SOmar Sandoval ASSERT(!bio->bi_iter.bi_size); 2897b4ee1782SOmar Sandoval 2898b4ee1782SOmar Sandoval rbio->faila = find_logical_bio_stripe(rbio, bio); 2899b4ee1782SOmar Sandoval if (rbio->faila == -1) { 2900f15fb2cdSQu Wenruo btrfs_warn_rl(fs_info, 2901f15fb2cdSQu Wenruo "can not determine the failed stripe number for full stripe %llu", 2902f15fb2cdSQu Wenruo bioc->raid_map[0]); 2903ff2b64a2SQu Wenruo free_raid_bio(rbio); 2904b4ee1782SOmar Sandoval return NULL; 2905b4ee1782SOmar Sandoval } 2906b4ee1782SOmar Sandoval 2907b4ee1782SOmar Sandoval return rbio; 2908b4ee1782SOmar Sandoval } 2909b4ee1782SOmar Sandoval 2910b4ee1782SOmar Sandoval void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2911b4ee1782SOmar Sandoval { 2912*d817ce35SQu Wenruo start_async_work(rbio, recover_rbio_work); 2913b4ee1782SOmar Sandoval } 2914